diff --git a/.devcontainer/README.MD b/.devcontainer/README.MD
new file mode 100644
index 000000000000..d9fe31f38f3a
--- /dev/null
+++ b/.devcontainer/README.MD
@@ -0,0 +1 @@
+The files in this directory configure a development container for GitHub Codespaces.
\ No newline at end of file
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 000000000000..a107c7891727
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,15 @@
+{
+	"name": "OpenDevin Codespaces",
+	"image": "mcr.microsoft.com/devcontainers/universal",
+	"customizations":{
+        "vscode":{
+            "extensions": [
+                "ms-python.python"
+            ]
+        }
+    },
+	"onCreateCommand": "sh ./.devcontainer/on_create.sh",
+	"postCreateCommand": "make build",
+	"postStartCommand": "nohup bash -c 'make run > output.log 2>&1 &'"
+
+}
diff --git a/.devcontainer/on_create.sh b/.devcontainer/on_create.sh
new file mode 100644
index 000000000000..4b8d592ae51a
--- /dev/null
+++ b/.devcontainer/on_create.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+sudo apt update
+sudo apt install -y netcat
+sudo add-apt-repository -y ppa:deadsnakes/ppa
+sudo apt install -y python3.11
+curl -sSL https://install.python-poetry.org | python3.11 -
+# chromadb requires SQLite > 3.35 but SQLite in Python3.11.9 comes with 3.31.1
+sudo cp /opt/conda/lib/libsqlite3.so.0 /lib/x86_64-linux-gnu/libsqlite3.so.0
+cat << EOF > config.toml
+[core]
+workspace_base = "./workspace"
+debug = 1
+
+[sandbox]
+use_host_network = 1
+persist_sandbox = 1
+fast_boot = 1
+user_id = 1001
+EOF
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index b15e920a2829..bc649f426695 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,8 +1,11 @@
 **What is the problem that this fixes or functionality that this introduces? Does it fix any open issues?**
 
----
 
+
+---
 **Give a summary of what the PR does, explaining any non-trivial design decisions**
 
+
+
 ---
 **Other references**
diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml
new file mode 100644
index 000000000000..a00dad7282db
--- /dev/null
+++ b/.github/workflows/clean-up.yml
@@ -0,0 +1,68 @@
+# Workflow that cleans up outdated and old workflows to prevent out of disk issues
+name: Delete old workflow runs
+
+on:
+  workflow_dispatch:
+    inputs:
+      days:
+        description: 'Days-worth of runs to keep for each workflow'
+        required: true
+        default: '30'
+      minimum_runs:
+        description: 'Minimum runs to keep for each workflow'
+        required: true
+        default: '10'
+      delete_workflow_pattern:
+        description: 'Name or filename of the workflow (if not set, all workflows are targeted)'
+        required: false
+      delete_workflow_by_state_pattern:
+        description: 'Filter workflows by state: active, deleted, disabled_fork, disabled_inactivity, disabled_manually'
+        required: true
+        default: "ALL"
+        type: choice
+        options:
+          - "ALL"
+          - active
+          - deleted
+          - disabled_inactivity
+          - disabled_manually
+      delete_run_by_conclusion_pattern:
+        description: 'Remove runs based on conclusion: action_required, cancelled, failure, skipped, success'
+        required: true
+        default: 'ALL'
+        type: choice
+        options:
+          - 'ALL'
+          - 'Unsuccessful: action_required,cancelled,failure,skipped'
+          - action_required
+          - cancelled
+          - failure
+          - skipped
+          - success
+      dry_run:
+        description: 'Logs simulated changes, no deletions are performed'
+        required: false
+
+jobs:
+  del_runs:
+    runs-on: ubuntu-latest
+    permissions:
+      actions: write
+      contents: read
+    steps:
+      - name: Delete workflow runs
+        uses: Mattraks/delete-workflow-runs@v2
+        with:
+          token: ${{ github.token }}
+          repository: ${{ github.repository }}
+          retain_days: ${{ github.event.inputs.days }}
+          keep_minimum_runs: ${{ github.event.inputs.minimum_runs }}
+          delete_workflow_pattern: ${{ github.event.inputs.delete_workflow_pattern }}
+          delete_workflow_by_state_pattern: ${{ github.event.inputs.delete_workflow_by_state_pattern }}
+          delete_run_by_conclusion_pattern: >-
+            ${{
+              startsWith(github.event.inputs.delete_run_by_conclusion_pattern, 'Unsuccessful:')
+              && 'action_required,cancelled,failure,skipped'
+              || github.event.inputs.delete_run_by_conclusion_pattern
+            }}
+          dry_run: ${{ github.event.inputs.dry_run }}
diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml
index 939e72bcb189..b4ed558a0c1d 100644
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -1,3 +1,4 @@
+# Workflow that builds and deploys the documentation website
 name: Deploy Docs to GitHub Pages
 
 on:
@@ -5,10 +6,13 @@ on:
     branches:
       - main
   pull_request:
+    paths:
+      - 'docs/**'
     branches:
       - main
 
 jobs:
+  # Build the documentation website
   build:
     name: Build Docusaurus
     runs-on: ubuntu-latest
@@ -25,23 +29,23 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.11"
-
+          python-version: '3.11'
       - name: Generate Python Docs
         run: rm -rf docs/modules/python && pip install pydoc-markdown && pydoc-markdown
       - name: Install dependencies
         run: cd docs && npm ci
       - name: Build website
         run: cd docs && npm run build
-
       - name: Upload Build Artifact
         if: github.ref == 'refs/heads/main'
         uses: actions/upload-pages-artifact@v3
         with:
           path: docs/build
 
+  # Deploy the documentation website
   deploy:
     name: Deploy to GitHub Pages
+    runs-on: ubuntu-latest
     needs: build
     if: github.ref == 'refs/heads/main' && github.repository == 'OpenDevin/OpenDevin'
     # Grant GITHUB_TOKEN the permissions required to make a Pages deployment
@@ -52,7 +56,6 @@ jobs:
     environment:
       name: github-pages
       url: ${{ steps.deployment.outputs.page_url }}
-    runs-on: ubuntu-latest
     steps:
       - name: Deploy to GitHub Pages
         id: deployment
diff --git a/.github/workflows/dummy-agent-test.yml b/.github/workflows/dummy-agent-test.yml
index 8422f0c361ef..6cf4d5900c66 100644
--- a/.github/workflows/dummy-agent-test.yml
+++ b/.github/workflows/dummy-agent-test.yml
@@ -1,3 +1,4 @@
+# Workflow that uses the DummyAgent to run a simple task
 name: Run E2E test with dummy agent
 
 concurrency:
@@ -10,9 +11,6 @@ on:
     - main
   pull_request:
 
-env:
-  PERSIST_SANDBOX : "false"
-
 jobs:
   test:
     runs-on: ubuntu-latest
@@ -25,7 +23,7 @@ jobs:
       - name: Set up environment
         run: |
           curl -sSL https://install.python-poetry.org | python3 -
-          poetry install --without evaluation
+          poetry install --without evaluation,llama-index
           poetry run playwright install --with-deps chromium
           wget https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json -P /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/1_Pooling/
       - name: Run tests
diff --git a/.github/workflows/ghcr-runtime.yml b/.github/workflows/ghcr-runtime.yml
deleted file mode 100644
index cecb1807c04d..000000000000
--- a/.github/workflows/ghcr-runtime.yml
+++ /dev/null
@@ -1,265 +0,0 @@
-name: Build Publish and Test Runtime Image
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - '*'
-  pull_request:
-  workflow_dispatch:
-    inputs:
-      reason:
-        description: 'Reason for manual trigger'
-        required: true
-        default: ''
-
-jobs:
-  ghcr_build_runtime:
-    runs-on: ubuntu-latest
-
-    outputs:
-      tags: ${{ steps.capture-tags.outputs.tags }}
-
-    permissions:
-      contents: read
-      packages: write
-
-    strategy:
-      matrix:
-        image: ["od_runtime"]
-        base_image: ["ubuntu:22.04"]
-        platform: ["amd64", "arm64"]
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: true
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: false
-          swap-storage: true
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Install poetry via pipx
-        run: pipx install poetry
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-          cache: "poetry"
-
-      - name: Install Python dependencies using Poetry
-        run: make install-python-dependencies
-
-      - name: Create source distribution and Dockerfile
-        run: poetry run python3 opendevin/runtime/utils/runtime_build.py --base_image ${{ matrix.base_image }} --build_folder containers/runtime
-
-      - name: Build and export image
-        id: build
-        run: ./containers/build.sh ${{ matrix.image }} ${{ github.repository_owner }} ${{ matrix.platform }}
-
-      - name: Capture tags
-        id: capture-tags
-        run: |
-          tags=$(cat tags.txt)
-          echo "tags=$tags"
-          echo "tags=$tags" >> $GITHUB_OUTPUT
-
-      - name: Upload Docker image as artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
-          path: /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
-
-  test-for-runtime:
-    name: Test for Runtime
-    runs-on: ubuntu-latest
-    needs: ghcr_build_runtime
-    env:
-      PERSIST_SANDBOX: "false"
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # when set to "true" but frees about 6 GB
-          tool-cache: true
-
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          swap-storage: true
-
-      - name: Install poetry via pipx
-        run: pipx install poetry
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-          cache: "poetry"
-
-      - name: Install Python dependencies using Poetry
-        run: make install-python-dependencies
-
-      - name: Download Runtime Docker image
-        uses: actions/download-artifact@v4
-        with:
-          name: od_runtime-docker-image-amd64
-          path: /tmp/
-
-      - name: Load Runtime image and run runtime tests
-        run: |
-          # Load the Docker image and capture the output
-          output=$(docker load -i /tmp/od_runtime_image_amd64.tar)
-
-          # Extract the first image name from the output
-          image_name=$(echo "$output" | grep -oP 'Loaded image: \K.*' | head -n 1)
-
-          # Print the full name of the image
-          echo "Loaded Docker image: $image_name"
-
-          SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml -s ./tests/unit/test_runtime.py
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v4
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-
-  ghcr_push:
-    runs-on: ubuntu-latest
-    # don't push if runtime tests fail
-    needs: [ghcr_build_runtime, test-for-runtime]
-    if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
-
-    env:
-      tags: ${{ needs.ghcr_build_runtime.outputs.tags }}
-
-    permissions:
-      contents: read
-      packages: write
-
-    strategy:
-      matrix:
-        image: ["od_runtime"]
-        platform: ["amd64", "arm64"]
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          tool-cache: true
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: false
-          swap-storage: true
-
-      - name: Login to GHCR
-        uses: docker/login-action@v2
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Download Docker images
-        uses: actions/download-artifact@v4
-        with:
-          name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
-          path: /tmp/${{ matrix.platform }}
-
-      - name: List downloaded files
-        run: |
-          ls -la /tmp/${{ matrix.platform }}
-          file /tmp/${{ matrix.platform }}/*
-
-      - name: Load images and push to registry
-        run: |
-          mv /tmp/${{ matrix.platform }}/${{ matrix.image }}_image_${{ matrix.platform }}.tar ./${{ matrix.image }}_image_${{ matrix.platform }}.tar
-          if ! loaded_image=$(docker load -i ${{ matrix.image }}_image_${{ matrix.platform }}.tar | grep "Loaded image:" | head -n 1 | awk '{print $3}'); then
-            echo "Failed to load Docker image"
-            exit 1
-          fi
-          echo "loaded image = $loaded_image"
-          tags=$(echo ${tags} | tr ' ' '\n')
-          image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
-          echo "image name = $image_name"
-          for tag in $tags; do
-            echo "tag = $tag"
-            if [ -n "$image_name" ]; then
-              docker tag $loaded_image $image_name:${tag}_${{ matrix.platform }}
-              docker push $image_name:${tag}_${{ matrix.platform }}
-            else
-              echo "Skipping tag and push due to empty image_name"
-            fi
-          done
-
-  create_manifest:
-    runs-on: ubuntu-latest
-    needs: [ghcr_build_runtime, ghcr_push]
-    if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
-
-    env:
-      tags: ${{ needs.ghcr_build_runtime.outputs.tags }}
-
-    strategy:
-      matrix:
-        image: ["od_runtime"]
-
-    permissions:
-      contents: read
-      packages: write
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Login to GHCR
-        uses: docker/login-action@v2
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Create and push multi-platform manifest
-        run: |
-          image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
-          echo "image name = $image_name"
-          tags=$(echo ${tags} | tr ' ' '\n')
-          for tag in $tags; do
-            echo 'tag = $tag'
-            docker buildx imagetools create --tag $image_name:$tag \
-              $image_name:${tag}_amd64 \
-              $image_name:${tag}_arm64
-          done
diff --git a/.github/workflows/ghcr.yml b/.github/workflows/ghcr.yml
index 43427b641b11..66270d3a0e98 100644
--- a/.github/workflows/ghcr.yml
+++ b/.github/workflows/ghcr.yml
@@ -1,4 +1,5 @@
-name: Build Publish and Test Docker Image
+# Workflow that builds, tests and then pushes the docker images to the ghcr.io repository
+name: Build Publish and Test Runtime Image
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,25 +20,21 @@ on:
         default: ''
 
 jobs:
+  # Builds the OpenDevin Docker images
   ghcr_build:
     runs-on: ubuntu-latest
-
     outputs:
       tags: ${{ steps.capture-tags.outputs.tags }}
-
     permissions:
       contents: read
       packages: write
-
     strategy:
       matrix:
-        image: ["sandbox", "opendevin"]
-        platform: ["amd64", "arm64"]
-
+        image: ['opendevin']
+        platform: ['amd64', 'arm64']
     steps:
       - name: Checkout
         uses: actions/checkout@v4
-
       - name: Free Disk Space (Ubuntu)
         uses: jlumbroso/free-disk-space@main
         with:
@@ -52,62 +49,152 @@ jobs:
           large-packages: true
           docker-images: false
           swap-storage: true
-
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v3
-
       - name: Set up Docker Buildx
         id: buildx
         uses: docker/setup-buildx-action@v3
-
       - name: Build and export image
         id: build
         run: ./containers/build.sh ${{ matrix.image }} ${{ github.repository_owner }} ${{ matrix.platform }}
-
       - name: Capture tags
         id: capture-tags
         run: |
           tags=$(cat tags.txt)
           echo "tags=$tags"
           echo "tags=$tags" >> $GITHUB_OUTPUT
+      - name: Upload Docker image as artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
+          path: /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
+          retention-days: 14
 
+  # Builds the runtime Docker images
+  ghcr_build_runtime:
+    runs-on: ubuntu-latest
+    outputs:
+      tags: ${{ steps.capture-tags.outputs.tags }}
+    permissions:
+      contents: read
+      packages: write
+    strategy:
+      matrix:
+        image: ['od_runtime']
+        base_image: ['nikolaik/python-nodejs:python3.11-nodejs22']
+        platform: ['amd64', 'arm64']
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Install poetry via pipx
+        run: pipx install poetry
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'poetry'
+      - name: Install Python dependencies using Poetry
+        run: make install-python-dependencies
+      - name: Create source distribution and Dockerfile
+        run: poetry run python3 opendevin/runtime/utils/runtime_build.py --base_image ${{ matrix.base_image }} --build_folder containers/runtime --force_rebuild
+      - name: Build and export image
+        id: build
+        run: |
+          if [ -f 'containers/runtime/Dockerfile' ]; then
+            echo 'Dockerfile detected, building runtime image...'
+            ./containers/build.sh ${{ matrix.image }} ${{ github.repository_owner }} ${{ matrix.platform }}
+          else
+            echo 'No Dockerfile detected which means an exact image is already built. Pulling the image and saving it to a tar file...'
+            source containers/runtime/config.sh
+            echo "$DOCKER_IMAGE_TAG $DOCKER_IMAGE_HASH_TAG" >> tags.txt
+            echo "Pulling image $DOCKER_IMAGE/$DOCKER_IMAGE_HASH_TAG to /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar"
+            docker pull $DOCKER_IMAGE:$DOCKER_IMAGE_HASH_TAG
+            docker save $DOCKER_IMAGE:$DOCKER_IMAGE_HASH_TAG -o /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
+          fi
+      - name: Capture tags
+        id: capture-tags
+        run: |
+          tags=$(cat tags.txt)
+          echo "tags=$tags"
+          echo "tags=$tags" >> $GITHUB_OUTPUT
       - name: Upload Docker image as artifact
         uses: actions/upload-artifact@v4
         with:
           name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
           path: /tmp/${{ matrix.image }}_image_${{ matrix.platform }}.tar
+          retention-days: 14
 
-  test-for-sandbox:
-    name: Test for Sandbox
+  # Run unit tests with the EventStream and Server runtime Docker images
+  test_runtime:
+    name: Test Runtime
     runs-on: ubuntu-latest
-    needs: ghcr_build
-    env:
-      PERSIST_SANDBOX: "false"
+    needs: [ghcr_build_runtime, ghcr_build]
+    strategy:
+      matrix:
+        runtime_type: ['eventstream']
     steps:
       - uses: actions/checkout@v4
-
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # when set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: true
       - name: Install poetry via pipx
         run: pipx install poetry
-
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.11"
-          cache: "poetry"
-
+          python-version: '3.11'
+          cache: 'poetry'
       - name: Install Python dependencies using Poetry
         run: make install-python-dependencies
-
-      - name: Download sandbox Docker image
+      - name: Download Runtime Docker image
+        if: matrix.runtime_type == 'eventstream'
+        uses: actions/download-artifact@v4
+        with:
+          name: od_runtime-docker-image-amd64
+          path: /tmp/
+      - name: Download Sandbox Docker image
+        if: matrix.runtime_type == 'server'
         uses: actions/download-artifact@v4
         with:
           name: sandbox-docker-image-amd64
           path: /tmp/
-
-      - name: Load sandbox image and run sandbox tests
+      - name: Load Runtime image and run runtime tests
         run: |
           # Load the Docker image and capture the output
-          output=$(docker load -i /tmp/sandbox_image_amd64.tar)
+          if [ "${{ matrix.runtime_type }}" == "eventstream" ]; then
+            output=$(docker load -i /tmp/od_runtime_image_amd64.tar)
+          else
+            output=$(docker load -i /tmp/sandbox_image_amd64.tar)
+          fi
 
           # Extract the first image name from the output
           image_name=$(echo "$output" | grep -oP 'Loaded image: \K.*' | head -n 1)
@@ -115,51 +202,48 @@ jobs:
           # Print the full name of the image
           echo "Loaded Docker image: $image_name"
 
-          SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml -s ./tests/unit/test_sandbox.py
-
+          TEST_RUNTIME=${{ matrix.runtime_type }} SANDBOX_USER_ID=$(id -u) SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml -s ./tests/unit/test_runtime.py
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
-  integration-tests-on-linux:
-    name: Integration Tests on Linux
+  # Run integration tests with the eventstream runtime Docker image
+  runtime_integration_tests_on_linux:
+    name: Runtime Integration Tests on Linux
     runs-on: ubuntu-latest
-    needs: ghcr_build
-    env:
-      PERSIST_SANDBOX: "false"
+    needs: [ghcr_build_runtime]
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.11"]
-        sandbox: ["ssh", "local"]
+        python-version: ['3.11']
+        # server is tested in a separate workflow
+        runtime_type: ['eventstream']
     steps:
       - uses: actions/checkout@v4
-
       - name: Install poetry via pipx
         run: pipx install poetry
-
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
           cache: 'poetry'
-
       - name: Install Python dependencies using Poetry
         run: make install-python-dependencies
-
-      - name: Download sandbox Docker image
+      - name: Download Runtime Docker image
         uses: actions/download-artifact@v4
         with:
-          name: sandbox-docker-image-amd64
+          name: od_runtime-docker-image-amd64
           path: /tmp/
-
-      - name: Load sandbox image and run integration tests
-        env:
-          SANDBOX_BOX_TYPE: ${{ matrix.sandbox }}
+      - name: Load runtime image and run integration tests
         run: |
           # Load the Docker image and capture the output
-          output=$(docker load -i /tmp/sandbox_image_amd64.tar)
+          if [ "${{ matrix.runtime_type }}" == "eventstream" ]; then
+            output=$(docker load -i /tmp/od_runtime_image_amd64.tar)
+          else
+            echo "No Runtime Docker image to load"
+            exit 1
+          fi
 
           # Extract the first image name from the output
           image_name=$(echo "$output" | grep -oP 'Loaded image: \K.*' | head -n 1)
@@ -167,48 +251,40 @@ jobs:
           # Print the full name of the image
           echo "Loaded Docker image: $image_name"
 
-          SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true TEST_ONLY=true ./tests/integration/regenerate.sh
-
+          TEST_RUNTIME=${{ matrix.runtime_type }} SANDBOX_USER_ID=$(id -u) SANDBOX_CONTAINER_IMAGE=$image_name TEST_IN_CI=true TEST_ONLY=true ./tests/integration/regenerate.sh
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
+  # Push the OpenDevin and sandbox Docker images to the ghcr.io repository
   ghcr_push:
     runs-on: ubuntu-latest
-    # don't push if integration tests or sandbox tests fail
-    needs: [ghcr_build, integration-tests-on-linux, test-for-sandbox]
+    needs: [ghcr_build]
     if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
-
     env:
       tags: ${{ needs.ghcr_build.outputs.tags }}
-
     permissions:
       contents: read
       packages: write
-
     strategy:
       matrix:
-        image: ["sandbox", "opendevin"]
-        platform: ["amd64", "arm64"]
-
+        image: ['opendevin']
+        platform: ['amd64', 'arm64']
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
-
       - name: Login to GHCR
         uses: docker/login-action@v2
         with:
           registry: ghcr.io
           username: ${{ github.repository_owner }}
           password: ${{ secrets.GITHUB_TOKEN }}
-
       - name: Download Docker images
         uses: actions/download-artifact@v4
         with:
           name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
           path: /tmp/${{ matrix.platform }}
-
       - name: Load images and push to registry
         run: |
           mv /tmp/${{ matrix.platform }}/${{ matrix.image }}_image_${{ matrix.platform }}.tar .
@@ -223,33 +299,124 @@ jobs:
             docker push $image_name:${tag}_${{ matrix.platform }}
           done
 
+  # Push the runtime Docker images to the ghcr.io repository
+  ghcr_push_runtime:
+    runs-on: ubuntu-latest
+    needs: [ghcr_build_runtime, test_runtime, runtime_integration_tests_on_linux]
+    if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
+    env:
+      RUNTIME_TAGS: ${{ needs.ghcr_build_runtime.outputs.tags }}
+    permissions:
+      contents: read
+      packages: write
+    strategy:
+      matrix:
+        image: ['od_runtime']
+        platform: ['amd64', 'arm64']
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: true
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Login to GHCR
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Download Docker images
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ matrix.image }}-docker-image-${{ matrix.platform }}
+          path: /tmp/${{ matrix.platform }}
+      - name: List downloaded files
+        run: |
+          ls -la /tmp/${{ matrix.platform }}
+          file /tmp/${{ matrix.platform }}/*
+      - name: Load images and push to registry
+        run: |
+          mv /tmp/${{ matrix.platform }}/${{ matrix.image }}_image_${{ matrix.platform }}.tar ./${{ matrix.image }}_image_${{ matrix.platform }}.tar
+          if ! loaded_image=$(docker load -i ${{ matrix.image }}_image_${{ matrix.platform }}.tar | grep "Loaded image:" | head -n 1 | awk '{print $3}'); then
+            echo "Failed to load Docker image"
+            exit 1
+          fi
+          echo "loaded image = $loaded_image"
+          image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
+          echo "image name = $image_name"
+          echo "$RUNTIME_TAGS" | tr ' ' '\n' | while read -r tag; do
+            echo "tag = $tag"
+            if [ -n "$image_name" ] && [ -n "$tag" ]; then
+              docker tag $loaded_image $image_name:${tag}_${{ matrix.platform }}
+              docker push $image_name:${tag}_${{ matrix.platform }}
+            else
+              echo "Skipping tag and push due to empty image_name or tag"
+            fi
+          done
+
+  # Creates and pushes the OpenDevin and sandbox Docker image manifests
   create_manifest:
     runs-on: ubuntu-latest
     needs: [ghcr_build, ghcr_push]
     if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
-
     env:
       tags: ${{ needs.ghcr_build.outputs.tags }}
-
     strategy:
       matrix:
-        image: ["sandbox", "opendevin"]
-
+        image: ['opendevin']
     permissions:
       contents: read
       packages: write
-
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
-
       - name: Login to GHCR
         uses: docker/login-action@v2
         with:
           registry: ghcr.io
           username: ${{ github.repository_owner }}
           password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Create and push multi-platform manifest
+        run: |
+          image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
+          echo "image name = $image_name"
+          tags=$(echo ${tags} | tr ' ' '\n')
+          for tag in $tags; do
+            echo 'tag = $tag'
+            docker buildx imagetools create --tag $image_name:$tag \
+              $image_name:${tag}_amd64 \
+              $image_name:${tag}_arm64
+          done
 
+  # Creates and pushes the runtime Docker image manifest
+  create_manifest_runtime:
+    runs-on: ubuntu-latest
+    needs: [ghcr_build_runtime, ghcr_push_runtime]
+    if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')
+    env:
+      tags: ${{ needs.ghcr_build_runtime.outputs.tags }}
+    strategy:
+      matrix:
+        image: ['od_runtime']
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Login to GHCR
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
       - name: Create and push multi-platform manifest
         run: |
           image_name=$(echo "ghcr.io/${{ github.repository_owner }}/${{ matrix.image }}" | tr '[:upper:]' '[:lower:]')
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 7233bdb25aee..98d905c6c3be 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -1,3 +1,4 @@
+# Workflow that runs lint on the frontend and python code
 name: Lint
 
 concurrency:
@@ -11,27 +12,26 @@ on:
   pull_request:
 
 jobs:
+  # Run lint on the frontend code
   lint-frontend:
     name: Lint frontend
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-
       - name: Install Node.js 20
         uses: actions/setup-node@v4
         with:
           node-version: 20
-
       - name: Install dependencies
         run: |
           cd frontend
           npm install --frozen-lockfile
-
       - name: Lint
         run: |
           cd frontend
           npm run lint
 
+  # Run lint on the python code
   lint-python:
     name: Lint python
     runs-on: ubuntu-latest
diff --git a/.github/workflows/review-pr.yml b/.github/workflows/review-pr.yml
index f38c65b8325d..6d7771e5701f 100644
--- a/.github/workflows/review-pr.yml
+++ b/.github/workflows/review-pr.yml
@@ -1,3 +1,4 @@
+# Workflow that uses OpenDevin to review a pull request. PR must be labeled 'review-this'
 name: Use OpenDevin to Review Pull Request
 
 on:
@@ -22,16 +23,13 @@ jobs:
       run: |
         sudo apt-get install -y git gh
         git config --global --add safe.directory $PWD
-
     - name: Checkout Repository
       uses: actions/checkout@v4
       with:
         ref: ${{ github.event.pull_request.base.ref }} # check out the target branch
-
     - name: Download Diff
       run: |
         curl -O "${{ github.event.pull_request.diff_url }}" -L
-
     - name: Write Task File
       run: |
         echo "Your coworker wants to apply a pull request to this project." > task.txt
@@ -45,19 +43,16 @@ jobs:
         echo "${{ github.event.pull_request.body }}" >> task.txt
         echo "" >> task.txt
         echo "Diff file is: ${{ github.event.pull_request.number }}.diff" >> task.txt
-
     - name: Set up environment
       run: |
         curl -sSL https://install.python-poetry.org | python3 -
         export PATH="/github/home/.local/bin:$PATH"
-        poetry install --without evaluation
+        poetry install --without evaluation,llama-index
         poetry run playwright install --with-deps chromium
-
     - name: Run OpenDevin
       env:
         LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
         LLM_MODEL: ${{ vars.LLM_MODEL }}
-        SANDBOX_BOX_TYPE: ssh
       run: |
         # Append path to launch poetry
         export PATH="/github/home/.local/bin:$PATH"
@@ -67,7 +62,6 @@ jobs:
         export WORKSPACE_BASE=$GITHUB_WORKSPACE
         echo -e "/exit\n" | poetry run python opendevin/core/main.py -i 50 -f task.txt
         rm task.txt
-
     - name: Check if review file is non-empty
       id: check_file
       run: |
@@ -76,7 +70,6 @@ jobs:
           echo "non_empty=true" >> $GITHUB_OUTPUT
         fi
       shell: bash
-
     - name: Create PR review if file is non-empty
       env:
         GH_TOKEN: ${{ github.token }}
diff --git a/.github/workflows/run-unit-tests.yml b/.github/workflows/run-unit-tests.yml
index 5fb79fc1f9a7..fd3b9c23e0b6 100644
--- a/.github/workflows/run-unit-tests.yml
+++ b/.github/workflows/run-unit-tests.yml
@@ -1,3 +1,4 @@
+# Workflow that runs frontend and python unit tests
 name: Run Unit Tests
 
 concurrency:
@@ -15,63 +16,52 @@ on:
       - 'evaluation/**'
   pull_request:
 
-env:
-  PERSIST_SANDBOX : "false"
 
 jobs:
+  # Run frontend unit tests
   fe-test:
     runs-on: ubuntu-latest
-
     strategy:
       matrix:
         node-version: [20]
-
     steps:
       - name: Checkout
         uses: actions/checkout@v4
-
       - name: Set up Node.js
         uses: actions/setup-node@v4
         with:
           node-version: ${{ matrix.node-version }}
-
       - name: Install dependencies
         working-directory: ./frontend
         run: npm ci
-
       - name: Run tests and collect coverage
         working-directory: ./frontend
         run: npm run test:coverage
-
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
+  # Run python unit tests on macOS
   test-on-macos:
     name: Test on macOS
     runs-on: macos-12
     env:
-      INSTALL_DOCKER: "1" # Set to '0' to skip Docker installation
+      INSTALL_DOCKER: '1' # Set to '0' to skip Docker installation
     strategy:
       matrix:
-        python-version: ["3.11"]
-
+        python-version: ['3.11']
     steps:
       - uses: actions/checkout@v4
-
       - name: Install poetry via pipx
         run: pipx install poetry
-
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
-          cache: "poetry"
-
+          cache: 'poetry'
       - name: Install Python dependencies using Poetry
-        run: poetry install
-
+        run: poetry install --without evaluation,llama-index
       - name: Install & Start Docker
         if: env.INSTALL_DOCKER == '1'
         run: |
@@ -120,47 +110,39 @@ jobs:
           # For testcontainers to find the Colima socket
           # https://github.com/abiosoft/colima/blob/main/docs/FAQ.md#cannot-connect-to-the-docker-daemon-at-unixvarrundockersock-is-the-docker-daemon-running
           sudo ln -sf $HOME/.colima/default/docker.sock /var/run/docker.sock
-
       - name: Build Environment
         run: make build
-
       - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox.py and not test_runtime.py"
-
+        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_runtime.py"
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+
+  # Run python unit tests on Linux
   test-on-linux:
     name: Test on Linux
     runs-on: ubuntu-latest
     env:
-      INSTALL_DOCKER: "0" # Set to '0' to skip Docker installation
+      INSTALL_DOCKER: '0' # Set to '0' to skip Docker installation
     strategy:
       matrix:
-        python-version: ["3.11"]
-
+        python-version: ['3.11']
     steps:
       - uses: actions/checkout@v4
-
       - name: Install poetry via pipx
         run: pipx install poetry
-
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
-          cache: "poetry"
-
+          cache: 'poetry'
       - name: Install Python dependencies using Poetry
-        run: poetry install --without evaluation
-
+        run: poetry install --without evaluation,llama-index
       - name: Build Environment
         run: make build
-
       - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox.py and not test_runtime.py"
-
+        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_runtime.py"
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
diff --git a/.github/workflows/solve-issue.yml b/.github/workflows/solve-issue.yml
index df965b95542e..8e075761ab2a 100644
--- a/.github/workflows/solve-issue.yml
+++ b/.github/workflows/solve-issue.yml
@@ -1,3 +1,4 @@
+# Workflow that uses OpenDevin to resolve a GitHub issue. Issue must be labeled 'solve-this'
 name: Use OpenDevin to Resolve GitHub Issue
 
 on:
@@ -17,14 +18,11 @@ jobs:
       image: ghcr.io/opendevin/opendevin
       volumes:
         - /var/run/docker.sock:/var/run/docker.sock
-
     steps:
     - name: install git, github cli
       run: apt-get install -y git gh
-
     - name: Checkout Repository
       uses: actions/checkout@v4
-
     - name: Write Task File
       env:
         ISSUE_TITLE: ${{ github.event.issue.title }}
@@ -35,22 +33,18 @@ jobs:
         echo "" >> task.txt
         echo "BODY:" >> task.txt
         echo "${ISSUE_BODY}" >> task.txt
-
     - name: Set up environment
       run: |
         curl -sSL https://install.python-poetry.org | python3 -
         export PATH="/github/home/.local/bin:$PATH"
-        poetry install --without evaluation
+        poetry install --without evaluation,llama-index
         poetry run playwright install --with-deps chromium
-
-
     - name: Run OpenDevin
       env:
         ISSUE_TITLE: ${{ github.event.issue.title }}
         ISSUE_BODY: ${{ github.event.issue.body }}
         LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        SANDBOX_BOX_TYPE: ssh
       run: |
         # Append path to launch poetry
         export PATH="/github/home/.local/bin:$PATH"
@@ -58,7 +52,6 @@ jobs:
         export PYTHONPATH=$(pwd):$PYTHONPATH
         WORKSPACE_MOUNT_PATH=$GITHUB_WORKSPACE poetry run python ./opendevin/core/main.py -i 50 -f task.txt -d $GITHUB_WORKSPACE
         rm task.txt
-
     - name: Setup Git, Create Branch, and Commit Changes
       run: |
         # Setup Git configuration
@@ -84,7 +77,6 @@ jobs:
 
         # Push changes
         git push --set-upstream origin $BRANCH_NAME
-
     - name: Fetch Default Branch
       env:
         GH_TOKEN: ${{ github.token }}
@@ -93,7 +85,6 @@ jobs:
         DEFAULT_BRANCH=$(gh repo view --json defaultBranchRef --jq .defaultBranchRef.name)
         echo "Default branch is $DEFAULT_BRANCH"
         echo "DEFAULT_BRANCH=$DEFAULT_BRANCH" >> $GITHUB_ENV
-
     - name: Generate PR
       env:
         GH_TOKEN: ${{ github.token }}
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index b7e48311e480..6897fc79adea 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -1,4 +1,6 @@
+# Workflow that marks issues and PRs with no activity for 30 days with "Stale" and closes them after 7 more days of no activity
 name: 'Close stale issues'
+
 on:
   schedule:
     - cron: '30 1 * * *'
@@ -9,21 +11,9 @@ jobs:
     steps:
       - uses: actions/stale@v9
         with:
-          # Aggressively close issues that have been explicitly labeled `age-out`
-          any-of-labels: age-out
-          stale-issue-message: 'This issue is stale because it has been open for 7 days with no activity. Remove stale label or comment or this will be closed in 1 day.'
-          close-issue-message: 'This issue was closed because it has been stalled for over 7 days with no activity.'
-          stale-pr-message: 'This PR is stale because it has been open for 7 days with no activity. Remove stale label or comment or this will be closed in 1 days.'
-          close-pr-message: 'This PR was closed because it has been stalled for over 7 days with no activity.'
-          days-before-stale: 7
-          days-before-close: 1
-
-      - uses: actions/stale@v9
-        with:
-          # Be more lenient with other issues
           stale-issue-message: 'This issue is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
-          close-issue-message: 'This issue was closed because it has been stalled for over 30 days with no activity.'
           stale-pr-message: 'This PR is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
-          close-pr-message: 'This PR was closed because it has been stalled for over 30 days with no activity.'
           days-before-stale: 30
+          close-issue-message: 'This issue was closed because it has been stalled for over 30 days with no activity.'
+          close-pr-message: 'This PR was closed because it has been stalled for over 30 days with no activity.'
           days-before-close: 7
diff --git a/.github/workflows/update-pyproject-version.yml b/.github/workflows/update-pyproject-version.yml
deleted file mode 100644
index 24fa5429c557..000000000000
--- a/.github/workflows/update-pyproject-version.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-name: Update pyproject.toml Version and Tags
-
-on:
-  release:
-    types:
-      - published
-
-jobs:
-  update-pyproject-and-tags:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0  # Fetch all history for all branches and tags
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install toml
-
-      - name: Get release tag
-        id: get_release_tag
-        run: echo "RELEASE_TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
-
-      - name: Update pyproject.toml with release tag
-        run: |
-          python -c "
-          import toml
-          with open('pyproject.toml', 'r') as f:
-              data = toml.load(f)
-          data['tool']['poetry']['version'] = '${{ env.RELEASE_TAG }}'
-          with open('pyproject.toml', 'w') as f:
-              toml.dump(data, f)
-          "
-
-      - name: Commit and push pyproject.toml changes
-        uses: stefanzweifel/git-auto-commit-action@v4
-        with:
-          commit_message: "Update pyproject.toml version to ${{ env.RELEASE_TAG }}"
-          branch: main
-          file_pattern: pyproject.toml
diff --git a/.gitignore b/.gitignore
index fbed82154901..cac17cdfa34f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -169,6 +169,10 @@ evaluation/outputs
 evaluation/swe_bench/eval_workspace*
 evaluation/SWE-bench/data
 evaluation/webarena/scripts/webarena_env.sh
+evaluation/bird/data
+evaluation/gaia/data
+evaluation/gorilla/data
+evaluation/toolqa/data
 
 # frontend
 
diff --git a/Makefile b/Makefile
index 2e06fa01d2a8..a2d91d3527aa 100644
--- a/Makefile
+++ b/Makefile
@@ -23,9 +23,6 @@ RESET=$(shell tput -Txterm sgr0)
 build:
 	@echo "$(GREEN)Building project...$(RESET)"
 	@$(MAKE) -s check-dependencies
-ifeq ($(INSTALL_DOCKER),)
-	@$(MAKE) -s pull-docker-image
-endif
 	@$(MAKE) -s install-python-dependencies
 	@$(MAKE) -s install-frontend-dependencies
 	@$(MAKE) -s install-pre-commit-hooks
@@ -124,11 +121,6 @@ check-poetry:
 		exit 1; \
 	fi
 
-pull-docker-image:
-	@echo "$(YELLOW)Pulling Docker image...$(RESET)"
-	@docker pull $(DOCKER_IMAGE)
-	@echo "$(GREEN)Docker image pulled successfully.$(RESET)"
-
 install-python-dependencies:
 	@echo "$(GREEN)Installing Python dependencies...$(RESET)"
 	@if [ -z "${TZ}" ]; then \
@@ -141,7 +133,7 @@ install-python-dependencies:
 		export HNSWLIB_NO_NATIVE=1; \
 		poetry run pip install chroma-hnswlib; \
 	fi
-	@poetry install
+	@poetry install --without llama-index
 	@if [ -f "/etc/manjaro-release" ]; then \
 		echo "$(BLUE)Detected Manjaro Linux. Installing Playwright dependencies...$(RESET)"; \
 		poetry run pip install playwright; \
@@ -195,42 +187,45 @@ build-frontend:
 	@echo "$(YELLOW)Building frontend...$(RESET)"
 	@cd frontend && npm run build
 
-# Start backend
+# Start backend server with auto-reload
 start-backend:
 	@echo "$(YELLOW)Starting backend...$(RESET)"
 	@poetry run uvicorn opendevin.server.listen:app --port $(BACKEND_PORT) --reload --reload-exclude "workspace/*"
 
-# Start frontend
+# Start frontend server
 start-frontend:
 	@echo "$(YELLOW)Starting frontend...$(RESET)"
-	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run start
+	@if [ -n "$$WSL_DISTRO_NAME" ]; then \
+		mode=dev_wsl; \
+	else \
+		mode=start; \
+	fi; \
+	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run $$mode
 
-# Common setup for running the app (non-callable)
-_run_setup:
+# check for Windows (non-callable)
+_run_check:
 	@if [ "$(OS)" = "Windows_NT" ]; then \
 		echo "$(RED) Windows is not supported, use WSL instead!$(RESET)"; \
 		exit 1; \
 	fi
 	@mkdir -p logs
-	@echo "$(YELLOW)Starting backend server...$(RESET)"
-	@poetry run uvicorn opendevin.server.listen:app --port $(BACKEND_PORT) &
-	@echo "$(YELLOW)Waiting for the backend to start...$(RESET)"
-	@until nc -z localhost $(BACKEND_PORT); do sleep 0.1; done
-	@echo "$(GREEN)Backend started successfully.$(RESET)"
 
-# Run the app (standard mode)
+# Run the app in standard mode for end-users
 run:
 	@echo "$(YELLOW)Running the app...$(RESET)"
-	@$(MAKE) -s _run_setup
-	@cd frontend && echo "$(BLUE)Starting frontend with npm...$(RESET)" && npm run start -- --port $(FRONTEND_PORT)
+	@$(MAKE) -s _run_check
+	@poetry run uvicorn opendevin.server.listen:app --port $(BACKEND_PORT) &
+	@echo "$(YELLOW)Waiting for the app to start...$(RESET)"
+	@until nc -z localhost $(BACKEND_PORT); do sleep 0.1; done
+	@echo "$(GREEN)Application started successfully.$(RESET)"
+
+# Start both backend and frontend servers
+start:
+	@echo "$(YELLOW)Start the app in dev mode...$(RESET)"
+	@$(MAKE) -s start-backend
+	@$(MAKE) -s start-frontend
 	@echo "$(GREEN)Application started successfully.$(RESET)"
 
-# Run the app (WSL mode)
-run-wsl:
-	@echo "$(YELLOW)Running the app in WSL mode...$(RESET)"
-	@$(MAKE) -s _run_setup
-	@cd frontend && echo "$(BLUE)Starting frontend with npm (WSL mode)...$(RESET)" && npm run dev_wsl -- --port $(FRONTEND_PORT)
-	@echo "$(GREEN)Application started successfully in WSL mode.$(RESET)"
 
 # Setup config.toml
 setup-config:
@@ -246,16 +241,6 @@ setup-config-prompts:
 	 workspace_dir=$${workspace_dir:-$(DEFAULT_WORKSPACE_DIR)}; \
 	 echo "workspace_base=\"$$workspace_dir\"" >> $(CONFIG_FILE).tmp
 
-	@read -p "Do you want to persist the sandbox container? [true/false] [default: false]: " persist_sandbox; \
-	 persist_sandbox=$${persist_sandbox:-false}; \
-	 if [ "$$persist_sandbox" = "true" ]; then \
-		 read -p "Enter a password for the sandbox container: " ssh_password; \
-		 echo "ssh_password=\"$$ssh_password\"" >> $(CONFIG_FILE).tmp; \
-		 echo "persist_sandbox=$$persist_sandbox" >> $(CONFIG_FILE).tmp; \
-	 else \
-		echo "persist_sandbox=$$persist_sandbox" >> $(CONFIG_FILE).tmp; \
-	 fi
-
 	@echo "" >> $(CONFIG_FILE).tmp
 
 	@echo "[llm]" >> $(CONFIG_FILE).tmp
@@ -301,6 +286,16 @@ clean:
 	@rm -rf opendevin/.cache
 	@echo "$(GREEN)Caches cleaned up successfully.$(RESET)"
 
+# Kill all processes on port BACKEND_PORT and FRONTEND_PORT
+kill:
+	@echo "$(YELLOW)Killing all processes on port $(BACKEND_PORT) and $(FRONTEND_PORT)...$(RESET)"
+	ports=$$(lsof -t -i:$(BACKEND_PORT) -i:$(FRONTEND_PORT)); \
+	if [ -n "$$ports" ]; then \
+		kill -9 $$ports; \
+		echo "$(GREEN)Processes killed successfully.$(RESET)"; \
+	else \
+		echo "$(BLUE)No processes found on port $(BACKEND_PORT) and $(FRONTEND_PORT).$(RESET)"; \
+	fi
 # Help
 help:
 	@echo "$(BLUE)Usage: make [target]$(RESET)"
@@ -309,11 +304,14 @@ help:
 	@echo "  $(GREEN)lint$(RESET)                - Run linters on the project."
 	@echo "  $(GREEN)setup-config$(RESET)        - Setup the configuration for OpenDevin by providing LLM API key,"
 	@echo "                        LLM Model name, and workspace directory."
-	@echo "  $(GREEN)start-backend$(RESET)       - Start the backend server for the OpenDevin project."
+	@echo "  $(GREEN)start-backend$(RESET)       - Start the backend server for the OpenDevin project with auto-reload."
 	@echo "  $(GREEN)start-frontend$(RESET)      - Start the frontend server for the OpenDevin project."
-	@echo "  $(GREEN)run$(RESET)                 - Run the OpenDevin application, starting both backend and frontend servers."
+	@echo "  $(GREEN)start$(RESET)               - Start both backend and frontend servers."
+	@echo "  $(GREEN)run$(RESET)                 - Run the OpenDevin application for end-users."
+	@echo "  $(GREEN)run-wsl$(RESET)         - Run the OpenDevin application, starting both backend and frontend servers for WSL users."
+	@echo "  $(GREEN)kill$(RESET)                - Kill all processes on port 3000 and 3001."
 	@echo "                        Backend Log file will be stored in the 'logs' directory."
 	@echo "  $(GREEN)help$(RESET)                - Display this help message, providing information on available targets."
 
 # Phony targets
-.PHONY: build check-dependencies check-python check-npm check-docker check-poetry pull-docker-image install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend run run-wsl setup-config setup-config-prompts help
+.PHONY: build check-dependencies check-python check-npm check-docker check-poetry install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend start run run-wsl setup-config setup-config-prompts kill help
diff --git a/README.md b/README.md
index e2232ef19d5c..c2b884244809 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,35 @@
+The easiest way to run Kevin is to [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://github.com/codespaces/new/SmartManoj/Kevin) 🔥🔥🔥
+
+The vision is to leverage SLMs effectively and work towards solving most of the issues on the SWE-Bench Lite evaluation.
+
+### Kevin Changelogs:
+
+  1) [Added Auto Mode](https://github.com/OpenDevin/OpenDevin/pull/2782) 🔥🔥🔥
+  2) [Restarted Jupyter kernel if package installed via bash too](https://github.com/OpenDevin/OpenDevin/pull/3178) 👍
+  3) [Cleaned Browser Observattions](https://github.com/OpenDevin/OpenDevin/pull/3096) 🧹
+  4) [Showed relevant error in UI](https://github.com/OpenDevin/OpenDevin/pull/2657) 🚨
+  5) [Added Event History Condenser](https://github.com/OpenDevin/OpenDevin/pull/2937) 📜
+  6) [Feat: Persist sandbox for Event Runtime](https://github.com/SmartManoj/Kevin/commit/2200b21dd01ecf3618d7e676cf16f875c5fce154) 🥳🥳
+  7) [Parsed pip output and restarted kernel automatically (for bash too)](https://github.com/SmartManoj/Kevin/commit/3b77d5b2ec592e0fcb5bd7ed8a0d5787378bc0de) 📦
+  8) [Added editable address bar in browser tab](https://github.com/OpenDevin/OpenDevin/pull/3078) 🌐
+  9) [Include workspace contents if any at first step only.](https://github.com/OpenDevin/OpenDevin/pull/2865#issuecomment-2257487634) 📂
+  10) [Add start and kill modes in Makefile](https://github.com/OpenDevin/OpenDevin/pull/2850) 📳
+
+### Bug Fixes:
+  1) [Fixed GroqException - content must be a string for role system & assisstant](https://github.com/SmartManoj/Kevin/commit/30c98d458a299d789ebd6b8ada842c050bc91b20) 🛠️
+  2) [Fixed GroqException - condense' is unsupported](https://github.com/SmartManoj/Kevin/commit/1ece04784beb657dccbf615b3085e72f23a73e77) 🛠️
+  3) [Clear history when starting a new task](https://github.com/SmartManoj/Kevin/commit/f874e13fdd4ea50dcd0d8484639de40a1d6f66f4) 🧹
+  4) [Add miniforge path to synchronize bash and notebook](https://github.com/SmartManoj/Kevin/commit/6753d8b2b2b4e5a753cc4b3e26982d36464b6002) 🛣️
+  5) [Fixed frontend terminal prompt](https://github.com/SmartManoj/Kevin/commit/77950625b51a779b99533a9af616c97e640d5cd6) 🛠️
+  6) [Set TERM variable in bash](https://github.com/SmartManoj/Kevin/ec84c3b633ac23effac9f096a68560abc7388d2f) 🛠️
+
+### Minor Changes:
+  1) [Notify after task is finished](https://github.com/SmartManoj/Kevin/commit/cec8e7d9af109efc6abb099e2f9ac5b42b6650f6) 📢
+
+### Separate Feature Branches:
+  1) [Added Tutor Agent](https://github.com/SmartManoj/Kevin/tree/add-tutor-agent) 🧑‍🏫
+---
+
 <a name="readme-top"></a>
 
 <!--
@@ -24,7 +56,7 @@
   <a href="https://github.com/OpenDevin/OpenDevin/issues"><img src="https://img.shields.io/github/issues/opendevin/opendevin?style=for-the-badge&color=blue" alt="Issues"></a>
   <a href="https://github.com/OpenDevin/OpenDevin/blob/main/LICENSE"><img src="https://img.shields.io/github/license/opendevin/opendevin?style=for-the-badge&color=blue" alt="MIT License"></a>
   <br/>
-  <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2i1iqdag6-bVmvamiPA9EZUu7oCO6KhA"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
+  <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
   <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
   <a href="https://codecov.io/github/opendevin/opendevin?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/opendevin/opendevin?style=for-the-badge"></a>
 </div>
@@ -47,6 +79,7 @@ OpenDevin agents collaborate with human developers to write code, fix bugs, and
 ![App screenshot](./docs/static/img/screenshot.png)
 
 ## ⚡ Getting Started
+
 OpenDevin works best with Docker version 26.0.0+ (Docker Desktop 4.31.0+).
 You must be using Linux, Mac OS, or WSL on Windows.
 
@@ -66,7 +99,7 @@ docker run -it \
     -p 3000:3000 \
     --add-host host.docker.internal:host-gateway \
     --name opendevin-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/opendevin/opendevin
+    ghcr.io/opendevin/opendevin:0.8
 ```
 
 > [!NOTE]
@@ -82,7 +115,7 @@ OpenDevin will only have access to this workspace folder. The rest of your syste
 Upon opening OpenDevin, you must select the appropriate `Model` and enter the `API Key` within the settings that should pop up automatically. These can be set at any time by selecting
 the `Settings` button (gear icon) in the UI. If the required `Model` does not exist in the list, you can manually enter it in the text box.
 
-For the development workflow, see [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md).
+For the development workflow, see [Development.md](https://github.com/SmartManoj/Kevin/blob/main/Development.md).
 
 Are you having trouble? Check out our [Troubleshooting Guide](https://docs.all-hands.dev/modules/usage/troubleshooting).
 
@@ -111,7 +144,7 @@ For details, please check [CONTRIBUTING.md](./CONTRIBUTING.md).
 Whether you're a developer, a researcher, or simply enthusiastic about OpenDevin, we'd love to have you in our community.
 Let's make software engineering better together!
 
-- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) - Here we talk about research, architecture, and future development.
+- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) - Here we talk about research, architecture, and future development.
 - [Discord server](https://discord.gg/ESHStjSjD4) - This is a community-run server for general discussion, questions, and feedback.
 
 ## 📈 Progress
@@ -141,12 +174,12 @@ Distributed under the MIT License. See [`LICENSE`](./LICENSE) for more informati
 
 ```
 @misc{opendevin,
-      title={{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}}, 
+      title={{OpenDevin: An Open Platform for AI Software Developers as Generalist Agents}},
       author={Xingyao Wang and Boxuan Li and Yufan Song and Frank F. Xu and Xiangru Tang and Mingchen Zhuge and Jiayi Pan and Yueqi Song and Bowen Li and Jaskirat Singh and Hoang H. Tran and Fuqiang Li and Ren Ma and Mingzhang Zheng and Bill Qian and Yanjun Shao and Niklas Muennighoff and Yizhe Zhang and Binyuan Hui and Junyang Lin and Robert Brennan and Hao Peng and Heng Ji and Graham Neubig},
       year={2024},
       eprint={2407.16741},
       archivePrefix={arXiv},
       primaryClass={cs.SE},
-      url={https://arxiv.org/abs/2407.16741}, 
+      url={https://arxiv.org/abs/2407.16741},
 }
 ```
diff --git a/agenthub/browsing_agent/browsing_agent.py b/agenthub/browsing_agent/browsing_agent.py
index 91a59cd282b7..62925cd684c4 100644
--- a/agenthub/browsing_agent/browsing_agent.py
+++ b/agenthub/browsing_agent/browsing_agent.py
@@ -1,12 +1,12 @@
 import os
 
 from browsergym.core.action.highlevel import HighLevelActionSet
-from browsergym.utils.obs import flatten_axtree_to_str
 
 from agenthub.browsing_agent.response_parser import BrowsingResponseParser
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.message import Message, TextContent
 from opendevin.events.action import (
     Action,
     AgentFinishAction,
@@ -136,7 +136,7 @@ def step(self, state: State) -> Action:
         - MessageAction(content) - Message action to run (e.g. ask for clarification)
         - AgentFinishAction() - end the interaction
         """
-        messages = []
+        messages: list[Message] = []
         prev_actions = []
         cur_axtree_txt = ''
         error_prefix = ''
@@ -178,33 +178,29 @@ def step(self, state: State) -> Action:
                 self.error_accumulator += 1
                 if self.error_accumulator > 5:
                     return MessageAction('Too many errors encountered. Task failed.')
-            try:
-                cur_axtree_txt = flatten_axtree_to_str(
-                    last_obs.axtree_object,
-                    extra_properties=last_obs.extra_element_properties,
-                    with_clickable=True,
-                    filter_visible_only=True,
+            cur_axtree_txt = last_obs.axtree_txt
+            if cur_axtree_txt.startswith('AX Error:'):
+                return MessageAction(
+                    f'Error encountered when browsing. {cur_axtree_txt}'
                 )
-            except Exception as e:
-                logger.error(
-                    'Error when trying to process the accessibility tree: %s', e
-                )
-                return MessageAction('Error encountered when browsing.')
 
-        if (goal := state.get_current_user_intent()) is None:
+        goal, _ = state.get_current_user_intent()
+
+        if goal is None:
             goal = state.inputs['task']
+
         system_msg = get_system_message(
             goal,
             self.action_space.describe(with_long_description=False, with_examples=True),
         )
 
-        messages.append({'role': 'system', 'content': system_msg})
+        messages.append(Message(role='system', content=[TextContent(text=system_msg)]))
 
         prompt = get_prompt(error_prefix, cur_axtree_txt, prev_action_str)
-        messages.append({'role': 'user', 'content': prompt})
+        messages.append(Message(role='user', content=[TextContent(text=prompt)]))
         logger.debug(prompt)
         response = self.llm.completion(
-            messages=messages,
+            messages=[message.model_dump() for message in messages],
             temperature=0.0,
             stop=[')```', ')\n```'],
         )
diff --git a/agenthub/codeact_agent/action_parser.py b/agenthub/codeact_agent/action_parser.py
index 2491efcb37c3..2e4df1a164f8 100644
--- a/agenthub/codeact_agent/action_parser.py
+++ b/agenthub/codeact_agent/action_parser.py
@@ -5,6 +5,7 @@
     Action,
     AgentDelegateAction,
     AgentFinishAction,
+    AgentSummarizeAction,
     CmdRunAction,
     IPythonRunCellAction,
     MessageAction,
@@ -32,6 +33,8 @@ def __init__(self):
         self.default_parser = CodeActActionParserMessage()
 
     def parse(self, response) -> Action:
+        if isinstance(response, AgentSummarizeAction):
+            return response
         action_str = self.parse_response(response)
         return self.parse_action(action_str)
 
@@ -111,7 +114,6 @@ def __init__(
         self,
     ):
         self.python_code = None
-        self.jupyter_kernel_init_code: str = 'from agentskills import *'
 
     def check_condition(self, action_str: str) -> bool:
         self.python_code = re.search(
@@ -128,7 +130,6 @@ def parse(self, action_str: str) -> Action:
         return IPythonRunCellAction(
             code=code_group,
             thought=thought,
-            kernel_init_code=self.jupyter_kernel_init_code,
         )
 
 
diff --git a/agenthub/codeact_agent/codeact_agent.py b/agenthub/codeact_agent/codeact_agent.py
index e6eea593c7d4..6f58d4d4e69e 100644
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@@ -1,3 +1,5 @@
+import os
+
 from agenthub.codeact_agent.action_parser import CodeActResponseParser
 from agenthub.codeact_agent.prompt import (
     COMMAND_DOCS,
@@ -8,19 +10,24 @@
 )
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.config import load_app_config
+from opendevin.core.message import ImageContent, Message, TextContent
 from opendevin.events.action import (
     Action,
     AgentDelegateAction,
     AgentFinishAction,
+    AgentSummarizeAction,
     CmdRunAction,
     IPythonRunCellAction,
     MessageAction,
 )
+from opendevin.events.action.browse import BrowseURLAction
 from opendevin.events.observation import (
     AgentDelegateObservation,
     CmdOutputObservation,
     IPythonRunCellObservation,
 )
+from opendevin.events.observation.browse import BrowserOutputObservation
 from opendevin.events.observation.observation import Observation
 from opendevin.events.serialization.event import truncate_content
 from opendevin.llm.llm import LLM
@@ -33,6 +40,8 @@
 
 ENABLE_GITHUB = True
 
+config = load_app_config()
+
 
 # FIXME: We can tweak these two settings to create MicroAgents specialized toward different area
 def get_system_message() -> str:
@@ -121,48 +130,84 @@ def action_to_str(self, action: Action) -> str:
             return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
         elif isinstance(action, MessageAction):
             return action.content
+        elif isinstance(action, BrowseURLAction):
+            return f'Opening {action.url} in browser manually'
+        elif isinstance(action, AgentSummarizeAction):
+            return (
+                'Summary of all Action and Observations till now. \n'
+                f'Action: {action.summarized_actions}\n'
+                f'Observation: {action.summarized_observations}'
+            )
+        elif isinstance(action, AgentFinishAction) and action.source == 'agent':
+            return action.thought
         return ''
 
-    def get_action_message(self, action: Action) -> dict[str, str] | None:
+    def get_action_message(self, action: Action) -> Message | None:
         if (
             isinstance(action, AgentDelegateAction)
             or isinstance(action, CmdRunAction)
             or isinstance(action, IPythonRunCellAction)
             or isinstance(action, MessageAction)
+            or isinstance(action, BrowseURLAction)
+            or isinstance(action, AgentSummarizeAction)
+            or (isinstance(action, AgentFinishAction) and action.source == 'agent')
         ):
-            return {
-                'role': 'user' if action.source == 'user' else 'assistant',
-                'content': self.action_to_str(action),
-            }
+            content = [TextContent(text=self.action_to_str(action))]
+
+            if isinstance(action, MessageAction) and action.images_urls:
+                content.append(ImageContent(image_urls=action.images_urls))
+
+            return Message(
+                role='user' if action.source == 'user' else 'assistant',
+                content=content,
+                event_id=action.id,
+            )
         return None
 
-    def get_observation_message(self, obs: Observation) -> dict[str, str] | None:
+    def get_observation_message(self, obs: Observation) -> Message | None:
         max_message_chars = self.llm.config.max_message_chars
         if isinstance(obs, CmdOutputObservation):
-            content = 'OBSERVATION:\n' + truncate_content(
-                obs.content, max_message_chars
-            )
-            content += (
+            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
+            text += (
                 f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
             )
-            return {'role': 'user', 'content': content}
+            return Message(
+                role='user',
+                content=[TextContent(text=text)],
+                event_id=obs.id,
+            )
         elif isinstance(obs, IPythonRunCellObservation):
-            content = 'OBSERVATION:\n' + obs.content
+            text = 'OBSERVATION:\n' + obs.content
             # replace base64 images with a placeholder
-            splitted = content.split('\n')
+            splitted = text.split('\n')
             for i, line in enumerate(splitted):
                 if '![image](data:image/png;base64,' in line:
                     splitted[i] = (
                         '![image](data:image/png;base64, ...) already displayed to user'
                     )
-            content = '\n'.join(splitted)
-            content = truncate_content(content, max_message_chars)
-            return {'role': 'user', 'content': content}
+            text = '\n'.join(splitted)
+            text = truncate_content(text, max_message_chars)
+            return Message(
+                role='user',
+                content=[TextContent(text=text)],
+                event_id=obs.id,
+            )
         elif isinstance(obs, AgentDelegateObservation):
-            content = 'OBSERVATION:\n' + truncate_content(
+            text = 'OBSERVATION:\n' + truncate_content(
                 str(obs.outputs), max_message_chars
             )
-            return {'role': 'user', 'content': content}
+            return Message(
+                role='user',
+                content=[TextContent(text=text)],
+                event_id=obs.id,
+            )
+        elif isinstance(obs, BrowserOutputObservation):
+            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
+            return Message(
+                role='user',
+                content=[TextContent(text=text)],
+                event_id=obs.id,
+            )
         return None
 
     def reset(self) -> None:
@@ -188,49 +233,103 @@ def step(self, state: State) -> Action:
         if latest_user_message and latest_user_message.strip() == '/exit':
             return AgentFinishAction()
 
-        # prepare what we want to send to the LLM
-        messages: list[dict[str, str]] = self._get_messages(state)
-
-        response = self.llm.completion(
-            messages=messages,
-            stop=[
-                '</execute_ipython>',
-                '</execute_bash>',
-                '</execute_browse>',
-            ],
-            temperature=0.0,
-        )
+        response = None
+        # give it multiple chances to get a response
+        # if it fails, we'll try to condense memory
+        attempt = 0
+        # TODO check why loop is necessary as response is not None after the first attempt
+        while not response and attempt < self.llm.config.attempts_to_condense:
+            # prepare what we want to send to the LLM
+            messages: list[Message] = self._get_messages(state)
+            response = self.llm.completion(
+                messages=messages,
+                stop=[
+                    '</execute_ipython>',
+                    '</execute_bash>',
+                    '</execute_browse>',
+                ],
+                temperature=0.0,
+                condense=True,
+            )
+            attempt += 1
+
         return self.action_parser.parse(response)
 
-    def _get_messages(self, state: State) -> list[dict[str, str]]:
-        messages = [
-            {'role': 'system', 'content': self.system_message},
-            {'role': 'user', 'content': self.in_context_example},
+    def _get_messages(self, state: State) -> list[Message]:
+        messages: list[Message] = [
+            Message(
+                role='system',
+                content=[TextContent(text=self.system_message)],
+                condensable=False,
+            ),
+            Message(
+                role='user',
+                content=[TextContent(text=self.in_context_example)],
+                condensable=False,
+            ),
         ]
 
-        for event in state.history.get_events():
-            # create a regular message from an event
-            if isinstance(event, Action):
-                message = self.get_action_message(event)
-            elif isinstance(event, Observation):
-                message = self.get_observation_message(event)
-            else:
-                raise ValueError(f'Unknown event type: {type(event)}')
+        if len(state.history.get_events_as_list()) == 1:
+            workspace_contents = ', '.join(os.listdir(config.workspace_base))
+            if workspace_contents:
+                messages.append(
+                    Message(
+                        role='user',
+                        content=[
+                            TextContent(
+                                text=f'WORKSPACE CONTENTS: {workspace_contents}'
+                            )
+                        ],
+                    )
+                )
 
-            # add regular message
-            if message:
-                messages.append(message)
+        if state.history.summary:
+            summary_message = self.get_action_message(state.history.summary)
+            if summary_message:
+                messages.append(summary_message)
+        for event in state.history.get_events():
+            if event.id > state.history.last_summarized_event_id:
+                # create a regular message from an event
+                if isinstance(event, Action):
+                    message = self.get_action_message(event)
+                elif isinstance(event, Observation):
+                    message = self.get_observation_message(event)
+                else:
+                    raise ValueError(f'Unknown event type: {type(event)}')
+                # add regular message
+                if message:
+                    # handle error if the message is the SAME role as the previous message
+                    # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
+                    # there should not have two consecutive messages from the same role
+                    if messages and messages[-1].role == message.role:
+                        messages[-1].content.extend(message.content)
+                    else:
+                        messages.append(message)
 
         # the latest user message is important:
         # we want to remind the agent of the environment constraints
         latest_user_message = next(
-            (m for m in reversed(messages) if m['role'] == 'user'), None
+            (m for m in reversed(messages) if m.role == 'user'), None
         )
 
-        # add a reminder to the prompt
+        # Get the last user text inside content
         if latest_user_message:
-            latest_user_message['content'] += (
-                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>'
+            latest_user_message_text = next(
+                (
+                    t
+                    for t in reversed(latest_user_message.content)
+                    if isinstance(t, TextContent)
+                )
             )
+            # add a reminder to the prompt
+            reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
+
+            if latest_user_message_text:
+                latest_user_message_text.text = (
+                    latest_user_message_text.text + reminder_text
+                )
+            else:
+                latest_user_message_text = TextContent(text=reminder_text)
+                latest_user_message.content.append(latest_user_message_text)
 
         return messages
diff --git a/agenthub/codeact_swe_agent/action_parser.py b/agenthub/codeact_swe_agent/action_parser.py
index d51a6c6ec128..bc4facb48280 100644
--- a/agenthub/codeact_swe_agent/action_parser.py
+++ b/agenthub/codeact_swe_agent/action_parser.py
@@ -70,7 +70,6 @@ def __init__(
         self,
     ):
         self.python_code = None
-        self.jupyter_kernel_init_code: str = 'from agentskills import *'
 
     def check_condition(self, action_str: str) -> bool:
         self.python_code = re.search(
@@ -87,7 +86,6 @@ def parse(self, action_str: str) -> Action:
         return IPythonRunCellAction(
             code=code_group,
             thought=thought,
-            kernel_init_code=self.jupyter_kernel_init_code,
         )
 
 
diff --git a/agenthub/codeact_swe_agent/codeact_swe_agent.py b/agenthub/codeact_swe_agent/codeact_swe_agent.py
index d150af840558..b1191ff672c1 100644
--- a/agenthub/codeact_swe_agent/codeact_swe_agent.py
+++ b/agenthub/codeact_swe_agent/codeact_swe_agent.py
@@ -7,6 +7,7 @@
 from agenthub.codeact_swe_agent.response_parser import CodeActSWEResponseParser
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.message import ImageContent, Message, TextContent
 from opendevin.events.action import (
     Action,
     AgentFinishAction,
@@ -84,40 +85,43 @@ def action_to_str(self, action: Action) -> str:
             return action.content
         return ''
 
-    def get_action_message(self, action: Action) -> dict[str, str] | None:
+    def get_action_message(self, action: Action) -> Message | None:
         if (
             isinstance(action, CmdRunAction)
             or isinstance(action, IPythonRunCellAction)
             or isinstance(action, MessageAction)
         ):
-            return {
-                'role': 'user' if action.source == 'user' else 'assistant',
-                'content': self.action_to_str(action),
-            }
+            content = [TextContent(text=self.action_to_str(action))]
+
+            if isinstance(action, MessageAction) and action.images_urls:
+                content.append(ImageContent(image_urls=action.images_urls))
+
+            return Message(
+                role='user' if action.source == 'user' else 'assistant', content=content
+            )
+
         return None
 
-    def get_observation_message(self, obs: Observation) -> dict[str, str] | None:
+    def get_observation_message(self, obs: Observation) -> Message | None:
         max_message_chars = self.llm.config.max_message_chars
         if isinstance(obs, CmdOutputObservation):
-            content = 'OBSERVATION:\n' + truncate_content(
-                obs.content, max_message_chars
-            )
-            content += (
+            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
+            text += (
                 f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
             )
-            return {'role': 'user', 'content': content}
+            return Message(role='user', content=[TextContent(text=text)])
         elif isinstance(obs, IPythonRunCellObservation):
-            content = 'OBSERVATION:\n' + obs.content
+            text = 'OBSERVATION:\n' + obs.content
             # replace base64 images with a placeholder
-            splitted = content.split('\n')
+            splitted = text.split('\n')
             for i, line in enumerate(splitted):
                 if '![image](data:image/png;base64,' in line:
                     splitted[i] = (
                         '![image](data:image/png;base64, ...) already displayed to user'
                     )
-            content = '\n'.join(splitted)
-            content = truncate_content(content, max_message_chars)
-            return {'role': 'user', 'content': content}
+            text = '\n'.join(splitted)
+            text = truncate_content(text, max_message_chars)
+            return Message(role='user', content=[TextContent(text=text)])
         return None
 
     def reset(self) -> None:
@@ -143,10 +147,10 @@ def step(self, state: State) -> Action:
             return AgentFinishAction()
 
         # prepare what we want to send to the LLM
-        messages: list[dict[str, str]] = self._get_messages(state)
+        messages: list[Message] = self._get_messages(state)
 
         response = self.llm.completion(
-            messages=messages,
+            messages=[message.model_dump() for message in messages],
             stop=[
                 '</execute_ipython>',
                 '</execute_bash>',
@@ -156,10 +160,10 @@ def step(self, state: State) -> Action:
 
         return self.response_parser.parse(response)
 
-    def _get_messages(self, state: State) -> list[dict[str, str]]:
-        messages = [
-            {'role': 'system', 'content': self.system_message},
-            {'role': 'user', 'content': self.in_context_example},
+    def _get_messages(self, state: State) -> list[Message]:
+        messages: list[Message] = [
+            Message(role='system', content=[TextContent(text=self.system_message)]),
+            Message(role='user', content=[TextContent(text=self.in_context_example)]),
         ]
 
         for event in state.history.get_events():
@@ -173,18 +177,38 @@ def _get_messages(self, state: State) -> list[dict[str, str]]:
 
             # add regular message
             if message:
-                messages.append(message)
+                # handle error if the message is the SAME role as the previous message
+                # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
+                # there should not have two consecutive messages from the same role
+                if messages and messages[-1].role == message.role:
+                    messages[-1].content.extend(message.content)
+                else:
+                    messages.append(message)
 
         # the latest user message is important:
         # we want to remind the agent of the environment constraints
         latest_user_message = next(
-            (m for m in reversed(messages) if m['role'] == 'user'), None
+            (m for m in reversed(messages) if m.role == 'user'), None
         )
 
-        # add a reminder to the prompt
+        # Get the last user text inside content
         if latest_user_message:
-            latest_user_message['content'] += (
-                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
+            latest_user_message_text = next(
+                (
+                    t
+                    for t in reversed(latest_user_message.content)
+                    if isinstance(t, TextContent)
+                )
             )
+            # add a reminder to the prompt
+            reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
+
+            if latest_user_message_text:
+                latest_user_message_text.text = (
+                    latest_user_message_text.text + reminder_text
+                )
+            else:
+                latest_user_message_text = TextContent(text=reminder_text)
+                latest_user_message.content.append(latest_user_message_text)
 
         return messages
diff --git a/agenthub/delegator_agent/agent.py b/agenthub/delegator_agent/agent.py
index 5a07cfc68c43..1e49603bf368 100644
--- a/agenthub/delegator_agent/agent.py
+++ b/agenthub/delegator_agent/agent.py
@@ -34,7 +34,7 @@ def step(self, state: State) -> Action:
         """
         if self.current_delegate == '':
             self.current_delegate = 'study'
-            task = state.get_current_user_intent()
+            task, _ = state.get_current_user_intent()
             return AgentDelegateAction(
                 agent='StudyRepoForTaskAgent', inputs={'task': task}
             )
@@ -45,7 +45,7 @@ def step(self, state: State) -> Action:
         if not isinstance(last_observation, AgentDelegateObservation):
             raise Exception('Last observation is not an AgentDelegateObservation')
 
-        goal = state.get_current_user_intent()
+        goal, _ = state.get_current_user_intent()
         if self.current_delegate == 'study':
             self.current_delegate = 'coder'
             return AgentDelegateAction(
diff --git a/agenthub/dummy_agent/agent.py b/agenthub/dummy_agent/agent.py
index 6563aef41583..6c0167562e38 100644
--- a/agenthub/dummy_agent/agent.py
+++ b/agenthub/dummy_agent/agent.py
@@ -39,12 +39,12 @@
 
 
 class DummyAgent(Agent):
-    VERSION = '1.0'
     """
-    The DummyAgent is used for e2e testing. It just sends the same set of actions deterministically,
-    without making any LLM calls.
+    The DummyAgent is used for e2e testing. It just sends the same set of actions deterministically,without making any LLM calls.
     """
 
+    VERSION = '1.0'
+
     def __init__(self, llm: LLM):
         super().__init__(llm)
         self.steps: list[ActionObs] = [
@@ -208,9 +208,3 @@ def handle_browser_unavailable(
                 f' Unable to perform interactive browsing: {action.browser_actions}'
             )
         return MessageAction(content=message)
-
-    async def get_working_directory(self, state: State) -> str:
-        # Implement this method to return the current working directory
-        # This might involve accessing state information or making an async call
-        # For now, we'll return a placeholder value
-        return './workspace'
diff --git a/agenthub/micro/agent.py b/agenthub/micro/agent.py
index 941bc9e90313..0d305f70ec8d 100644
--- a/agenthub/micro/agent.py
+++ b/agenthub/micro/agent.py
@@ -2,6 +2,7 @@
 
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.message import ImageContent, Message, TextContent
 from opendevin.core.utils import json
 from opendevin.events.action import Action
 from opendevin.events.serialization.action import action_from_dict
@@ -62,16 +63,20 @@ def __init__(self, llm: LLM):
         del self.delegates[self.agent_definition['name']]
 
     def step(self, state: State) -> Action:
+        last_user_message, last_image_urls = state.get_current_user_intent()
         prompt = self.prompt_template.render(
             state=state,
             instructions=instructions,
             to_json=to_json,
             history_to_json=self.history_to_json,
             delegates=self.delegates,
-            latest_user_message=state.get_current_user_intent(),
+            latest_user_message=last_user_message,
         )
-        messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
+        content = [TextContent(text=prompt)]
+        if last_image_urls:
+            content.append(ImageContent(image_urls=last_image_urls))
+        message = Message(role='user', content=content)
+        resp = self.llm.completion(messages=[message.model_dump()])
         action_resp = resp['choices'][0]['message']['content']
         action = parse_response(action_resp)
         return action
diff --git a/agenthub/micro/commit_writer/README.md b/agenthub/micro/commit_writer/README.md
index 927bc67da286..f82484b91d93 100644
--- a/agenthub/micro/commit_writer/README.md
+++ b/agenthub/micro/commit_writer/README.md
@@ -3,7 +3,7 @@
 CommitWriterAgent can help write git commit message. Example:
 
 ```bash
-WORKSPACE_MOUNT_PATH="`PWD`" SANDBOX_BOX_TYPE="ssh" \
+WORKSPACE_MOUNT_PATH="`PWD`" \
   poetry run python opendevin/core/main.py -t "dummy task" -c CommitWriterAgent -d ./
 ```
 
diff --git a/agenthub/planner_agent/agent.py b/agenthub/planner_agent/agent.py
index f3225e6e0e4f..04e38c0b95b2 100644
--- a/agenthub/planner_agent/agent.py
+++ b/agenthub/planner_agent/agent.py
@@ -1,11 +1,12 @@
 from agenthub.planner_agent.response_parser import PlannerResponseParser
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
+from opendevin.core.message import ImageContent, Message, TextContent
 from opendevin.events.action import Action, AgentFinishAction
 from opendevin.llm.llm import LLM
 from opendevin.runtime.tools import RuntimeTool
 
-from .prompt import get_prompt
+from .prompt import get_prompt_and_images
 
 
 class PlannerAgent(Agent):
@@ -42,7 +43,13 @@ def step(self, state: State) -> Action:
             'abandoned',
         ]:
             return AgentFinishAction()
-        prompt = get_prompt(state, self.llm.config.max_message_chars)
-        messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
+
+        prompt, image_urls = get_prompt_and_images(
+            state, self.llm.config.max_message_chars
+        )
+        content = [TextContent(text=prompt)]
+        if image_urls:
+            content.append(ImageContent(image_urls=image_urls))
+        message = Message(role='user', content=content)
+        resp = self.llm.completion(messages=[message.model_dump()])
         return self.response_parser.parse(resp)
diff --git a/agenthub/planner_agent/prompt.py b/agenthub/planner_agent/prompt.py
index 9289ac045b5c..ad26b20cac97 100644
--- a/agenthub/planner_agent/prompt.py
+++ b/agenthub/planner_agent/prompt.py
@@ -115,7 +115,9 @@ def get_hint(latest_action_id: str) -> str:
     return hints.get(latest_action_id, '')
 
 
-def get_prompt(state: State, max_message_chars: int) -> str:
+def get_prompt_and_images(
+    state: State, max_message_chars: int
+) -> tuple[str, list[str]]:
     """Gets the prompt for the planner agent.
 
     Formatted with the most recent action-observation pairs, current task, and hint based on last action
@@ -161,16 +163,16 @@ def get_prompt(state: State, max_message_chars: int) -> str:
     logger.info('HINT:\n' + hint, extra={'msg_type': 'DETAIL'})
 
     # the last relevant user message (the task)
-    task = state.get_current_user_intent()
+    message, image_urls = state.get_current_user_intent()
 
     # finally, fill in the prompt
     return prompt % {
-        'task': task,
+        'task': message,
         'plan': plan_str,
         'history': history_str,
         'hint': hint,
         'plan_status': plan_status,
-    }
+    }, image_urls
 
 
 def parse_response(response: str) -> Action:
diff --git a/config.template.toml b/config.template.toml
index 1ce48dd517b3..644e1d603472 100644
--- a/config.template.toml
+++ b/config.template.toml
@@ -55,24 +55,11 @@ workspace_base = "./workspace"
 # Path to rewrite the workspace mount path to
 #workspace_mount_rewrite = ""
 
-# Persist the sandbox
-persist_sandbox = false
-
 # Run as devin
 #run_as_devin = true
 
 # Runtime environment
-#runtime = "server"
-
-# SSH hostname for the sandbox
-#ssh_hostname = "localhost"
-
-# SSH password for the sandbox
-#ssh_password = ""
-
-# SSH port for the sandbox
-#ssh_port = 63710
-
+#runtime = "eventstream"
 
 # Name of the default agent
 #default_agent = "CodeActAgent"
@@ -183,14 +170,11 @@ llm_config = 'gpt3'
 # Sandbox timeout in seconds
 #timeout = 120
 
-# Sandbox type (ssh, e2b, local)
-#box_type = "ssh"
-
 # Sandbox user ID
 #user_id = 1000
 
 # Container image to use for the sandbox
-#container_image = "ghcr.io/opendevin/sandbox:main"
+#container_image = "nikolaik/python-nodejs:python3.11-nodejs22"
 
 # Use host network
 #use_host_network = false
diff --git a/containers/app/Dockerfile b/containers/app/Dockerfile
index adba7c8babf9..dce75843718e 100644
--- a/containers/app/Dockerfile
+++ b/containers/app/Dockerfile
@@ -26,17 +26,19 @@ RUN apt-get update -y \
 
 COPY ./pyproject.toml ./poetry.lock ./
 RUN touch README.md
-RUN poetry install --without evaluation --no-root && rm -rf $POETRY_CACHE_DIR
+RUN export POETRY_CACHE_DIR && poetry install --without evaluation,llama-index --no-root && rm -rf $POETRY_CACHE_DIR
 
 FROM python:3.12.3-slim AS runtime
 
 WORKDIR /app
 
+ARG OPEN_DEVIN_BUILD_VERSION #re-declare for this section
+
 ENV RUN_AS_DEVIN=true
 # A random number--we need this to be different from the user's UID on the host machine
 ENV OPENDEVIN_USER_ID=42420
+ENV SANDBOX_API_HOSTNAME=host.docker.internal
 ENV USE_HOST_NETWORK=false
-ENV SSH_HOSTNAME=host.docker.internal
 ENV WORKSPACE_BASE=/opt/workspace_base
 ENV OPEN_DEVIN_BUILD_VERSION=$OPEN_DEVIN_BUILD_VERSION
 RUN mkdir -p $WORKSPACE_BASE
@@ -44,15 +46,17 @@ RUN mkdir -p $WORKSPACE_BASE
 RUN apt-get update -y \
     && apt-get install -y curl ssh sudo
 
-RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs # Default is 1000, but OSX is often 501
-RUN sed -i 's/^UID_MAX.*/UID_MAX 1000000/' /etc/login.defs # Default is 60000, but we've seen up to 200000
+# Default is 1000, but OSX is often 501
+RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs
+# Default is 60000, but we've seen up to 200000
+RUN sed -i 's/^UID_MAX.*/UID_MAX 1000000/' /etc/login.defs
 
 RUN groupadd app
 RUN useradd -l -m -u $OPENDEVIN_USER_ID -s /bin/bash opendevin && \
     usermod -aG app opendevin && \
     usermod -aG sudo opendevin && \
     echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
-RUN chown -R opendevin:app /app && chmod -R 770 /app
+RUN chown -R opendevin:app /app && chmod -R 774 /app
 RUN sudo chown -R opendevin:app $WORKSPACE_BASE && sudo chmod -R 770 $WORKSPACE_BASE
 USER opendevin
 
@@ -66,6 +70,9 @@ RUN playwright install --with-deps chromium
 COPY --chown=opendevin:app --chmod=770 ./opendevin ./opendevin
 COPY --chown=opendevin:app --chmod=777 ./opendevin/runtime/plugins ./opendevin/runtime/plugins
 COPY --chown=opendevin:app --chmod=770 ./agenthub ./agenthub
+COPY --chown=opendevin:app --chmod=770 ./pyproject.toml ./pyproject.toml
+COPY --chown=opendevin:app --chmod=770 ./poetry.lock ./poetry.lock
+COPY --chown=opendevin:app --chmod=770 ./README.md ./README.md
 
 RUN python opendevin/core/download.py # No-op to download assets
 RUN chown -R opendevin:app /app/logs && chmod -R 770 /app/logs # This gets created by the download.py script
diff --git a/containers/build.sh b/containers/build.sh
index ceeced985a42..15184823440b 100755
--- a/containers/build.sh
+++ b/containers/build.sh
@@ -53,6 +53,11 @@ fi
 if [[ -n "$DOCKER_IMAGE_TAG" ]]; then
   tags+=("$DOCKER_IMAGE_TAG")
 fi
+# If $DOCKER_IMAGE_HASH_TAG is set, add it to the tags
+if [[ -n "$DOCKER_IMAGE_HASH_TAG" ]]; then
+  tags+=("$DOCKER_IMAGE_HASH_TAG")
+fi
+
 
 DOCKER_REPOSITORY="$DOCKER_REGISTRY/$DOCKER_ORG/$DOCKER_IMAGE"
 DOCKER_REPOSITORY=${DOCKER_REPOSITORY,,} # lowercase
diff --git a/containers/runtime/config.sh b/containers/runtime/config.sh
index 5ff2fab5a07e..1a4b47f10d71 100644
--- a/containers/runtime/config.sh
+++ b/containers/runtime/config.sh
@@ -4,5 +4,3 @@ DOCKER_BASE_DIR="./containers/runtime"
 # These two variables will be appended by the runtime_build.py script
 # DOCKER_IMAGE=
 # DOCKER_IMAGE_TAG=
-DOCKER_IMAGE=od_runtime
-DOCKER_IMAGE_TAG=od_v0.8.1_image_ubuntu_tag_22.04
diff --git a/docs/docusaurus.config.ts b/docs/docusaurus.config.ts
index 038918042637..1cd0a0d44024 100644
--- a/docs/docusaurus.config.ts
+++ b/docs/docusaurus.config.ts
@@ -4,7 +4,7 @@ import { themes as prismThemes } from "prism-react-renderer";
 
 const config: Config = {
   title: "OpenDevin",
-  tagline: "Code Less, Make More",
+  tagline: "An Open Platform for AI Software Developers as Generalist Agents",
   favicon: "img/logo.png",
 
   // Set the production url of your site here
@@ -32,6 +32,10 @@ const config: Config = {
     },
   },
 
+  markdown: {
+    mermaid: true,
+  },
+  themes: ['@docusaurus/theme-mermaid'],
   presets: [
     [
       "classic",
@@ -77,7 +81,6 @@ const config: Config = {
           position: "left",
           label: "Codebase",
         },
-        { to: "/faq", label: "FAQ", position: "left" },
         {
           href: "https://github.com/OpenDevin/OpenDevin",
           label: "GitHub",
diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/about.md b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/about.md
index b21435b52290..aa6fa8d8e2b9 100644
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/about.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/about.md
@@ -31,7 +31,7 @@ Pour plus de détails, veuillez consulter [ce document](https://github.com/OpenD
 
 Nous avons maintenant à la fois un espace de travail Slack pour la collaboration sur la construction d'OpenDevin et un serveur Discord pour discuter de tout ce qui est lié, par exemple, à ce projet, aux LLM, aux agents, etc.
 
-- [Espace de travail Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
+- [Espace de travail Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw)
 - [Serveur Discord](https://discord.gg/ESHStjSjD4)
 
 Si vous souhaitez contribuer, n'hésitez pas à rejoindre notre communauté. Simplifions l'ingénierie logicielle ensemble !
diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
index 4b77b87d2d14..d7812087ff5e 100644
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
@@ -41,7 +41,6 @@ Créez un fichier ```config.toml``` dans le répertoire OpenDevin et entrez ces
 ```toml
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="image_personnalisée"
 ```
@@ -92,7 +91,6 @@ Si vous voyez cette erreur dans la sortie de la console, il s'agit du fait que O
 ```toml
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="image_personnalisée"
 sandbox_user_id="1001"
@@ -104,4 +102,4 @@ Si vous voyez un message d'erreur indiquant que le port est utilisé ou indispon
 
 ## Discuter
 
-Pour d'autres problèmes ou questions rejoignez le [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) ou le [Discord](https://discord.gg/ESHStjSjD4) et demandez!
+Pour d'autres problèmes ou questions rejoignez le [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) ou le [Discord](https://discord.gg/ESHStjSjD4) et demandez!
diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/intro.mdx b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/intro.mdx
index 28a478842d88..ed66081c48c4 100644
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/intro.mdx
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/intro.mdx
@@ -42,7 +42,7 @@ Explorez le code source d'OpenDevin sur [GitHub](https://github.com/OpenDevin/Op
   />
 </a>
 <br></br>
-<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
+<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw">
   <img
     src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
     alt="Join our Slack community"
@@ -72,8 +72,6 @@ WORKSPACE_BASE=$(pwd)/workspace
 docker run -it \
     --pull=always \
     -e SANDBOX_USER_ID=$(id -u) \
-    -e PERSIST_SANDBOX="true" \
-    -e SSH_PASSWORD="make something up here" \
     -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
     -v $WORKSPACE_BASE:/opt/workspace_base \
     -v /var/run/docker.sock:/var/run/docker.sock \
diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/about.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/about.md
index 137357724242..f08b72cf733d 100644
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/about.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/about.md
@@ -31,7 +31,7 @@ OpenDevin 是一个社区驱动的项目，我们欢迎每个人的贡献。无
 
 我们现在有一个 Slack 工作区，用于合作建设 OpenDevin，还设有一个 Discord 服务器，用于讨论与该项目、LLM、代理等相关的任何事情。
 
-- [Slack 工作区](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
+- [Slack 工作区](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw)
 - [Discord 服务器](https://discord.gg/ESHStjSjD4)
 
 如果您愿意贡献，请随时加入我们的社区。让我们一起简化软件工程！
diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
index 2e639150822a..8bae2b75dc80 100644
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/custom_sandbox_guide.md
@@ -40,7 +40,6 @@ docker build -t custom_image .
 ```
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="custom_image"
 ```
@@ -92,7 +91,6 @@ dockerfile_content = (
 ```
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="custom_image"
 sandbox_user_id="1001"
@@ -104,4 +102,4 @@ sandbox_user_id="1001"
 
 ## 讨论
 
-对于其他问题或疑问，请加入 [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) 或 [Discord](https://discord.gg/ESHStjSjD4)，并提问！
+对于其他问题或疑问，请加入 [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) 或 [Discord](https://discord.gg/ESHStjSjD4)，并提问！
diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/intro.mdx b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/intro.mdx
index 6926a9935905..e8a172e5d555 100644
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/intro.mdx
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/intro.mdx
@@ -42,7 +42,7 @@ OpenDevin 是一个**自主 AI 软件工程师**，能够执行复杂的工程
   />
 </a>
 <br></br>
-<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
+<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw">
   <img
     src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
     alt="Join our Slack community"
@@ -72,8 +72,6 @@ WORKSPACE_BASE=$(pwd)/workspace
 docker run -it \
     --pull=always \
     -e SANDBOX_USER_ID=$(id -u) \
-    -e PERSIST_SANDBOX="true" \
-    -e SSH_PASSWORD="make something up here" \
     -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
     -v $WORKSPACE_BASE:/opt/workspace_base \
     -v /var/run/docker.sock:/var/run/docker.sock \
diff --git a/docs/modules/usage/about.md b/docs/modules/usage/about.md
index 3c80f89a4767..d2752fea6bc4 100644
--- a/docs/modules/usage/about.md
+++ b/docs/modules/usage/about.md
@@ -31,7 +31,7 @@ For details, please check [this document](https://github.com/OpenDevin/OpenDevin
 
 We have both Slack workspace for the collaboration on building OpenDevin and Discord server for discussion about anything related, e.g., this project, LLM, agent, etc.
 
-- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA)
+- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw)
 - [Discord server](https://discord.gg/ESHStjSjD4)
 
 If you would love to contribute, feel free to join our community. Let's simplify software engineering together!
diff --git a/docs/modules/usage/custom_sandbox_guide.md b/docs/modules/usage/custom_sandbox_guide.md
index 125fb4b7547a..8be127f44b4a 100644
--- a/docs/modules/usage/custom_sandbox_guide.md
+++ b/docs/modules/usage/custom_sandbox_guide.md
@@ -4,22 +4,35 @@ sidebar_position: 6
 
 # 💿 How to Create and Use a Custom Docker Sandbox
 
-The default OpenDevin sandbox comes with a [minimal ubuntu configuration](https://github.com/OpenDevin/OpenDevin/blob/main/containers/sandbox/Dockerfile). 
+The default OpenDevin sandbox comes with a [minimal ubuntu configuration](https://github.com/OpenDevin/OpenDevin/blob/main/containers/sandbox/Dockerfile).
 
 Your use case may need additional software installed by default.
 
 There are two ways you can do so:
+
 1. Use an existing image from docker hub. For instance, if you want to have `nodejs` installed, you can do so by using the `node:20` image
 2. Creating your own custom docker image and using it
 
 If you want to take the first approach, you can skip the `Create Your Docker Image` section.
 
+For a more feature-rich environment, you might consider using pre-built images like **[nikolaik/python-nodejs](https://hub.docker.com/r/nikolaik/python-nodejs)**, which comes with both Python and Node.js pre-installed, along with many other useful tools and libraries, like:
+
+- Node.js: 22.x
+- npm: 10.x
+- yarn: stable
+- Python: latest
+- pip: latest
+- pipenv: latest
+- poetry: latest
+- uv: latest
+
 ## Setup
 
 Make sure you are able to run OpenDevin using the [Development.md](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) first.
 
 ## Create Your Docker Image
-To create a custom docker image, it must be debian/ubuntu based. 
+
+To create a custom docker image, it must be debian/ubuntu based.
 
 For example, if we want OpenDevin to have access to the `node` binary, we would use the following Dockerfile:
 
@@ -34,7 +47,7 @@ RUN apt-get update && apt-get install -y
 RUN apt-get install -y nodejs
 ```
 
-Next build your docker image with the name of your choice, for example `custom_image`. 
+Next build your docker image with the name of your choice, for example `custom_image`.
 
 To do this you can create a directory and put your file inside it with the name `Dockerfile`, and inside the directory run the following command:
 
@@ -50,19 +63,19 @@ This will produce a new image called ```custom_image``` that will be available i
 
 ## Specify your sandbox image in config.toml file
 
-OpenDevin configuration occurs via the top-level `config.toml` file. 
+OpenDevin configuration occurs via the top-level `config.toml` file.
 
 Create a `config.toml` file in the OpenDevin directory and enter these contents:
 
 ```toml
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="custom_image"
 ```
 
 For `sandbox_container_image`, you can specify either:
+
 1. The name of your custom image that you built in the previous step (e.g., `”custom_image”`)
 2. A pre-existing image from Docker Hub (e.g., `”node:20”` if you want a sandbox with Node.js pre-installed)
 
@@ -79,7 +92,7 @@ Congratulations!
 
 The relevant code is defined in [ssh_box.py](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/docker/ssh_box.py) and [image_agnostic_util.py](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/docker/image_agnostic_util.py).
 
-In particular, ssh_box.py checks the config object for ```config.sandbox_container_image``` and then attempts to retrieve the image using [get_od_sandbox_image](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/docker/image_agnostic_util.py#L72) which is defined in image_agnostic_util.py.
+In particular, `ssh_box.py` checks the config object for ```config.sandbox_container_image``` and then attempts to retrieve the image using [get_od_sandbox_image](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/docker/image_agnostic_util.py#L72) which is defined in image_agnostic_util.py.
 
 When first using a custom image, it will not be found and thus it will be built (on subsequent runs the built image will be found and returned).
 
@@ -109,12 +122,12 @@ dockerfile_content = (
 ## Troubleshooting / Errors
 
 ### Error: ```useradd: UID 1000 is not unique```
+
 If you see this error in the console output it is because OpenDevin is trying to create the opendevin user in the sandbox with a UID of 1000, however this UID is already being used in the image (for some reason). To fix this change the sandbox_user_id field in the config.toml file to a different value:
 
 ```toml
 [core]
 workspace_base="./workspace"
-persist_sandbox=false
 run_as_devin=true
 sandbox_container_image="custom_image"
 sandbox_user_id="1001"
@@ -122,8 +135,8 @@ sandbox_user_id="1001"
 
 ### Port use errors
 
-If you see an error about a port being in use or unavailable, try deleting all running Docker Containers (run `docker ps` and `docker rm` relevant containers) and then re-running ```make run```
+If you see an error about a port being in use or unavailable, try deleting all running Docker Containers (run `docker ps` and `docker rm` relevant containers) and then re-running ```make run``` .
 
 ## Discuss
 
-For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
+For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
diff --git a/docs/modules/usage/evaluation_harness.md b/docs/modules/usage/evaluation_harness.md
new file mode 100644
index 000000000000..136d477cef25
--- /dev/null
+++ b/docs/modules/usage/evaluation_harness.md
@@ -0,0 +1,257 @@
+---
+sidebar_position: 6
+---
+
+# 📈 How to contribute to OpenDevin Evaluation Harness
+
+This guide provides an overview of how to integrate your own evaluation benchmark into the OpenDevin framework.
+
+## Before everything begins: Setup Environment and LLM Configuration
+
+Please follow instruction [here](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup your local development environment and LLM.
+
+OpenDevin in development mode uses `config.toml` to keep track of most configurations.
+
+Here's an example configuration file you can use to define and use multiple LLMs:
+
+```toml
+[llm]
+# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
+model = "gpt-4o-2024-05-13"
+api_key = "sk-XXX"
+
+[llm.eval_gpt4_1106_preview_llm]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[llm.eval_some_openai_compatible_model_llm]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+
+## How to use OpenDevin in the command line
+
+OpenDevin can be run from the command line using the following format:
+
+```bash
+poetry run python ./opendevin/core/main.py \
+        -i <max_iterations> \
+        -t "<task_description>" \
+        -c <agent_class> \
+        -l <llm_config>
+```
+
+For example:
+
+```bash
+poetry run python ./opendevin/core/main.py \
+        -i 10 \
+        -t "Write me a bash script that prints hello world." \
+        -c CodeActAgent \
+        -l llm
+```
+
+This command runs OpenDevin with:
+- A maximum of 10 iterations
+- The specified task description
+- Using the CodeActAgent
+- With the LLM configuration defined in the `llm` section of your `config.toml` file
+
+## How does OpenDevin work
+
+The main entry point for OpenDevin is in `opendevin/core/main.py`. Here's a simplified flow of how it works:
+
+1. Parse command-line arguments and load the configuration.
+2. Create a runtime environment using `create_runtime()`.
+3. Initialize the specified agent.
+4. Run the controller using `run_controller()`, which:
+   - Attaches the runtime to the agent
+   - Executes the agent's task
+   - Returns a final state when complete
+
+The `run_controller()` function is the core of OpenDevin's execution. It manages the interaction between the agent, the runtime, and the task, handling things like user input simulation and event processing.
+
+
+## Easiest way to get started: Exploring Existing Benchmarks
+
+We encourage you to review the various evaluation benchmarks available in the [`evaluation/` directory](https://github.com/OpenDevin/OpenDevin/blob/main/evaluation) of our repository.
+
+To integrate your own benchmark, we suggest starting with the one that most closely resembles your needs. This approach can significantly streamline your integration process, allowing you to build upon existing structures and adapt them to your specific requirements.
+
+## How to create an evaluation workflow
+
+To create an evaluation workflow for your benchmark, follow these steps:
+
+1. Create a configuration:
+   ```python
+   def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
+       config = AppConfig(
+           default_agent=metadata.agent_class,
+           runtime='eventstream',
+           max_iterations=metadata.max_iterations,
+           sandbox=SandboxConfig(
+               container_image='your_container_image',
+               enable_auto_lint=True,
+               timeout=300,
+           ),
+       )
+       config.set_llm_config(metadata.llm_config)
+       return config
+   ```
+
+2. Initialize the runtime and set up the evaluation environment:
+   ```python
+   async def initialize_runtime(runtime: Runtime, instance: pd.Series):
+       # Set up your evaluation environment here
+       # For example, setting environment variables, preparing files, etc.
+       pass
+   ```
+
+3. Create a function to process each instance:
+   ```python
+   async def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
+       config = get_config(instance, metadata)
+       runtime = await create_runtime(config, sid=instance.instance_id)
+       await initialize_runtime(runtime, instance)
+
+       instruction = get_instruction(instance, metadata)
+
+       state = await run_controller(
+           config=config,
+           task_str=instruction,
+           runtime=runtime,
+           fake_user_response_fn=your_user_response_function,
+       )
+
+       # Evaluate the agent's actions
+       evaluation_result = await evaluate_agent_actions(runtime, instance)
+
+       return EvalOutput(
+           instance_id=instance.instance_id,
+           instruction=instruction,
+           test_result=evaluation_result,
+           metadata=metadata,
+           history=state.history.compatibility_for_eval_history_pairs(),
+           metrics=state.metrics.get() if state.metrics else None,
+           error=state.last_error if state and state.last_error else None,
+       )
+   ```
+
+4. Run the evaluation:
+   ```python
+   metadata = make_metadata(llm_config, dataset_name, agent_class, max_iterations, eval_note, eval_output_dir)
+   output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+   instances = prepare_dataset(your_dataset, output_file, eval_n_limit)
+
+   await run_evaluation(
+       instances,
+       metadata,
+       output_file,
+       num_workers,
+       process_instance
+   )
+   ```
+
+This workflow sets up the configuration, initializes the runtime environment, processes each instance by running the agent and evaluating its actions, and then collects the results into an `EvalOutput` object. The `run_evaluation` function handles parallelization and progress tracking.
+
+Remember to customize the `get_instruction`, `your_user_response_function`, and `evaluate_agent_actions` functions according to your specific benchmark requirements.
+
+By following this structure, you can create a robust evaluation workflow for your benchmark within the OpenDevin framework.
+
+Certainly! I'll add a section explaining the user_response_fn and include a description of the workflow and interaction. Here's an updated version of the guideline with the new section:
+
+
+## Understanding the `user_response_fn`
+
+The `user_response_fn` is a crucial component in OpenDevin's evaluation workflow. It simulates user interaction with the agent, allowing for automated responses during the evaluation process. This function is particularly useful when you want to provide consistent, predefined responses to the agent's queries or actions.
+
+
+### Workflow and Interaction
+
+The correct workflow for handling actions and the `user_response_fn` is as follows:
+
+1. Agent receives a task and starts processing
+2. Agent emits an Action
+3. If the Action is executable (e.g., CmdRunAction, IPythonRunCellAction):
+   - The Runtime processes the Action
+   - Runtime returns an Observation
+4. If the Action is not executable (typically a MessageAction):
+   - The `user_response_fn` is called
+   - It returns a simulated user response
+5. The agent receives either the Observation or the simulated response
+6. Steps 2-5 repeat until the task is completed or max iterations are reached
+
+Here's a more accurate visual representation:
+
+```
+                 [Agent]
+                    |
+                    v
+               [Emit Action]
+                    |
+                    v
+            [Is Action Executable?]
+           /                       \
+         Yes                        No
+          |                          |
+          v                          v
+     [Runtime]               [user_response_fn]
+          |                          |
+          v                          v
+  [Return Observation]    [Simulated Response]
+           \                        /
+            \                      /
+             v                    v
+           [Agent receives feedback]
+                    |
+                    v
+         [Continue or Complete Task]
+```
+
+In this workflow:
+
+- Executable actions (like running commands or executing code) are handled directly by the Runtime.
+- Non-executable actions (typically when the agent wants to communicate or ask for clarification) are handled by the `user_response_fn`.
+- The agent then processes the feedback, whether it's an Observation from the Runtime or a simulated response from the `user_response_fn`.
+
+This approach allows for automated handling of both concrete actions and simulated user interactions, making it suitable for evaluation scenarios where you want to test the agent's ability to complete tasks with minimal human intervention.
+
+### Example Implementation
+
+Here's an example of a `user_response_fn` used in the SWE-Bench evaluation:
+
+```python
+def codeact_user_response(state: State | None) -> str:
+    msg = (
+        'Please continue working on the task on whatever approach you think is suitable.\n'
+        'If you think you have solved the task, please first send your answer to user through message and then <execute_bash> exit </execute_bash>.\n'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
+    )
+
+    if state and state.history:
+        # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
+        user_msgs = [
+            event
+            for event in state.history.get_events()
+            if isinstance(event, MessageAction) and event.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            # let the agent know that it can give up when it has tried 3 times
+            return (
+                msg
+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
+            )
+    return msg
+```
+
+This function does the following:
+
+1. Provides a standard message encouraging the agent to continue working.
+2. Checks how many times the agent has attempted to communicate with the user.
+3. If the agent has made multiple attempts, it provides an option to give up.
+
+By using this function, you can ensure consistent behavior across multiple evaluation runs and prevent the agent from getting stuck waiting for human input.
diff --git a/docs/modules/usage/intro.mdx b/docs/modules/usage/intro.mdx
index 69dc6ea3c66b..b571f80cfd3c 100644
--- a/docs/modules/usage/intro.mdx
+++ b/docs/modules/usage/intro.mdx
@@ -42,7 +42,7 @@ Explore the codebase of OpenDevin on [GitHub](https://github.com/OpenDevin/OpenD
   />
 </a>
 <br></br>
-<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA">
+<a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw">
   <img
     src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge"
     alt="Join our Slack community"
diff --git a/docs/modules/usage/openshift-example.md b/docs/modules/usage/openshift-example.md
index 99ed48c6c21b..a898690ffabc 100644
--- a/docs/modules/usage/openshift-example.md
+++ b/docs/modules/usage/openshift-example.md
@@ -158,8 +158,6 @@ spec:
     env:
     - name: SANDBOX_USER_ID
       value: "1000"
-    - name: SANDBOX_BOX_TYPE
-      value: 'local'
     - name: WORKSPACE_MOUNT_PATH
       value: "/opt/workspace_base"
     volumeMounts:
@@ -290,13 +288,13 @@ RUN mkdir -p /opt/workspace_base && chown -R 1000:1000 /opt/workspace_base
 # Verify Git installation
 RUN git --version
 ```
-   
+
 2. Mount a shared development directory "i.e. one hosted in EC2 instance" to the POD:
    This can be also done by sharing the developement directory to the worker node through a sharing software (NFS), then creating a pv and pvc as described above to access that directory.
 
-3. Not all Agents working! Just tested CoderAgent with an openai API key and produced results. 
-   
+3. Not all Agents working! Just tested CoderAgent with an openai API key and produced results.
+
 
 ## Discuss
 
-For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
+For other issues or questions join the [Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw) or [Discord](https://discord.gg/ESHStjSjD4) and ask!
diff --git a/docs/modules/usage/runtime.md b/docs/modules/usage/runtime.md
new file mode 100644
index 000000000000..a1559f8af91d
--- /dev/null
+++ b/docs/modules/usage/runtime.md
@@ -0,0 +1,181 @@
+---
+sidebar_position: 4
+---
+
+# 📦 EventStream Runtime
+
+The OpenDevin EventStream Runtime is the core component that enables secure and flexible execution of AI agent's action.
+It creates a sandboxed environment using Docker, where arbitrary code can be run safely without risking the host system.
+
+
+## Why do we need a sandboxed runtime?
+
+OpenDevin needs to execute arbitrary code in a secure, isolated environment for several reasons:
+
+1. Security: Executing untrusted code can pose significant risks to the host system. A sandboxed environment prevents malicious code from accessing or modifying the host system's resources.
+
+2. Consistency: A sandboxed environment ensures that code execution is consistent across different machines and setups, eliminating "it works on my machine" issues.
+
+3. Resource Control: Sandboxing allows for better control over resource allocation and usage, preventing runaway processes from affecting the host system.
+
+4. Isolation: Different projects or users can work in isolated environments without interfering with each other or the host system.
+
+5. Reproducibility: Sandboxed environments make it easier to reproduce bugs and issues, as the execution environment is consistent and controllable.
+
+## How does our Runtime work?
+
+The OpenDevin Runtime system uses a client-server architecture implemented with Docker containers. Here's an overview of how it works:
+
+```mermaid
+graph TD
+    A[User-provided Custom Docker Image] --> B[OpenDevin Backend]
+    B -->|Builds| C[OD Runtime Image]
+    C -->|Launches| D[Runtime Client]
+    D -->|Initializes| E[Browser]
+    D -->|Initializes| F[Bash Shell]
+    D -->|Initializes| G[Plugins]
+    G -->|Initializes| L[Jupyter Server]
+
+    B -->|Spawn| H[Agent]
+    B -->|Spawn| I[EventStream]
+    I <--->|Execute Action to
+    Get Observation
+    via REST API
+    | D
+
+    H -->|Generate Action| I
+    I -->|Obtain Observation| H
+
+    subgraph "Docker Container"
+    D
+    E
+    F
+    G
+    L
+    end
+```
+
+1. User Input: The user provides a custom base Docker image.
+
+2. Image Building: OpenDevin builds a new Docker image (the "OD runtime image") based on the user-provided image. This new image includes OpenDevin-specific code, primarily the "runtime client."
+
+3. Container Launch: When OpenDevin starts, it launches a Docker container using the OD runtime image.
+
+4. Client Initialization: The runtime client initializes inside the container, setting up necessary components like a bash shell and loading any specified plugins.
+
+5. Communication: The OpenDevin backend (`runtime.py`) communicates with the runtime client over RESTful API, sending actions and receiving observations.
+
+6. Action Execution: The runtime client receives actions from the backend, executes them in the sandboxed environment, and sends back observations.
+
+7. Observation Return: The client sends execution results back to the OpenDevin backend as observations.
+
+
+The role of the client is crucial:
+- It acts as an intermediary between the OpenDevin backend and the sandboxed environment.
+- It executes various types of actions (shell commands, file operations, Python code, etc.) safely within the container.
+- It manages the state of the sandboxed environment, including the current working directory and loaded plugins.
+- It formats and returns observations to the backend, ensuring a consistent interface for processing results.
+
+
+## Advanced: How OpenDevin builds and maintains OD Runtime images
+
+OpenDevin uses a sophisticated approach to build and manage runtime images. This process ensures efficiency, consistency, and flexibility in creating and maintaining Docker images for both production and development environments.
+
+Check out [relavant code](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/runtime/utils/runtime_build.py) if you are interested in more details.
+
+### Image Tagging System
+
+OpenDevin uses a dual-tagging system for its runtime images to balance reproducibility with flexibility:
+
+1. Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`
+   Example: `od_runtime:abc123def456`
+
+   - This tag is based on the MD5 hash of the Docker build folder, which includes the source code (of runtime client and related dependencies) and Dockerfile.
+   - Identical hash tags guarantee that the images were built with exactly the same source code and Dockerfile.
+   - This ensures reproducibility: the same hash always means the same image contents.
+
+2. Generic tag: `{target_image_repo}:{target_image_tag}`
+   Example: `od_runtime:od_v0.8.3_ubuntu_tag_22.04`
+
+   - This tag follows the format: `od_runtime:od_v{OD_VERSION}_{BASE_IMAGE_NAME}_tag_{BASE_IMAGE_TAG}`
+   - It represents the latest build for a particular base image and OpenDevin version combination.
+   - This tag is updated whenever a new image is built from the same base image, even if the source code changes.
+
+The hash-based tag ensures exact reproducibility, while the generic tag provides a stable reference to the latest version of a particular configuration. This dual-tagging approach allows OpenDevin to efficiently manage both development and production environments.
+
+### Build Process
+
+1. Image Naming Convention:
+   - Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`
+     Example: `od_runtime:abc123def456`
+   - Generic tag: `{target_image_repo}:{target_image_tag}`
+     Example: `od_runtime:od_v0.8.3_ubuntu_tag_22.04`
+
+2. Build Process:
+   - a. Convert the base image name to an OD runtime image name.
+      Example: `ubuntu:22.04` -> `od_runtime:od_v0.8.3_ubuntu_tag_22.04`
+   - b. Generate a build context (Dockerfile and OpenDevin source code) and calculate its hash.
+   - c. Check for an existing image with the calculated hash.
+   - d. If not found, check for a recent compatible image to use as a base.
+   - e. If no compatible image exists, build from scratch using the original base image.
+   - f. Tag the new image with both hash-based and generic tags.
+
+3. Image Reuse and Rebuilding Logic:
+   The system follows these steps to determine whether to build a new image or use an existing one from a user-provided (base) image (e.g., `ubuntu:22.04`):
+
+   a. If an image exists with the same hash (e.g., `od_runtime:abc123def456`), it will be reused as is.
+
+   b. If the exact hash is not found, the system will try to rebuild using the latest generic image (e.g., `od_runtime:od_v0.8.3_ubuntu_tag_22.04`) as a base. This saves time by leveraging existing dependencies.
+
+   c. If neither the hash-tagged nor the generic-tagged image is found, the system will build the image completely from scratch.
+
+4. Caching and Efficiency:
+   - The system attempts to reuse existing images when possible to save build time.
+   - If an exact match (by hash) is found, it's used without rebuilding.
+   - If a compatible image is found, it's used as a base for rebuilding, saving time on dependency installation.
+
+Here's a flowchart illustrating the build process:
+
+```mermaid
+flowchart TD
+    A[Start] --> B{Convert base image name}
+    B --> |ubuntu:22.04 -> od_runtime:od_v0.8.3_ubuntu_tag_22.04| C[Generate build context and hash]
+    C --> D{Check for existing image with hash}
+    D -->|Found od_runtime:abc123def456| E[Use existing image]
+    D -->|Not found| F{Check for od_runtime:od_v0.8.3_ubuntu_tag_22.04}
+    F -->|Found| G[Rebuild based on recent image]
+    F -->|Not found| H[Build from scratch]
+    G --> I[Tag with hash and generic tags]
+    H --> I
+    E --> J[End]
+    I --> J
+```
+
+This approach ensures that:
+
+1. Identical source code and Dockerfile always produce the same image (via hash-based tags).
+2. The system can quickly rebuild images when minor changes occur (by leveraging recent compatible images).
+3. The generic tag (e.g., `od_runtime:od_v0.8.3_ubuntu_tag_22.04`) always points to the latest build for a particular base image and OpenDevin version combination.
+
+By using this method, OpenDevin maintains an efficient and flexible system for building and managing runtime images, adapting to both development needs and production requirements.
+
+
+## Advanced: Runtime Plugin System
+
+The OpenDevin Runtime supports a plugin system that allows for extending functionality and customizing the runtime environment. Plugins are initialized when the runtime client starts up.
+
+Check [an example of Jupyter plugin here](https://github.com/OpenDevin/OpenDevin/blob/9c44d94cef32e6426ebd8deeeb52963153b2348a/opendevin/runtime/plugins/jupyter/__init__.py#L30-L63) if you want to implement your own plugin.
+
+*More details about the Plugin system are still under construction - contributions are welcomed!*
+
+Key aspects of the plugin system:
+
+1. Plugin Definition: Plugins are defined as Python classes that inherit from a base `Plugin` class.
+
+2. Plugin Registration: Available plugins are registered in an `ALL_PLUGINS` dictionary.
+
+3. Plugin Specification: Plugins are associate with `Agent.sandbox_plugins: list[PluginRequirement]`. Users can specify which plugins to load when initializing the runtime.
+
+4. Initialization: Plugins are initialized asynchronously when the runtime client starts.
+
+5. Usage: The runtime client can use initialized plugins to extend its capabilities (e.g., the JupyterPlugin for running IPython cells).
diff --git a/docs/package-lock.json b/docs/package-lock.json
index edfb57026997..935c0614a685 100644
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
@@ -11,6 +11,7 @@
         "@docusaurus/core": "^3.4.0",
         "@docusaurus/plugin-content-pages": "^3.4.0",
         "@docusaurus/preset-classic": "^3.4.0",
+        "@docusaurus/theme-mermaid": "^3.4.0",
         "@mdx-js/react": "^3.0.0",
         "clsx": "^2.0.0",
         "prism-react-renderer": "^2.3.0",
@@ -2083,6 +2084,11 @@
         "node": ">=6.9.0"
       }
     },
+    "node_modules/@braintree/sanitize-url": {
+      "version": "6.0.4",
+      "resolved": "https://registry.npmmirror.com/@braintree/sanitize-url/-/sanitize-url-6.0.4.tgz",
+      "integrity": "sha512-s3jaWicZd0pkP0jf5ysyHUI/RE7MHos6qlToFcGWXVp+ykHOy77OUMrfbgJ9it2C5bow7OIQwYYaHjk9XlBQ2A=="
+    },
     "node_modules/@colors/colors": {
       "version": "1.5.0",
       "resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.5.0.tgz",
@@ -2579,6 +2585,27 @@
         "react-dom": "^18.0.0"
       }
     },
+    "node_modules/@docusaurus/theme-mermaid": {
+      "version": "3.4.0",
+      "resolved": "https://registry.npmmirror.com/@docusaurus/theme-mermaid/-/theme-mermaid-3.4.0.tgz",
+      "integrity": "sha512-3w5QW0HEZ2O6x2w6lU3ZvOe1gNXP2HIoKDMJBil1VmLBc9PmpAG17VmfhI/p3L2etNmOiVs5GgniUqvn8AFEGQ==",
+      "dependencies": {
+        "@docusaurus/core": "3.4.0",
+        "@docusaurus/module-type-aliases": "3.4.0",
+        "@docusaurus/theme-common": "3.4.0",
+        "@docusaurus/types": "3.4.0",
+        "@docusaurus/utils-validation": "3.4.0",
+        "mermaid": "^10.4.0",
+        "tslib": "^2.6.0"
+      },
+      "engines": {
+        "node": ">=18.0"
+      },
+      "peerDependencies": {
+        "react": "^18.0.0",
+        "react-dom": "^18.0.0"
+      }
+    },
     "node_modules/@docusaurus/theme-search-algolia": {
       "version": "3.4.0",
       "resolved": "https://registry.npmjs.org/@docusaurus/theme-search-algolia/-/theme-search-algolia-3.4.0.tgz",
@@ -3291,6 +3318,24 @@
         "@types/node": "*"
       }
     },
+    "node_modules/@types/d3-scale": {
+      "version": "4.0.8",
+      "resolved": "https://registry.npmmirror.com/@types/d3-scale/-/d3-scale-4.0.8.tgz",
+      "integrity": "sha512-gkK1VVTr5iNiYJ7vWDI+yUFFlszhNMtVeneJ6lUTKPjprsvLLI9/tgEGiXJOnlINJA8FyA88gfnQsHbybVZrYQ==",
+      "dependencies": {
+        "@types/d3-time": "*"
+      }
+    },
+    "node_modules/@types/d3-scale-chromatic": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmmirror.com/@types/d3-scale-chromatic/-/d3-scale-chromatic-3.0.3.tgz",
+      "integrity": "sha512-laXM4+1o5ImZv3RpFAsTRn3TEkzqkytiOY0Dz0sq5cnd1dtNlk6sHLon4OvqaiJb28T0S/TdsBI3Sjsy+keJrw=="
+    },
+    "node_modules/@types/d3-time": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmmirror.com/@types/d3-time/-/d3-time-3.0.3.tgz",
+      "integrity": "sha512-2p6olUZ4w3s+07q3Tm2dbiMZy5pCDfYwtLXXHUnVzXgQlZ/OyPtUz6OL382BkOuGlLXqfT+wqv8Fw2v8/0geBw=="
+    },
     "node_modules/@types/debug": {
       "version": "4.1.12",
       "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz",
@@ -4976,6 +5021,14 @@
       "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz",
       "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ=="
     },
+    "node_modules/cose-base": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmmirror.com/cose-base/-/cose-base-1.0.3.tgz",
+      "integrity": "sha512-s9whTXInMSgAp/NVXVNuVxVKzGH2qck3aQlVHxDCdAEPgtMKwc4Wq6/QKhgdEdgbLSi9rBTAcPoRa6JpiG4ksg==",
+      "dependencies": {
+        "layout-base": "^1.0.0"
+      }
+    },
     "node_modules/cosmiconfig": {
       "version": "8.3.6",
       "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-8.3.6.tgz",
@@ -5320,472 +5373,955 @@
       "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
       "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="
     },
-    "node_modules/debounce": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/debounce/-/debounce-1.2.1.tgz",
-      "integrity": "sha512-XRRe6Glud4rd/ZGQfiV1ruXSfbvfJedlV9Y6zOlP+2K04vBYiJEte6stfFkCP03aMnY5tsipamumUjL14fofug=="
+    "node_modules/cytoscape": {
+      "version": "3.30.1",
+      "resolved": "https://registry.npmmirror.com/cytoscape/-/cytoscape-3.30.1.tgz",
+      "integrity": "sha512-TRJc3HbBPkHd50u9YfJh2FxD1lDLZ+JXnJoyBn5LkncoeuT7fapO/Hq/Ed8TdFclaKshzInge2i30bg7VKeoPQ==",
+      "engines": {
+        "node": ">=0.10"
+      }
     },
-    "node_modules/debug": {
-      "version": "4.3.4",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
-      "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
-      "dependencies": {
-        "ms": "2.1.2"
+    "node_modules/cytoscape-cose-bilkent": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmmirror.com/cytoscape-cose-bilkent/-/cytoscape-cose-bilkent-4.1.0.tgz",
+      "integrity": "sha512-wgQlVIUJF13Quxiv5e1gstZ08rnZj2XaLHGoFMYXz7SkNfCDOOteKBE6SYRfA9WxxI/iBc3ajfDoc6hb/MRAHQ==",
+      "dependencies": {
+        "cose-base": "^1.0.0"
+      },
+      "peerDependencies": {
+        "cytoscape": "^3.2.0"
+      }
+    },
+    "node_modules/d3": {
+      "version": "7.9.0",
+      "resolved": "https://registry.npmmirror.com/d3/-/d3-7.9.0.tgz",
+      "integrity": "sha512-e1U46jVP+w7Iut8Jt8ri1YsPOvFpg46k+K8TpCb0P+zjCkjkPnV7WzfDJzMHy1LnA+wj5pLT1wjO901gLXeEhA==",
+      "dependencies": {
+        "d3-array": "3",
+        "d3-axis": "3",
+        "d3-brush": "3",
+        "d3-chord": "3",
+        "d3-color": "3",
+        "d3-contour": "4",
+        "d3-delaunay": "6",
+        "d3-dispatch": "3",
+        "d3-drag": "3",
+        "d3-dsv": "3",
+        "d3-ease": "3",
+        "d3-fetch": "3",
+        "d3-force": "3",
+        "d3-format": "3",
+        "d3-geo": "3",
+        "d3-hierarchy": "3",
+        "d3-interpolate": "3",
+        "d3-path": "3",
+        "d3-polygon": "3",
+        "d3-quadtree": "3",
+        "d3-random": "3",
+        "d3-scale": "4",
+        "d3-scale-chromatic": "3",
+        "d3-selection": "3",
+        "d3-shape": "3",
+        "d3-time": "3",
+        "d3-time-format": "4",
+        "d3-timer": "3",
+        "d3-transition": "3",
+        "d3-zoom": "3"
       },
       "engines": {
-        "node": ">=6.0"
-      },
-      "peerDependenciesMeta": {
-        "supports-color": {
-          "optional": true
-        }
+        "node": ">=12"
       }
     },
-    "node_modules/decode-named-character-reference": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/decode-named-character-reference/-/decode-named-character-reference-1.0.2.tgz",
-      "integrity": "sha512-O8x12RzrUF8xyVcY0KJowWsmaJxQbmy0/EtnNtHRpsOcT7dFk5W598coHqBVpmWo1oQQfsCqfCmkZN5DJrZVdg==",
+    "node_modules/d3-array": {
+      "version": "3.2.4",
+      "resolved": "https://registry.npmmirror.com/d3-array/-/d3-array-3.2.4.tgz",
+      "integrity": "sha512-tdQAmyA18i4J7wprpYq8ClcxZy3SC31QMeByyCFyRt7BVHdREQZ5lpzoe5mFEYZUWe+oq8HBvk9JjpibyEV4Jg==",
       "dependencies": {
-        "character-entities": "^2.0.0"
+        "internmap": "1 - 2"
       },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
+      "engines": {
+        "node": ">=12"
       }
     },
-    "node_modules/decompress-response": {
-      "version": "6.0.0",
-      "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz",
-      "integrity": "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==",
+    "node_modules/d3-axis": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmmirror.com/d3-axis/-/d3-axis-3.0.0.tgz",
+      "integrity": "sha512-IH5tgjV4jE/GhHkRV0HiVYPDtvfjHQlQfJHs0usq7M30XcSBvOotpmH1IgkcXsO/5gEQZD43B//fc7SRT5S+xw==",
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/d3-brush": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmmirror.com/d3-brush/-/d3-brush-3.0.0.tgz",
+      "integrity": "sha512-ALnjWlVYkXsVIGlOsuWH1+3udkYFI48Ljihfnh8FZPF2QS9o+PzGLBslO0PjzVoHLZ2KCVgAM8NVkXPJB2aNnQ==",
       "dependencies": {
-        "mimic-response": "^3.1.0"
+        "d3-dispatch": "1 - 3",
+        "d3-drag": "2 - 3",
+        "d3-interpolate": "1 - 3",
+        "d3-selection": "3",
+        "d3-transition": "3"
       },
       "engines": {
-        "node": ">=10"
+        "node": ">=12"
+      }
+    },
+    "node_modules/d3-chord": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmmirror.com/d3-chord/-/d3-chord-3.0.1.tgz",
+      "integrity": "sha512-VE5S6TNa+j8msksl7HwjxMHDM2yNK3XCkusIlpX5kwauBfXuyLAtNg9jCp/iHH61tgI4sb6R/EIMWCqEIdjT/g==",
+      "dependencies": {
+        "d3-path": "1 - 3"
       },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
+      "engines": {
+        "node": ">=12"
       }
     },
-    "node_modules/decompress-response/node_modules/mimic-response": {
+    "node_modules/d3-color": {
       "version": "3.1.0",
-      "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
-      "integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
+      "resolved": "https://registry.npmmirror.com/d3-color/-/d3-color-3.1.0.tgz",
+      "integrity": "sha512-zg/chbXyeBtMQ1LbD/WSoW2DpC3I0mpmPdW+ynRTj/x2DAWYrIY7qeZIHidozwV24m4iavr15lNwIwLxRmOxhA==",
       "engines": {
-        "node": ">=10"
+        "node": ">=12"
+      }
+    },
+    "node_modules/d3-contour": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmmirror.com/d3-contour/-/d3-contour-4.0.2.tgz",
+      "integrity": "sha512-4EzFTRIikzs47RGmdxbeUvLWtGedDUNkTcmzoeyg4sP/dvCexO47AaQL7VKy/gul85TOxw+IBgA8US2xwbToNA==",
+      "dependencies": {
+        "d3-array": "^3.2.0"
       },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
+      "engines": {
+        "node": ">=12"
       }
     },
-    "node_modules/deep-extend": {
-      "version": "0.6.0",
-      "resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz",
-      "integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==",
+    "node_modules/d3-delaunay": {
+      "version": "6.0.4",
+      "resolved": "https://registry.npmmirror.com/d3-delaunay/-/d3-delaunay-6.0.4.tgz",
+      "integrity": "sha512-mdjtIZ1XLAM8bm/hx3WwjfHt6Sggek7qH043O8KEjDXN40xi3vx/6pYSVTwLjEgiXQTbvaouWKynLBiUZ6SK6A==",
+      "dependencies": {
+        "delaunator": "5"
+      },
       "engines": {
-        "node": ">=4.0.0"
+        "node": ">=12"
       }
     },
-    "node_modules/deepmerge": {
-      "version": "4.3.1",
-      "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
-      "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
+    "node_modules/d3-dispatch": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmmirror.com/d3-dispatch/-/d3-dispatch-3.0.1.tgz",
+      "integrity": "sha512-rzUyPU/S7rwUflMyLc1ETDeBj0NRuHKKAcvukozwhshr6g6c5d8zh4c2gQjY2bZ0dXeGLWc1PF174P2tVvKhfg==",
       "engines": {
-        "node": ">=0.10.0"
+        "node": ">=12"
       }
     },
-    "node_modules/default-gateway": {
-      "version": "6.0.3",
-      "resolved": "https://registry.npmjs.org/default-gateway/-/default-gateway-6.0.3.tgz",
-      "integrity": "sha512-fwSOJsbbNzZ/CUFpqFBqYfYNLj1NbMPm8MMCIzHjC83iSJRBEGmDUxU+WP661BaBQImeC2yHwXtz+P/O9o+XEg==",
+    "node_modules/d3-drag": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmmirror.com/d3-drag/-/d3-drag-3.0.0.tgz",
+      "integrity": "sha512-pWbUJLdETVA8lQNJecMxoXfH6x+mO2UQo8rSmZ+QqxcbyA3hfeprFgIT//HW2nlHChWeIIMwS2Fq+gEARkhTkg==",
       "dependencies": {
-        "execa": "^5.0.0"
+        "d3-dispatch": "1 - 3",
+        "d3-selection": "3"
       },
       "engines": {
-        "node": ">= 10"
+        "node": ">=12"
       }
     },
-    "node_modules/defer-to-connect": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-2.0.1.tgz",
-      "integrity": "sha512-4tvttepXG1VaYGrRibk5EwJd1t4udunSOVMdLSAL6mId1ix438oPwPZMALY41FCijukO1L0twNcGsdzS7dHgDg==",
+    "node_modules/d3-dsv": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmmirror.com/d3-dsv/-/d3-dsv-3.0.1.tgz",
+      "integrity": "sha512-UG6OvdI5afDIFP9w4G0mNq50dSOsXHJaRE8arAS5o9ApWnIElp8GZw1Dun8vP8OyHOZ/QJUKUJwxiiCCnUwm+Q==",
+      "dependencies": {
+        "commander": "7",
+        "iconv-lite": "0.6",
+        "rw": "1"
+      },
+      "bin": {
+        "csv2json": "bin/dsv2json.js",
+        "csv2tsv": "bin/dsv2dsv.js",
+        "dsv2dsv": "bin/dsv2dsv.js",
+        "dsv2json": "bin/dsv2json.js",
+        "json2csv": "bin/json2dsv.js",
+        "json2dsv": "bin/json2dsv.js",
+        "json2tsv": "bin/json2dsv.js",
+        "tsv2csv": "bin/dsv2dsv.js",
+        "tsv2json": "bin/dsv2json.js"
+      },
       "engines": {
-        "node": ">=10"
+        "node": ">=12"
       }
     },
-    "node_modules/define-data-property": {
-      "version": "1.1.4",
-      "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
-      "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==",
+    "node_modules/d3-dsv/node_modules/commander": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmmirror.com/commander/-/commander-7.2.0.tgz",
+      "integrity": "sha512-QrWXB+ZQSVPmIWIhtEO9H+gwHaMGYiF5ChvoJ+K9ZGHG/sVsa6yiesAD1GC/x46sET00Xlwo1u49RVVVzvcSkw==",
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/d3-dsv/node_modules/iconv-lite": {
+      "version": "0.6.3",
+      "resolved": "https://registry.npmmirror.com/iconv-lite/-/iconv-lite-0.6.3.tgz",
+      "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
       "dependencies": {
-        "es-define-property": "^1.0.0",
-        "es-errors": "^1.3.0",
-        "gopd": "^1.0.1"
+        "safer-buffer": ">= 2.1.2 < 3.0.0"
       },
       "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
+        "node": ">=0.10.0"
       }
     },
-    "node_modules/define-lazy-prop": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-2.0.0.tgz",
-      "integrity": "sha512-Ds09qNh8yw3khSjiJjiUInaGX9xlqZDY7JVryGxdxV7NPeuqQfplOpQ66yJFZut3jLa5zOwkXw1g9EI2uKh4Og==",
+    "node_modules/d3-ease": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmmirror.com/d3-ease/-/d3-ease-3.0.1.tgz",
+      "integrity": "sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==",
       "engines": {
-        "node": ">=8"
+        "node": ">=12"
       }
     },
-    "node_modules/define-properties": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.2.1.tgz",
-      "integrity": "sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg==",
+    "node_modules/d3-fetch": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmmirror.com/d3-fetch/-/d3-fetch-3.0.1.tgz",
+      "integrity": "sha512-kpkQIM20n3oLVBKGg6oHrUchHM3xODkTzjMoj7aWQFq5QEM+R6E4WkzT5+tojDY7yjez8KgCBRoj4aEr99Fdqw==",
       "dependencies": {
-        "define-data-property": "^1.0.1",
-        "has-property-descriptors": "^1.0.0",
-        "object-keys": "^1.1.1"
+        "d3-dsv": "1 - 3"
       },
       "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
+        "node": ">=12"
       }
     },
-    "node_modules/del": {
-      "version": "6.1.1",
-      "resolved": "https://registry.npmjs.org/del/-/del-6.1.1.tgz",
-      "integrity": "sha512-ua8BhapfP0JUJKC/zV9yHHDW/rDoDxP4Zhn3AkA6/xT6gY7jYXJiaeyBZznYVujhZZET+UgcbZiQ7sN3WqcImg==",
+    "node_modules/d3-force": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmmirror.com/d3-force/-/d3-force-3.0.0.tgz",
+      "integrity": "sha512-zxV/SsA+U4yte8051P4ECydjD/S+qeYtnaIyAs9tgHCqfguma/aAQDjo85A9Z6EKhBirHRJHXIgJUlffT4wdLg==",
       "dependencies": {
-        "globby": "^11.0.1",
-        "graceful-fs": "^4.2.4",
-        "is-glob": "^4.0.1",
-        "is-path-cwd": "^2.2.0",
-        "is-path-inside": "^3.0.2",
-        "p-map": "^4.0.0",
-        "rimraf": "^3.0.2",
-        "slash": "^3.0.0"
+        "d3-dispatch": "1 - 3",
+        "d3-quadtree": "1 - 3",
+        "d3-timer": "1 - 3"
       },
       "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
+        "node": ">=12"
       }
     },
-    "node_modules/depd": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz",
-      "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
+    "node_modules/d3-format": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmmirror.com/d3-format/-/d3-format-3.1.0.tgz",
+      "integrity": "sha512-YyUI6AEuY/Wpt8KWLgZHsIU86atmikuoOmCfommt0LYHiQSPjvX2AcFc38PX0CBpr2RCyZhjex+NS/LPOv6YqA==",
       "engines": {
-        "node": ">= 0.8"
+        "node": ">=12"
       }
     },
-    "node_modules/dequal": {
-      "version": "2.0.3",
-      "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz",
-      "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==",
+    "node_modules/d3-geo": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmmirror.com/d3-geo/-/d3-geo-3.1.1.tgz",
+      "integrity": "sha512-637ln3gXKXOwhalDzinUgY83KzNWZRKbYubaG+fGVuc/dxO64RRljtCTnf5ecMyE1RIdtqpkVcq0IbtU2S8j2Q==",
+      "dependencies": {
+        "d3-array": "2.5.0 - 3"
+      },
       "engines": {
-        "node": ">=6"
+        "node": ">=12"
       }
     },
-    "node_modules/destroy": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz",
-      "integrity": "sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg==",
+    "node_modules/d3-hierarchy": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmmirror.com/d3-hierarchy/-/d3-hierarchy-3.1.2.tgz",
+      "integrity": "sha512-FX/9frcub54beBdugHjDCdikxThEqjnR93Qt7PvQTOHxyiNCAlvMrHhclk3cD5VeAaq9fxmfRp+CnWw9rEMBuA==",
       "engines": {
-        "node": ">= 0.8",
-        "npm": "1.2.8000 || >= 1.4.16"
+        "node": ">=12"
       }
     },
-    "node_modules/detect-node": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/detect-node/-/detect-node-2.1.0.tgz",
-      "integrity": "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g=="
-    },
-    "node_modules/detect-port": {
-      "version": "1.5.1",
-      "resolved": "https://registry.npmjs.org/detect-port/-/detect-port-1.5.1.tgz",
-      "integrity": "sha512-aBzdj76lueB6uUst5iAs7+0H/oOjqI5D16XUWxlWMIMROhcM0rfsNVk93zTngq1dDNpoXRr++Sus7ETAExppAQ==",
+    "node_modules/d3-interpolate": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmmirror.com/d3-interpolate/-/d3-interpolate-3.0.1.tgz",
+      "integrity": "sha512-3bYs1rOD33uo8aqJfKP3JWPAibgw8Zm2+L9vBKEHJ2Rg+viTR7o5Mmv5mZcieN+FRYaAOWX5SJATX6k1PWz72g==",
       "dependencies": {
-        "address": "^1.0.1",
-        "debug": "4"
+        "d3-color": "1 - 3"
       },
-      "bin": {
-        "detect": "bin/detect-port.js",
-        "detect-port": "bin/detect-port.js"
+      "engines": {
+        "node": ">=12"
       }
     },
-    "node_modules/detect-port-alt": {
-      "version": "1.1.6",
-      "resolved": "https://registry.npmjs.org/detect-port-alt/-/detect-port-alt-1.1.6.tgz",
-      "integrity": "sha512-5tQykt+LqfJFBEYaDITx7S7cR7mJ/zQmLXZ2qt5w04ainYZw6tBf9dBunMjVeVOdYVRUzUOE4HkY5J7+uttb5Q==",
-      "dependencies": {
-        "address": "^1.0.1",
-        "debug": "^2.6.0"
-      },
-      "bin": {
-        "detect": "bin/detect-port",
-        "detect-port": "bin/detect-port"
-      },
+    "node_modules/d3-path": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmmirror.com/d3-path/-/d3-path-3.1.0.tgz",
+      "integrity": "sha512-p3KP5HCf/bvjBSSKuXid6Zqijx7wIfNW+J/maPs+iwR35at5JCbLUT0LzF1cnjbCHWhqzQTIN2Jpe8pRebIEFQ==",
       "engines": {
-        "node": ">= 4.2.1"
+        "node": ">=12"
       }
     },
-    "node_modules/detect-port-alt/node_modules/debug": {
-      "version": "2.6.9",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
-      "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
-      "dependencies": {
-        "ms": "2.0.0"
+    "node_modules/d3-polygon": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmmirror.com/d3-polygon/-/d3-polygon-3.0.1.tgz",
+      "integrity": "sha512-3vbA7vXYwfe1SYhED++fPUQlWSYTTGmFmQiany/gdbiWgU/iEyQzyymwL9SkJjFFuCS4902BSzewVGsHHmHtXg==",
+      "engines": {
+        "node": ">=12"
       }
     },
-    "node_modules/detect-port-alt/node_modules/ms": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
-      "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
-    },
-    "node_modules/devlop": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/devlop/-/devlop-1.1.0.tgz",
-      "integrity": "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==",
-      "dependencies": {
-        "dequal": "^2.0.0"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
+    "node_modules/d3-quadtree": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmmirror.com/d3-quadtree/-/d3-quadtree-3.0.1.tgz",
+      "integrity": "sha512-04xDrxQTDTCFwP5H6hRhsRcb9xxv2RzkcsygFzmkSIOJy3PeRJP7sNk3VRIbKXcog561P9oU0/rVH6vDROAgUw==",
+      "engines": {
+        "node": ">=12"
       }
     },
-    "node_modules/dir-glob": {
+    "node_modules/d3-random": {
       "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/dir-glob/-/dir-glob-3.0.1.tgz",
-      "integrity": "sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==",
-      "dependencies": {
-        "path-type": "^4.0.0"
-      },
+      "resolved": "https://registry.npmmirror.com/d3-random/-/d3-random-3.0.1.tgz",
+      "integrity": "sha512-FXMe9GfxTxqd5D6jFsQ+DJ8BJS4E/fT5mqqdjovykEB2oFbTMDVdg1MGFxfQW+FBOGoB++k8swBrgwSHT1cUXQ==",
       "engines": {
-        "node": ">=8"
+        "node": ">=12"
       }
     },
-    "node_modules/dns-packet": {
-      "version": "5.6.1",
-      "resolved": "https://registry.npmjs.org/dns-packet/-/dns-packet-5.6.1.tgz",
-      "integrity": "sha512-l4gcSouhcgIKRvyy99RNVOgxXiicE+2jZoNmaNmZ6JXiGajBOJAesk1OBlJuM5k2c+eudGdLxDqXuPCKIj6kpw==",
+    "node_modules/d3-sankey": {
+      "version": "0.12.3",
+      "resolved": "https://registry.npmmirror.com/d3-sankey/-/d3-sankey-0.12.3.tgz",
+      "integrity": "sha512-nQhsBRmM19Ax5xEIPLMY9ZmJ/cDvd1BG3UVvt5h3WRxKg5zGRbvnteTyWAbzeSvlh3tW7ZEmq4VwR5mB3tutmQ==",
       "dependencies": {
-        "@leichtgewicht/ip-codec": "^2.0.1"
-      },
-      "engines": {
-        "node": ">=6"
+        "d3-array": "1 - 2",
+        "d3-shape": "^1.2.0"
       }
     },
-    "node_modules/dom-converter": {
-      "version": "0.2.0",
-      "resolved": "https://registry.npmjs.org/dom-converter/-/dom-converter-0.2.0.tgz",
-      "integrity": "sha512-gd3ypIPfOMr9h5jIKq8E3sHOTCjeirnl0WK5ZdS1AW0Odt0b1PaWaHdJ4Qk4klv+YB9aJBS7mESXjFoDQPu6DA==",
+    "node_modules/d3-sankey/node_modules/d3-array": {
+      "version": "2.12.1",
+      "resolved": "https://registry.npmmirror.com/d3-array/-/d3-array-2.12.1.tgz",
+      "integrity": "sha512-B0ErZK/66mHtEsR1TkPEEkwdy+WDesimkM5gpZr5Dsg54BiTA5RXtYW5qTLIAcekaS9xfZrzBLF/OAkB3Qn1YQ==",
       "dependencies": {
-        "utila": "~0.4"
+        "internmap": "^1.0.0"
       }
     },
-    "node_modules/dom-serializer": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
-      "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
+    "node_modules/d3-sankey/node_modules/d3-path": {
+      "version": "1.0.9",
+      "resolved": "https://registry.npmmirror.com/d3-path/-/d3-path-1.0.9.tgz",
+      "integrity": "sha512-VLaYcn81dtHVTjEHd8B+pbe9yHWpXKZUC87PzoFmsFrJqgFwDe/qxfp5MlfsfM1V5E/iVt0MmEbWQ7FVIXh/bg=="
+    },
+    "node_modules/d3-sankey/node_modules/d3-shape": {
+      "version": "1.3.7",
+      "resolved": "https://registry.npmmirror.com/d3-shape/-/d3-shape-1.3.7.tgz",
+      "integrity": "sha512-EUkvKjqPFUAZyOlhY5gzCxCeI0Aep04LwIRpsZ/mLFelJiUfnK56jo5JMDSE7yyP2kLSb6LtF+S5chMk7uqPqw==",
       "dependencies": {
-        "domelementtype": "^2.3.0",
-        "domhandler": "^5.0.2",
-        "entities": "^4.2.0"
-      },
-      "funding": {
-        "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
+        "d3-path": "1"
       }
     },
-    "node_modules/domelementtype": {
-      "version": "2.3.0",
-      "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
-      "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
-      "funding": [
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/fb55"
-        }
-      ]
+    "node_modules/d3-sankey/node_modules/internmap": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmmirror.com/internmap/-/internmap-1.0.1.tgz",
+      "integrity": "sha512-lDB5YccMydFBtasVtxnZ3MRBHuaoE8GKsppq+EchKL2U4nK/DmEpPHNH8MZe5HkMtpSiTSOZwfN0tzYjO/lJEw=="
     },
-    "node_modules/domhandler": {
-      "version": "5.0.3",
-      "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
-      "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
+    "node_modules/d3-scale": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmmirror.com/d3-scale/-/d3-scale-4.0.2.tgz",
+      "integrity": "sha512-GZW464g1SH7ag3Y7hXjf8RoUuAFIqklOAq3MRl4OaWabTFJY9PN/E1YklhXLh+OQ3fM9yS2nOkCoS+WLZ6kvxQ==",
       "dependencies": {
-        "domelementtype": "^2.3.0"
+        "d3-array": "2.10.0 - 3",
+        "d3-format": "1 - 3",
+        "d3-interpolate": "1.2.0 - 3",
+        "d3-time": "2.1.1 - 3",
+        "d3-time-format": "2 - 4"
       },
       "engines": {
-        "node": ">= 4"
-      },
-      "funding": {
-        "url": "https://github.com/fb55/domhandler?sponsor=1"
+        "node": ">=12"
       }
     },
-    "node_modules/domutils": {
+    "node_modules/d3-scale-chromatic": {
       "version": "3.1.0",
-      "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz",
-      "integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==",
+      "resolved": "https://registry.npmmirror.com/d3-scale-chromatic/-/d3-scale-chromatic-3.1.0.tgz",
+      "integrity": "sha512-A3s5PWiZ9YCXFye1o246KoscMWqf8BsD9eRiJ3He7C9OBaxKhAd5TFCdEx/7VbKtxxTsu//1mMJFrEt572cEyQ==",
       "dependencies": {
-        "dom-serializer": "^2.0.0",
-        "domelementtype": "^2.3.0",
-        "domhandler": "^5.0.3"
+        "d3-color": "1 - 3",
+        "d3-interpolate": "1 - 3"
       },
-      "funding": {
-        "url": "https://github.com/fb55/domutils?sponsor=1"
+      "engines": {
+        "node": ">=12"
       }
     },
-    "node_modules/dot-case": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/dot-case/-/dot-case-3.0.4.tgz",
-      "integrity": "sha512-Kv5nKlh6yRrdrGvxeJ2e5y2eRUpkUosIW4A2AS38zwSz27zu7ufDwQPi5Jhs3XAlGNetl3bmnGhQsMtkKJnj3w==",
-      "dependencies": {
-        "no-case": "^3.0.4",
-        "tslib": "^2.0.3"
+    "node_modules/d3-selection": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmmirror.com/d3-selection/-/d3-selection-3.0.0.tgz",
+      "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==",
+      "engines": {
+        "node": ">=12"
       }
     },
-    "node_modules/dot-prop": {
-      "version": "6.0.1",
-      "resolved": "https://registry.npmjs.org/dot-prop/-/dot-prop-6.0.1.tgz",
-      "integrity": "sha512-tE7ztYzXHIeyvc7N+hR3oi7FIbf/NIjVP9hmAt3yMXzrQ072/fpjGLx2GxNxGxUl5V73MEqYzioOMoVhGMJ5cA==",
+    "node_modules/d3-shape": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmmirror.com/d3-shape/-/d3-shape-3.2.0.tgz",
+      "integrity": "sha512-SaLBuwGm3MOViRq2ABk3eLoxwZELpH6zhl3FbAoJ7Vm1gofKx6El1Ib5z23NUEhF9AsGl7y+dzLe5Cw2AArGTA==",
       "dependencies": {
-        "is-obj": "^2.0.0"
+        "d3-path": "^3.1.0"
       },
       "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
+        "node": ">=12"
       }
     },
-    "node_modules/dot-prop/node_modules/is-obj": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/is-obj/-/is-obj-2.0.0.tgz",
-      "integrity": "sha512-drqDG3cbczxxEJRoOXcOjtdp1J/lyp1mNn0xaznRs8+muBhgQcrnbspox5X5fOw0HnMnbfDzvnEMEtqDEJEo8w==",
+    "node_modules/d3-time": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmmirror.com/d3-time/-/d3-time-3.1.0.tgz",
+      "integrity": "sha512-VqKjzBLejbSMT4IgbmVgDjpkYrNWUYJnbCGo874u7MMKIWsILRX+OpX/gTk8MqjpT1A/c6HY2dCA77ZN0lkQ2Q==",
+      "dependencies": {
+        "d3-array": "2 - 3"
+      },
       "engines": {
-        "node": ">=8"
+        "node": ">=12"
       }
     },
-    "node_modules/duplexer": {
-      "version": "0.1.2",
-      "resolved": "https://registry.npmjs.org/duplexer/-/duplexer-0.1.2.tgz",
-      "integrity": "sha512-jtD6YG370ZCIi/9GTaJKQxWTZD045+4R4hTk/x1UyoqadyJ9x9CgSi1RlVDQF8U2sxLLSnFkCaMihqljHIWgMg=="
-    },
-    "node_modules/eastasianwidth": {
-      "version": "0.2.0",
-      "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz",
-      "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA=="
-    },
-    "node_modules/ee-first": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
-      "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow=="
-    },
-    "node_modules/electron-to-chromium": {
-      "version": "1.4.748",
-      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.748.tgz",
-      "integrity": "sha512-VWqjOlPZn70UZ8FTKUOkUvBLeTQ0xpty66qV0yJcAGY2/CthI4xyW9aEozRVtuwv3Kpf5xTesmJUcPwuJmgP4A=="
-    },
-    "node_modules/emoji-regex": {
-      "version": "9.2.2",
-      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz",
-      "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg=="
-    },
-    "node_modules/emojilib": {
-      "version": "2.4.0",
-      "resolved": "https://registry.npmjs.org/emojilib/-/emojilib-2.4.0.tgz",
-      "integrity": "sha512-5U0rVMU5Y2n2+ykNLQqMoqklN9ICBT/KsvC1Gz6vqHbz2AXXGkG+Pm5rMWk/8Vjrr/mY9985Hi8DYzn1F09Nyw=="
-    },
-    "node_modules/emojis-list": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/emojis-list/-/emojis-list-3.0.0.tgz",
-      "integrity": "sha512-/kyM18EfinwXZbno9FyUGeFh87KC8HRQBQGildHZbEuRyWFOmv1U10o9BBp8XVZDVNNuQKyIGIu5ZYAAXJ0V2Q==",
+    "node_modules/d3-time-format": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmmirror.com/d3-time-format/-/d3-time-format-4.1.0.tgz",
+      "integrity": "sha512-dJxPBlzC7NugB2PDLwo9Q8JiTR3M3e4/XANkreKSUxF8vvXKqm1Yfq4Q5dl8budlunRVlUUaDUgFt7eA8D6NLg==",
+      "dependencies": {
+        "d3-time": "1 - 3"
+      },
       "engines": {
-        "node": ">= 4"
+        "node": ">=12"
       }
     },
-    "node_modules/emoticon": {
-      "version": "4.0.1",
-      "resolved": "https://registry.npmjs.org/emoticon/-/emoticon-4.0.1.tgz",
-      "integrity": "sha512-dqx7eA9YaqyvYtUhJwT4rC1HIp82j5ybS1/vQ42ur+jBe17dJMwZE4+gvL1XadSFfxaPFFGt3Xsw+Y8akThDlw==",
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
+    "node_modules/d3-timer": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmmirror.com/d3-timer/-/d3-timer-3.0.1.tgz",
+      "integrity": "sha512-ndfJ/JxxMd3nw31uyKoY2naivF+r29V+Lc0svZxe1JvvIRmi8hUsrMvdOwgS1o6uBHmiz91geQ0ylPP0aj1VUA==",
+      "engines": {
+        "node": ">=12"
       }
     },
-    "node_modules/encodeurl": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
-      "integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==",
+    "node_modules/d3-transition": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmmirror.com/d3-transition/-/d3-transition-3.0.1.tgz",
+      "integrity": "sha512-ApKvfjsSR6tg06xrL434C0WydLr7JewBB3V+/39RMHsaXTOG0zmt/OAXeng5M5LBm0ojmxJrpomQVZ1aPvBL4w==",
+      "dependencies": {
+        "d3-color": "1 - 3",
+        "d3-dispatch": "1 - 3",
+        "d3-ease": "1 - 3",
+        "d3-interpolate": "1 - 3",
+        "d3-timer": "1 - 3"
+      },
       "engines": {
-        "node": ">= 0.8"
+        "node": ">=12"
+      },
+      "peerDependencies": {
+        "d3-selection": "2 - 3"
       }
     },
-    "node_modules/enhanced-resolve": {
-      "version": "5.16.0",
-      "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.16.0.tgz",
-      "integrity": "sha512-O+QWCviPNSSLAD9Ucn8Awv+poAkqn3T1XY5/N7kR7rQO9yfSGWkYZDwpJ+iKF7B8rxaQKWngSqACpgzeapSyoA==",
+    "node_modules/d3-zoom": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmmirror.com/d3-zoom/-/d3-zoom-3.0.0.tgz",
+      "integrity": "sha512-b8AmV3kfQaqWAuacbPuNbL6vahnOJflOhexLzMMNLga62+/nh0JzvJ0aO/5a5MVgUFGS7Hu1P9P03o3fJkDCyw==",
       "dependencies": {
-        "graceful-fs": "^4.2.4",
-        "tapable": "^2.2.0"
+        "d3-dispatch": "1 - 3",
+        "d3-drag": "2 - 3",
+        "d3-interpolate": "1 - 3",
+        "d3-selection": "2 - 3",
+        "d3-transition": "2 - 3"
       },
       "engines": {
-        "node": ">=10.13.0"
+        "node": ">=12"
       }
     },
-    "node_modules/entities": {
-      "version": "4.5.0",
-      "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
-      "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
-      "engines": {
-        "node": ">=0.12"
-      },
-      "funding": {
-        "url": "https://github.com/fb55/entities?sponsor=1"
+    "node_modules/dagre-d3-es": {
+      "version": "7.0.10",
+      "resolved": "https://registry.npmmirror.com/dagre-d3-es/-/dagre-d3-es-7.0.10.tgz",
+      "integrity": "sha512-qTCQmEhcynucuaZgY5/+ti3X/rnszKZhEQH/ZdWdtP1tA/y3VoHJzcVrO9pjjJCNpigfscAtoUB5ONcd2wNn0A==",
+      "dependencies": {
+        "d3": "^7.8.2",
+        "lodash-es": "^4.17.21"
       }
     },
-    "node_modules/error-ex": {
-      "version": "1.3.2",
-      "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz",
-      "integrity": "sha512-7dFHNmqeFSEt2ZBsCriorKnn3Z2pj+fd9kmI6QoWw4//DL+icEBfc0U7qJCisqrTsKTjw4fNFy2pW9OqStD84g==",
+    "node_modules/dayjs": {
+      "version": "1.11.12",
+      "resolved": "https://registry.npmmirror.com/dayjs/-/dayjs-1.11.12.tgz",
+      "integrity": "sha512-Rt2g+nTbLlDWZTwwrIXjy9MeiZmSDI375FvZs72ngxx8PDC6YXOeR3q5LAuPzjZQxhiWdRKac7RKV+YyQYfYIg=="
+    },
+    "node_modules/debounce": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/debounce/-/debounce-1.2.1.tgz",
+      "integrity": "sha512-XRRe6Glud4rd/ZGQfiV1ruXSfbvfJedlV9Y6zOlP+2K04vBYiJEte6stfFkCP03aMnY5tsipamumUjL14fofug=="
+    },
+    "node_modules/debug": {
+      "version": "4.3.4",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
+      "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
       "dependencies": {
-        "is-arrayish": "^0.2.1"
+        "ms": "2.1.2"
+      },
+      "engines": {
+        "node": ">=6.0"
+      },
+      "peerDependenciesMeta": {
+        "supports-color": {
+          "optional": true
+        }
       }
     },
-    "node_modules/error-stack-parser": {
-      "version": "2.1.4",
-      "resolved": "https://registry.npmjs.org/error-stack-parser/-/error-stack-parser-2.1.4.tgz",
-      "integrity": "sha512-Sk5V6wVazPhq5MhpO+AUxJn5x7XSXGl1R93Vn7i+zS15KDVxQijejNCrz8340/2bgLBjR9GtEG8ZVKONDjcqGQ==",
+    "node_modules/decode-named-character-reference": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/decode-named-character-reference/-/decode-named-character-reference-1.0.2.tgz",
+      "integrity": "sha512-O8x12RzrUF8xyVcY0KJowWsmaJxQbmy0/EtnNtHRpsOcT7dFk5W598coHqBVpmWo1oQQfsCqfCmkZN5DJrZVdg==",
       "dependencies": {
-        "stackframe": "^1.3.4"
+        "character-entities": "^2.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
       }
     },
-    "node_modules/es-define-property": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz",
-      "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==",
+    "node_modules/decompress-response": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz",
+      "integrity": "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==",
       "dependencies": {
-        "get-intrinsic": "^1.2.4"
+        "mimic-response": "^3.1.0"
       },
       "engines": {
-        "node": ">= 0.4"
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/es-errors": {
-      "version": "1.3.0",
-      "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
-      "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
+    "node_modules/decompress-response/node_modules/mimic-response": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
+      "integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
       "engines": {
-        "node": ">= 0.4"
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/es-module-lexer": {
-      "version": "1.5.0",
-      "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.5.0.tgz",
+    "node_modules/deep-extend": {
+      "version": "0.6.0",
+      "resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz",
+      "integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==",
+      "engines": {
+        "node": ">=4.0.0"
+      }
+    },
+    "node_modules/deepmerge": {
+      "version": "4.3.1",
+      "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
+      "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/default-gateway": {
+      "version": "6.0.3",
+      "resolved": "https://registry.npmjs.org/default-gateway/-/default-gateway-6.0.3.tgz",
+      "integrity": "sha512-fwSOJsbbNzZ/CUFpqFBqYfYNLj1NbMPm8MMCIzHjC83iSJRBEGmDUxU+WP661BaBQImeC2yHwXtz+P/O9o+XEg==",
+      "dependencies": {
+        "execa": "^5.0.0"
+      },
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/defer-to-connect": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-2.0.1.tgz",
+      "integrity": "sha512-4tvttepXG1VaYGrRibk5EwJd1t4udunSOVMdLSAL6mId1ix438oPwPZMALY41FCijukO1L0twNcGsdzS7dHgDg==",
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/define-data-property": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
+      "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==",
+      "dependencies": {
+        "es-define-property": "^1.0.0",
+        "es-errors": "^1.3.0",
+        "gopd": "^1.0.1"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/define-lazy-prop": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-2.0.0.tgz",
+      "integrity": "sha512-Ds09qNh8yw3khSjiJjiUInaGX9xlqZDY7JVryGxdxV7NPeuqQfplOpQ66yJFZut3jLa5zOwkXw1g9EI2uKh4Og==",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/define-properties": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.2.1.tgz",
+      "integrity": "sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg==",
+      "dependencies": {
+        "define-data-property": "^1.0.1",
+        "has-property-descriptors": "^1.0.0",
+        "object-keys": "^1.1.1"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/del": {
+      "version": "6.1.1",
+      "resolved": "https://registry.npmjs.org/del/-/del-6.1.1.tgz",
+      "integrity": "sha512-ua8BhapfP0JUJKC/zV9yHHDW/rDoDxP4Zhn3AkA6/xT6gY7jYXJiaeyBZznYVujhZZET+UgcbZiQ7sN3WqcImg==",
+      "dependencies": {
+        "globby": "^11.0.1",
+        "graceful-fs": "^4.2.4",
+        "is-glob": "^4.0.1",
+        "is-path-cwd": "^2.2.0",
+        "is-path-inside": "^3.0.2",
+        "p-map": "^4.0.0",
+        "rimraf": "^3.0.2",
+        "slash": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/delaunator": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmmirror.com/delaunator/-/delaunator-5.0.1.tgz",
+      "integrity": "sha512-8nvh+XBe96aCESrGOqMp/84b13H9cdKbG5P2ejQCh4d4sK9RL4371qou9drQjMhvnPmhWl5hnmqbEE0fXr9Xnw==",
+      "dependencies": {
+        "robust-predicates": "^3.0.2"
+      }
+    },
+    "node_modules/depd": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz",
+      "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/dequal": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz",
+      "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/destroy": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz",
+      "integrity": "sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg==",
+      "engines": {
+        "node": ">= 0.8",
+        "npm": "1.2.8000 || >= 1.4.16"
+      }
+    },
+    "node_modules/detect-node": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/detect-node/-/detect-node-2.1.0.tgz",
+      "integrity": "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g=="
+    },
+    "node_modules/detect-port": {
+      "version": "1.5.1",
+      "resolved": "https://registry.npmjs.org/detect-port/-/detect-port-1.5.1.tgz",
+      "integrity": "sha512-aBzdj76lueB6uUst5iAs7+0H/oOjqI5D16XUWxlWMIMROhcM0rfsNVk93zTngq1dDNpoXRr++Sus7ETAExppAQ==",
+      "dependencies": {
+        "address": "^1.0.1",
+        "debug": "4"
+      },
+      "bin": {
+        "detect": "bin/detect-port.js",
+        "detect-port": "bin/detect-port.js"
+      }
+    },
+    "node_modules/detect-port-alt": {
+      "version": "1.1.6",
+      "resolved": "https://registry.npmjs.org/detect-port-alt/-/detect-port-alt-1.1.6.tgz",
+      "integrity": "sha512-5tQykt+LqfJFBEYaDITx7S7cR7mJ/zQmLXZ2qt5w04ainYZw6tBf9dBunMjVeVOdYVRUzUOE4HkY5J7+uttb5Q==",
+      "dependencies": {
+        "address": "^1.0.1",
+        "debug": "^2.6.0"
+      },
+      "bin": {
+        "detect": "bin/detect-port",
+        "detect-port": "bin/detect-port"
+      },
+      "engines": {
+        "node": ">= 4.2.1"
+      }
+    },
+    "node_modules/detect-port-alt/node_modules/debug": {
+      "version": "2.6.9",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
+      "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
+      "dependencies": {
+        "ms": "2.0.0"
+      }
+    },
+    "node_modules/detect-port-alt/node_modules/ms": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
+      "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
+    },
+    "node_modules/devlop": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/devlop/-/devlop-1.1.0.tgz",
+      "integrity": "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==",
+      "dependencies": {
+        "dequal": "^2.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/diff": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmmirror.com/diff/-/diff-5.2.0.tgz",
+      "integrity": "sha512-uIFDxqpRZGZ6ThOk84hEfqWoHx2devRFvpTZcTHur85vImfaxUbTW9Ryh4CpCuDnToOP1CEtXKIgytHBPVff5A==",
+      "engines": {
+        "node": ">=0.3.1"
+      }
+    },
+    "node_modules/dir-glob": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/dir-glob/-/dir-glob-3.0.1.tgz",
+      "integrity": "sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==",
+      "dependencies": {
+        "path-type": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/dns-packet": {
+      "version": "5.6.1",
+      "resolved": "https://registry.npmjs.org/dns-packet/-/dns-packet-5.6.1.tgz",
+      "integrity": "sha512-l4gcSouhcgIKRvyy99RNVOgxXiicE+2jZoNmaNmZ6JXiGajBOJAesk1OBlJuM5k2c+eudGdLxDqXuPCKIj6kpw==",
+      "dependencies": {
+        "@leichtgewicht/ip-codec": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/dom-converter": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/dom-converter/-/dom-converter-0.2.0.tgz",
+      "integrity": "sha512-gd3ypIPfOMr9h5jIKq8E3sHOTCjeirnl0WK5ZdS1AW0Odt0b1PaWaHdJ4Qk4klv+YB9aJBS7mESXjFoDQPu6DA==",
+      "dependencies": {
+        "utila": "~0.4"
+      }
+    },
+    "node_modules/dom-serializer": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
+      "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
+      "dependencies": {
+        "domelementtype": "^2.3.0",
+        "domhandler": "^5.0.2",
+        "entities": "^4.2.0"
+      },
+      "funding": {
+        "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
+      }
+    },
+    "node_modules/domelementtype": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
+      "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/fb55"
+        }
+      ]
+    },
+    "node_modules/domhandler": {
+      "version": "5.0.3",
+      "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
+      "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
+      "dependencies": {
+        "domelementtype": "^2.3.0"
+      },
+      "engines": {
+        "node": ">= 4"
+      },
+      "funding": {
+        "url": "https://github.com/fb55/domhandler?sponsor=1"
+      }
+    },
+    "node_modules/dompurify": {
+      "version": "3.1.6",
+      "resolved": "https://registry.npmmirror.com/dompurify/-/dompurify-3.1.6.tgz",
+      "integrity": "sha512-cTOAhc36AalkjtBpfG6O8JimdTMWNXjiePT2xQH/ppBGi/4uIpmj8eKyIkMJErXWARyINV/sB38yf8JCLF5pbQ=="
+    },
+    "node_modules/domutils": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz",
+      "integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==",
+      "dependencies": {
+        "dom-serializer": "^2.0.0",
+        "domelementtype": "^2.3.0",
+        "domhandler": "^5.0.3"
+      },
+      "funding": {
+        "url": "https://github.com/fb55/domutils?sponsor=1"
+      }
+    },
+    "node_modules/dot-case": {
+      "version": "3.0.4",
+      "resolved": "https://registry.npmjs.org/dot-case/-/dot-case-3.0.4.tgz",
+      "integrity": "sha512-Kv5nKlh6yRrdrGvxeJ2e5y2eRUpkUosIW4A2AS38zwSz27zu7ufDwQPi5Jhs3XAlGNetl3bmnGhQsMtkKJnj3w==",
+      "dependencies": {
+        "no-case": "^3.0.4",
+        "tslib": "^2.0.3"
+      }
+    },
+    "node_modules/dot-prop": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/dot-prop/-/dot-prop-6.0.1.tgz",
+      "integrity": "sha512-tE7ztYzXHIeyvc7N+hR3oi7FIbf/NIjVP9hmAt3yMXzrQ072/fpjGLx2GxNxGxUl5V73MEqYzioOMoVhGMJ5cA==",
+      "dependencies": {
+        "is-obj": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/dot-prop/node_modules/is-obj": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/is-obj/-/is-obj-2.0.0.tgz",
+      "integrity": "sha512-drqDG3cbczxxEJRoOXcOjtdp1J/lyp1mNn0xaznRs8+muBhgQcrnbspox5X5fOw0HnMnbfDzvnEMEtqDEJEo8w==",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/duplexer": {
+      "version": "0.1.2",
+      "resolved": "https://registry.npmjs.org/duplexer/-/duplexer-0.1.2.tgz",
+      "integrity": "sha512-jtD6YG370ZCIi/9GTaJKQxWTZD045+4R4hTk/x1UyoqadyJ9x9CgSi1RlVDQF8U2sxLLSnFkCaMihqljHIWgMg=="
+    },
+    "node_modules/eastasianwidth": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz",
+      "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA=="
+    },
+    "node_modules/ee-first": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
+      "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow=="
+    },
+    "node_modules/electron-to-chromium": {
+      "version": "1.4.748",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.748.tgz",
+      "integrity": "sha512-VWqjOlPZn70UZ8FTKUOkUvBLeTQ0xpty66qV0yJcAGY2/CthI4xyW9aEozRVtuwv3Kpf5xTesmJUcPwuJmgP4A=="
+    },
+    "node_modules/elkjs": {
+      "version": "0.9.3",
+      "resolved": "https://registry.npmmirror.com/elkjs/-/elkjs-0.9.3.tgz",
+      "integrity": "sha512-f/ZeWvW/BCXbhGEf1Ujp29EASo/lk1FDnETgNKwJrsVvGZhUWCZyg3xLJjAsxfOmt8KjswHmI5EwCQcPMpOYhQ=="
+    },
+    "node_modules/emoji-regex": {
+      "version": "9.2.2",
+      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz",
+      "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg=="
+    },
+    "node_modules/emojilib": {
+      "version": "2.4.0",
+      "resolved": "https://registry.npmjs.org/emojilib/-/emojilib-2.4.0.tgz",
+      "integrity": "sha512-5U0rVMU5Y2n2+ykNLQqMoqklN9ICBT/KsvC1Gz6vqHbz2AXXGkG+Pm5rMWk/8Vjrr/mY9985Hi8DYzn1F09Nyw=="
+    },
+    "node_modules/emojis-list": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/emojis-list/-/emojis-list-3.0.0.tgz",
+      "integrity": "sha512-/kyM18EfinwXZbno9FyUGeFh87KC8HRQBQGildHZbEuRyWFOmv1U10o9BBp8XVZDVNNuQKyIGIu5ZYAAXJ0V2Q==",
+      "engines": {
+        "node": ">= 4"
+      }
+    },
+    "node_modules/emoticon": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/emoticon/-/emoticon-4.0.1.tgz",
+      "integrity": "sha512-dqx7eA9YaqyvYtUhJwT4rC1HIp82j5ybS1/vQ42ur+jBe17dJMwZE4+gvL1XadSFfxaPFFGt3Xsw+Y8akThDlw==",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/encodeurl": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
+      "integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/enhanced-resolve": {
+      "version": "5.16.0",
+      "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.16.0.tgz",
+      "integrity": "sha512-O+QWCviPNSSLAD9Ucn8Awv+poAkqn3T1XY5/N7kR7rQO9yfSGWkYZDwpJ+iKF7B8rxaQKWngSqACpgzeapSyoA==",
+      "dependencies": {
+        "graceful-fs": "^4.2.4",
+        "tapable": "^2.2.0"
+      },
+      "engines": {
+        "node": ">=10.13.0"
+      }
+    },
+    "node_modules/entities": {
+      "version": "4.5.0",
+      "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
+      "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
+      "engines": {
+        "node": ">=0.12"
+      },
+      "funding": {
+        "url": "https://github.com/fb55/entities?sponsor=1"
+      }
+    },
+    "node_modules/error-ex": {
+      "version": "1.3.2",
+      "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz",
+      "integrity": "sha512-7dFHNmqeFSEt2ZBsCriorKnn3Z2pj+fd9kmI6QoWw4//DL+icEBfc0U7qJCisqrTsKTjw4fNFy2pW9OqStD84g==",
+      "dependencies": {
+        "is-arrayish": "^0.2.1"
+      }
+    },
+    "node_modules/error-stack-parser": {
+      "version": "2.1.4",
+      "resolved": "https://registry.npmjs.org/error-stack-parser/-/error-stack-parser-2.1.4.tgz",
+      "integrity": "sha512-Sk5V6wVazPhq5MhpO+AUxJn5x7XSXGl1R93Vn7i+zS15KDVxQijejNCrz8340/2bgLBjR9GtEG8ZVKONDjcqGQ==",
+      "dependencies": {
+        "stackframe": "^1.3.4"
+      }
+    },
+    "node_modules/es-define-property": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz",
+      "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==",
+      "dependencies": {
+        "get-intrinsic": "^1.2.4"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/es-errors": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
+      "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/es-module-lexer": {
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.5.0.tgz",
       "integrity": "sha512-pqrTKmwEIgafsYZAGw9kszYzmagcE/n4dbgwGWLEXg7J4QFJVQRBld8j3Q3GNez79jzxZshq0bcT962QHOghjw=="
     },
     "node_modules/escalade": {
@@ -7539,6 +8075,14 @@
         "css-in-js-utils": "^3.1.0"
       }
     },
+    "node_modules/internmap": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmmirror.com/internmap/-/internmap-2.0.3.tgz",
+      "integrity": "sha512-5Hh7Y1wQbvY5ooGgPbDaL5iYLAPzMTUrjMulskHLH6wnv/A+1q5rgEaiuqEjB+oxGXIVZs1FF+R/KPN3ZSQYYg==",
+      "engines": {
+        "node": ">=12"
+      }
+    },
     "node_modules/interpret": {
       "version": "1.4.0",
       "resolved": "https://registry.npmjs.org/interpret/-/interpret-1.4.0.tgz",
@@ -7980,6 +8524,29 @@
         "graceful-fs": "^4.1.6"
       }
     },
+    "node_modules/katex": {
+      "version": "0.16.11",
+      "resolved": "https://registry.npmmirror.com/katex/-/katex-0.16.11.tgz",
+      "integrity": "sha512-RQrI8rlHY92OLf3rho/Ts8i/XvjgguEjOkO1BEXcU3N8BqPpSzBNwV/G0Ukr+P/l3ivvJUE/Fa/CwbS6HesGNQ==",
+      "funding": [
+        "https://opencollective.com/katex",
+        "https://github.com/sponsors/katex"
+      ],
+      "dependencies": {
+        "commander": "^8.3.0"
+      },
+      "bin": {
+        "katex": "cli.js"
+      }
+    },
+    "node_modules/katex/node_modules/commander": {
+      "version": "8.3.0",
+      "resolved": "https://registry.npmmirror.com/commander/-/commander-8.3.0.tgz",
+      "integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==",
+      "engines": {
+        "node": ">= 12"
+      }
+    },
     "node_modules/keyv": {
       "version": "4.5.4",
       "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz",
@@ -7988,6 +8555,11 @@
         "json-buffer": "3.0.1"
       }
     },
+    "node_modules/khroma": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmmirror.com/khroma/-/khroma-2.1.0.tgz",
+      "integrity": "sha512-Ls993zuzfayK269Svk9hzpeGUKob/sIgZzyHYdjQoAdQetRKpOLj+k/QQQ/6Qi0Yz65mlROrfd+Ev+1+7dz9Kw=="
+    },
     "node_modules/kind-of": {
       "version": "6.0.3",
       "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.3.tgz",
@@ -8027,6 +8599,11 @@
         "shell-quote": "^1.8.1"
       }
     },
+    "node_modules/layout-base": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmmirror.com/layout-base/-/layout-base-1.0.2.tgz",
+      "integrity": "sha512-8h2oVEZNktL4BH2JCOI90iD1yXwL6iNW7KcCKT2QZgQJR2vbqDsldCTPRU9NifTCqHZci57XvQQ15YTu+sTYPg=="
+    },
     "node_modules/leven": {
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/leven/-/leven-3.1.0.tgz",
@@ -8091,6 +8668,11 @@
       "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
       "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg=="
     },
+    "node_modules/lodash-es": {
+      "version": "4.17.21",
+      "resolved": "https://registry.npmmirror.com/lodash-es/-/lodash-es-4.17.21.tgz",
+      "integrity": "sha512-mKnC+QJ9pWVzv+C4/U3rRsHapFfHvQFoFB92e52xeyGMcX6/OlIl78je1u8vePzYZSkkogMPJ2yjxxsb89cxyw=="
+    },
     "node_modules/lodash.debounce": {
       "version": "4.0.8",
       "resolved": "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz",
@@ -8115,213 +8697,563 @@
         "url": "https://github.com/sponsors/wooorm"
       }
     },
-    "node_modules/loose-envify": {
-      "version": "1.4.0",
-      "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
-      "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==",
+    "node_modules/loose-envify": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
+      "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==",
+      "dependencies": {
+        "js-tokens": "^3.0.0 || ^4.0.0"
+      },
+      "bin": {
+        "loose-envify": "cli.js"
+      }
+    },
+    "node_modules/lower-case": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/lower-case/-/lower-case-2.0.2.tgz",
+      "integrity": "sha512-7fm3l3NAF9WfN6W3JOmf5drwpVqX78JtoGJ3A6W0a6ZnldM41w2fV5D490psKFTpMds8TJse/eHLFFsNHHjHgg==",
+      "dependencies": {
+        "tslib": "^2.0.3"
+      }
+    },
+    "node_modules/lowercase-keys": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-3.0.0.tgz",
+      "integrity": "sha512-ozCC6gdQ+glXOQsveKD0YsDy8DSQFjDTz4zyzEHNV5+JP5D62LmfDZ6o1cycFx9ouG940M5dE8C8CTewdj2YWQ==",
+      "engines": {
+        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/lru-cache": {
+      "version": "5.1.1",
+      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
+      "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
+      "dependencies": {
+        "yallist": "^3.0.2"
+      }
+    },
+    "node_modules/markdown-extensions": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/markdown-extensions/-/markdown-extensions-2.0.0.tgz",
+      "integrity": "sha512-o5vL7aDWatOTX8LzaS1WMoaoxIiLRQJuIKKe2wAw6IeULDHaqbiqiggmx+pKvZDb1Sj+pE46Sn1T7lCqfFtg1Q==",
+      "engines": {
+        "node": ">=16"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/markdown-table": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.3.tgz",
+      "integrity": "sha512-Z1NL3Tb1M9wH4XESsCDEksWoKTdlUafKc4pt0GRwjUyXaCFZ+dc3g2erqB6zm3szA2IUSi7VnPI+o/9jnxh9hw==",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/mdast-util-directive": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-directive/-/mdast-util-directive-3.0.0.tgz",
+      "integrity": "sha512-JUpYOqKI4mM3sZcNxmF/ox04XYFFkNwr0CFlrQIkCwbvH0xzMCqkMqAde9wRd80VAhaUrwFwKm2nxretdT1h7Q==",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "@types/unist": "^3.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0",
+        "parse-entities": "^4.0.0",
+        "stringify-entities": "^4.0.0",
+        "unist-util-visit-parents": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-find-and-replace": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/mdast-util-find-and-replace/-/mdast-util-find-and-replace-3.0.1.tgz",
+      "integrity": "sha512-SG21kZHGC3XRTSUhtofZkBzZTJNM5ecCi0SK2IMKmSXR8vO3peL+kb1O0z7Zl83jKtutG4k5Wv/W7V3/YHvzPA==",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "escape-string-regexp": "^5.0.0",
+        "unist-util-is": "^6.0.0",
+        "unist-util-visit-parents": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-find-and-replace/node_modules/escape-string-regexp": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz",
+      "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/mdast-util-from-markdown": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.1.tgz",
+      "integrity": "sha512-aJEUyzZ6TzlsX2s5B4Of7lN7EQtAxvtradMMglCQDyaTFgse6CmtmdJ15ElnVRlCg1vpNyVtbem0PWzlNieZsA==",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "@types/unist": "^3.0.0",
+        "decode-named-character-reference": "^1.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-to-string": "^4.0.0",
+        "micromark": "^4.0.0",
+        "micromark-util-decode-numeric-character-reference": "^2.0.0",
+        "micromark-util-decode-string": "^2.0.0",
+        "micromark-util-normalize-identifier": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0",
+        "unist-util-stringify-position": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-from-markdown/node_modules/micromark-util-symbol": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/micromark-util-symbol/-/micromark-util-symbol-2.0.0.tgz",
+      "integrity": "sha512-8JZt9ElZ5kyTnO94muPxIGS8oyElRJaiJO8EzV6ZSyGQ1Is8xwl4Q45qU5UOg+bGH4AikWziz0iN4sFLWs8PGw==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ]
+    },
+    "node_modules/mdast-util-frontmatter": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/mdast-util-frontmatter/-/mdast-util-frontmatter-2.0.1.tgz",
+      "integrity": "sha512-LRqI9+wdgC25P0URIJY9vwocIzCcksduHQ9OF2joxQoyTNVduwLAFUzjoopuRJbJAReaKrNQKAZKL3uCMugWJA==",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "escape-string-regexp": "^5.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0",
+        "micromark-extension-frontmatter": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-frontmatter/node_modules/escape-string-regexp": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz",
+      "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/mdast-util-gfm": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.0.0.tgz",
+      "integrity": "sha512-dgQEX5Amaq+DuUqf26jJqSK9qgixgd6rYDHAv4aTBuA92cTknZlKpPfa86Z/s8Dj8xsAQpFfBmPUHWJBWqS4Bw==",
+      "dependencies": {
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-gfm-autolink-literal": "^2.0.0",
+        "mdast-util-gfm-footnote": "^2.0.0",
+        "mdast-util-gfm-strikethrough": "^2.0.0",
+        "mdast-util-gfm-table": "^2.0.0",
+        "mdast-util-gfm-task-list-item": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-gfm-autolink-literal": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-gfm-autolink-literal/-/mdast-util-gfm-autolink-literal-2.0.0.tgz",
+      "integrity": "sha512-FyzMsduZZHSc3i0Px3PQcBT4WJY/X/RCtEJKuybiC6sjPqLv7h1yqAkmILZtuxMSsUyaLUWNp71+vQH2zqp5cg==",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "ccount": "^2.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-find-and-replace": "^3.0.0",
+        "micromark-util-character": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-gfm-autolink-literal/node_modules/micromark-util-character": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/micromark-util-character/-/micromark-util-character-2.1.0.tgz",
+      "integrity": "sha512-KvOVV+X1yLBfs9dCBSopq/+G1PcgT3lAK07mC4BzXi5E7ahzMAF8oIupDDJ6mievI6F+lAATkbQQlQixJfT3aQ==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "dependencies": {
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/mdast-util-gfm-autolink-literal/node_modules/micromark-util-symbol": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/micromark-util-symbol/-/micromark-util-symbol-2.0.0.tgz",
+      "integrity": "sha512-8JZt9ElZ5kyTnO94muPxIGS8oyElRJaiJO8EzV6ZSyGQ1Is8xwl4Q45qU5UOg+bGH4AikWziz0iN4sFLWs8PGw==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ]
+    },
+    "node_modules/mdast-util-gfm-footnote": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-gfm-footnote/-/mdast-util-gfm-footnote-2.0.0.tgz",
+      "integrity": "sha512-5jOT2boTSVkMnQ7LTrd6n/18kqwjmuYqo7JUPe+tRCY6O7dAuTFMtTPauYYrMPpox9hlN0uOx/FL8XvEfG9/mQ==",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.1.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0",
+        "micromark-util-normalize-identifier": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-gfm-strikethrough": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-gfm-strikethrough/-/mdast-util-gfm-strikethrough-2.0.0.tgz",
+      "integrity": "sha512-mKKb915TF+OC5ptj5bJ7WFRPdYtuHv0yTRxK2tJvi+BDqbkiG7h7u/9SI89nRAYcmap2xHQL9D+QG/6wSrTtXg==",
       "dependencies": {
-        "js-tokens": "^3.0.0 || ^4.0.0"
+        "@types/mdast": "^4.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
       },
-      "bin": {
-        "loose-envify": "cli.js"
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/lower-case": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/lower-case/-/lower-case-2.0.2.tgz",
-      "integrity": "sha512-7fm3l3NAF9WfN6W3JOmf5drwpVqX78JtoGJ3A6W0a6ZnldM41w2fV5D490psKFTpMds8TJse/eHLFFsNHHjHgg==",
+    "node_modules/mdast-util-gfm-table": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-gfm-table/-/mdast-util-gfm-table-2.0.0.tgz",
+      "integrity": "sha512-78UEvebzz/rJIxLvE7ZtDd/vIQ0RHv+3Mh5DR96p7cS7HsBhYIICDBCu8csTNWNO6tBWfqXPWekRuj2FNOGOZg==",
       "dependencies": {
-        "tslib": "^2.0.3"
-      }
-    },
-    "node_modules/lowercase-keys": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-3.0.0.tgz",
-      "integrity": "sha512-ozCC6gdQ+glXOQsveKD0YsDy8DSQFjDTz4zyzEHNV5+JP5D62LmfDZ6o1cycFx9ouG940M5dE8C8CTewdj2YWQ==",
-      "engines": {
-        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "markdown-table": "^3.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
       },
       "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/lru-cache": {
-      "version": "5.1.1",
-      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
-      "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
+    "node_modules/mdast-util-gfm-task-list-item": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-gfm-task-list-item/-/mdast-util-gfm-task-list-item-2.0.0.tgz",
+      "integrity": "sha512-IrtvNvjxC1o06taBAVJznEnkiHxLFTzgonUdy8hzFVeDun0uTjxxrRGVaNFqkU1wJR3RBPEfsxmU6jDWPofrTQ==",
       "dependencies": {
-        "yallist": "^3.0.2"
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/markdown-extensions": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/markdown-extensions/-/markdown-extensions-2.0.0.tgz",
-      "integrity": "sha512-o5vL7aDWatOTX8LzaS1WMoaoxIiLRQJuIKKe2wAw6IeULDHaqbiqiggmx+pKvZDb1Sj+pE46Sn1T7lCqfFtg1Q==",
-      "engines": {
-        "node": ">=16"
+    "node_modules/mdast-util-mdx": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-mdx/-/mdast-util-mdx-3.0.0.tgz",
+      "integrity": "sha512-JfbYLAW7XnYTTbUsmpu0kdBUVe+yKVJZBItEjwyYJiDJuZ9w4eeaqks4HQO+R7objWgS2ymV60GYpI14Ug554w==",
+      "dependencies": {
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-mdx-expression": "^2.0.0",
+        "mdast-util-mdx-jsx": "^3.0.0",
+        "mdast-util-mdxjs-esm": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
       },
       "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/markdown-table": {
-      "version": "3.0.3",
-      "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.3.tgz",
-      "integrity": "sha512-Z1NL3Tb1M9wH4XESsCDEksWoKTdlUafKc4pt0GRwjUyXaCFZ+dc3g2erqB6zm3szA2IUSi7VnPI+o/9jnxh9hw==",
+    "node_modules/mdast-util-mdx-expression": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-mdx-expression/-/mdast-util-mdx-expression-2.0.0.tgz",
+      "integrity": "sha512-fGCu8eWdKUKNu5mohVGkhBXCXGnOTLuFqOvGMvdikr+J1w7lDJgxThOKpwRWzzbyXAU2hhSwsmssOY4yTokluw==",
+      "dependencies": {
+        "@types/estree-jsx": "^1.0.0",
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
+      },
       "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-directive": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-directive/-/mdast-util-directive-3.0.0.tgz",
-      "integrity": "sha512-JUpYOqKI4mM3sZcNxmF/ox04XYFFkNwr0CFlrQIkCwbvH0xzMCqkMqAde9wRd80VAhaUrwFwKm2nxretdT1h7Q==",
+    "node_modules/mdast-util-mdx-jsx": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/mdast-util-mdx-jsx/-/mdast-util-mdx-jsx-3.1.2.tgz",
+      "integrity": "sha512-eKMQDeywY2wlHc97k5eD8VC+9ASMjN8ItEZQNGwJ6E0XWKiW/Z0V5/H8pvoXUf+y+Mj0VIgeRRbujBmFn4FTyA==",
       "dependencies": {
+        "@types/estree-jsx": "^1.0.0",
+        "@types/hast": "^3.0.0",
         "@types/mdast": "^4.0.0",
         "@types/unist": "^3.0.0",
-        "devlop": "^1.0.0",
+        "ccount": "^2.0.0",
+        "devlop": "^1.1.0",
         "mdast-util-from-markdown": "^2.0.0",
         "mdast-util-to-markdown": "^2.0.0",
         "parse-entities": "^4.0.0",
         "stringify-entities": "^4.0.0",
-        "unist-util-visit-parents": "^6.0.0"
+        "unist-util-remove-position": "^5.0.0",
+        "unist-util-stringify-position": "^4.0.0",
+        "vfile-message": "^4.0.0"
       },
       "funding": {
         "type": "opencollective",
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-find-and-replace": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/mdast-util-find-and-replace/-/mdast-util-find-and-replace-3.0.1.tgz",
-      "integrity": "sha512-SG21kZHGC3XRTSUhtofZkBzZTJNM5ecCi0SK2IMKmSXR8vO3peL+kb1O0z7Zl83jKtutG4k5Wv/W7V3/YHvzPA==",
+    "node_modules/mdast-util-mdxjs-esm": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/mdast-util-mdxjs-esm/-/mdast-util-mdxjs-esm-2.0.1.tgz",
+      "integrity": "sha512-EcmOpxsZ96CvlP03NghtH1EsLtr0n9Tm4lPUJUBccV9RwUOneqSycg19n5HGzCf+10LozMRSObtVr3ee1WoHtg==",
+      "dependencies": {
+        "@types/estree-jsx": "^1.0.0",
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-phrasing": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-phrasing/-/mdast-util-phrasing-4.1.0.tgz",
+      "integrity": "sha512-TqICwyvJJpBwvGAMZjj4J2n0X8QWp21b9l0o7eXyVJ25YNWYbJDVIyD1bZXE6WtV6RmKJVYmQAKWa0zWOABz2w==",
       "dependencies": {
         "@types/mdast": "^4.0.0",
-        "escape-string-regexp": "^5.0.0",
-        "unist-util-is": "^6.0.0",
-        "unist-util-visit-parents": "^6.0.0"
+        "unist-util-is": "^6.0.0"
       },
       "funding": {
         "type": "opencollective",
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-find-and-replace/node_modules/escape-string-regexp": {
-      "version": "5.0.0",
-      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz",
-      "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==",
-      "engines": {
-        "node": ">=12"
+    "node_modules/mdast-util-to-hast": {
+      "version": "13.2.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.0.tgz",
+      "integrity": "sha512-QGYKEuUsYT9ykKBCMOEDLsU5JRObWQusAolFMeko/tYPufNkRffBAQjIE+99jbA87xv6FgmjLtwjh9wBWajwAA==",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "@ungap/structured-clone": "^1.0.0",
+        "devlop": "^1.0.0",
+        "micromark-util-sanitize-uri": "^2.0.0",
+        "trim-lines": "^3.0.0",
+        "unist-util-position": "^5.0.0",
+        "unist-util-visit": "^5.0.0",
+        "vfile": "^6.0.0"
       },
       "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-from-markdown": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.1.tgz",
-      "integrity": "sha512-aJEUyzZ6TzlsX2s5B4Of7lN7EQtAxvtradMMglCQDyaTFgse6CmtmdJ15ElnVRlCg1vpNyVtbem0PWzlNieZsA==",
+    "node_modules/mdast-util-to-markdown": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-to-markdown/-/mdast-util-to-markdown-2.1.0.tgz",
+      "integrity": "sha512-SR2VnIEdVNCJbP6y7kVTJgPLifdr8WEU440fQec7qHoHOUz/oJ2jmNRqdDQ3rbiStOXb2mCDGTuwsK5OPUgYlQ==",
       "dependencies": {
         "@types/mdast": "^4.0.0",
         "@types/unist": "^3.0.0",
-        "decode-named-character-reference": "^1.0.0",
-        "devlop": "^1.0.0",
+        "longest-streak": "^3.0.0",
+        "mdast-util-phrasing": "^4.0.0",
         "mdast-util-to-string": "^4.0.0",
-        "micromark": "^4.0.0",
-        "micromark-util-decode-numeric-character-reference": "^2.0.0",
         "micromark-util-decode-string": "^2.0.0",
-        "micromark-util-normalize-identifier": "^2.0.0",
-        "micromark-util-symbol": "^2.0.0",
-        "micromark-util-types": "^2.0.0",
-        "unist-util-stringify-position": "^4.0.0"
+        "unist-util-visit": "^5.0.0",
+        "zwitch": "^2.0.0"
       },
       "funding": {
         "type": "opencollective",
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-from-markdown/node_modules/micromark-util-symbol": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/micromark-util-symbol/-/micromark-util-symbol-2.0.0.tgz",
-      "integrity": "sha512-8JZt9ElZ5kyTnO94muPxIGS8oyElRJaiJO8EzV6ZSyGQ1Is8xwl4Q45qU5UOg+bGH4AikWziz0iN4sFLWs8PGw==",
-      "funding": [
-        {
-          "type": "GitHub Sponsors",
-          "url": "https://github.com/sponsors/unifiedjs"
-        },
-        {
-          "type": "OpenCollective",
-          "url": "https://opencollective.com/unified"
-        }
-      ]
-    },
-    "node_modules/mdast-util-frontmatter": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/mdast-util-frontmatter/-/mdast-util-frontmatter-2.0.1.tgz",
-      "integrity": "sha512-LRqI9+wdgC25P0URIJY9vwocIzCcksduHQ9OF2joxQoyTNVduwLAFUzjoopuRJbJAReaKrNQKAZKL3uCMugWJA==",
+    "node_modules/mdast-util-to-string": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-4.0.0.tgz",
+      "integrity": "sha512-0H44vDimn51F0YwvxSJSm0eCDOJTRlmN0R1yBh4HLj9wiV1Dn0QoXGbvFAWj2hSItVTlCmBF1hqKlIyUBVFLPg==",
       "dependencies": {
-        "@types/mdast": "^4.0.0",
-        "devlop": "^1.0.0",
-        "escape-string-regexp": "^5.0.0",
-        "mdast-util-from-markdown": "^2.0.0",
-        "mdast-util-to-markdown": "^2.0.0",
-        "micromark-extension-frontmatter": "^2.0.0"
+        "@types/mdast": "^4.0.0"
       },
       "funding": {
         "type": "opencollective",
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-frontmatter/node_modules/escape-string-regexp": {
-      "version": "5.0.0",
-      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz",
-      "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==",
+    "node_modules/mdn-data": {
+      "version": "2.0.14",
+      "resolved": "https://registry.npmjs.org/mdn-data/-/mdn-data-2.0.14.tgz",
+      "integrity": "sha512-dn6wd0uw5GsdswPFfsgMp5NSB0/aDe6fK94YJV/AJDYXL6HVLWBsxeq7js7Ad+mU2K9LAlwpk6kN2D5mwCPVow=="
+    },
+    "node_modules/media-typer": {
+      "version": "0.3.0",
+      "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
+      "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/memfs": {
+      "version": "3.5.3",
+      "resolved": "https://registry.npmjs.org/memfs/-/memfs-3.5.3.tgz",
+      "integrity": "sha512-UERzLsxzllchadvbPs5aolHh65ISpKpM+ccLbOJ8/vvpBKmAWf+la7dXFy7Mr0ySHbdHrFv5kGFCUHHe6GFEmw==",
+      "dependencies": {
+        "fs-monkey": "^1.0.4"
+      },
+      "engines": {
+        "node": ">= 4.0.0"
+      }
+    },
+    "node_modules/merge-descriptors": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
+      "integrity": "sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w=="
+    },
+    "node_modules/merge-stream": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
+      "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w=="
+    },
+    "node_modules/merge2": {
+      "version": "1.4.1",
+      "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz",
+      "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==",
       "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
+        "node": ">= 8"
       }
     },
-    "node_modules/mdast-util-gfm": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.0.0.tgz",
-      "integrity": "sha512-dgQEX5Amaq+DuUqf26jJqSK9qgixgd6rYDHAv4aTBuA92cTknZlKpPfa86Z/s8Dj8xsAQpFfBmPUHWJBWqS4Bw==",
+    "node_modules/mermaid": {
+      "version": "10.9.1",
+      "resolved": "https://registry.npmmirror.com/mermaid/-/mermaid-10.9.1.tgz",
+      "integrity": "sha512-Mx45Obds5W1UkW1nv/7dHRsbfMM1aOKA2+Pxs/IGHNonygDHwmng8xTHyS9z4KWVi0rbko8gjiBmuwwXQ7tiNA==",
+      "dependencies": {
+        "@braintree/sanitize-url": "^6.0.1",
+        "@types/d3-scale": "^4.0.3",
+        "@types/d3-scale-chromatic": "^3.0.0",
+        "cytoscape": "^3.28.1",
+        "cytoscape-cose-bilkent": "^4.1.0",
+        "d3": "^7.4.0",
+        "d3-sankey": "^0.12.3",
+        "dagre-d3-es": "7.0.10",
+        "dayjs": "^1.11.7",
+        "dompurify": "^3.0.5",
+        "elkjs": "^0.9.0",
+        "katex": "^0.16.9",
+        "khroma": "^2.0.0",
+        "lodash-es": "^4.17.21",
+        "mdast-util-from-markdown": "^1.3.0",
+        "non-layered-tidy-tree-layout": "^2.0.2",
+        "stylis": "^4.1.3",
+        "ts-dedent": "^2.2.0",
+        "uuid": "^9.0.0",
+        "web-worker": "^1.2.0"
+      }
+    },
+    "node_modules/mermaid/node_modules/@types/mdast": {
+      "version": "3.0.15",
+      "resolved": "https://registry.npmmirror.com/@types/mdast/-/mdast-3.0.15.tgz",
+      "integrity": "sha512-LnwD+mUEfxWMa1QpDraczIn6k0Ee3SMicuYSSzS6ZYl2gKS09EClnJYGd8Du6rfc5r/GZEk5o1mRb8TaTj03sQ==",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/mermaid/node_modules/@types/unist": {
+      "version": "2.0.10",
+      "resolved": "https://registry.npmmirror.com/@types/unist/-/unist-2.0.10.tgz",
+      "integrity": "sha512-IfYcSBWE3hLpBg8+X2SEa8LVkJdJEkT2Ese2aaLs3ptGdVtABxndrMaxuFlQ1qdFf9Q5rDvDpxI3WwgvKFAsQA=="
+    },
+    "node_modules/mermaid/node_modules/mdast-util-from-markdown": {
+      "version": "1.3.1",
+      "resolved": "https://registry.npmmirror.com/mdast-util-from-markdown/-/mdast-util-from-markdown-1.3.1.tgz",
+      "integrity": "sha512-4xTO/M8c82qBcnQc1tgpNtubGUW/Y1tBQ1B0i5CtSoelOLKFYlElIr3bvgREYYO5iRqbMY1YuqZng0GVOI8Qww==",
       "dependencies": {
-        "mdast-util-from-markdown": "^2.0.0",
-        "mdast-util-gfm-autolink-literal": "^2.0.0",
-        "mdast-util-gfm-footnote": "^2.0.0",
-        "mdast-util-gfm-strikethrough": "^2.0.0",
-        "mdast-util-gfm-table": "^2.0.0",
-        "mdast-util-gfm-task-list-item": "^2.0.0",
-        "mdast-util-to-markdown": "^2.0.0"
+        "@types/mdast": "^3.0.0",
+        "@types/unist": "^2.0.0",
+        "decode-named-character-reference": "^1.0.0",
+        "mdast-util-to-string": "^3.1.0",
+        "micromark": "^3.0.0",
+        "micromark-util-decode-numeric-character-reference": "^1.0.0",
+        "micromark-util-decode-string": "^1.0.0",
+        "micromark-util-normalize-identifier": "^1.0.0",
+        "micromark-util-symbol": "^1.0.0",
+        "micromark-util-types": "^1.0.0",
+        "unist-util-stringify-position": "^3.0.0",
+        "uvu": "^0.5.0"
       },
       "funding": {
         "type": "opencollective",
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-gfm-autolink-literal": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-gfm-autolink-literal/-/mdast-util-gfm-autolink-literal-2.0.0.tgz",
-      "integrity": "sha512-FyzMsduZZHSc3i0Px3PQcBT4WJY/X/RCtEJKuybiC6sjPqLv7h1yqAkmILZtuxMSsUyaLUWNp71+vQH2zqp5cg==",
+    "node_modules/mermaid/node_modules/mdast-util-to-string": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmmirror.com/mdast-util-to-string/-/mdast-util-to-string-3.2.0.tgz",
+      "integrity": "sha512-V4Zn/ncyN1QNSqSBxTrMOLpjr+IKdHl2v3KVLoWmDPscP4r9GcCi71gjgvUV1SFSKh92AjAG4peFuBl2/YgCJg==",
       "dependencies": {
-        "@types/mdast": "^4.0.0",
-        "ccount": "^2.0.0",
-        "devlop": "^1.0.0",
-        "mdast-util-find-and-replace": "^3.0.0",
-        "micromark-util-character": "^2.0.0"
+        "@types/mdast": "^3.0.0"
       },
       "funding": {
         "type": "opencollective",
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-gfm-autolink-literal/node_modules/micromark-util-character": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/micromark-util-character/-/micromark-util-character-2.1.0.tgz",
-      "integrity": "sha512-KvOVV+X1yLBfs9dCBSopq/+G1PcgT3lAK07mC4BzXi5E7ahzMAF8oIupDDJ6mievI6F+lAATkbQQlQixJfT3aQ==",
+    "node_modules/mermaid/node_modules/micromark": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmmirror.com/micromark/-/micromark-3.2.0.tgz",
+      "integrity": "sha512-uD66tJj54JLYq0De10AhWycZWGQNUvDI55xPgk2sQM5kn1JYlhbCMTtEeT27+vAhW2FBQxLlOmS3pmA7/2z4aA==",
       "funding": [
         {
           "type": "GitHub Sponsors",
@@ -8333,14 +9265,29 @@
         }
       ],
       "dependencies": {
-        "micromark-util-symbol": "^2.0.0",
-        "micromark-util-types": "^2.0.0"
+        "@types/debug": "^4.0.0",
+        "debug": "^4.0.0",
+        "decode-named-character-reference": "^1.0.0",
+        "micromark-core-commonmark": "^1.0.1",
+        "micromark-factory-space": "^1.0.0",
+        "micromark-util-character": "^1.0.0",
+        "micromark-util-chunked": "^1.0.0",
+        "micromark-util-combine-extensions": "^1.0.0",
+        "micromark-util-decode-numeric-character-reference": "^1.0.0",
+        "micromark-util-encode": "^1.0.0",
+        "micromark-util-normalize-identifier": "^1.0.0",
+        "micromark-util-resolve-all": "^1.0.0",
+        "micromark-util-sanitize-uri": "^1.0.0",
+        "micromark-util-subtokenize": "^1.0.0",
+        "micromark-util-symbol": "^1.0.0",
+        "micromark-util-types": "^1.0.1",
+        "uvu": "^0.5.0"
       }
     },
-    "node_modules/mdast-util-gfm-autolink-literal/node_modules/micromark-util-symbol": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/micromark-util-symbol/-/micromark-util-symbol-2.0.0.tgz",
-      "integrity": "sha512-8JZt9ElZ5kyTnO94muPxIGS8oyElRJaiJO8EzV6ZSyGQ1Is8xwl4Q45qU5UOg+bGH4AikWziz0iN4sFLWs8PGw==",
+    "node_modules/mermaid/node_modules/micromark-core-commonmark": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-core-commonmark/-/micromark-core-commonmark-1.1.0.tgz",
+      "integrity": "sha512-BgHO1aRbolh2hcrzL2d1La37V0Aoz73ymF8rAcKnohLy93titmv62E0gP8Hrx9PKcKrqCZ1BbLGbP3bEhoXYlw==",
       "funding": [
         {
           "type": "GitHub Sponsors",
@@ -8350,247 +9297,349 @@
           "type": "OpenCollective",
           "url": "https://opencollective.com/unified"
         }
-      ]
+      ],
+      "dependencies": {
+        "decode-named-character-reference": "^1.0.0",
+        "micromark-factory-destination": "^1.0.0",
+        "micromark-factory-label": "^1.0.0",
+        "micromark-factory-space": "^1.0.0",
+        "micromark-factory-title": "^1.0.0",
+        "micromark-factory-whitespace": "^1.0.0",
+        "micromark-util-character": "^1.0.0",
+        "micromark-util-chunked": "^1.0.0",
+        "micromark-util-classify-character": "^1.0.0",
+        "micromark-util-html-tag-name": "^1.0.0",
+        "micromark-util-normalize-identifier": "^1.0.0",
+        "micromark-util-resolve-all": "^1.0.0",
+        "micromark-util-subtokenize": "^1.0.0",
+        "micromark-util-symbol": "^1.0.0",
+        "micromark-util-types": "^1.0.1",
+        "uvu": "^0.5.0"
+      }
     },
-    "node_modules/mdast-util-gfm-footnote": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-gfm-footnote/-/mdast-util-gfm-footnote-2.0.0.tgz",
-      "integrity": "sha512-5jOT2boTSVkMnQ7LTrd6n/18kqwjmuYqo7JUPe+tRCY6O7dAuTFMtTPauYYrMPpox9hlN0uOx/FL8XvEfG9/mQ==",
+    "node_modules/mermaid/node_modules/micromark-factory-destination": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-factory-destination/-/micromark-factory-destination-1.1.0.tgz",
+      "integrity": "sha512-XaNDROBgx9SgSChd69pjiGKbV+nfHGDPVYFs5dOoDd7ZnMAE+Cuu91BCpsY8RT2NP9vo/B8pds2VQNCLiu0zhg==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
       "dependencies": {
-        "@types/mdast": "^4.0.0",
-        "devlop": "^1.1.0",
-        "mdast-util-from-markdown": "^2.0.0",
-        "mdast-util-to-markdown": "^2.0.0",
-        "micromark-util-normalize-identifier": "^2.0.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/unified"
+        "micromark-util-character": "^1.0.0",
+        "micromark-util-symbol": "^1.0.0",
+        "micromark-util-types": "^1.0.0"
       }
     },
-    "node_modules/mdast-util-gfm-strikethrough": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-gfm-strikethrough/-/mdast-util-gfm-strikethrough-2.0.0.tgz",
-      "integrity": "sha512-mKKb915TF+OC5ptj5bJ7WFRPdYtuHv0yTRxK2tJvi+BDqbkiG7h7u/9SI89nRAYcmap2xHQL9D+QG/6wSrTtXg==",
+    "node_modules/mermaid/node_modules/micromark-factory-label": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-factory-label/-/micromark-factory-label-1.1.0.tgz",
+      "integrity": "sha512-OLtyez4vZo/1NjxGhcpDSbHQ+m0IIGnT8BoPamh+7jVlzLJBH98zzuCoUeMxvM6WsNeh8wx8cKvqLiPHEACn0w==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
       "dependencies": {
-        "@types/mdast": "^4.0.0",
-        "mdast-util-from-markdown": "^2.0.0",
-        "mdast-util-to-markdown": "^2.0.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/unified"
+        "micromark-util-character": "^1.0.0",
+        "micromark-util-symbol": "^1.0.0",
+        "micromark-util-types": "^1.0.0",
+        "uvu": "^0.5.0"
       }
     },
-    "node_modules/mdast-util-gfm-table": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-gfm-table/-/mdast-util-gfm-table-2.0.0.tgz",
-      "integrity": "sha512-78UEvebzz/rJIxLvE7ZtDd/vIQ0RHv+3Mh5DR96p7cS7HsBhYIICDBCu8csTNWNO6tBWfqXPWekRuj2FNOGOZg==",
+    "node_modules/mermaid/node_modules/micromark-factory-title": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-factory-title/-/micromark-factory-title-1.1.0.tgz",
+      "integrity": "sha512-J7n9R3vMmgjDOCY8NPw55jiyaQnH5kBdV2/UXCtZIpnHH3P6nHUKaH7XXEYuWwx/xUJcawa8plLBEjMPU24HzQ==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
       "dependencies": {
-        "@types/mdast": "^4.0.0",
-        "devlop": "^1.0.0",
-        "markdown-table": "^3.0.0",
-        "mdast-util-from-markdown": "^2.0.0",
-        "mdast-util-to-markdown": "^2.0.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/unified"
+        "micromark-factory-space": "^1.0.0",
+        "micromark-util-character": "^1.0.0",
+        "micromark-util-symbol": "^1.0.0",
+        "micromark-util-types": "^1.0.0"
       }
     },
-    "node_modules/mdast-util-gfm-task-list-item": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-gfm-task-list-item/-/mdast-util-gfm-task-list-item-2.0.0.tgz",
-      "integrity": "sha512-IrtvNvjxC1o06taBAVJznEnkiHxLFTzgonUdy8hzFVeDun0uTjxxrRGVaNFqkU1wJR3RBPEfsxmU6jDWPofrTQ==",
+    "node_modules/mermaid/node_modules/micromark-factory-whitespace": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-factory-whitespace/-/micromark-factory-whitespace-1.1.0.tgz",
+      "integrity": "sha512-v2WlmiymVSp5oMg+1Q0N1Lxmt6pMhIHD457whWM7/GUlEks1hI9xj5w3zbc4uuMKXGisksZk8DzP2UyGbGqNsQ==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
       "dependencies": {
-        "@types/mdast": "^4.0.0",
-        "devlop": "^1.0.0",
-        "mdast-util-from-markdown": "^2.0.0",
-        "mdast-util-to-markdown": "^2.0.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/unified"
+        "micromark-factory-space": "^1.0.0",
+        "micromark-util-character": "^1.0.0",
+        "micromark-util-symbol": "^1.0.0",
+        "micromark-util-types": "^1.0.0"
       }
     },
-    "node_modules/mdast-util-mdx": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-mdx/-/mdast-util-mdx-3.0.0.tgz",
-      "integrity": "sha512-JfbYLAW7XnYTTbUsmpu0kdBUVe+yKVJZBItEjwyYJiDJuZ9w4eeaqks4HQO+R7objWgS2ymV60GYpI14Ug554w==",
+    "node_modules/mermaid/node_modules/micromark-util-chunked": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-util-chunked/-/micromark-util-chunked-1.1.0.tgz",
+      "integrity": "sha512-Ye01HXpkZPNcV6FiyoW2fGZDUw4Yc7vT0E9Sad83+bEDiCJ1uXu0S3mr8WLpsz3HaG3x2q0HM6CTuPdcZcluFQ==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
       "dependencies": {
-        "mdast-util-from-markdown": "^2.0.0",
-        "mdast-util-mdx-expression": "^2.0.0",
-        "mdast-util-mdx-jsx": "^3.0.0",
-        "mdast-util-mdxjs-esm": "^2.0.0",
-        "mdast-util-to-markdown": "^2.0.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/unified"
+        "micromark-util-symbol": "^1.0.0"
       }
     },
-    "node_modules/mdast-util-mdx-expression": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-mdx-expression/-/mdast-util-mdx-expression-2.0.0.tgz",
-      "integrity": "sha512-fGCu8eWdKUKNu5mohVGkhBXCXGnOTLuFqOvGMvdikr+J1w7lDJgxThOKpwRWzzbyXAU2hhSwsmssOY4yTokluw==",
+    "node_modules/mermaid/node_modules/micromark-util-classify-character": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-util-classify-character/-/micromark-util-classify-character-1.1.0.tgz",
+      "integrity": "sha512-SL0wLxtKSnklKSUplok1WQFoGhUdWYKggKUiqhX+Swala+BtptGCu5iPRc+xvzJ4PXE/hwM3FNXsfEVgoZsWbw==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
       "dependencies": {
-        "@types/estree-jsx": "^1.0.0",
-        "@types/hast": "^3.0.0",
-        "@types/mdast": "^4.0.0",
-        "devlop": "^1.0.0",
-        "mdast-util-from-markdown": "^2.0.0",
-        "mdast-util-to-markdown": "^2.0.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/unified"
+        "micromark-util-character": "^1.0.0",
+        "micromark-util-symbol": "^1.0.0",
+        "micromark-util-types": "^1.0.0"
       }
     },
-    "node_modules/mdast-util-mdx-jsx": {
-      "version": "3.1.2",
-      "resolved": "https://registry.npmjs.org/mdast-util-mdx-jsx/-/mdast-util-mdx-jsx-3.1.2.tgz",
-      "integrity": "sha512-eKMQDeywY2wlHc97k5eD8VC+9ASMjN8ItEZQNGwJ6E0XWKiW/Z0V5/H8pvoXUf+y+Mj0VIgeRRbujBmFn4FTyA==",
+    "node_modules/mermaid/node_modules/micromark-util-combine-extensions": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-util-combine-extensions/-/micromark-util-combine-extensions-1.1.0.tgz",
+      "integrity": "sha512-Q20sp4mfNf9yEqDL50WwuWZHUrCO4fEyeDCnMGmG5Pr0Cz15Uo7KBs6jq+dq0EgX4DPwwrh9m0X+zPV1ypFvUA==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
       "dependencies": {
-        "@types/estree-jsx": "^1.0.0",
-        "@types/hast": "^3.0.0",
-        "@types/mdast": "^4.0.0",
-        "@types/unist": "^3.0.0",
-        "ccount": "^2.0.0",
-        "devlop": "^1.1.0",
-        "mdast-util-from-markdown": "^2.0.0",
-        "mdast-util-to-markdown": "^2.0.0",
-        "parse-entities": "^4.0.0",
-        "stringify-entities": "^4.0.0",
-        "unist-util-remove-position": "^5.0.0",
-        "unist-util-stringify-position": "^4.0.0",
-        "vfile-message": "^4.0.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/unified"
+        "micromark-util-chunked": "^1.0.0",
+        "micromark-util-types": "^1.0.0"
       }
     },
-    "node_modules/mdast-util-mdxjs-esm": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/mdast-util-mdxjs-esm/-/mdast-util-mdxjs-esm-2.0.1.tgz",
-      "integrity": "sha512-EcmOpxsZ96CvlP03NghtH1EsLtr0n9Tm4lPUJUBccV9RwUOneqSycg19n5HGzCf+10LozMRSObtVr3ee1WoHtg==",
+    "node_modules/mermaid/node_modules/micromark-util-decode-numeric-character-reference": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-util-decode-numeric-character-reference/-/micromark-util-decode-numeric-character-reference-1.1.0.tgz",
+      "integrity": "sha512-m9V0ExGv0jB1OT21mrWcuf4QhP46pH1KkfWy9ZEezqHKAxkj4mPCy3nIH1rkbdMlChLHX531eOrymlwyZIf2iw==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
       "dependencies": {
-        "@types/estree-jsx": "^1.0.0",
-        "@types/hast": "^3.0.0",
-        "@types/mdast": "^4.0.0",
-        "devlop": "^1.0.0",
-        "mdast-util-from-markdown": "^2.0.0",
-        "mdast-util-to-markdown": "^2.0.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/unified"
+        "micromark-util-symbol": "^1.0.0"
       }
     },
-    "node_modules/mdast-util-phrasing": {
-      "version": "4.1.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-phrasing/-/mdast-util-phrasing-4.1.0.tgz",
-      "integrity": "sha512-TqICwyvJJpBwvGAMZjj4J2n0X8QWp21b9l0o7eXyVJ25YNWYbJDVIyD1bZXE6WtV6RmKJVYmQAKWa0zWOABz2w==",
+    "node_modules/mermaid/node_modules/micromark-util-decode-string": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-util-decode-string/-/micromark-util-decode-string-1.1.0.tgz",
+      "integrity": "sha512-YphLGCK8gM1tG1bd54azwyrQRjCFcmgj2S2GoJDNnh4vYtnL38JS8M4gpxzOPNyHdNEpheyWXCTnnTDY3N+NVQ==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
       "dependencies": {
-        "@types/mdast": "^4.0.0",
-        "unist-util-is": "^6.0.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/unified"
+        "decode-named-character-reference": "^1.0.0",
+        "micromark-util-character": "^1.0.0",
+        "micromark-util-decode-numeric-character-reference": "^1.0.0",
+        "micromark-util-symbol": "^1.0.0"
       }
     },
-    "node_modules/mdast-util-to-hast": {
-      "version": "13.2.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.0.tgz",
-      "integrity": "sha512-QGYKEuUsYT9ykKBCMOEDLsU5JRObWQusAolFMeko/tYPufNkRffBAQjIE+99jbA87xv6FgmjLtwjh9wBWajwAA==",
+    "node_modules/mermaid/node_modules/micromark-util-encode": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-util-encode/-/micromark-util-encode-1.1.0.tgz",
+      "integrity": "sha512-EuEzTWSTAj9PA5GOAs992GzNh2dGQO52UvAbtSOMvXTxv3Criqb6IOzJUBCmEqrrXSblJIJBbFFv6zPxpreiJw==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ]
+    },
+    "node_modules/mermaid/node_modules/micromark-util-html-tag-name": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmmirror.com/micromark-util-html-tag-name/-/micromark-util-html-tag-name-1.2.0.tgz",
+      "integrity": "sha512-VTQzcuQgFUD7yYztuQFKXT49KghjtETQ+Wv/zUjGSGBioZnkA4P1XXZPT1FHeJA6RwRXSF47yvJ1tsJdoxwO+Q==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ]
+    },
+    "node_modules/mermaid/node_modules/micromark-util-normalize-identifier": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-util-normalize-identifier/-/micromark-util-normalize-identifier-1.1.0.tgz",
+      "integrity": "sha512-N+w5vhqrBihhjdpM8+5Xsxy71QWqGn7HYNUvch71iV2PM7+E3uWGox1Qp90loa1ephtCxG2ftRV/Conitc6P2Q==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
       "dependencies": {
-        "@types/hast": "^3.0.0",
-        "@types/mdast": "^4.0.0",
-        "@ungap/structured-clone": "^1.0.0",
-        "devlop": "^1.0.0",
-        "micromark-util-sanitize-uri": "^2.0.0",
-        "trim-lines": "^3.0.0",
-        "unist-util-position": "^5.0.0",
-        "unist-util-visit": "^5.0.0",
-        "vfile": "^6.0.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/unified"
+        "micromark-util-symbol": "^1.0.0"
       }
     },
-    "node_modules/mdast-util-to-markdown": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-to-markdown/-/mdast-util-to-markdown-2.1.0.tgz",
-      "integrity": "sha512-SR2VnIEdVNCJbP6y7kVTJgPLifdr8WEU440fQec7qHoHOUz/oJ2jmNRqdDQ3rbiStOXb2mCDGTuwsK5OPUgYlQ==",
+    "node_modules/mermaid/node_modules/micromark-util-resolve-all": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-util-resolve-all/-/micromark-util-resolve-all-1.1.0.tgz",
+      "integrity": "sha512-b/G6BTMSg+bX+xVCshPTPyAu2tmA0E4X98NSR7eIbeC6ycCqCeE7wjfDIgzEbkzdEVJXRtOG4FbEm/uGbCRouA==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
       "dependencies": {
-        "@types/mdast": "^4.0.0",
-        "@types/unist": "^3.0.0",
-        "longest-streak": "^3.0.0",
-        "mdast-util-phrasing": "^4.0.0",
-        "mdast-util-to-string": "^4.0.0",
-        "micromark-util-decode-string": "^2.0.0",
-        "unist-util-visit": "^5.0.0",
-        "zwitch": "^2.0.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/unified"
+        "micromark-util-types": "^1.0.0"
       }
     },
-    "node_modules/mdast-util-to-string": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-4.0.0.tgz",
-      "integrity": "sha512-0H44vDimn51F0YwvxSJSm0eCDOJTRlmN0R1yBh4HLj9wiV1Dn0QoXGbvFAWj2hSItVTlCmBF1hqKlIyUBVFLPg==",
+    "node_modules/mermaid/node_modules/micromark-util-sanitize-uri": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmmirror.com/micromark-util-sanitize-uri/-/micromark-util-sanitize-uri-1.2.0.tgz",
+      "integrity": "sha512-QO4GXv0XZfWey4pYFndLUKEAktKkG5kZTdUNaTAkzbuJxn2tNBOr+QtxR2XpWaMhbImT2dPzyLrPXLlPhph34A==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
       "dependencies": {
-        "@types/mdast": "^4.0.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/unified"
+        "micromark-util-character": "^1.0.0",
+        "micromark-util-encode": "^1.0.0",
+        "micromark-util-symbol": "^1.0.0"
       }
     },
-    "node_modules/mdn-data": {
-      "version": "2.0.14",
-      "resolved": "https://registry.npmjs.org/mdn-data/-/mdn-data-2.0.14.tgz",
-      "integrity": "sha512-dn6wd0uw5GsdswPFfsgMp5NSB0/aDe6fK94YJV/AJDYXL6HVLWBsxeq7js7Ad+mU2K9LAlwpk6kN2D5mwCPVow=="
-    },
-    "node_modules/media-typer": {
-      "version": "0.3.0",
-      "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
-      "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==",
-      "engines": {
-        "node": ">= 0.6"
+    "node_modules/mermaid/node_modules/micromark-util-subtokenize": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-util-subtokenize/-/micromark-util-subtokenize-1.1.0.tgz",
+      "integrity": "sha512-kUQHyzRoxvZO2PuLzMt2P/dwVsTiivCK8icYTeR+3WgbuPqfHgPPy7nFKbeqRivBvn/3N3GBiNC+JRTMSxEC7A==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "dependencies": {
+        "micromark-util-chunked": "^1.0.0",
+        "micromark-util-symbol": "^1.0.0",
+        "micromark-util-types": "^1.0.0",
+        "uvu": "^0.5.0"
       }
     },
-    "node_modules/memfs": {
-      "version": "3.5.3",
-      "resolved": "https://registry.npmjs.org/memfs/-/memfs-3.5.3.tgz",
-      "integrity": "sha512-UERzLsxzllchadvbPs5aolHh65ISpKpM+ccLbOJ8/vvpBKmAWf+la7dXFy7Mr0ySHbdHrFv5kGFCUHHe6GFEmw==",
+    "node_modules/mermaid/node_modules/micromark-util-types": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmmirror.com/micromark-util-types/-/micromark-util-types-1.1.0.tgz",
+      "integrity": "sha512-ukRBgie8TIAcacscVHSiddHjO4k/q3pnedmzMQ4iwDcK0FtFCohKOlFbaOL/mPgfnPsL3C1ZyxJa4sbWrBl3jg==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ]
+    },
+    "node_modules/mermaid/node_modules/unist-util-stringify-position": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmmirror.com/unist-util-stringify-position/-/unist-util-stringify-position-3.0.3.tgz",
+      "integrity": "sha512-k5GzIBZ/QatR8N5X2y+drfpWG8IDBzdnVj6OInRNWm1oXrzydiaAT2OQiA8DPRRZyAKb9b6I2a6PxYklZD0gKg==",
       "dependencies": {
-        "fs-monkey": "^1.0.4"
+        "@types/unist": "^2.0.0"
       },
-      "engines": {
-        "node": ">= 4.0.0"
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/merge-descriptors": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
-      "integrity": "sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w=="
-    },
-    "node_modules/merge-stream": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
-      "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w=="
-    },
-    "node_modules/merge2": {
-      "version": "1.4.1",
-      "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz",
-      "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==",
-      "engines": {
-        "node": ">= 8"
+    "node_modules/mermaid/node_modules/uuid": {
+      "version": "9.0.1",
+      "resolved": "https://registry.npmmirror.com/uuid/-/uuid-9.0.1.tgz",
+      "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
+      "funding": [
+        "https://github.com/sponsors/broofa",
+        "https://github.com/sponsors/ctavan"
+      ],
+      "bin": {
+        "uuid": "dist/bin/uuid"
       }
     },
     "node_modules/methods": {
@@ -10376,6 +11425,14 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
+    "node_modules/mri": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmmirror.com/mri/-/mri-1.2.0.tgz",
+      "integrity": "sha512-tzzskb3bG8LvYGFF/mDTpq3jpI6Q9wc3LEmBaghu+DdCssd1FakN7Bc0hVNmEyGq1bq3RgfkCb3cmQLpNPOroA==",
+      "engines": {
+        "node": ">=4"
+      }
+    },
     "node_modules/mrmime": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/mrmime/-/mrmime-2.0.0.tgz",
@@ -10486,6 +11543,11 @@
       "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.14.tgz",
       "integrity": "sha512-y10wOWt8yZpqXmOgRo77WaHEmhYQYGNA6y421PKsKYWEK8aW+cqAphborZDhqfyKrbZEN92CN1X2KbafY2s7Yw=="
     },
+    "node_modules/non-layered-tidy-tree-layout": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmmirror.com/non-layered-tidy-tree-layout/-/non-layered-tidy-tree-layout-2.0.2.tgz",
+      "integrity": "sha512-gkXMxRzUH+PB0ax9dUN0yYF0S25BqeAYqhgMaLUFmpXLEk7Fcu8f4emJuOAY0V8kjDICxROIKsTAKsV/v355xw=="
+    },
     "node_modules/normalize-path": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz",
@@ -12567,6 +13629,11 @@
         "url": "https://github.com/sponsors/isaacs"
       }
     },
+    "node_modules/robust-predicates": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmmirror.com/robust-predicates/-/robust-predicates-3.0.2.tgz",
+      "integrity": "sha512-IXgzBWvWQwE6PrDI05OvmXUIruQTcoMDzRsOd5CDvHCVLcLHMTSYvOK5Cm46kWqlV3yAbuSpBZdJ5oP5OUoStg=="
+    },
     "node_modules/rtl-css-js": {
       "version": "1.16.1",
       "resolved": "https://registry.npmjs.org/rtl-css-js/-/rtl-css-js-1.16.1.tgz",
@@ -12619,6 +13686,22 @@
         "queue-microtask": "^1.2.2"
       }
     },
+    "node_modules/rw": {
+      "version": "1.3.3",
+      "resolved": "https://registry.npmmirror.com/rw/-/rw-1.3.3.tgz",
+      "integrity": "sha512-PdhdWy89SiZogBLaw42zdeqtRJ//zFd2PgQavcICDUgJT5oW10QCRKbJ6bg4r0/UY2M6BWd5tkxuGFRvCkgfHQ=="
+    },
+    "node_modules/sade": {
+      "version": "1.8.1",
+      "resolved": "https://registry.npmmirror.com/sade/-/sade-1.8.1.tgz",
+      "integrity": "sha512-xal3CZX1Xlo/k4ApwCFrHVACi9fBqJ7V+mwhBsuf/1IOKbBy098Fex+Wa/5QMubw09pSZ/u8EY8PWgevJsXp1A==",
+      "dependencies": {
+        "mri": "^1.1.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
     "node_modules/safe-buffer": {
       "version": "5.2.1",
       "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
@@ -13699,6 +14782,14 @@
         "url": "https://github.com/sponsors/wooorm"
       }
     },
+    "node_modules/ts-dedent": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmmirror.com/ts-dedent/-/ts-dedent-2.2.0.tgz",
+      "integrity": "sha512-q5W7tVM71e2xjHZTlgfTDoPF/SmqKG5hddq9SzR49CH2hayqRKJtQ4mtRlSxKaJlR/+9rEM+mnBHf7I2/BQcpQ==",
+      "engines": {
+        "node": ">=6.10"
+      }
+    },
     "node_modules/ts-easing": {
       "version": "0.2.0",
       "resolved": "https://registry.npmjs.org/ts-easing/-/ts-easing-0.2.0.tgz",
@@ -14195,6 +15286,31 @@
         "uuid": "dist/bin/uuid"
       }
     },
+    "node_modules/uvu": {
+      "version": "0.5.6",
+      "resolved": "https://registry.npmmirror.com/uvu/-/uvu-0.5.6.tgz",
+      "integrity": "sha512-+g8ENReyr8YsOc6fv/NVJs2vFdHBnBNdfE49rshrTzDWOlUx4Gq7KOS2GD8eqhy2j+Ejq29+SbKH8yjkAqXqoA==",
+      "dependencies": {
+        "dequal": "^2.0.0",
+        "diff": "^5.0.0",
+        "kleur": "^4.0.3",
+        "sade": "^1.7.3"
+      },
+      "bin": {
+        "uvu": "bin.js"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/uvu/node_modules/kleur": {
+      "version": "4.1.5",
+      "resolved": "https://registry.npmmirror.com/kleur/-/kleur-4.1.5.tgz",
+      "integrity": "sha512-o+NO+8WrRiQEE4/7nwRJhN1HWpVmJm511pBHUxPLtp0BUISzlBplORYSmTclCnJvQq2tKu/sgl3xVpkc7ZWuQQ==",
+      "engines": {
+        "node": ">=6"
+      }
+    },
     "node_modules/value-equal": {
       "version": "1.0.1",
       "resolved": "https://registry.npmjs.org/value-equal/-/value-equal-1.0.1.tgz",
@@ -14277,6 +15393,11 @@
         "url": "https://github.com/sponsors/wooorm"
       }
     },
+    "node_modules/web-worker": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmmirror.com/web-worker/-/web-worker-1.3.0.tgz",
+      "integrity": "sha512-BSR9wyRsy/KOValMgd5kMyr3JzpdeoR9KVId8u5GVlTTAtNChlsE4yTxeY7zMdNSyOmoKBv8NH2qeRY9Tg+IaA=="
+    },
     "node_modules/webpack": {
       "version": "5.91.0",
       "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.91.0.tgz",
diff --git a/docs/package.json b/docs/package.json
index 8d3bcb970e46..a529476c4786 100644
--- a/docs/package.json
+++ b/docs/package.json
@@ -18,6 +18,7 @@
     "@docusaurus/core": "^3.4.0",
     "@docusaurus/plugin-content-pages": "^3.4.0",
     "@docusaurus/preset-classic": "^3.4.0",
+    "@docusaurus/theme-mermaid": "^3.4.0",
     "@mdx-js/react": "^3.0.0",
     "clsx": "^2.0.0",
     "prism-react-renderer": "^2.3.0",
diff --git a/docs/src/components/CustomFooter.tsx b/docs/src/components/CustomFooter.tsx
index 36e12ac94aac..23b36d6ec28e 100644
--- a/docs/src/components/CustomFooter.tsx
+++ b/docs/src/components/CustomFooter.tsx
@@ -17,11 +17,9 @@ function CustomFooter() {
             </a>
           </div>
         </div>
-        <div className="footer-community">
-          <Translate id="footer.community">Community</Translate>
-        </div>
+
         <div className="footer-icons">
-          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA" target="_blank" rel="noopener noreferrer">
+          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2ngejmfw6-9gW4APWOC9XUp1n~SiQ6iw" target="_blank" rel="noopener noreferrer">
             <FaSlack />
           </a>
           <a href="https://discord.gg/ESHStjSjD4" target="_blank" rel="noopener noreferrer">
diff --git a/docs/src/components/Demo/Demo.tsx b/docs/src/components/Demo/Demo.tsx
index a7493e34ce4a..25621bca5f15 100644
--- a/docs/src/components/Demo/Demo.tsx
+++ b/docs/src/components/Demo/Demo.tsx
@@ -6,7 +6,7 @@ export function Demo() {
 
   return (
     <div
-      style={{ paddingBottom: "30px", paddingTop: "20px", textAlign: "center" }}
+      style={{ paddingBottom: "10px", paddingTop: "10px", textAlign: "center" }}
     >
       <video
         playsInline
diff --git a/docs/src/components/HomepageHeader/HomepageHeader.tsx b/docs/src/components/HomepageHeader/HomepageHeader.tsx
index 0226e482f539..10445eab8f78 100644
--- a/docs/src/components/HomepageHeader/HomepageHeader.tsx
+++ b/docs/src/components/HomepageHeader/HomepageHeader.tsx
@@ -14,15 +14,28 @@ export function HomepageHeader() {
         <Heading as="h1" className="header-title">
           {siteConfig.title}
         </Heading>
+
         <p className="header-subtitle">{siteConfig.tagline}</p>
-        <div className="header-buttons">
-          <Link
-            className="button button--secondary button--lg"
-            to="/modules/usage/intro"
-          >
-            <Translate id="homepage.getStarted">Get Started</Translate>
-          </Link>
+
+        <div className="header-links">
+          <a href="https://github.com/OpenDevin/OpenDevin">
+            <img src="https://img.shields.io/badge/Code-Github-purple?logo=github&logoColor=white&style=for-the-badge" alt="Code" />
+          </a>
+          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2i1iqdag6-bVmvamiPA9EZUu7oCO6KhA">
+            <img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" />
+          </a>
+          <a href="https://discord.gg/ESHStjSjD4">
+            <img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community" />
+          </a>
+
+          <a href="https://arxiv.org/abs/2407.16741">
+            <img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv" />
+          </a>
+          <a href="https://huggingface.co/spaces/OpenDevin/evaluation">
+            <img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark" />
+          </a>
         </div>
+
         <Demo />
       </div>
     </div>
diff --git a/docs/src/components/Welcome/Welcome.tsx b/docs/src/components/Welcome/Welcome.tsx
deleted file mode 100644
index bea684915247..000000000000
--- a/docs/src/components/Welcome/Welcome.tsx
+++ /dev/null
@@ -1,20 +0,0 @@
-import "../../css/welcome.css";
-import Translate from '@docusaurus/Translate';
-
-export function Welcome() {
-  return (
-    <div className="text-white">
-      <div className="welcome-container">
-        <img src="img/logo.png" className="welcome-logo" />
-        <p className="welcome-text">
-          <Translate id="welcome.message">
-          Welcome to OpenDevin, an open-source autonomous AI software engineer
-          that is capable of executing
-          complex engineering tasks and collaborating actively with users on
-          software development projects.
-          </Translate>
-        </p>
-      </div>
-    </div>
-  );
-}
diff --git a/docs/src/css/faq.css b/docs/src/css/faq.css
deleted file mode 100644
index 07bfe7b3e0ed..000000000000
--- a/docs/src/css/faq.css
+++ /dev/null
@@ -1,66 +0,0 @@
-/* faq.css */
-
-.faq-container {
-    margin: auto;
-    padding: 24px;
-    display: flex;
-    flex-direction: column;
-    gap: 8px;
-    margin-bottom: 24px;
-  }
-  
-  .faq-title {
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    font-size: 2rem;
-    padding: 8px;
-    text-transform: uppercase;
-    font-weight: bold;
-  }
-  
-  @media (min-width: 1024px) {
-    .faq-title {
-      font-size: 6rem;
-    }
-  }
-  
-  .faq-section {
-    display: flex;
-    flex-direction: column;
-    gap: 8px;
-    width: 100%;
-    margin-bottom: 24px;
-  }
-  
-  .faq-section-title {
-    text-transform: uppercase;
-    font-weight: bold;
-    font-size: 2rem;
-    letter-spacing: 0.1em;
-  }
-  
-  .highlight {
-    font-weight: 600;
-    color: var(--logo);
-  }
-  
-  .faq-steps ol {
-    padding-left: 24px;
-  }
-  
-  .command-box {
-    display: flex;
-    flex-direction: column;
-    padding: 8px;
-    background-color: #e0e0e0;
-    border-radius: 0.375rem;
-    height: 6vh;
-    text-transform: uppercase;
-    color: #4a5568;
-  }
-  
-  .command-box + .command-box {
-    height: 8vh;
-  }
-  
\ No newline at end of file
diff --git a/docs/src/css/footer.css b/docs/src/css/footer.css
index d17277367940..173ac6406711 100644
--- a/docs/src/css/footer.css
+++ b/docs/src/css/footer.css
@@ -3,12 +3,12 @@
 .custom-footer {
     background-color: dark;
     color: white;
-    height: 25vh;
+    height: 200px;
     /* background: linear-gradient(to bottom, #1a1a1a, #1a1a1a); */
     background: linear-gradient(to bottom, #1f2937, #000000);
 
   }
-  
+
   .footer-content {
     display: flex;
     flex-direction: column;
@@ -17,56 +17,55 @@
     padding: 8px;
     height: 100%;
   }
-  
+
   .footer-top {
     display: flex;
     gap: 8px;
     align-items: center;
   }
-  
+
   .footer-title {
     font-weight: bold;
     font-size: 1.125rem;
   }
-  
+
   @media (min-width: 768px) {
     .footer-title {
       font-size: 1.875rem;
     }
   }
-  
+
   .footer-link a {
     font-size: 0.875rem;
     text-decoration: none;
     color: gray;
     transition: color 0.3s ease;
   }
-  
+
   .footer-link a:hover {
     color: white;
   }
-  
+
   .footer-community {
     text-transform: uppercase;
     font-weight: 300;
   }
-  
+
   .footer-icons {
     display: flex;
     gap: 24px;
     font-size: 1.875rem;
   }
-  
+
   .footer-icons a {
     color:gray;
     transition: color 0.3s ease;
   }
-  
+
   .footer-icons a:hover {
     color: white;
   }
-  
+
   .footer-bottom {
     text-transform: uppercase;
   }
-  
\ No newline at end of file
diff --git a/docs/src/css/homepageHeader.css b/docs/src/css/homepageHeader.css
index 8e1b54336176..f8dd2003d763 100644
--- a/docs/src/css/homepageHeader.css
+++ b/docs/src/css/homepageHeader.css
@@ -1,36 +1,47 @@
 /* homepageHeader.css */
 
 .homepage-header {
-    height: 100vh;
-    color: white;
-    background: linear-gradient(to top, #64748b, #000000);
-  }
-  
-  .header-content {
-    display: flex;
-    flex-direction: column;
-    gap: 8px;
-    align-items: center;
-    padding: 24px;
-    font-weight: 300;
-    width: 100%;
-  }
-  
+  height: 800px;
+  color: white;
+  background: linear-gradient(to top, #64748b, #000000);
+}
+
+.header-content {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  padding: 2rem;
+  font-weight: 300;
+  width: 100%;
+}
+
+.header-title {
+  font-size: 3rem;
+}
+
+@media (min-width: 768px) {
   .header-title {
-    font-size: 3rem;
-  }
-  
-  @media (min-width: 768px) {
-    .header-title {
-      font-size: 5rem;
-    }
+    font-size: 4rem;
   }
-  
-  .header-subtitle {
-    font-size: 1.25rem;
-  }
-  
-  .header-buttons {
-    margin-top: 24px;
-  }
-  
\ No newline at end of file
+}
+
+.header-subtitle {
+  font-size: 1.5rem;
+}
+
+.header-links {
+  display: flex;
+  flex-wrap: wrap;
+  justify-content: center;
+  gap: 10px;
+  max-width: 680px;
+}
+
+.header-links a {
+  display: inline-block;
+  transition: transform 0.2s ease-in-out;
+}
+
+.header-links a:hover {
+  transform: translateY(-2px);
+}
diff --git a/docs/src/css/welcome.css b/docs/src/css/welcome.css
deleted file mode 100644
index 731cc87c1984..000000000000
--- a/docs/src/css/welcome.css
+++ /dev/null
@@ -1,53 +0,0 @@
-/* welcome.css */
-
-.text-white {
-    color: white;
-  }
-
-  .welcome-container {
-    display: flex;
-    justify-content: center;
-    align-items: center;
-    flex-direction: column;
-    background: linear-gradient(to bottom, #64748b, #1f2937);
-  }
-
-  @media (min-width: 768px) {
-    .welcome-container {
-      flex-direction: row;
-      background: linear-gradient(to bottom, #64748b, #1f2937);
-    }
-  }
-
-  .welcome-logo {
-    height: 45vh;
-    width: 45vw;
-  }
-
-  @media (max-width: 640px) {
-    .welcome-logo {
-      height: 40vw;
-      width: 40vw;
-    }
-  }
-
-  @media (min-width: 768px) {
-    .welcome-logo {
-      height: auto;
-      width: 350px;
-    }
-  }
-
-  .welcome-text {
-    padding: 24px;
-    margin-bottom: 24px;
-    font-weight: 300;
-    font-size: 1.125rem;
-  }
-
-  @media (min-width: 768px) {
-    .welcome-text {
-      padding: 8px;
-      font-size: 1.5rem;
-    }
-  }
diff --git a/docs/src/pages/faq.tsx b/docs/src/pages/faq.tsx
deleted file mode 100644
index c185e705f9ac..000000000000
--- a/docs/src/pages/faq.tsx
+++ /dev/null
@@ -1,129 +0,0 @@
-import Layout from '@theme/Layout';
-import '../css/faq.css';
-import Translate, { translate } from '@docusaurus/Translate';
-
-export default function FAQ() {
-  const githubLink = (
-    <a href="https://github.com/OpenDevin/OpenDevin/issues" target="_blank">GitHub</a>
-  );
-  const discordLink = (
-    <a href="https://discord.gg/mBuDGRzzES" target="_blank">Discord</a>
-  );
-  const slackLink = (
-    <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2jsrl32uf-fTeeFjNyNYxqSZt5NPY3fA" target="_blank">Slack</a>
-  );
-
-  return (
-    <Layout
-      title={translate({ id: 'faq.title', message: 'FAQ' })}
-      description={translate({ id: 'faq.description', message: 'Frequently Asked Questions' })}
-    >
-      <div id="faq" className="faq-container">
-        <div className="faq-title">
-          <Translate id="faq.title" description="FAQ Title">Frequently Asked Questions</Translate>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.1" description="First Section Title">What is OpenDevin?</Translate>
-          </div>
-          <p>
-            <span className="highlight"><Translate id="faq.section.highlight" description="Highlight Text">OpenDevin</Translate></span>{" "}
-            <Translate id="faq.section.description.1" description="Description for OpenDevin">
-              is an autonomous software engineer that can solve software engineering
-              and web-browsing tasks end-to-end. It can perform data science queries, such
-              as "Find the number of pull requests to the OpenDevin repository in the last
-              month," and software engineering tasks, such as "Please add tests to this
-              file and verify that all the tests pass. If they don't fix the file."
-            </Translate>
-          </p>
-          <p>
-            <Translate id="faq.section.description.2" description="Further Description for OpenDevin">
-              At the same time, OpenDevin is a platform and community for agent developers
-              to test out and evaluate new agents.
-            </Translate>
-          </p>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.2" description="Support Section Title">Support</Translate>
-          </div>
-          <div>
-            <Translate
-              id="faq.section.support.answer"
-              description="Support Answer"
-              values={{
-                githubLink: githubLink,
-                discordLink: discordLink,
-                slackLink: slackLink,
-              }}
-            >
-              {`Please file a bug on {githubLink} if you notice a problem that likely affects others. If you're having trouble installing, or have general questions, reach out on {discordLink} or {slackLink}.`}
-            </Translate>
-          </div>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.3" description="GitHub Issue Section Title">How to fix a GitHub issue with OpenDevin?</Translate>
-          </div>
-          <div className="faq-steps">
-            <Translate id="faq.section.github.steps.intro" description="GitHub Steps Introduction">
-              To fix an issue on GitHub using OpenDevin, send a prompt to OpenDevin asking it to follow
-              steps like the following:
-            </Translate>
-            <ol>
-              <li><Translate id="faq.section.github.step1" description="GitHub Step 1">Read the issue https://github.com/OpenDevin/OpenDevin/issues/1611</Translate></li>
-              <li><Translate id="faq.section.github.step2" description="GitHub Step 2">Clone the repository and check out a new branch</Translate></li>
-              <li><Translate id="faq.section.github.step3" description="GitHub Step 3">Based on the instructions in the issue description, modify files to fix the issue</Translate></li>
-              <li><Translate id="faq.section.github.step4" description="GitHub Step 4">Push the resulting output to GitHub using the GITHUB_TOKEN environment variable</Translate></li>
-              <li><Translate id="faq.section.github.step5" description="GitHub Step 5">Tell me the link that I need to go to to send a pull request</Translate></li>
-            </ol>
-            <Translate id="faq.section.github.steps.preRun" description="GitHub Steps Pre-Run">
-              Before you run OpenDevin, you can do:
-            </Translate>
-            <div className="command-box">
-              export SANDBOX_ENV_GITHUB_TOKEN=XXX
-            </div>
-            <Translate id="faq.section.github.steps.tokenInfo" description="GitHub Steps Token Info">
-              where XXX is a GitHub token that you created that has permissions to push to the OpenDevin repo. If you don’t have write permission to the OpenDevin repo, you might need to change that to:
-            </Translate>
-            <div className="command-box">
-              Push the resulting output to my fork at https://github.com/USERNAME/OpenDevin/ using the GITHUB_TOKEN environment variable
-            </div>
-            <Translate id="faq.section.github.steps.usernameInfo" description="GitHub Steps Username Info">
-              where USERNAME is your GitHub username.
-            </Translate>
-          </div>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.4" description="Devin Section Title">How is OpenDevin different from Devin?</Translate>
-          </div>
-          <p>
-            <a href="https://www.cognition.ai/blog/introducing-devin"><Translate id="faq.section.devin.linkText" description="Devin Link Text">Devin</Translate></a>&nbsp;
-            <Translate id="faq.section.devin.description" description="Devin Description">
-              is a commercial product by Cognition Inc., that served as the initial
-              inspiration for OpenDevin. They both aim to do a good job at solving software
-              engineering tasks, but OpenDevin you can download, use, and modify, while Devin
-              you can only use through the Cognition site. In addition, OpenDevin has evolved
-              beyond the initial inspiration, and now serves as a community-driven ecosystem for
-              agent development in general, and we'd love to have you join and
-            </Translate>
-            <a href="https://github.com/OpenDevin/OpenDevin/blob/main/CONTRIBUTING.md"><Translate id="faq.section.devin.contribute" description="Contribute Link">contribute</Translate></a>!
-          </p>
-        </div>
-        <div className="faq-section">
-          <div className="faq-section-title">
-            <Translate id="faq.section.title.5" description="ChatGPT Section Title">How is OpenDevin different from ChatGPT?</Translate>
-          </div>
-          <p>
-            <Translate id="faq.section.chatgpt.description" description="ChatGPT Description">
-              ChatGPT you can access online, it does not interface with local files, and
-              its ability to execute code is limited. So it can write code, but it is not
-              easy to test or execute it.
-            </Translate>
-          </p>
-        </div>
-      </div>
-    </Layout>
-  );
-}
diff --git a/docs/src/pages/index.tsx b/docs/src/pages/index.tsx
index 7a2b9f0cc032..8f1605b0c6ee 100644
--- a/docs/src/pages/index.tsx
+++ b/docs/src/pages/index.tsx
@@ -4,12 +4,11 @@ import { HomepageHeader } from "../components/HomepageHeader/HomepageHeader";
 import { Welcome } from "../components/Welcome/Welcome";
 import { translate } from '@docusaurus/Translate';
 
-export function Header({ title, summary, description }): JSX.Element {
+export function Header({ title, summary }): JSX.Element {
   return (
     <div>
       <h1>{title}</h1>
-      <h2 style={{ fontSize: "40px" }}>{summary}</h2>
-      <h3 className="headerDescription">{description}</h3>
+      <h2 style={{ fontSize: "3rem" }}>{summary}</h2>
     </div>
   );
 }
@@ -17,22 +16,15 @@ export function Header({ title, summary, description }): JSX.Element {
 export default function Home(): JSX.Element {
   const { siteConfig } = useDocusaurusContext();
   return (
-    <>
     <Layout
       title={`${siteConfig.title}`}
       description={translate({
         id: 'homepage.description',
-        message: 'AI-powered code generation for software engineering.',
+        message: 'An Open Platform for AI Software Developers as Generalist Agents',
         description: 'The homepage description',
       })}
     >
-      <div>
-        <HomepageHeader />
-        <div>
-          <Welcome />
-        </div>
-      </div>
+    <HomepageHeader />
     </Layout>
-    </>
   );
 }
diff --git a/docs/static/img/system_architecture_overview.png b/docs/static/img/system_architecture_overview.png
new file mode 100644
index 000000000000..a9174fc6ba72
Binary files /dev/null and b/docs/static/img/system_architecture_overview.png differ
diff --git a/docs/static/img/teaser.mp4 b/docs/static/img/teaser.mp4
index 12ae85f59511..6a3cc0821b05 100644
Binary files a/docs/static/img/teaser.mp4 and b/docs/static/img/teaser.mp4 differ
diff --git a/evaluation/EDA/README.md b/evaluation/EDA/README.md
index 8ae5f7b843f3..06e453ec7fdd 100644
--- a/evaluation/EDA/README.md
+++ b/evaluation/EDA/README.md
@@ -2,9 +2,10 @@
 
 This folder contains evaluation harness for evaluating agents on the Entity-deduction-Arena Benchmark, from the paper [Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games](https://arxiv.org/abs/2310.01468), presented in ACL 2024 main conference.
 
-## Configure OpenDevin and your LLM
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
-Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
 
 ## Start the evaluation
 
diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py
index 5cd66901d6ce..29ce4241c66b 100644
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -1,30 +1,27 @@
 import asyncio
-import logging
 import os
 
 import pandas as pd
-
-# import huggingface_hub
 from datasets import load_dataset
 
 from evaluation.EDA.game import Q20Game, Q20GameCelebrity
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
-
-# from evaluation.EDA.scorer import question_scorer
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
 
 game = None
 
@@ -56,39 +53,44 @@ def codeact_user_response_eda(state: State) -> str:
 }
 
 
-def process_instance(
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='python:3.11-bookworm',
+            enable_auto_lint=False,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def process_instance(
     instance: pd.Series,
     metadata: EvalMetadata,
     reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
+) -> EvalOutput:
+    config = get_config(metadata)
+    instance_id = instance['text'].strip()
+
     # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-    eval_output_dir = metadata.eval_output_dir
     if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            eval_output_dir, 'logs', f'instance_{instance["text"].strip()}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {instance["text"].strip()}.\nLOG:   tail -f {log_file}'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance_id}.')
 
     # Prepare instruction
-    _game_class = {'things': Q20Game, 'celebs': Q20GameCelebrity}
+    _game_class = {'eda-things': Q20Game, 'eda-celebs': Q20GameCelebrity}
 
     guesser_kargs = {
         'max_new_tokens': 64,
@@ -112,24 +114,16 @@ def process_instance(
 
     instruction = f'{game.first_user_utterance}'
     logger.info(f'Instruction: {instruction}')
-
-    # instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-    # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    runtime = await create_runtime(config, sid=instance['text'].strip())
 
-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                agent.__class__.__name__
-            ],
-            sid=instance['text'].strip(),
-        )
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
     )
     # ======= Attempt to evaluate the agent's edits =======
     # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
@@ -150,21 +144,20 @@ def process_instance(
     histories = state.history.compatibility_for_eval_history_pairs()
 
     # Save the output
-    output = {
-        'instance_id': instance['text'].strip(),
-        'instance': instance,
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': {
+    output = EvalOutput(
+        instance_id=instance_id,
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
             'success': test_result,
             'final_message': final_message,
             'ground_truth': instance['text'],
         },
-    }
-
+    )
     return output
 
 
@@ -191,12 +184,16 @@ def process_instance(
     )
     args, _ = parser.parse_known_args()
 
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
-
     eda_dataset = load_dataset(
         'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
     )
+    eda_dataset.rename(columns={'text': 'instance_id'}, inplace=True)
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     metadata = make_metadata(
         llm_config,
@@ -214,16 +211,15 @@ def process_instance(
 
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
     prepared_dataset = prepare_dataset(
-        eda_dataset.to_pandas(), output_file, args.eval_n_limit, 'text'
+        eda_dataset.to_pandas(), output_file, args.eval_n_limit
     )
 
-    agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm))
-
-    run_evaluation(
-        prepared_dataset,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        'text',
+    asyncio.run(
+        run_evaluation(
+            prepared_dataset,
+            metadata,
+            output_file,
+            args.eval_num_workers,
+            process_instance,
+        )
     )
diff --git a/evaluation/EDA/scripts/run_infer.sh b/evaluation/EDA/scripts/run_infer.sh
old mode 100644
new mode 100755
diff --git a/evaluation/README.md b/evaluation/README.md
index 496b313ffb45..7ede494b8e46 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -12,15 +12,59 @@ all the preprocessing/evaluation/analysis scripts.
 
 ## Supported Benchmarks
 
+To learn more about how to integrate your benchmark into OpenDevin, check out [tutorial here](https://docs.all-hands.dev/modules/usage/evaluation_harness).
+
+### Software Engineering
+
 - SWE-Bench: [`evaluation/swe_bench`](./swe_bench)
-- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
 - HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
+- BIRD: [`evaluation/bird`](./bird)
+- BioCoder: [`evaluation/ml_bench`](./ml_bench)
+- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
+- APIBench: [`evaluation/gorilla`](./gorilla/)
+- ToolQA: [`evaluation/toolqa`](./toolqa/)
+
+### Web Browsing
+
+- WebArena: [`evaluation/webarena`](./webarena/)
+- MiniWob++: [`evaluation/miniwob`](./miniwob/)
+
+### Misc. Assistance
+
 - GAIA: [`evaluation/gaia`](./gaia)
-- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
-- MINT: [`evaluation/mint`](./mint)
+- GPQA: [`evaluation/gpqa`](./gpqa)
 - AgentBench: [`evaluation/agent_bench`](./agent_bench)
-- BIRD: [`evaluation/bird`](./bird)
-- LogicReasoning: [`evaluation/logic_reasoning`](./logic_reasoning)
+- MINT: [`evaluation/mint`](./mint)
+- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
+- ProofWriter: [`evaluation/logic_reasoning`](./logic_reasoning)
+
+
+## Before everything begins: Setup Environment and LLM Configuration
+
+Please follow instruction [here](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup your local development environment and LLM.
+
+OpenDevin in development mode uses `config.toml` to keep track of most configurations.
+
+Here's an example configuration file you can use to define and use multiple LLMs:
+
+```toml
+[llm]
+# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
+model = "gpt-4o-2024-05-13"
+api_key = "sk-XXX"
+
+[llm.eval_gpt4_1106_preview_llm]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[llm.eval_some_openai_compatible_model_llm]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
 
 ### Result Visualization
 
diff --git a/evaluation/TUTORIAL.md b/evaluation/TUTORIAL.md
deleted file mode 100644
index 5906d7ff580c..000000000000
--- a/evaluation/TUTORIAL.md
+++ /dev/null
@@ -1,186 +0,0 @@
-# Tutorial: How to add a New Evaluation Benchmark to OpenDevin
-
-This tutorial provides a general guide on how to integrate your own evaluation benchmark into the OpenDevin framework.
-
-You can read this for details, and also learn by example by looking at our existing evaluations:
-- [swe_bench](swe_bench/)
-
-
-## A quick walk-through of OpenDevin architecture
-
-### Before everything begins
-
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-### Configuration file
-
-OpenDevin uses `config.toml` to keep track of most configurations.
-
-Here's an example configuration file you can use:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-
-# IMPORTANT: You should set these two paths to YOUR WORKSPACE directory,
-# which will be mounted into Sandbox for agent to interact with!
-# The OpenDevin agent will be able to read/write files whatever they like (even rm -rf)
-# in this directory, so be careful!!
-workspace_base = "/path/to/your/workspace"
-workspace_mount_path = "/path/to/your/workspace"
-# ==========================
-
-ssh_hostname = "localhost"
-
-run_as_devin = false
-
-[sandbox]
-# SWEBench eval specific - but you can tweak it to your needs
-use_host_network = false
-# linting python after editing helps LLM fix indentations
-enable_auto_lint = true
-
-
-box_type = "ssh"
-timeout = 120
-
-[llm]
-# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
-model = "gpt-4o-2024-05-13"
-api_key = "sk-XXX"
-```
-
-### How to use OpenDevin programmatically
-
-In this section, for the purpose of building an evaluation task, we don't use the standard OpenDevin web-based GUI, but rather run OpenDevin backend from CLI.
-
-For example, you can run the following, which performs the specified task `-t`, with a particular model config `-l` and agent `-c`, for a maximum number of iterations `-i`:
-
-```bash
-poetry run python ./opendevin/core/main.py \
-        -i 10 \
-        -t "Write me a bash script that print hello world." \
-        -c CodeActAgent \
-        -l llm
-```
-
-After running the script, you will observe the following:
-
-![](./static/example_task_1.png)
-
-You can see the agent uses bash to write a script, makes it executable, and then tests it by running it to make sure it is working.
-
-At the end of the above screenshot, OpenDevin actually requests user inputs when it think it finishes the task. This will cause issues in evaluation, since most evaluation don't assume additional user input. To fix this, we introduce the functionality of `fake_user_response_fn` in the `main` function, which we describe in the next section.
-
-## The `main` function
-
-The signature of `main` (in file [[`opendevin/core/main.py`](../opendevin/core/main.py)]) is as follows:
-
-```python
-async def main(
-    task_str: str = '',
-    exit_on_message: bool = False,
-    fake_user_response_fn: Optional[Callable[[Optional[State]], str]] = None,
-    sandbox: Optional[Sandbox] = None,
-) -> Optional[State]:
-```
-
-- `task_str`: The task instruction to run. In the above example, it is "Write me a bash script that print hello world."
-- `exit_on_message`: whether to quit if the agent asks for a message from user
-- `fake_user_response_fn`: An optional function that receives the current state (could be None) and returns a fake user response.
-- `sandbox`: An optional sandbox to run the agent in.
-
-### `fake_user_response_fn`
-
-Here's an example of `fake_user_response_fn` in the implementation for SWE-Bench in [`evaluation/swe_bench/run_infer.py`](swe_bench/run_infer.py):
-
-```python
-def codeact_user_response(state: State) -> str:
-    msg = (
-        'Please continue working on the task on whatever approach you think is suitable.\n'
-        'If you think you have modified the code in a way that fixes the issue, please run the following command: <execute_bash> exit </execute_bash>.\n'
-        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
-    )
-    # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
-    if state.history:
-        user_msgs = [
-            event
-            for event in state.history.get_events()
-            if isinstance(action, MessageAction) and action.source == 'user'
-        ]
-        if len(user_msgs) > 2:
-            # let the agent know that it can give up when it has tried 3 times
-            return (
-                msg
-                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
-            )
-    return msg
-```
-
-
-### Return value
-
-The main function returns a `State`, which is defined in [`opendevin/controller/state/state.py`](../opendevin/controller/state/state.py). We are mainly using `state.history` here, which is the most important field of data. You can imagine it is being a more structured version of OpenAI's chat completion [messages](https://platform.openai.com/docs/guides/text-generation/chat-completions-api).
-
-`history: list[tuple[Action, Observation]] = field(default_factory=list)` is a list of (action, observation) tuple. All the actions are defined at [`opendevin/events/action`](../opendevin/events/action) and observations are defined at [`opendevin/events/observation`](../opendevin/events/action).
-
-The agent can emit different actions like `CmdRunAction`  (`opendevin/events/action/commands.py`) to execute bash commands and receive `CmdOutputObservation` (`opendevin/events/observation/commands.py`), `IPythonRunCellAction` to receive `IPythonRunCellObservation`, `BrowseInteractiveAction` (`opendevin/events/action/browse.py`) to browse the web and receive `BrowserOutputObservation` (`opendevin/events/observation/browse.py`).
-
-The action we used in this example is `MessageAction` (`opendevin/events/action/message.py`), which actually denotes a message from either `agent` or `user`. In the [CodeAct agent example](https://github.com/OpenDevin/OpenDevin/blob/7ca560471bd262f22513f3863995d0a8e6121c07/agenthub/codeact_agent/codeact_agent.py#L239-L273), an agent is considered to emit a `MessageAction` when it does not trigger a `CmdRunAction`, `IPythonRunCellAction`, and/or `BrowseInteractiveAction`.
-
-Typically, the agent returns `MessageAction` when it is confused about the task, and want to ask human for follow-up clarification, which is a good thing in real-world task, but not necessarily in evaluation. So in this example, we provide a dummy prompt to tell the agent "Please continue working on the task on whatever approach you think is suitable[...]".
-
-If you see something like this, you can consider adding this to your evaluation pipeline as well.
-
-### `sandbox`
-
-Sandbox is a fully functioning docker container where the agent can perform all sorts of tasks, e.g., using bash, calling Python, install packages, and more. You can leave `sandbox` to `None` if you don't need to do anything special to pre-configure the `Sandbox`.
-
-In SWE-Bench, we need to copy the proper repository directory to the workspace and activate the right python virtual environment before the agent can start performing the task, so we actually defined a custom [`SWEBenchSSHBox`](https://github.com/OpenDevin/OpenDevin/blob/7ca560471bd262f22513f3863995d0a8e6121c07/evaluation/swe_bench/swe_env_box.py#L12-L118) that inherit from the default sandbox [`SSHBox`](https://github.com/OpenDevin/OpenDevin/blob/7ca560471bd262f22513f3863995d0a8e6121c07/opendevin/runtime/docker/ssh_box.py#L188) and handles all these initial setup. If you need to configure the `sandbox` for your evaluation, check `SWEBenchSSHBox` for a reference of implementation.
-
-## How to put together an evaluation script?
-
-Now we know how to start running the agent end-to-end, and how `fake_user_response_fn` and `sandbox` work. We will walk through a piece of dummy code (simplified version of SWE-Bench's [`run_infer.py`](https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/run_infer.py)) that outline the general workflow:
-
-- Load the dataset and prepare the evaluation configuration.
-- Filter out any instances that have already been processed.
-- For each instance in the dataset:
-  - Set up the sandbox environment.
-  - Run the agent to generate a solution.
-  - Apply the solution to the instance and execute the test command.
-  - Collect the results and write them to the output file.
-- Perform cleanup after the evaluation is complete.
-
-You can see the [swe_bench/run_infer.py](swe_bench/run_infer.py) file for an example.
-
-When you fully understand the `run_infer.py`, you can be ready to actually starting the evaluation!
-
-
-## Run the evaluation!
-
-You can write your `run_infer.sh` script mimicking SWE-Bench's [`run_infer.sh`](https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/scripts/run_infer.sh).
-
-
-You can start the evaluation by running:
-
-```bash
-./run_infer.sh eval_gpt_4o_2024_05_13
-```
-Where `eval_gpt_4o_2024_05_13` is the model config you defined on the config.toml.
-Like this:
-
-```toml
-[core]
-...
-
-[llm]
-model="gpt-4-32k"
-...
-
-[eval_gpt_4o_2024_05_13]
-model="gpt-4o-2024-05-13"
-api_key="sk-xxx"
-```
-
-If `[eval_gpt_4o_2024_05_13]` is not present, it will default to using the model configured in `[llm]`.
diff --git a/evaluation/agent_bench/README.md b/evaluation/agent_bench/README.md
index 6da710e1c10c..f656e17ba7a8 100644
--- a/evaluation/agent_bench/README.md
+++ b/evaluation/agent_bench/README.md
@@ -1,44 +1,10 @@
 # AgentBench Evaluation
 
-This folder contains evaluation harness for evaluating agents on
-the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688).
+This folder contains evaluation harness for evaluating agents on the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688). We currently only support running on the `osbench` subset.
 
-## Configure OpenDevin and your LLM
+## Setup Environment and LLM Configuration
 
-Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md)
-for how to set this up.
-
-Here is an example `config.toml` file:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/path/to/cache"
-
-workspace_base = "/path/to/workspace"
-workspace_mount_path = "/path/to/workspace"
-
-ssh_hostname = "localhost"
-
-# AgentBench specific
-run_as_devin = true
-
-[sandbox]
-use_host_network = false
-enable_auto_lint = true
-box_type = "ssh"
-timeout = 120
-
-[llm.eval_gpt35_turbo]
-model = "gpt-3.5-turbo"
-api_key = "sk-123"
-temperature = 0.0
-
-[llm.eval_gpt4o]
-model = "gpt-4o"
-api_key = "sk-123"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## Start the evaluation
 
@@ -46,7 +12,18 @@ temperature = 0.0
 ./evaluation/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
 ```
 
-Following is the basic command to start the evaluation. Here we are only evaluating the `osbench` for now.
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
+default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
+in order to use `eval_limit`, you must also set `agent`.
+
+
+Following is the basic command to start the evaluation.
 
 You can update the arguments in the script `evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on.
 
@@ -57,5 +34,5 @@ You can update the arguments in the script `evaluation/agent_bench/scripts/run_i
 - `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
 
 ```bash
-./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo 0.6.2 CodeActAgent 1
+./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
 ```
diff --git a/evaluation/agent_bench/helper.py b/evaluation/agent_bench/helper.py
index 3a11b5f0b221..ca17e4f97d43 100644
--- a/evaluation/agent_bench/helper.py
+++ b/evaluation/agent_bench/helper.py
@@ -14,7 +14,7 @@ def try_parse_answer(act) -> str | None:
         raw_ans = act.thought
     else:
         return None
-    agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
+    agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans, re.DOTALL)
     if not agent_answer:
         return None
     return agent_answer[0].strip()
diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py
index d6edeb3ff961..77b7a1e0026d 100644
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -1,10 +1,9 @@
 import asyncio
-import logging
 import os
 import re
-import shutil
+import tempfile
+from typing import Any
 
-import docker
 import pandas as pd
 from datasets import load_dataset
 
@@ -16,64 +15,175 @@
 )
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.events.action import CmdRunAction, MessageAction
-from opendevin.llm.llm import LLM
-from opendevin.runtime.docker.ssh_box import DockerSSHBox
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import AgentFinishAction, CmdRunAction, MessageAction
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime
+
+
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='python:3.11-bookworm',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    init_cmd = instance.init
+    if init_cmd is not None:
+        script_name = f'{instance.instance_id}_init.sh'
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            host_script_path = os.path.join(tmpdir, script_name)
+            create_sh_file(host_script_path, init_cmd)
+            await runtime.copy_to(
+                host_script_path,
+                '/workspace',
+            )
+
+        logger.info(f'Running init script: {script_name}')
+        action = CmdRunAction(command=f'chmod +x ./{script_name} && ./{script_name}')
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
 
-config = load_app_config()
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    agent_answer = None
+    get_agent_result_cmd = instance.get_agent_result
+    if get_agent_result_cmd is not None:
+        script_name = 'get_agent_result.sh'
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            host_script_path = os.path.join(tmpdir, script_name)
+            create_sh_file(host_script_path, get_agent_result_cmd)
+            await runtime.copy_to(
+                host_script_path,
+                '/workspace',
+            )
+            logger.info(f'Running get agent result cmd: {script_name}')
+
+        action = CmdRunAction(
+            command=f'chmod +x ./{script_name} && ./{script_name}',
+            keep_prompt=False,
+        )
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+        agent_answer = obs.content
+    # IF the agent answer is not found, retrieve it from the history
+    # We wait until the controller finishes
+
+    final_ans = None
+    if instance.ground_truth is not None:
+        final_ans = instance.ground_truth
+    else:
+        get_ground_truth_cmd = instance.get_ground_truth
+        if get_ground_truth_cmd is not None:
+            script_name = 'get_ground_truth.sh'
+            with tempfile.TemporaryDirectory() as tmpdir:
+                host_script_path = os.path.join(tmpdir, script_name)
+                create_sh_file(host_script_path, get_ground_truth_cmd)
+                await runtime.copy_to(
+                    host_script_path,
+                    '/workspace',
+                )
+            logger.info(f'Running get ground truth cmd: {script_name}')
 
+            action = CmdRunAction(
+                command=f'chmod +x ./{script_name} && ./{script_name}',
+                keep_prompt=False,
+            )
+            logger.info(action, extra={'msg_type': 'ACTION'})
+            obs = await runtime.run_action(action)
+            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+            final_ans = obs.content
 
-def process_instance(
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    return {
+        'final_ans': final_ans,
+        'agent_answer': agent_answer,
+    }
+
+
+async def process_instance(
     instance: pd.Series,
     metadata: EvalMetadata,
     reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-
-    inst_id = instance.instance_id
-    question = instance.description
-    # create a directory for the instance's workspace
-    instance_workspace = str(os.path.join(config.workspace_base, inst_id))
-    container_inst_workspace = str(
-        os.path.join(config.workspace_mount_path_in_sandbox, inst_id)
-    )
-    if os.path.exists(instance_workspace):
-        shutil.rmtree(instance_workspace)
-    os.makedirs(instance_workspace, exist_ok=True)
+) -> EvalOutput:
+    config = get_config(metadata)
 
-    # Set up the logger properly, so you can run multiprocessing to parallel the evaluation
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
     if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir, 'logs', f'instance_{inst_id}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {inst_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
 
     # =============================================
     # build instruction
@@ -86,97 +196,68 @@ def process_instance(
         'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
         'For example: The answer to the question is <solution> 42 </solution>.\n'
         '# Problem \n'
-        f'{question}\n\n'
+        f'{instance.description}\n\n'
     )
     instruction += (
         'IMPORTANT: You should ONLY interact with the environment provided '
         'to you AND NEVER ASK FOR HUMAN HELP.\n'
     )
     # NOTE: You can actually set slightly different instruction for different agents
-    instruction += INST_SUFFIXES[agent.__class__.__name__]
+    instruction += INST_SUFFIXES[metadata.agent_class]
 
     # =============================================
     # create sandbox and run the agent
     # =============================================
 
-    sandbox = DockerSSHBox()
-    sandbox.execute(f'cd {inst_id}')
+    runtime: Runtime = await create_runtime(config, sid=instance.instance_id)
 
-    init_cmd = instance.init
-    if init_cmd is not None:
-        scpt_name = f'{instance.instance_id}_init.sh'
-        scpt_path = os.path.join(container_inst_workspace, scpt_name)
-        host_scpt_path = os.path.join(instance_workspace, scpt_name)
-        create_sh_file(host_scpt_path, init_cmd)
-        logger.info(f'Running init script: {scpt_path}')
-        _, init_res = sandbox.execute(scpt_path)
-        logger.info(f'Init script result: {init_res}')
+    await initialize_runtime(runtime, instance=instance)
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            fake_user_response_fn=FAKE_RESPONSES[agent.__class__.__name__],
-            sandbox=sandbox,
-            sid=inst_id,
-        )
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
     )
-
     if state is None:
         raise ValueError('State should not be None.')
 
-    # get the ground truth
-    # OSBenchSSHBox.get_ground_truth(instance, state)
-
     # =============================================
     # result evaluation
     # =============================================
 
-    agent_answer = ''
-    get_agent_result_cmd = instance.get_agent_result
-    if get_agent_result_cmd is not None:
-        scpt_name = f'{instance.instance_id}_get_agent_result.sh'
-        scpt_path = os.path.join(container_inst_workspace, scpt_name)
-        host_scpt_path = os.path.join(instance_workspace, scpt_name)
-        create_sh_file(host_scpt_path, get_agent_result_cmd)
-        logger.info(f'Running get agent result cmd: {scpt_path}')
-        _, agent_answer = sandbox.execute(scpt_path)
-    else:
+    return_val = await complete_runtime(runtime, instance)
+    agent_answer = return_val['agent_answer']
+    final_ans = return_val['final_ans']
+
+    # If the agent answer is not found, retrieve it from the history
+    if agent_answer is None:
+        agent_answer = ''
         logger.info('Retrieving agent answer from history.')
         raw_ans = ''
 
         # retrieve the last agent message or thought
         for event in state.history.get_events(reverse=True):
-            if isinstance(event, MessageAction) and event.source == 'agent':
-                raw_ans = event.content
-            elif isinstance(event, CmdRunAction) and event.source == 'agent':
-                raw_ans = event.thought
+            if event.source == 'agent':
+                if isinstance(event, AgentFinishAction):
+                    raw_ans = event.thought
+                    break
+                elif isinstance(event, MessageAction):
+                    raw_ans = event.content
+                    break
+                elif isinstance(event, CmdRunAction):
+                    raw_ans = event.thought
+                    break
 
         # parse the answer for a solution tag
-        agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
+        agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans, re.DOTALL)
         if len(agent_answer) == 0:
             logger.warning(f'Failed to parse model answer: {raw_ans}')
             agent_answer = raw_ans
         else:
             agent_answer = agent_answer[0]
 
-    final_ans = ''
-    if instance.ground_truth is not None:
-        final_ans = instance.ground_truth
-    else:
-        get_ground_truth_cmd = instance.get_ground_truth
-        if get_ground_truth_cmd is not None:
-            scpt_name = f'{instance.instance_id}_get_ground_truth.sh'
-            scpt_path = os.path.join(container_inst_workspace, scpt_name)
-            host_scpt_path = os.path.join(instance_workspace, scpt_name)
-            create_sh_file(host_scpt_path, get_ground_truth_cmd)
-            logger.info(f'Running get ground truth cmd: {scpt_path}')
-            sandbox.execute(f'cd {container_inst_workspace}')
-            _, final_ans = sandbox.execute(scpt_path)
-
     comparison_method = instance.comparison_method
     logger.info(
         f'Final message: {agent_answer} | Ground truth: {final_ans} | Comparison method: {comparison_method}'
@@ -191,58 +272,49 @@ def process_instance(
     metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
-    output = {
-        'instance_id': inst_id,
-        'instance': instance.to_dict(),
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': {
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
             'agent_answer': agent_answer,
             'final_answer': final_ans,
             'check_method': comparison_method,
             'result': test_result,
         },
-    }
-
-    # clean up
-    if os.path.exists(instance_workspace):
-        shutil.rmtree(instance_workspace)
-    # Close the sandbox
-    try:
-        sandbox.close()
-    except docker.errors.NotFound as e:
-        logger.error(f'Failed to close sandbox: {e}')
+    )
     return output
 
 
 if __name__ == '__main__':
-    id_column = 'instance_id'
     args = parse_arguments()
     dataset = load_dataset('iFurySt/AgentBench')
     agent_bench_tests = dataset['osbench'].to_pandas()
 
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     metadata = make_metadata(
         llm_config,
-        args.dataset_name,
+        'AgentBench-OS',
         args.agent_cls,
         args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
-
-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    instances = prepare_dataset(agent_bench_tests, output_file, args.eval_n_limit)
+
+    asyncio.run(
+        run_evaluation(
+            instances, metadata, output_file, args.eval_num_workers, process_instance
+        )
     )
diff --git a/evaluation/agent_bench/scripts/run_infer.sh b/evaluation/agent_bench/scripts/run_infer.sh
old mode 100644
new mode 100755
diff --git a/evaluation/biocoder/README.md b/evaluation/biocoder/README.md
index 6829e1aaa55a..c549afc9c141 100644
--- a/evaluation/biocoder/README.md
+++ b/evaluation/biocoder/README.md
@@ -2,15 +2,12 @@
 
 Implements evaluation of agents on BioCoder from the BioCoder benchmark introduced in [BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models](https://arxiv.org/abs/2308.16458). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper.
 
-## Setup Environment
+## Setup Environment and LLM Configuration
 
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-
-## Configure OpenDevin and your LLM
-Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## BioCoder Docker Image
+
 In the opendevin branch of the Biocoder repository, we have slightly modified our original Docker image to work with the OpenDevin environment. In the Docker image are testing scripts (`/testing/start_test_opendevin.py` and aux files in `/testing_files/`) to assist with evaluation. Additionally, we have installed all dependencies, including OpenJDK, mamba (with Python 3.6), and many system libraries. Notably, we have **not** packaged all repositories into the image, so they are downloaded at runtime.
 
 **Before first execution, pull our Docker image with the following command**
@@ -41,12 +38,12 @@ to `CodeActAgent`.
 - `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
 
 Let's say you'd like to run 1 instance using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent
-with OpenDevin version 0.6.2, then your command would be:
+with current OpenDevin version, then your command would be:
 
 ## Examples
 
 ```bash
-./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent 1
+./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 HEAD CodeActAgent 1
 ```
 
 ## Reference
diff --git a/evaluation/biocoder/biocoder_env_box.py b/evaluation/biocoder/biocoder_env_box.py
deleted file mode 100644
index f589535ca3a4..000000000000
--- a/evaluation/biocoder/biocoder_env_box.py
+++ /dev/null
@@ -1,387 +0,0 @@
-import json
-import os
-import re
-import sys
-from collections import defaultdict
-from dataclasses import dataclass
-
-from datasets import load_dataset
-
-from opendevin.core.config import load_app_config
-from opendevin.core.logger import opendevin_logger as logger
-from opendevin.runtime.docker.ssh_box import DockerSSHBox
-from opendevin.runtime.plugins import (
-    JupyterRequirement,
-    PluginRequirement,
-    SWEAgentCommandsRequirement,
-)
-
-config = load_app_config()
-
-BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
-
-
-@dataclass
-class BiocoderData:
-    filePath: str
-    numLines: int
-    lineStart: int
-    lineEnd: int
-    signature: str
-    comment: str
-    content: str
-    repository: str
-    promptSummaryOnly: str
-    contextCode: str
-    goldenCode: str
-    test_case_id: str
-    language: str
-
-    def to_dict(self):
-        return {
-            'filePath': self.filePath,
-            'numLines': self.numLines,
-            'lineStart': self.lineStart,
-            'lineEnd': self.lineEnd,
-            'signature': self.signature,
-            'comment': self.comment,
-            'content': self.content,
-            'repository': self.repository,
-            'promptSummaryOnly': self.promptSummaryOnly,
-            'contextCode': self.contextCode,
-            'goldenCode': self.goldenCode,
-            'test_case_id': self.test_case_id,
-            'language': self.language,
-        }
-
-
-def get_likely_indent_size(array_of_tabs) -> int:
-    sizes = defaultdict(int)
-
-    for i in range(len(array_of_tabs) - 1):
-        diff = array_of_tabs[i + 1] - array_of_tabs[i]
-        if diff > 0:
-            sizes[diff] += 1
-    if len(sizes) == 0:
-        return 4
-    return int(max(sizes, key=sizes.get))
-
-
-class BiocoderSSHBox(DockerSSHBox):
-    def __init__(
-        self,
-        container_image: str,
-        timeout: int = 120,
-        sid: str | None = None,
-        biocoder_instance_id: str | None = None,
-        biocoder_instance: BiocoderData | None = None,
-        skip_workspace_mount: bool = True,
-        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
-        biocoder_cache_folder: str = 'biocoder_cache',
-        workspace_dir_name: str | None = None,
-    ):
-        if biocoder_instance_id is None:
-            raise ValueError('biocoder_instance_id must be provided')
-        self.biocoder_instance_id = biocoder_instance_id
-        self.biocoder_instance = biocoder_instance
-        self.skip_workspace_mount = skip_workspace_mount
-        self.biocoder_cache_folder = biocoder_cache_folder
-        self.first_line_after_removed = None
-        self.workspace_dir_name = workspace_dir_name
-        self.workspace_base = config.workspace_base
-        self.workspace_mount_path = config.workspace_mount_path
-        # self.workspace_dir_name_host = os.path.join(config.workspace_base, workspace_dir_name)
-
-        self.context_path = None
-        self.generated_path = None
-        self.golden_path = None
-
-        assert (
-            container_image is not None
-        ), 'container_image is required for BiocoderBenchSSHBox!'
-        super().__init__(container_image, timeout, sid)
-        self.init_plugins(sandbox_plugins)
-
-    @property
-    def volumes(self):
-        if self.skip_workspace_mount:
-            return {
-                k: v
-                for k, v in super().volumes.items()
-                if not v['bind'] == self.sandbox_workspace_dir
-            }
-        return super().volumes
-
-    def get_target_filepath(self):
-        target_filepath = os.path.join(
-            self.workspace_mount_path,
-            self.biocoder_instance.repository.split('/')[1],
-            self.biocoder_instance.filePath,
-        )
-        return target_filepath
-
-    def get_changed_code(self, include_signature=False):
-        # copies changed code into /testing_files/
-        # Note that this does NOT copy the function signature
-        target_filepath = self.get_target_filepath()
-        selected_lines = []
-        offset = 1 if include_signature else 0
-        if self.first_line_after_removed is None:
-            logger.warning('First line after removed is None')
-        with open(target_filepath, 'r') as f:
-            lines = f.read().split('\n')
-            for i in range(self.biocoder_instance.lineStart - offset, len(lines)):
-                if lines[i].strip() == self.first_line_after_removed.strip():
-                    break
-                selected_lines.append(lines[i])
-        text = '\n'.join(selected_lines)
-        return text
-
-    def copy_changed_code(self):
-        changed_code = self.get_changed_code(include_signature=True)
-        with open(self.generated_path, 'w') as f:
-            f.write(changed_code)
-        exit_code, output = self.execute_and_check(
-            f'cp -r /workspace/{self.biocoder_cache_folder}/* /testing_files',
-            'Failed to copy the files',
-        )
-
-    def remove_code(self):
-        comment_prefix = {'python': '#', 'java': '//'}
-
-        target_filepath = self.get_target_filepath()
-        line_start = self.biocoder_instance.lineStart
-        line_end = self.biocoder_instance.lineEnd
-        with open(target_filepath, 'r') as f:
-            lines = f.read().split('\n')
-            # print("="*10+"ORIGINAL"+"="*10)
-            # print("\n".join(lines))
-            signature_line = lines[line_start - 1]
-
-            # get the number of tabs
-            def get_indent_size(s: str):
-                return len(re.match(r'\s*', s).group())
-
-            indent_sizes = list(map(get_indent_size, lines))
-            indent_size = get_likely_indent_size(indent_sizes)
-            comment_indent_size = get_indent_size(signature_line) + indent_size
-            lines = (
-                lines[:line_start]
-                + [
-                    f"{' '*comment_indent_size+comment_prefix[self.biocoder_instance.language.lower()]}TODO: replace with your code here"
-                ]
-                + ([''] * 2)
-                + lines[line_end:]
-            )
-        first_line_after_removed_index = line_start
-        while len(
-            lines[first_line_after_removed_index].strip()
-        ) == 0 and first_line_after_removed_index < len(lines):
-            first_line_after_removed_index += 1
-        self.first_line_after_removed = lines[first_line_after_removed_index]
-        # print("FIRST LINE AFTER REMOVED: ", self.first_line_after_removed)
-
-        with open(target_filepath, 'w') as f:
-            f.write('\n'.join(lines))
-
-        # with open(target_filepath, 'r') as f:
-        #     print("="*10+"MODIFIED"+"="*10)
-        #     print(f.read())
-
-    def execute_and_check(self, cmd: str, error_msg: str) -> tuple[int, str]:
-        exit_code, output = self.execute(cmd)
-        if exit_code != 0:
-            logger.error(error_msg)
-            sys.exit(1)
-        return exit_code, output
-
-    @classmethod
-    def get_box_for_instance(
-        cls,
-        instance,
-        workspace_dir_name=None,
-        skip_workspace_mount: bool = False,
-        workspace_mount_path: str | None = None,
-        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
-    ) -> 'BiocoderSSHBox':
-        """This method initializes a container image, then runs some initialization commands"""
-        if workspace_dir_name is None:
-            workspace_dir_name = f'{instance.repository}__{instance.test_case_id[:10]}__{os.getpid()}'.replace(
-                '/', '__'
-            )
-
-        workspace_base = str(os.path.join(config.workspace_base, workspace_dir_name))
-        old_workspace_base = config.workspace_base
-        old_workspace_mount_path = config.workspace_mount_path
-
-        try:
-            config.workspace_base = workspace_base
-            config.workspace_mount_path = workspace_base
-
-            # linting python after editing helps LLM fix indentations
-            config.sandbox.enable_auto_lint = True
-
-            # create folder for transferring files back/forth
-            biocoder_cache_folder = 'biocoder_cache'
-            if not os.path.exists(os.path.join(workspace_base, biocoder_cache_folder)):
-                os.makedirs(
-                    os.path.join(workspace_base, biocoder_cache_folder), exist_ok=True
-                )
-
-            file_ext = {
-                'python': 'py',
-                'java': 'java',
-                'c': 'c',
-                'cpp': 'cpp',
-                'javascript': 'js',
-                'typescript': 'ts',
-            }[instance.language.lower()]
-
-            context_path = os.path.join(
-                workspace_base, biocoder_cache_folder, 'context.' + file_ext
-            )
-            generated_path = os.path.join(
-                workspace_base, biocoder_cache_folder, 'generated.' + file_ext
-            )
-            golden_path = os.path.join(
-                workspace_base, biocoder_cache_folder, 'golden.' + file_ext
-            )
-
-            # print(instance.contextCode)
-            with open(context_path, 'w') as f:
-                f.write(instance.contextCode)
-            with open(generated_path, 'w') as f:
-                f.write(instance.goldenCode)
-            with open(golden_path, 'w') as f:
-                f.write(instance.goldenCode)
-
-            testcase_json = {
-                'test_case_id': instance.test_case_id,
-                'num_cases': 1000,
-                'language': instance.language.lower(),
-            }
-
-            with open(
-                os.path.join(
-                    workspace_base, biocoder_cache_folder, 'testcase_biocoder.json'
-                ),
-                'w',
-            ) as f:
-                f.write(json.dumps(testcase_json, indent=4))
-
-            # linting python after editing helps LLM fix indentations
-            config.sandbox.enable_auto_lint = True
-
-            sandbox = cls(
-                container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
-                biocoder_instance_id=instance.test_case_id,
-                biocoder_instance=instance,
-                skip_workspace_mount=skip_workspace_mount,
-                sandbox_plugins=sandbox_plugins,
-                biocoder_cache_folder=biocoder_cache_folder,
-                workspace_dir_name=workspace_dir_name,
-            )
-        except Exception:
-            raise
-        finally:
-            config.workspace_base = old_workspace_base
-            config.workspace_mount_path = old_workspace_mount_path
-
-        sandbox.context_path = context_path
-        sandbox.generated_path = generated_path
-        sandbox.golden_path = golden_path
-
-        logger.info(f'SSH box started for instance {instance.test_case_id}.')
-        # cd to the workspace
-        exit_code, output = sandbox.execute_and_check(
-            'cd /workspace', 'Failed to cd to workspace'
-        )
-        logger.info(f'cd to workspace: {output}')
-
-        # download repository archive
-        repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip"
-        exit_code, output = sandbox.execute_and_check(
-            'wget -O repo.zip ' + repository_url, 'Failed to download the repository'
-        )
-        logger.info(f'Downloaded the repository: {output}')
-        exit_code, output = sandbox.execute_and_check(
-            'unzip -o -q repo.zip', 'Failed to unzip the repository'
-        )
-        logger.info(f'Unzipped the repository: {output}')
-
-        # copy the context, generated and golden files to the /testing_files folder
-        exit_code, output = sandbox.execute_and_check(
-            f'cp -r /workspace/{biocoder_cache_folder}/* /testing_files',
-            'Failed to copy the files',
-        )
-
-        # chmod 777
-        exit_code, output = sandbox.execute_and_check(
-            'chmod -R 777 /workspace',
-            'Failed to chmod the files',
-        )
-
-        return sandbox
-
-
-if __name__ == '__main__':
-    biocoder_dataset = load_dataset('Lilbillbiscuit/biocoder_public')
-    EXAMPLE_INSTANCE = biocoder_dataset['test'][0]
-    EXAMPLE_INSTANCE = BiocoderData(**EXAMPLE_INSTANCE)
-
-    sandbox = BiocoderSSHBox.get_box_for_instance(
-        instance=EXAMPLE_INSTANCE,
-        workspace_mount_path='/home/ubuntu/OpenDevinBioCoder/workspace',
-        skip_workspace_mount=False,
-        sandbox_plugins=[JupyterRequirement(), SWEAgentCommandsRequirement()],
-    )
-
-    # PRE TEST
-    exit_code, output = sandbox.execute_and_check(
-        'cd /testing',
-        'Failed to cd /testing',
-    )
-    logger.info(f'cd $REPO_PATH: {output}')
-
-    exit_code, output = sandbox.execute_and_check(
-        'whoami',
-        'Failed to run whoami',
-    )
-    logger.info(f'whoami: {output}')
-
-    # TEST
-    exit_code, output = sandbox.execute(
-        '/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
-    )
-    assert exit_code == 0, 'Expected exit code 0 (this should have passed)'
-    logger.info(f'$TEST_CMD:\n{output}')
-
-    exit_code, output = sandbox.execute_and_check(
-        'cat /testing_files/results_biocoder.json', 'Failed to read the result file'
-    )
-
-    print(output)
-    json_obj = json.loads(output)
-    if json_obj['result'] == 'pass':
-        print('PASS')
-    else:
-        print('FAIL')
-
-    sys.stdout.flush()
-    try:
-        while True:
-            try:
-                user_input = input('>>> ')
-            except EOFError:
-                logger.info('Exiting...')
-                break
-            if user_input.lower() == 'exit':
-                logger.info('Exiting...')
-                break
-            exit_code, output = sandbox.execute(user_input)
-            logger.info('exit code: %d', exit_code)
-            logger.info(output)
-            sys.stdout.flush()
-    except KeyboardInterrupt:
-        logger.info('Exiting...')
-    sandbox.close()
diff --git a/evaluation/biocoder/run_infer.py b/evaluation/biocoder/run_infer.py
index 3fb14e8bd76c..3fb213744063 100644
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -1,33 +1,38 @@
 import asyncio
+import functools
 import json
-import logging
 import os
-import pathlib
-from functools import partial
+import tempfile
+from typing import Any
 
 import pandas as pd
 from datasets import load_dataset
 
-from evaluation.biocoder.biocoder_env_box import BiocoderData, BiocoderSSHBox
+from evaluation.biocoder.utils import BiocoderData
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     codeact_user_response,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import CmdRunAction
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
-    'CodeActAgent': partial(
+    'CodeActAgent': functools.partial(
         codeact_user_response, encapsulate_solution=True, try_parse=None
     ),
 }
@@ -36,111 +41,218 @@
     'CodeActAgent': 'When you think you have fixed the issue through code changes, please run the following command: <execute_bash> exit </execute_bash>.\n'
 }
 
+FILE_EXT_MAP = {
+    'python': 'py',
+    'java': 'java',
+    'c': 'c',
+    'cpp': 'cpp',
+    'javascript': 'js',
+    'typescript': 'ts',
+}
+
+
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
+
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: BiocoderData,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    file_ext = FILE_EXT_MAP[instance.language.lower()]
+
+    action = CmdRunAction(command='mkdir -p /workspace && mkdir -p /testing_files')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        context_path = os.path.join(tmpdir, 'context.' + file_ext)
+        with open(context_path, 'w') as f:
+            f.write(instance.contextCode)
+        await runtime.copy_to(context_path, '/testing_files')
+
+        golden_path = os.path.join(tmpdir, 'golden.' + file_ext)
+        with open(golden_path, 'w') as f:
+            f.write(instance.goldenCode)
+        await runtime.copy_to(golden_path, '/testing_files')
+
+        testcase_json = {
+            'test_case_id': instance.test_case_id,
+            'num_cases': 1000,
+            'language': instance.language.lower(),
+        }
+        testcase_path = os.path.join(tmpdir, 'testcase_biocoder.json')
+        with open(testcase_path, 'w') as f:
+            f.write(json.dumps(testcase_json, indent=4))
+
+        await runtime.copy_to(testcase_path, '/testing_files')
+
+    # setup paths
+    remove_code_script = os.path.join(
+        os.path.dirname(__file__), 'scripts', 'setup', 'remove_code.py'
+    )
+    await runtime.copy_to(remove_code_script, '/testing_files')
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    # download repository archive
+    repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip"
+    action = CmdRunAction(command='wget -O repo.zip ' + repository_url)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0, f'Failed to download the repository: {obs.content}'
+
+    # unzip the repository
+    action = CmdRunAction(command='unzip -o -q repo.zip && rm repo.zip')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0, f'Failed to unzip the repository: {obs.content}'
+
+    # chmod 777
+    action = CmdRunAction(command='chmod -R 777 /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0, f'Failed to chmod the files: {obs.content}'
+
+    # remove code for evaluation instance
+    target_filepath = os.path.join(
+        '/workspace', instance.repository.split('/')[1], instance.filePath
+    )
+    line_start = instance.lineStart
+    line_end = instance.lineEnd
+    language = instance.language.lower()
+    action = CmdRunAction(
+        command=f'python3 /testing_files/remove_code.py --target_filepath {target_filepath} --line_start {line_start} --line_end {line_end} --language {language}'
+    )
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0, f'Failed to remove the code: {obs.content}'
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
 
-def get_test_result(instance, sandbox, workspace_dir_name):
     test_result = {'result': {}, 'metadata': {}}
-    try:
-        code = sandbox.get_changed_code(include_signature=True)
-        sandbox.copy_changed_code()
+
+    copy_changed_code_script = os.path.join(
+        os.path.dirname(__file__), 'scripts', 'setup', 'copy_changed_code.py'
+    )
+    await runtime.copy_to(copy_changed_code_script, '/testing_files')
+
+    file_ext = FILE_EXT_MAP[instance.language.lower()]
+    target_filepath = os.path.join(
+        '/workspace', instance.repository.split('/')[1], instance.filePath
+    )
+    generated_path = os.path.join('/testing_files', 'generated.' + file_ext)
+
+    action = CmdRunAction(
+        command=f'python3 /testing_files/copy_changed_code.py --target_filepath {target_filepath} --generated_code_filepath {generated_path} --line_start {instance.lineStart} --include_signature'
+    )
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    if obs.exit_code == 0:
         test_result['metadata']['1_copy_change_success'] = True
+
+        action = CmdRunAction(command=f'cat {generated_path}', keep_prompt=False)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        assert obs.exit_code == 0
+
+        code = obs.content
         test_result['metadata']['1_copy_change_code'] = code
-    except Exception:
-        logger.error('Error fetching changed code for this instance')
+    else:
         test_result['metadata']['1_copy_change_success'] = False
         test_result['metadata']['1_copy_change_code'] = None
 
-    exit_code, output = sandbox.execute_and_check(
-        'cd /testing',
-        'Failed to cd /testing',
-    )
-    logger.info(f'cd $REPO_PATH: {output}')
-
-    exit_code, output = sandbox.execute_and_check(
-        'whoami',
-        'Failed to run whoami',
-    )
-    logger.info(f'whoami: {output}')
+    action = CmdRunAction(command='cd /testing_files')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
 
-    exit_code, output = sandbox.execute(
-        '/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
+    action = CmdRunAction(
+        command='/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
     )
-    logger.info(f'$TEST_CMD:\n{output}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
 
-    exit_code, output = sandbox.execute_and_check(
-        'cat /testing_files/results_biocoder.json', 'Failed to read the result file'
+    action = CmdRunAction(
+        command='cat /testing_files/results_biocoder.json', keep_prompt=False
     )
-    if exit_code == 0:
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    if obs.exit_code == 0:
         test_result['metadata']['2_run_test_success'] = True
-        test_result['metadata']['2_run_test_result'] = str(output)
+        test_result['metadata']['2_run_test_result'] = str(obs.content)
+        json_obj = json.loads(obs.content)
+        test_result['result'] = json_obj['result']
     else:
         test_result['metadata']['2_run_test_success'] = False
-        test_result['metadata']['2_run_test_result'] = str(output)
-    json_obj = json.loads(output)
-    test_result['result'] = json_obj['result']
+        test_result['metadata']['2_run_test_result'] = str(obs.content)
 
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
     return test_result
 
 
-def process_instance(
+async def process_instance(
     instance: pd.Series,
     metadata: EvalMetadata,
     reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
+) -> EvalOutput:
+    config = get_config(metadata)
     instance = BiocoderData(**instance)
     print(instance)
-    workspace_dir_name = (
-        f'{instance.repository}__{instance.test_case_id[:10]}__{os.getpid()}'.replace(
-            '/', '__'
-        )
-    )
-    workspace_mount_path = os.path.join(config.workspace_base, workspace_dir_name)
-    # create process-specific workspace dir
-    # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
-    # so that different agent don't interfere with each other.
-    workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-
-    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
-    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir, 'logs', f'instance_{instance.test_case_id}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {instance.test_case_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
+    instance_id = f'{instance.repository}__{instance.instance_id[:10]}'
 
-    logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-
-    # NOTE: this is something special we do for SWE-Bench due to the reason described in the previous section
-    # You can omit this if you don't need to setup specialized sandbox
-    workspace_dir_name = f'{instance.repository}__{instance.test_case_id[:10]}'.replace(
-        '/', '__'
-    )
-    sandbox = BiocoderSSHBox.get_box_for_instance(
-        instance,
-        workspace_dir_name,
-        skip_workspace_mount=False,
-        workspace_mount_path=workspace_mount_path,
-        sandbox_plugins=agent.sandbox_plugins,
-    )
-
-    sandbox.remove_code()
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance_id}.')
 
     # Prepare instruction
     instruction = (
@@ -160,80 +272,76 @@ def process_instance(
         'Make sure to include proper formatting in Java and Python, including correct braces and/or indentation.\n'
     )
     # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
 
     # use a session id for concurrent evaluation
-    sid = instance.test_case_id.replace('/', '__')
+    sid = instance.instance_id.replace('/', '__')
+
+    runtime = await create_runtime(config, sid=sid)
+
+    await initialize_runtime(runtime, instance)
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                agent.__class__.__name__
-            ],
-            sandbox=sandbox,
-            sid=sid,
-        )
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
     )
 
-    test_result = get_test_result(instance, sandbox, workspace_dir_name)
-
     if state is None:
         raise ValueError('State should not be None.')
-    metrics = state.metrics.get() if state.metrics else None
 
+    test_result = await complete_runtime(runtime, instance)
+    metrics = state.metrics.get() if state.metrics else None
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
     histories = state.history.compatibility_for_eval_history_pairs()
 
+    test_result['generated'] = test_result['metadata']['1_copy_change_code']
+
     # Save the output
-    output = {
-        'test_case_id': instance.test_case_id,
-        'biocoder_instance': instance.to_dict(),
-        'instruction': instruction,
-        'generated': test_result['metadata']['1_copy_change_code'],
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': test_result,
-    }
-
-    # Close the sandbox
-    sandbox.close()
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
     return output
 
 
 if __name__ == '__main__':
-    id_column = 'test_case_id'
     args = parse_arguments()
+
     dataset = load_dataset('lilbillbiscuit/biocoder_public')
-    biocoder_tests = dataset['test'].to_pandas()
+    biocoder_tests = dataset['train'].to_pandas()
+    biocoder_tests['instance_id'] = biocoder_tests['test_case_id']
 
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     metadata = make_metadata(
         llm_config,
-        args.dataset_name,
+        'biocoder',
         args.agent_cls,
         args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
-
-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    instances = prepare_dataset(biocoder_tests, output_file, args.eval_n_limit)
+
+    asyncio.run(
+        run_evaluation(
+            instances, metadata, output_file, args.eval_num_workers, process_instance
+        )
     )
diff --git a/evaluation/biocoder/scripts/run_infer.sh b/evaluation/biocoder/scripts/run_infer.sh
old mode 100644
new mode 100755
diff --git a/evaluation/biocoder/scripts/setup/copy_changed_code.py b/evaluation/biocoder/scripts/setup/copy_changed_code.py
new file mode 100644
index 000000000000..2cee1e97b66f
--- /dev/null
+++ b/evaluation/biocoder/scripts/setup/copy_changed_code.py
@@ -0,0 +1,45 @@
+import argparse
+
+
+def get_changed_code(target_filepath, line_start, include_signature=False):
+    # copies changed code into /testing_files/
+    # Note that this does NOT copy the function signature
+    selected_lines = []
+    offset = 1 if include_signature else 0
+
+    with open('/testing_files/first_line_after_removed.txt', 'r') as f:
+        first_line_after_removed = f.read()
+    if first_line_after_removed is None:
+        print('First line after removed is None')
+
+    with open(target_filepath, 'r') as f:
+        lines = f.read().split('\n')
+        for i in range(line_start - offset, len(lines)):
+            if lines[i].strip() == first_line_after_removed.strip():
+                break
+            selected_lines.append(lines[i])
+    text = '\n'.join(selected_lines)
+    return text
+
+
+def copy_changed_code(
+    target_filepath, generated_code_filepath, line_start, include_signature=False
+):
+    changed_code = get_changed_code(target_filepath, line_start, include_signature)
+    with open(generated_code_filepath, 'w') as f:
+        f.write(changed_code)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--target_filepath', type=str, required=True)
+    parser.add_argument('--generated_code_filepath', type=str, required=True)
+    parser.add_argument('--line_start', type=int, required=True)
+    parser.add_argument('--include_signature', action='store_true')
+    args = parser.parse_args()
+    copy_changed_code(
+        args.target_filepath,
+        args.generated_code_filepath,
+        args.line_start,
+        args.include_signature,
+    )
diff --git a/evaluation/biocoder/scripts/setup/remove_code.py b/evaluation/biocoder/scripts/setup/remove_code.py
new file mode 100644
index 000000000000..3c76a41738d5
--- /dev/null
+++ b/evaluation/biocoder/scripts/setup/remove_code.py
@@ -0,0 +1,74 @@
+import argparse
+import os
+import re
+from collections import defaultdict
+
+
+def get_likely_indent_size(array_of_tabs) -> int:
+    sizes = defaultdict(int)
+
+    for i in range(len(array_of_tabs) - 1):
+        diff = array_of_tabs[i + 1] - array_of_tabs[i]
+        if diff > 0:
+            sizes[diff] += 1
+    if len(sizes) == 0:
+        return 4
+    return int(max(sizes, key=sizes.get))
+
+
+def get_target_filepath(self):
+    target_filepath = os.path.join(
+        self.workspace_mount_path,
+        self.biocoder_instance.repository.split('/')[1],
+        self.biocoder_instance.filePath,
+    )
+    return target_filepath
+
+
+def remove_code(target_filepath: str, line_start: int, line_end: int, language: str):
+    comment_prefix = {'python': '#', 'java': '//'}
+
+    with open(target_filepath, 'r') as f:
+        lines = f.read().split('\n')
+        # print("="*10+"ORIGINAL"+"="*10)
+        # print("\n".join(lines))
+        signature_line = lines[line_start - 1]
+
+        # get the number of tabs
+        def get_indent_size(s: str):
+            return len(re.match(r'\s*', s).group())
+
+        indent_sizes = list(map(get_indent_size, lines))
+        indent_size = get_likely_indent_size(indent_sizes)
+        comment_indent_size = get_indent_size(signature_line) + indent_size
+        lines = (
+            lines[:line_start]
+            + [
+                f"{' '*comment_indent_size+comment_prefix[language.lower()]}TODO: replace with your code here"
+            ]
+            + ([''] * 2)
+            + lines[line_end:]
+        )
+    first_line_after_removed_index = line_start
+    while len(
+        lines[first_line_after_removed_index].strip()
+    ) == 0 and first_line_after_removed_index < len(lines):
+        first_line_after_removed_index += 1
+
+    first_line_after_removed = lines[first_line_after_removed_index]
+    print('FIRST LINE AFTER REMOVED: ', first_line_after_removed)
+    with open('/testing_files/first_line_after_removed.txt', 'w') as f:
+        f.write(first_line_after_removed)
+
+    with open(target_filepath, 'w') as f:
+        f.write('\n'.join(lines))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--target_filepath', type=str, required=True)
+    parser.add_argument('--line_start', type=int, required=True)
+    parser.add_argument('--line_end', type=int, required=True)
+    parser.add_argument('--language', type=str, required=True)
+    args = parser.parse_args()
+    remove_code(args.target_filepath, args.line_start, args.line_end, args.language)
diff --git a/evaluation/biocoder/utils.py b/evaluation/biocoder/utils.py
new file mode 100644
index 000000000000..2376d6104165
--- /dev/null
+++ b/evaluation/biocoder/utils.py
@@ -0,0 +1,36 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class BiocoderData:
+    instance_id: str
+    filePath: str
+    numLines: int
+    lineStart: int
+    lineEnd: int
+    signature: str
+    comment: str
+    content: str
+    repository: str
+    promptSummaryOnly: str
+    contextCode: str
+    goldenCode: str
+    test_case_id: str
+    language: str
+
+    def to_dict(self):
+        return {
+            'filePath': self.filePath,
+            'numLines': self.numLines,
+            'lineStart': self.lineStart,
+            'lineEnd': self.lineEnd,
+            'signature': self.signature,
+            'comment': self.comment,
+            'content': self.content,
+            'repository': self.repository,
+            'promptSummaryOnly': self.promptSummaryOnly,
+            'contextCode': self.contextCode,
+            'goldenCode': self.goldenCode,
+            'test_case_id': self.test_case_id,
+            'language': self.language,
+        }
diff --git a/evaluation/bird/README.md b/evaluation/bird/README.md
index 05e0fd8021f1..072da010f858 100644
--- a/evaluation/bird/README.md
+++ b/evaluation/bird/README.md
@@ -2,43 +2,14 @@
 
 Implements evaluation of agents on BIRD introduced in [Can LLM Already Serve as A Database Interface? A BIg Bench for Large-Scale Database Grounded Text-to-SQLs](https://arxiv.org/abs/2305.03111). Please see [here](https://bird-bench.github.io/) for the reference implementation used in the paper.
 
-## Setup Environment
+## Setup Environment and LLM Configuration
 
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-
-## Configure OpenDevin and your LLM
-
-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-ssh_hostname = "localhost"
-
-[sandbox]
-enable_auto_lint = true
-
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on Bird
 
 ```bash
-./evaluation/bird/scripts/run_infer.sh eval_gpt4_1106_preview [model_config] [git-version]
+./evaluation/bird/scripts/run_infer.sh [model_config] [git-version]
 ```
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py
index d370f6780ee1..86db5eb9eb20 100644
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -1,12 +1,12 @@
 import asyncio
 import json
-import logging
 import os
 import pathlib
 import re
-import shutil
 import sqlite3
 import subprocess
+import zipfile
+from typing import Any
 
 import pandas as pd
 from datasets import load_dataset
@@ -15,20 +15,24 @@
 
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.events.action import MessageAction
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import CmdRunAction, MessageAction
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime
 
 
 def codeact_user_response(state: State) -> str:
@@ -62,6 +66,27 @@ def codeact_user_response(state: State) -> str:
 }
 
 
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='python:3.11-bookworm',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
 def execute_sql(db_path, gen_sql, gold_sql):
     """Execute the generated SQL and the ground truth SQL and compare the results."""
     with sqlite3.connect(db_path) as conn:
@@ -76,12 +101,213 @@ def execute_sql(db_path, gen_sql, gold_sql):
     return res
 
 
-def get_test_result(instance, path, timeout=30):
+LOCAL_DATASET_PATH = os.path.join(os.path.dirname(__file__), 'data')
+
+
+def load_bird():
+    """Main function to handle the flow of downloading, processing, and loading the bird dataset."""
+
+    def _download_bird():
+        """Downloads and extracts the bird dataset from a specified URL into a local directory."""
+        devset_path = os.path.join(LOCAL_DATASET_PATH, 'dev')
+        if not os.path.exists(devset_path):
+            logger.info(
+                f'{LOCAL_DATASET_PATH} folder does not exist, starting download and extraction...'
+            )
+            os.makedirs(LOCAL_DATASET_PATH, exist_ok=True)
+
+            download_url = 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip'
+            download_path = os.path.join(LOCAL_DATASET_PATH, 'dev.zip')
+            if not os.path.exists(download_path):
+                logger.info('Start Downloading...')
+                subprocess.run(['wget', download_url, '-O', download_path])
+                logger.info('Download completed.')
+
+            devset_path = os.path.join(LOCAL_DATASET_PATH, 'dev')
+            if not os.path.exists(devset_path):
+                logger.info('Start Extracting...')
+                os.makedirs(devset_path, exist_ok=True)
+                with zipfile.ZipFile(download_path, 'r') as zip_ref:
+                    zip_ref.extractall(devset_path)
+                # move everything in 'dev_20240627' to the root folder
+                for file in os.listdir(os.path.join(devset_path, 'dev_20240627')):
+                    os.rename(
+                        os.path.join(devset_path, 'dev_20240627', file),
+                        os.path.join(devset_path, file),
+                    )
+                os.rmdir(os.path.join(devset_path, 'dev_20240627'))
+                logger.info('Extraction completed.')
+
+            # extract databases
+            database_path = os.path.join(devset_path, 'dev_databases.zip')
+            assert os.path.exists(database_path)
+            logger.info('Start Extracting...')
+            with zipfile.ZipFile(database_path, 'r') as zip_ref:
+                zip_ref.extractall(devset_path)
+            logger.info('Extraction completed.')
+        else:
+            logger.info(f'{LOCAL_DATASET_PATH} folder already exists.')
+        return devset_path
+
+    def _extract_create_table_prompt(db_path, limit_value=0):
+        """Generates a SQL prompt with CREATE TABLE statements and sample data from the database."""
+        table_query = "SELECT * FROM sqlite_master WHERE type='table';"
+        tables = sqlite3.connect(db_path).cursor().execute(table_query).fetchall()
+        prompt = ''
+        for table in tables:
+            table_name = table[1]
+            create_table_statement = table[-1]
+
+            table_info_query = f'PRAGMA table_info(`{table_name}`);'
+            top_k_row_query = f'SELECT * FROM {table_name} LIMIT {limit_value};'
+            try:
+                headers = [
+                    x[1]
+                    for x in sqlite3.connect(db_path)
+                    .cursor()
+                    .execute(table_info_query)
+                    .fetchall()
+                ]
+            except Exception:
+                logger.error(f'Error Connection: {table_info_query}, {top_k_row_query}')
+                exit(0)
+
+            prompt += create_table_statement + ';\n'
+            if limit_value > 0:
+                top_k_rows = (
+                    sqlite3.connect(db_path)
+                    .cursor()
+                    .execute(top_k_row_query)
+                    .fetchall()
+                )
+                prompt += (
+                    f"/*\n3 example rows:\n{top_k_row_query}\n{'    '.join(headers)}\n"
+                )
+                for row in top_k_rows:
+                    row = [str(x) for x in row]
+                    row = [x if x is not None else '' for x in row]
+                    prompt += '    '.join(row) + '\n'
+                prompt += '*/\n'
+            prompt += '\n'
+        return prompt
+
+    def _create_prompt(e, database_path):
+        """Create a prompt for the given example"""
+        db_id = e['db_id']
+        db_path = pathlib.Path(database_path) / db_id / f'{db_id}.sqlite'
+
+        # Extract the CREATE TABLE statements and sample data from the database
+        prompt = _extract_create_table_prompt(db_path)
+        prompt += f"-- External Knowledge: {e['evidence']}\n\n"
+        prompt += '-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n'
+        prompt += '-- Using valid SQLite, answer the following questions for the tables provided above.\n'
+        prompt += f"Question: {e['question']}\n"
+
+        return prompt
+
+    def _process_bird(dataset_path):
+        """Processes the raw bird dataset into a structured format and saves it as JSON."""
+        processed_path = os.path.join(LOCAL_DATASET_PATH, 'dev', 'processed_dev.json')
+        if not os.path.exists(processed_path):
+            logger.info(
+                f'{processed_path} folder does not exist, starting processing...'
+            )
+            raw_data_path = os.path.join(LOCAL_DATASET_PATH, 'dev', 'dev.json')
+            database_path = os.path.join(LOCAL_DATASET_PATH, 'dev', 'dev_databases')
+            processed_data = []
+            with pathlib.Path(raw_data_path).open('r') as f:
+                data = json.load(f)
+                for e in tqdm(data):
+                    item = {
+                        'instance_id': f'{len(processed_data)}',
+                        'db_path': os.path.join(
+                            database_path, e['db_id'], f"{e['db_id']}.sqlite"
+                        ),
+                        'db_id': e['db_id'],
+                        'instruction': _create_prompt(e, database_path),
+                        'SQL': e['SQL'],
+                    }
+                    processed_data.append(item)
+
+            with pathlib.Path(processed_path).open('w') as f:
+                json.dump(processed_data, f, indent=2)
+                logger.info(f'Processed data saved to {processed_path}')
+        else:
+            logger.info(f'{processed_path} folder already exists.')
+        bird_dataset = load_dataset('json', data_files={'test': processed_path})
+        return bird_dataset
+
+    raw_dataset_path = _download_bird()
+    bird_dataset = _process_bird(raw_dataset_path)
+    return bird_dataset
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Copy the database to the workspace
+    db_file = os.path.join(
+        LOCAL_DATASET_PATH,
+        'dev',
+        'dev_databases',
+        instance.db_id,
+        f'{instance.db_id}.sqlite',
+    )
+    await runtime.copy_to(db_file, '/workspace')
+
+    # Check the database is copied
+    action = CmdRunAction(
+        command='cd /workspace && ls -l',
+        keep_prompt=False,
+    )
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+    assert f'{instance.db_id}.sqlite' in obs.content
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+    timeout = 30
+
     test_result = {'result': {}, 'metadata': {}}
 
     # Read the generated python file
-    with open(path, 'r') as f:
-        gen_file = f.read()
+    instance_id = instance.instance_id.replace('/', '__')
+    path = os.path.join('/workspace', f'{instance_id}.py')
+
+    action = CmdRunAction(
+        command=f'cat {path}',
+        keep_prompt=False,
+    )
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    if obs.exit_code != 0:
+        test_result['result'] = {'passed': 0, 'status': 'error'}
+        return test_result
+
+    gen_file = obs.content.strip().replace('\r\n', '\n')
 
     # Extract the SQL from the python file
     gen_sql = ''
@@ -96,7 +322,13 @@ def get_test_result(instance, path, timeout=30):
     # Execute the SQL
     try:
         res = func_timeout(
-            timeout, execute_sql, args=(instance.db_path, gen_sql, gold_sql)
+            timeout,
+            execute_sql,
+            args=(
+                instance.db_path,
+                gen_sql,
+                gold_sql,
+            ),
         )
         status = 'success'
     except FunctionTimedOut:
@@ -114,68 +346,28 @@ def get_test_result(instance, path, timeout=30):
         'gen_sql': gen_sql,
         'gold_sql': gold_sql,
     }
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
     return test_result
 
 
-def process_instance(
+async def process_instance(
     instance: pd.Series,
     metadata: EvalMetadata,
     reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    workspace_mount_path = os.path.join(
-        config.workspace_mount_path, 'bird_eval_workspace'
-    )
-    workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-
-    # reset workspace to config
-    config.workspace_mount_path = workspace_mount_path
-
-    # Copy the database to the workspace
-    db_root = os.path.join(
-        config.workspace_base, 'evaluation_bird/dev/dev_databases', instance.db_id
-    )
-    target_path = os.path.join(workspace_mount_path, f'{instance.db_id}')
-    if not os.path.exists(target_path):
-        logger.info(f'Copying database from {db_root} to {target_path}...')
-        shutil.copytree(db_root, target_path)
-
-    # Set up the database path
-    database_path = os.path.join(instance.db_id, f'{instance.db_id}.sqlite')
-
+) -> EvalOutput:
+    config = get_config(metadata)
     # use session id for concurrent evaluation
-    sid = instance.task_id.replace('/', '__')
+    instance_id = instance.instance_id.replace('/', '__')
 
     # Set up the logger properly, so you can run multi-processing to parallelize the evaluation
     if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir,
-            'logs',
-            f'instance_{sid}.log',
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {instance.task_id}.\nLOG:   tail -f {log_file}'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
-
-    logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance_id}.')
 
     # Create file with BIRD instance
+    database_path = os.path.join('/workspace', f'{instance.db_id}.sqlite')
     statements = f"""
     import sqlite3
     def execute_sql(db_path, sql):
@@ -192,12 +384,12 @@ def execute_sql(db_path, sql):
         result = execute_sql(db_path, sql)
         print(result)
     """
-    path = os.path.join(config.workspace_mount_path, f'{sid}.py')
+
     instruction = (
         f'You are a SQL expert and need to complete the following text-to-SQL tasks.'
         f'\n\n{instance.instruction}\n\n'
         'Please write the SQL in one line without line breaks.'
-        f'And write a new python file named {sid}.py to call the SQL you wrote.'
+        f'And write a new python file named {instance_id}.py to call the SQL you wrote.'
         'You need to follow the code template below:'
         f'\n\n{statements}\n\n'
         'Environment has been set up for you to start working.'
@@ -208,23 +400,21 @@ def execute_sql(db_path, sql):
         'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
     )
     # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
+
+    runtime = await create_runtime(config, sid=instance_id)
+    await initialize_runtime(runtime, instance)
+
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                agent.__class__.__name__
-            ],
-            sid=sid,
-        )
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
+        runtime=runtime,
     )
 
     # ======= Attempt to evaluate the agent's edits =======
-    test_result = get_test_result(instance, path)
+    test_result = await complete_runtime(runtime, instance)
 
     # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
     # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
@@ -238,162 +428,43 @@ def execute_sql(db_path, sql):
     histories = state.history.compatibility_for_eval_history_pairs()
 
     # Save the output
-    output = {
-        'task_id': instance.task_id,
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': test_result,
-    }
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
     return output
 
 
-def load_bird():
-    """Main function to handle the flow of downloading, processing, and loading the bird dataset."""
-    raw_dataset_path = download_bird()
-    bird_dataset = process_bird(raw_dataset_path)
-    return bird_dataset
-
-
-def download_bird():
-    """Downloads and extracts the bird dataset from a specified URL into a local directory."""
-    dataset_path = os.path.join(config.workspace_base, 'evaluation_bird')
-    devset_path = os.path.join(dataset_path, 'dev')
-    if not os.path.exists(dataset_path):
-        logger.info(
-            f'{dataset_path} folder does not exist, starting download and extraction...'
-        )
-        os.makedirs(dataset_path, exist_ok=True)
-        download_url = 'https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip'
-        download_path = os.path.join(dataset_path, 'dev.zip')
-        logger.info('Start Downloading...')
-        subprocess.run(['wget', download_url, '-O', download_path])
-        logger.info('Download completed.')
-        logger.info('Start Extracting...')
-        subprocess.run(['unzip', download_path, '-d', dataset_path])
-        # extract databases
-        devset_path = os.path.join(dataset_path, 'dev')
-        database_path = os.path.join(devset_path, 'dev_databases.zip')
-        subprocess.run(['unzip', database_path, '-d', devset_path])
-        logger.info('Extraction completed.')
-    else:
-        logger.info(f'{dataset_path} folder already exists.')
-    return devset_path
-
-
-def process_bird(dataset_path):
-    """Processes the raw bird dataset into a structured format and saves it as JSON."""
-    processed_path = os.path.join(dataset_path, 'processed_dev.json')
-    if not os.path.exists(processed_path):
-        logger.info(f'{processed_path} folder does not exist, starting processing...')
-        raw_data_path = os.path.join(dataset_path, 'dev.json')
-        database_path = os.path.join(dataset_path, 'dev_databases')
-        processed_data = []
-        with pathlib.Path(raw_data_path).open('r') as f:
-            data = json.load(f)
-            for e in tqdm(data):
-                item = {
-                    'task_id': f'{len(processed_data)}',
-                    'db_path': os.path.join(
-                        database_path, e['db_id'], f"{e['db_id']}.sqlite"
-                    ),
-                    'db_id': e['db_id'],
-                    'instruction': create_prompt(e, database_path),
-                    'SQL': e['SQL'],
-                }
-                processed_data.append(item)
-
-        with pathlib.Path(processed_path).open('w') as f:
-            json.dump(processed_data, f, indent=2)
-            logger.info(f'Processed data saved to {processed_path}')
-    else:
-        logger.info(f'{processed_path} folder already exists.')
-    bird_dataset = load_dataset('json', data_files={'test': processed_path})
-    return bird_dataset
-
-
-def extract_create_table_prompt(db_path, limit_value=0):
-    """Generates a SQL prompt with CREATE TABLE statements and sample data from the database."""
-    table_query = "SELECT * FROM sqlite_master WHERE type='table';"
-    tables = sqlite3.connect(db_path).cursor().execute(table_query).fetchall()
-    prompt = ''
-    for table in tables:
-        table_name = table[1]
-        create_table_statement = table[-1]
-
-        table_info_query = f'PRAGMA table_info(`{table_name}`);'
-        top_k_row_query = f'SELECT * FROM {table_name} LIMIT {limit_value};'
-        try:
-            headers = [
-                x[1]
-                for x in sqlite3.connect(db_path)
-                .cursor()
-                .execute(table_info_query)
-                .fetchall()
-            ]
-        except Exception:
-            logger.error(f'Error Connection: {table_info_query}, {top_k_row_query}')
-            exit(0)
-
-        prompt += create_table_statement + ';\n'
-        if limit_value > 0:
-            top_k_rows = (
-                sqlite3.connect(db_path).cursor().execute(top_k_row_query).fetchall()
-            )
-            prompt += (
-                f"/*\n3 example rows:\n{top_k_row_query}\n{'    '.join(headers)}\n"
-            )
-            for row in top_k_rows:
-                row = [str(x) for x in row]
-                row = [x if x is not None else '' for x in row]
-                prompt += '    '.join(row) + '\n'
-            prompt += '*/\n'
-        prompt += '\n'
-    return prompt
-
-
-def create_prompt(e, database_path):
-    """Create a prompt for the given example"""
-    db_id = e['db_id']
-    db_path = pathlib.Path(database_path) / db_id / f'{db_id}.sqlite'
-
-    # Extract the CREATE TABLE statements and sample data from the database
-    prompt = extract_create_table_prompt(db_path)
-    prompt += f"-- External Knowledge: {e['evidence']}\n\n"
-    prompt += '-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n'
-    prompt += '-- Using valid SQLite, answer the following questions for the tables provided above.\n'
-    prompt += f"Question: {e['question']}\n"
-
-    return prompt
-
-
 if __name__ == '__main__':
-    id_column = 'task_id'
     args = parse_arguments()
     bird_dataset = load_bird()
     dataset = bird_dataset['test'].to_pandas()
+    dataset.rename(columns={'task_id': 'instance_id'}, inplace=True)
 
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     metadata = make_metadata(
         llm_config,
-        args.dataset_name,
+        'BIRD',
         args.agent_cls,
         args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
-
-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit)
+
+    asyncio.run(
+        run_evaluation(
+            instances, metadata, output_file, args.eval_num_workers, process_instance
+        )
     )
diff --git a/evaluation/bird/scripts/run_infer.sh b/evaluation/bird/scripts/run_infer.sh
old mode 100644
new mode 100755
diff --git a/evaluation/browsing_delegation/README.md b/evaluation/browsing_delegation/README.md
index 495ee0bd6bb0..fe71dacb88cb 100644
--- a/evaluation/browsing_delegation/README.md
+++ b/evaluation/browsing_delegation/README.md
@@ -5,30 +5,9 @@ Some of OpenDevin's agent supports agent delegation action, for example, CodeAct
 This evaluation tests whether CodeActAgent can correctly delegate the instruction from WebArena and MiniWob benchmark to the BrowsingAgent.
 If so, the browsing performance upper-bound of CodeActAgent will be the performance of BrowsingAgent.
 
+## Setup Environment and LLM Configuration
 
-## Setup Environment
-
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to set up a local development environment for OpenDevin.
-
-## Configure OpenDevin and your LLM
-
-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview_llm]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model_llm]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference
 
diff --git a/evaluation/browsing_delegation/run_infer.py b/evaluation/browsing_delegation/run_infer.py
index 02e529fe0d4f..c4d5ecd4d6ea 100644
--- a/evaluation/browsing_delegation/run_infer.py
+++ b/evaluation/browsing_delegation/run_infer.py
@@ -1,5 +1,4 @@
 import asyncio
-import logging
 import os
 import re
 
@@ -9,56 +8,61 @@
 
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
 
 # Only CodeActAgent can delegate to BrowsingAgent
 SUPPORTED_AGENT_CLS = {'CodeActAgent'}
 
 
-def process_instance(
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    assert (
+        metadata.max_iterations == 1
+    ), 'max_iterations must be 1 for browsing delegation evaluation.'
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='python:3.11-bookworm',
+            enable_auto_lint=False,
+            use_host_network=False,
+        ),
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def process_instance(
     instance: pd.Series,
     metadata: EvalMetadata,
     reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    env_id = instance.instance_id
+) -> EvalOutput:
+    config = get_config(metadata)
     # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
     if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
     else:
-        logger.info(f'Starting evaluation for instance {env_id}.')
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
 
     instruction = (
         f'You can delegate browsing tasks to a browser agent. '
@@ -67,20 +71,13 @@ def process_instance(
         f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
     )
 
-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            sid=env_id,
-        )
-    )
-
-    # ======= Attempt to evaluate the agent's environment impact =======
+    runtime = await create_runtime(config, sid=instance.instance_id)
 
-    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+    )
 
     if state is None:
         raise ValueError('State should not be None.')
@@ -115,20 +112,19 @@ def process_instance(
             result['is_exact_match'] = is_exact_match
 
     # Save the output
-    output = {
-        'instance_id': env_id,
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': {
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
             'query': instance.instruction,
             'action': last_delegate_action,
             'result': result,
         },
-    }
-
+    )
     return output
 
 
@@ -138,9 +134,13 @@ def process_instance(
     dataset = load_dataset('OpenDevin/eval-browsing-instructions')
     dataset = dataset['train'].to_pandas()
     assert dataset.columns.tolist() == ['instance_id', 'instruction']
-    id_column = 'instance_id'
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     metadata = make_metadata(
         llm_config,
@@ -150,18 +150,20 @@ def process_instance(
         args.eval_note,
         args.eval_output_dir,
     )
+
     if metadata.agent_class not in SUPPORTED_AGENT_CLS:
         raise ValueError(
             f'Agent class {metadata.agent_class} not supported with AgentDelegation.'
         )
 
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit)
+    asyncio.run(
+        run_evaluation(
+            instances,
+            metadata,
+            output_file,
+            args.eval_num_workers,
+            process_instance,
+        )
     )
diff --git a/evaluation/gaia/README.md b/evaluation/gaia/README.md
index 6cf911c95454..cd7e7c967709 100644
--- a/evaluation/gaia/README.md
+++ b/evaluation/gaia/README.md
@@ -2,9 +2,9 @@
 
 This folder contains evaluation harness for evaluating agents on the [GAIA benchmark](https://arxiv.org/abs/2311.12983).
 
-## Configure OpenDevin and your LLM
+## Setup Environment and LLM Configuration
 
-Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## Run the evaluation
 We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA).
diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py
index cfbfb32eb9be..0ff4b1a4340e 100644
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -1,10 +1,7 @@
 import asyncio
-import logging
+import functools
 import os
-import pathlib
 import re
-import shutil
-from functools import partial
 
 import huggingface_hub
 import pandas as pd
@@ -13,28 +10,31 @@
 from evaluation.gaia.scorer import question_scorer
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     codeact_user_response,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.events.action import CmdRunAction, MessageAction
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import AgentFinishAction, CmdRunAction, MessageAction
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime
 
-DATASET_CACHE_DIR = '~/.cache/open-devin/evals/gaia'
-DATASET_CACHE_DIR = os.path.expanduser(DATASET_CACHE_DIR)
+DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data')
 
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
-    'CodeActAgent': partial(codeact_user_response, encapsulate_solution=True),
+    'CodeActAgent': functools.partial(codeact_user_response, encapsulate_solution=True),
 }
 
 AGENT_CLS_TO_INST_SUFFIX = {
@@ -42,151 +42,174 @@
 }
 
 
-def process_instance(
-    instance: pd.Series,
+def get_config(
     metadata: EvalMetadata,
-    reset_logger: bool = True,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='python:3.11-bookworm',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
 ):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    # create process-specific workspace dir
-    # we will create a workspace directory for EACH process
-    # so that different agent don't interfere with each other.
-    old_workspace_mount_path = config.workspace_mount_path
-
-    try:
-        workspace_mount_path = os.path.join(
-            config.workspace_mount_path, '_eval_workspace'
-        )
-        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-        config.workspace_mount_path = workspace_mount_path
-
-        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-        eval_output_dir = metadata.eval_output_dir
-        if reset_logger:
-            # Set up logger
-            log_file = os.path.join(
-                eval_output_dir, 'logs', f'instance_{instance["task_id"]}.log'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            # add back the console handler to print ONE line
-            logger.addHandler(get_console_handler())
-            logger.info(
-                f'Starting evaluation for instance {instance["task_id"]}.\nLOG:   tail -f {log_file}'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            file_handler = logging.FileHandler(log_file)
-            file_handler.setFormatter(
-                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-            )
-            logger.addHandler(file_handler)
-
-        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-        if instance['file_name'] != '':
-            # if this question comes with a file, we need to save it to the workspace
-            assert metadata.data_split is not None
-            src_file = os.path.join(
-                DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
-            )
-            extension_name = instance['file_name'].split('.')[-1]
-            dest_file = os.path.join(workspace_mount_path, f'file.{extension_name}')
-            shutil.copyfile(src_file, dest_file)
-            logger.info(f'File copied to {dest_file}')
-        else:
-            dest_file = None
-
-        # Prepare instruction
-        instruction = f"{instance['Question']}\n"
-        logger.info(f'Instruction: {instruction}')
-        if dest_file:
-            instruction += f"\n\nThe mentioned file is provided in the workspace at: {dest_file.split('/')[-1]}"
-
-        instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-        instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
-        instruction += (
-            'For example: The answer to the question is <solution> 42 </solution>.\n'
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    if instance['file_name'] != '':
+        # if this question comes with a file, we need to save it to the workspace
+        assert metadata.data_split is not None
+        src_file = os.path.join(
+            DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
         )
-        # NOTE: You can actually set slightly different instruction for different agents
-        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent.__class__.__name__, '')
-        logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
-
-        # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State | None = asyncio.run(
-            run_agent_controller(
-                agent,
-                instruction,
-                max_iterations=metadata.max_iterations,
-                max_budget_per_task=config.max_budget_per_task,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                    agent.__class__.__name__
-                ],
-                sid=instance['task_id'],
-            )
+        assert os.path.exists(src_file)
+        dest_file = os.path.join('/workspace', instance['file_name'])
+        await runtime.copy_to(src_file, dest_file)
+
+        # rename to file.extension_name
+        extension_name = instance['file_name'].split('.')[-1]
+        action = CmdRunAction(
+            command=f'mv /workspace/{instance["file_name"]} /workspace/file.{extension_name}'
         )
-        # ======= Attempt to evaluate the agent's edits =======
-        # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance['instance_id'], log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')
+
+    if instance['file_name'] != '':
+        extension_name = instance['file_name'].split('.')[-1]
+        dest_file = os.path.join('/workspace', f'file.{extension_name}')
+    else:
+        dest_file = None
 
-        if state is None:
-            raise ValueError('State should not be None.')
+    # Prepare instruction
+    instruction = f"{instance['Question']}\n"
+    logger.info(f'Instruction: {instruction}')
+    if dest_file:
+        instruction += f"\n\nThe mentioned file is provided in the workspace at: {dest_file.split('/')[-1]}"
 
-        model_answer_raw = ''
+    instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+    instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
+    instruction += (
+        'For example: The answer to the question is <solution> 42 </solution>.\n'
+    )
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(metadata.agent_class, '')
+    logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
+
+    runtime = await create_runtime(config, sid=instance['instance_id'])
+    await initialize_runtime(runtime, instance)
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
+    )
+    # ======= Attempt to evaluate the agent's edits =======
+    # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
 
-        # get the last message or thought from the agent
-        for event in state.history.get_events(reverse=True):
-            if isinstance(event, CmdRunAction) and event.source == 'agent':
+    model_answer_raw = ''
+    # get the last message or thought from the agent
+    for event in state.history.get_events(reverse=True):
+        if event.source == 'agent':
+            if isinstance(event, AgentFinishAction):
+                model_answer_raw = event.thought
+                break
+            elif isinstance(event, CmdRunAction):
                 model_answer_raw = event.thought
-            elif isinstance(event, MessageAction) and event.source == 'agent':
+                break
+            elif isinstance(event, MessageAction):
                 model_answer_raw = event.content
+                break
 
-        # attempt to parse model_answer
-        model_answer = re.findall(r'<solution>(.*?)</solution>', model_answer_raw)
-        if len(model_answer) == 0:
-            logger.warning(f'Failed to parse model answer: {model_answer_raw}')
-            model_answer = model_answer_raw
-        else:
-            model_answer = model_answer[0]
+    # attempt to parse model_answer
+    model_answer = re.findall(r'<solution>(.*?)</solution>', model_answer_raw)
+    if len(model_answer) == 0:
+        logger.warning(f'Failed to parse model answer: {model_answer_raw}')
+        model_answer = model_answer_raw
+    else:
+        model_answer = model_answer[0]
 
-        logger.info(
-            f'Final message: {model_answer} | Ground truth: {instance["Final answer"]}'
-        )
-        score = question_scorer(
-            model_answer=model_answer, ground_truth=instance['Final answer']
-        )
-        test_result = {
-            'score': score,
-            'model_answer_raw': model_answer_raw,
-            'model_answer': model_answer,
-            'ground_truth': instance['Final answer'],
-        }
-        metrics = state.metrics.get() if state.metrics else None
-
-        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
-        # for compatibility with the existing output format, we can remake the pairs here
-        # remove when it becomes unnecessary
-        histories = state.history.compatibility_for_eval_history_pairs()
-
-        # Save the output
-        output = {
-            'instance_id': instance['task_id'],
-            'instance': instance,
-            'instruction': instance['Question'],
-            'metadata': metadata.model_dump(),
-            'history': histories,
-            'metrics': metrics,
-            'error': state.last_error if state and state.last_error else None,
-            'test_result': test_result,
-        }
-    except Exception:
-        logger.error('Process instance failed')
-        raise
-    finally:
-        config.workspace_mount_path = old_workspace_mount_path
+    logger.info(
+        f'Final message: {model_answer} | Ground truth: {instance["Final answer"]}'
+    )
+    score = question_scorer(
+        model_answer=model_answer, ground_truth=instance['Final answer']
+    )
+    test_result = {
+        'score': score,
+        'model_answer_raw': model_answer_raw,
+        'model_answer': model_answer,
+        'ground_truth': instance['Final answer'],
+    }
+    metrics = state.metrics.get() if state.metrics else None
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=instance['instance_id'],
+        instance=instance.to_dict(),
+        instruction=instance['Question'],
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
     return output
 
 
@@ -197,13 +220,19 @@ def process_instance(
         type=str,
         help='gaia level to evaluate, eg. 2023_level1',
     )
+    parser.add_argument(
+        '--data-split',
+        type=str,
+        help='data split to evaluate, eg. test',
+        default='validation',
+    )
     args, _ = parser.parse_known_args()
-    if args.directory:
-        config.workspace_base = os.path.abspath(args.directory)
-        logger.info(f'Setting workspace base to {config.workspace_base}')
 
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     metadata = make_metadata(
         llm_config=llm_config,
@@ -222,20 +251,18 @@ def process_instance(
         repo_type='dataset',
         local_dir=DATASET_CACHE_DIR,
     )
-    gaia_tests = dataset[metadata.data_split]
+    gaia_tests = dataset[metadata.data_split].to_pandas()
+    gaia_tests.rename(columns={'task_id': 'instance_id'}, inplace=True)
 
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    prepared_dataset = prepare_dataset(
-        gaia_tests.to_pandas(), output_file, args.eval_n_limit, 'task_id'
-    )
-
-    agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm))
+    prepared_dataset = prepare_dataset(gaia_tests, output_file, args.eval_n_limit)
 
-    run_evaluation(
-        dataset=prepared_dataset,
-        metadata=metadata,
-        output_file=output_file,
-        num_workers=args.eval_num_workers,
-        process_instance_func=process_instance,
-        id_column='task_id',
+    asyncio.run(
+        run_evaluation(
+            dataset=prepared_dataset,
+            metadata=metadata,
+            output_file=output_file,
+            num_workers=args.eval_num_workers,
+            process_instance_func=process_instance,
+        )
     )
diff --git a/evaluation/gaia/scripts/run_infer.sh b/evaluation/gaia/scripts/run_infer.sh
old mode 100644
new mode 100755
diff --git a/evaluation/gorilla/README.md b/evaluation/gorilla/README.md
index c5da3ad4531c..106a83a251b1 100644
--- a/evaluation/gorilla/README.md
+++ b/evaluation/gorilla/README.md
@@ -2,20 +2,16 @@
 
 This folder contains evaluation harness we built on top of the original [Gorilla APIBench](https://github.com/ShishirPatil/gorilla) ([paper](https://arxiv.org/pdf/2305.15334)).
 
-## Setup Environment
+## Setup Environment and LLM Configuration
 
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local development environment for OpenDevin.
-
-## Configure OpenDevin and your LLM
-
-Run `make setup-config` to set up the `config.toml` file if it does not exist at the root of the workspace.
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on APIBench Instances
 
 Make sure your Docker daemon is running, then run this bash script:
 
 ```bash
-bash evaluation/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs]
+./evaluation/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs]
 ```
 
 where `model_config` is mandatory, while all other arguments are optional.
@@ -39,5 +35,5 @@ Note: in order to use `eval_limit`, you must also set `agent`; in order to use `
 For example,
 
 ```bash
-bash evaluation/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th
+./evaluation/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th
 ```
diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py
index ea68d445a109..d84432c3b0aa 100644
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -1,59 +1,28 @@
 import asyncio
 import json
-import logging
-import multiprocessing as mp
 import os
-import pathlib
-import subprocess
-import time
-from concurrent.futures import ProcessPoolExecutor
 
-from tqdm import tqdm
-
-from opendevin.controller.agent import Agent
+import pandas as pd
+
+from evaluation.gorilla.utils import encode_question, get_data_for_hub
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    codeact_user_response,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+)
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.events.action import MessageAction
-from opendevin.llm.llm import LLM
-
-from .utils import encode_question, get_data
-
-config = load_app_config()
-
-
-def cleanup():
-    print('Cleaning up child processes...')
-    for process in mp.active_children():
-        print(f'Terminating child process: {process.name}')
-        process.terminate()
-        process.join()
-
-
-def codeact_user_response(state: State) -> str:
-    msg = (
-        #'Please continue working on the task on whatever approach you think is suitable.\n'
-        'Please run the following command: <execute_bash> exit </execute_bash>.\n'
-        #'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
-    )
-
-    # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
-    if state.history:
-        user_msgs = [
-            event
-            for event in state.history.get_events()
-            if isinstance(event, MessageAction) and event.source == 'user'
-        ]
-        if len(user_msgs) > 2:
-            # let the agent know that it can give up when it has tried 3 times
-            return (
-                msg
-                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
-            )
-    return msg
-
+from opendevin.core.main import create_runtime, run_controller
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
@@ -64,105 +33,95 @@ def codeact_user_response(state: State) -> str:
 }
 
 
-def process_instance(agent, question_id, question, metadata, reset_logger: bool = True):
-    # create process-specific workspace dir
-    # we will create a workspace directory for EACH process
-    # so that different agent don't interfere with each other.
-    old_workspace_mount_path = config.workspace_mount_path
-    try:
-        workspace_mount_path = os.path.join(
-            config.workspace_mount_path, '_eval_workspace'
-        )
-        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-        config.workspace_mount_path = workspace_mount_path
-
-        # Setup the logger properly, so you can run multi-processing to parallize the evaluation
-        eval_output_dir = metadata['eval_output_dir']
-        if reset_logger:
-            # Set up logger
-            log_file = os.path.join(
-                eval_output_dir, 'logs', f'instance_{question_id}.log'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            # add back the console handler to print ONE line
-            logger.addHandler(get_console_handler())
-            logger.info(
-                f'Starting evaluation for instance {question_id}.\nLOG:   tail -f {log_file}'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            file_handler = logging.FileHandler(log_file)
-            file_handler.setFormatter(
-                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-            )
-            logger.addHandler(file_handler)
-        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-
-        # Prepare instruction
-        instruction = encode_question(question, metadata['hub'])
-        instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-        # NOTE: You can actually set slightly different instruction for different agents
-        instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
-        # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
-
-        # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State | None = asyncio.run(
-            run_agent_controller(
-                agent,
-                instruction,
-                max_iterations=metadata.max_iterations,
-                max_budget_per_task=config.max_budget_per_task,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent.__class__.__name__
-                ),
-                sid=question_id,
-            )
-        )
-        # ======= Attempt to evaluate the agent's edits =======
-        # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-
-        if state is None:
-            raise ValueError('State should not be None.')
-
-        # retrieve the last message from the agent
-        model_answer_raw = state.history.get_last_agent_message()
-
-        # attempt to parse model_answer
-        _, _, ast_eval = get_data(metadata['hub'])
-        correct, hallucination = ast_eval(question_id, model_answer_raw)
-        metrics = state.metrics.get() if state.metrics else None
-        logger.info(
-            f'Final message: {model_answer_raw} | Correctness: {correct} | Hallucination: {hallucination}'
-        )
-
-        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
-        # for compatibility with the existing output format, we can remake the pairs here
-        # remove when it becomes unnecessary
-        histories = state.history.compatibility_for_eval_history_pairs()
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='python:3.11-bookworm',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(metadata)
+    instance_id = instance['question_id']
+    question = instance['question']
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance_id}.')
+
+    # Prepare instruction
+    instruction = encode_question(question, instance['hub'])
+    instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
+    # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    runtime = await create_runtime(config, sid=instance_id)
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+            metadata.agent_class
+        ),
+    )
+    # ======= Attempt to evaluate the agent's edits =======
+    # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # retrieve the last message from the agent
+    model_answer_raw = state.history.get_last_agent_message()
+
+    # attempt to parse model_answer
+    ast_eval_fn = instance['ast_eval']
+    correct, hallucination = ast_eval_fn(instance_id, model_answer_raw)
+    metrics = state.metrics.get() if state.metrics else None
+    logger.info(
+        f'Final message: {model_answer_raw} | Correctness: {correct} | Hallucination: {hallucination}'
+    )
 
-        # Save the output
-        output = {
-            'question_id': question_id,
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
+    output = EvalOutput(
+        instance_id=instance_id,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
             'text': model_answer_raw,
             'correct': correct,
             'hallucination': hallucination,
-            'answer_id': 'None',
-            'model_id': metadata['model_name'],
-            'metadata': metadata.model_dump(),
-            'history': histories,
-            'metrics': metrics,
-            'error': state.last_error if state and state.last_error else None,
-        }
-    except Exception:
-        logger.error('Process instance failed')
-        raise
-    finally:
-        config.workspace_mount_path = old_workspace_mount_path
+        },
+    )
     return output
 
 
@@ -175,188 +134,62 @@ def process_instance(agent, question_id, question, metadata, reset_logger: bool
         default='hf,torch,tf',
     )
     args, _ = parser.parse_known_args()
-    if args.directory:
-        config.workspace_base = os.path.abspath(args.directory)
-        print(f'Setting workspace base to {config.workspace_base}')
 
-    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
-    # for details of how to set `llm_config`
+    llm_config = None
     if args.llm_config:
-        specified_llm_config = get_llm_config_arg(args.llm_config)
-        if specified_llm_config:
-            config.llm = specified_llm_config
-    logger.info(f'Config for evaluation: {config}')
-    agent_class = args.agent_cls
-    assert (
-        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
-    ), f'Unsupported agent class: {agent_class}'
-    model_name = config.llm.model.split('/')[-1]
-    max_iterations = args.max_iterations
-    eval_note = ''
-    if args.eval_note is not None:
-        eval_note += '_N_' + args.eval_note
-    eval_output_dir = os.path.join(
-        args.eval_output_dir,
-        'gorilla',
-        agent_class,
-        model_name + '_maxiter_' + str(max_iterations) + eval_note,
-    )
-    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
-    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
-        parents=True, exist_ok=True
-    )
-    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
-    hubs = []
-    if 'hf' in args.hubs:
-        hubs.append('hf')
-    if 'torch' in args.hubs or 'th' in args.hubs:
-        hubs.append('torch')
-    if 'tf' in args.hubs:
-        hubs.append('tf')
-    if hubs == []:
+    hubs = args.hubs.split(',')
+    if len(hubs) == 0:
         raise ValueError('Please choose at least one from hf, torch, and tf for hubs.')
 
+    dfs = []
     for hub in hubs:
         logger.info(f'Evaluating APIBench {hub} test')
-        questions, question_ids, ast_eval = get_data(hub)
-
-        # TEST METADATA
-        metadata = {
-            'hub': hub,
-            'agent_class': agent_class,
-            'model_name': model_name,
-            'max_iterations': max_iterations,
-            'eval_output_dir': eval_output_dir,
-            'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
-            # get the commit id of current repo for reproduciblity
-            'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
-            .decode('utf-8')
-            .strip(),
-        }
-        logger.info(f'Metadata: {metadata}')
-        with open(os.path.join(eval_output_dir, f'metadata_{hub}.json'), 'w') as f:
-            json.dump(metadata, f)
+        df = get_data_for_hub(hub)
+        dfs.append(df)
+    dataset_df = pd.concat(dfs)
+    dataset_df.rename(columns={'question_id': 'instance_id'}, inplace=True)
+
+    metadata = make_metadata(
+        llm_config=llm_config,
+        dataset_name=f'gorilla-{hub}',
+        agent_class=args.agent_cls,
+        max_iterations=args.max_iterations,
+        eval_note=args.eval_note,
+        eval_output_dir=args.eval_output_dir,
+        data_split=args.data_split,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
 
-        # LIMIT EVALUATION
-        eval_n_limit = args.eval_n_limit
-        if eval_n_limit:
-            questions = questions[: (eval_n_limit // len(hubs))]
-            question_ids = question_ids[: (eval_n_limit // len(hubs))]
-            logger.info(
-                f'Limiting evaluation to a total of first {eval_n_limit} instances -> first {eval_n_limit//len(hubs)} instances per hub.'
-            )
-        output_file = os.path.join(eval_output_dir, f'output_{model_name}_{hub}.jsonl')
-        logger.info(f'Writing evaluation output to {output_file}')
-        finished_task_ids = set()
-        if os.path.exists(output_file):
-            with open(output_file, 'r') as f:
-                for line in f:
-                    data = json.loads(line)
-                    for i in range(len(question_ids)):
-                        if question_ids[i] == int(data['question_id']):
-                            finished_task_ids.add(data['question_id'])
-            logger.warning(
-                f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
-            )
-        output_fp = open(output_file, 'a')
-        logger.info(
-            f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
-        )
-        # =============================================
-        # filter out finished instances
-        new_questions = []
-        new_question_ids = []
-        for i in range(len(question_ids)):
-            if question_ids[i] in finished_task_ids:
-                logger.info(
-                    f'Skipping instance {question_ids[i]} as it is already finished.'
-                )
-                continue
-            new_questions.append(questions[i])
-            new_question_ids.append(question_ids[i])
+    dataset = prepare_dataset(
+        dataset_df, output_file=output_file, eval_n_limit=args.eval_n_limit
+    )
 
-        finished_task_number = len(finished_task_ids)
-        questions = new_questions
-        question_ids = new_question_ids
-        logger.info(
-            f'Finished instances: {finished_task_number}, Remaining instances: {len(question_ids)}'
+    asyncio.run(
+        run_evaluation(
+            dataset=dataset,
+            metadata=metadata,
+            output_file=output_file,
+            num_workers=args.eval_num_workers,
+            process_instance_func=process_instance,
         )
-        # =============================================
-        pbar = tqdm(total=len(question_ids))
-
-        # This function tracks the progress AND write the output to a JSONL file
-        def update_progress(future, pbar, output_fp, finished_task_ids):
-            pbar.update(1)
-            output = future.result()
-            pbar.set_description(f'Instance {output["question_id"]}')
-            pbar.set_postfix_str(f'Test Result: {output["correct"]}')
-            logger.info(
-                f'Finished evaluation for instance {output["question_id"]}: {output["correct"]}'
-            )
-            output_fp.write(json.dumps(output) + '\n')
-            output_fp.flush()
-            finished_task_ids.add(output['question_id'])
-
-        # Create the agent
-        agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
-
-        # This sets the multi-processing
-        num_workers = args.eval_num_workers
-        logger.info(f'Using {num_workers} workers for evaluation.')
-        try:
-            with ProcessPoolExecutor(num_workers) as executor:
-                futures = []
-                # This is how we perform multi-processing
-                for i in range(len(question_ids)):
-                    try:
-                        question_id = question_ids[i]
-                        question = questions[i]
-                        future = executor.submit(
-                            process_instance,
-                            agent,
-                            question_id,
-                            question,
-                            metadata,
-                            reset_logger=bool(num_workers > 1),
-                        )
-                        future.add_done_callback(
-                            update_progress, pbar, output_fp, finished_task_ids
-                        )
-                        futures.append(future)
-                    except Exception:
-                        continue
-
-                # Wait for all futures to complete
-                for future in futures:
-                    try:
-                        future.result()
-                    except Exception:
-                        continue
-        except KeyboardInterrupt:
-            logger.info('KeyboardInterrupt received. Cleaning up...')
-            cleanup()
-
-        output_fp.close()
-        total_correct = 0
-        total_hallucination = 0
-        output = []
-        with open(output_file, 'r') as f:
-            for line in f:
-                data = json.loads(line)
-                output.append(data)
-                if int(data['question_id']) in finished_task_ids:
-                    if str(data['correct']).lower() == 'true':
-                        total_correct += 1
-                    if str(data['hallucination']).lower() == 'true':
-                        total_hallucination += 1
-        # sort all output by question_id
-        output = sorted(output, key=lambda x: x['question_id'])
-        with open(output_file, 'w') as f:
-            for dat in output:
-                f.write(json.dumps(dat) + '\n')
-                f.flush()
+    )
 
-        logger.info(
-            f'Evaluation finished for {hub}. Total: {len(question_ids)+finished_task_number}; Correct: {total_correct}; Hallucination: {total_hallucination}. Accuracy: {total_correct / (len(question_ids)+finished_task_number)}'
-        )
+    # Read the output file and calculate the accuracy
+    total_correct = 0
+    total_hallucination = 0
+    output = []
+    with open(output_file, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            if data['test_result']['correct']:
+                total_correct += 1
+            if data['test_result']['hallucination']:
+                total_hallucination += 1
+            output.append(data)
+    logger.info(
+        f'Evaluation finished for {hub}. Total: {len(output)}; Correct: {total_correct}; Hallucination: {total_hallucination}. Accuracy: {total_correct / len(output)}'
+    )
diff --git a/evaluation/gorilla/scripts/run_infer.sh b/evaluation/gorilla/scripts/run_infer.sh
old mode 100644
new mode 100755
diff --git a/evaluation/gorilla/utils.py b/evaluation/gorilla/utils.py
index 66f878976b03..8c45cce58afb 100644
--- a/evaluation/gorilla/utils.py
+++ b/evaluation/gorilla/utils.py
@@ -1,6 +1,8 @@
 import json
+import os
 from functools import partial
 
+import pandas as pd
 import requests
 from ast_eval_hf import ast_eval_hf, ast_parse
 from ast_eval_tf import ast_eval_tf
@@ -48,48 +50,59 @@ def encode_question(question, api_name):
     return prompts
 
 
-def get_data(hub):
+DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
+os.makedirs(DATA_DIR, exist_ok=True)
+
+
+def fetch_data(url, filename):
+    cache_path = os.path.join(DATA_DIR, filename)
+    if os.path.exists(cache_path):
+        with open(cache_path, 'r') as f:
+            return f.read()
+    else:
+        response = requests.get(url)
+        if response.status_code == 200:
+            with open(cache_path, 'w') as f:
+                f.write(response.text)
+            return response.text
+        else:
+            raise Exception(f'Failed to fetch data from {url}')
+
+
+def get_data_for_hub(hub: str):
     if hub == 'hf':
         question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/huggingface/questions_huggingface_0_shot.jsonl'
         api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/huggingface_api.jsonl'
         apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/huggingface_eval.json'
         ast_eval = ast_eval_hf
-    if hub == 'torch':
+    elif hub == 'torch':
         question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/torchhub/questions_torchhub_0_shot.jsonl'
         api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/torchhub_api.jsonl'
         apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/torchhub_eval.json'
         ast_eval = ast_eval_th
-    if hub == 'tf':
+    elif hub == 'tf':
         question_data = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/eval/eval-data/questions/tensorflowhub/questions_tensorflowhub_0_shot.jsonl'
         api_dataset = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/api/tensorflowhub_api.jsonl'
         apibench = 'https://raw.githubusercontent.com/ShishirPatil/gorilla/main/data/apibench/tensorflow_eval.json'
         ast_eval = ast_eval_tf
 
-    # get questions and question_ids
+    question_data = fetch_data(question_data, 'question_data.jsonl')
+    api_dataset = fetch_data(api_dataset, 'api_dataset.jsonl')
+    apibench = fetch_data(apibench, 'apibench.json')
+
+    # Parse question data
     questions = []
     question_ids = []
-    question_data = requests.get(question_data)
-    if question_data.status_code == 200:
-        lines = question_data.text.splitlines()
-        for line in lines:
-            questions.append(json.loads(line)['text'])
-            question_ids.append(json.loads(line)['question_id'])
-
-    # get the api datasest
-    api_database = []
-    api_dataset = requests.get(api_dataset)
-    if api_dataset.status_code == 200:
-        lines = api_dataset.text.splitlines()
-        for line in lines:
-            api_database.append(json.loads(line))
-
-    # get the question answer pair datasest
-    qa_pairs = []
-    apibench = requests.get(apibench)
-    if apibench.status_code == 200:
-        lines = apibench.text.splitlines()
-        for line in lines:
-            qa_pairs.append(json.loads(line)['api_data'])
+    for line in question_data.splitlines():
+        data = json.loads(line)
+        questions.append(data['text'])
+        question_ids.append(data['question_id'])
+
+    # Parse API dataset
+    api_database = [json.loads(line) for line in api_dataset.splitlines()]
+
+    # Parse question-answer pairs
+    qa_pairs = [json.loads(line)['api_data'] for line in apibench.splitlines()]
 
     # Parse all apis to ast trees
     ast_database = []
@@ -97,4 +110,15 @@ def get_data(hub):
         ast_tree = ast_parse(data['api_call'])
         ast_database.append(ast_tree)
     ast_eval = partial(ast_eval, api_database, qa_pairs, ast_database)
-    return questions, question_ids, ast_eval
+
+    return pd.DataFrame(
+        {
+            'question_id': question_ids,
+            'question': questions,
+            'api_database': [api_database] * len(questions),
+            'qa_pairs': [qa_pairs] * len(questions),
+            'ast_database': [ast_database] * len(questions),
+            'ast_eval': [ast_eval] * len(questions),
+            'hub': [hub] * len(questions),
+        }
+    )
diff --git a/evaluation/gpqa/README.md b/evaluation/gpqa/README.md
index b8249be56d66..7e1981c1f0d7 100644
--- a/evaluation/gpqa/README.md
+++ b/evaluation/gpqa/README.md
@@ -15,31 +15,9 @@ Further references:
 - https://paperswithcode.com/dataset/gpqa
 - https://github.com/idavidrein/gpqa
 
+## Setup Environment and LLM Configuration
 
-## Setup Environment
-
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-
-## Configure OpenDevin and your LLM
-
-Create a `config.toml` file (you can copy from `config.template.toml`) if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_azure_openai_compatible_model]
-model = "AZURE_OPENAI_EXACT_DEPLOYMENT_MODEL_NAME"
-base_url = "AZURE_OPENAI_ENDPOINT"
-api_key = "AZURE_ENDPOINT_API_KEY"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on GPQA Benchmark
 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options
@@ -55,8 +33,3 @@ like to evaluate. It could also be a release tag like `0.6.2`.
 - `num_samples_eval`: Number of samples to evaluate (useful for testing and debugging).
 - `data_split`: The data split to evaluate on. Must be one of `gpqa_main`, `gqpa_diamond`, `gpqa_experts`, `gpqa_extended`. Defaults to `gpqa_diamond` as done in the paper.
 - `AgentClass`: The agent class to use for evaluation. Currently only supports `CodeActAgent` for CodeActAgent.
-
-
-## Benchmark Evaluation Results
-
-- [] TODO: Finish the evaluation run across the entire benchmark and compile results
diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py
index 9cc8751f3e33..8f2df78e5e16 100644
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -17,9 +17,7 @@
 """
 
 import asyncio
-import logging
 import os
-import pathlib
 import random
 import re
 from typing import Callable
@@ -29,22 +27,27 @@
 
 from evaluation.utils.shared import (
     EvalMetadata,
-    codeact_user_response,
+    EvalOutput,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.events.action import Action, AgentFinishAction, MessageAction
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import (
+    Action,
+    AgentFinishAction,
+    MessageAction,
+)
 from opendevin.events.observation import Observation
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
 
 ACTION_FORMAT = """
 <<FINAL_ANSWER||
@@ -53,6 +56,27 @@
 """.strip()
 
 
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='python:3.11-bookworm',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
 def gpqa_codeact_user_response(
     state: State,
     encapsulate_solution: bool = False,
@@ -68,11 +92,10 @@ def gpqa_codeact_user_response(
         '<execute_bash> exit </execute_bash>\n'
         'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP TO SOLVE THIS TASK.\n'
     )
-
     return msg
 
 
-AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {'CodeActAgent': codeact_user_response}
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {'CodeActAgent': gpqa_codeact_user_response}
 
 AGENT_CLS_TO_INST_SUFFIX = {
     'CodeActAgent': '\n\n SUPER IMPORTANT: When you think you have solved the question, first report it back to the user in the requested format. Only once that is done, in the next turn, please run the following command: <execute_bash> exit </execute_bash>.\n'
@@ -146,57 +169,23 @@ def convert_instance_dict(instance):
     return out_instance_dict
 
 
-def process_instance(
+async def process_instance(
     instance: pd.Series,
     metadata: EvalMetadata,
     reset_logger: bool = True,
 ):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    old_workspace_mount_path = config.workspace_mount_path
-    old_workspace_base = config.workspace_base
-    try:
-        workspace_mount_path = os.path.join(
-            config.workspace_mount_path, '_eval_workspace'
-        )
-        # create process-specific workspace dir
-        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-
-        # reset workspace to config
-        config.workspace_base = workspace_mount_path
-        config.workspace_mount_path = workspace_mount_path
-
-        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-        if reset_logger:
-            # Set up logger
-            log_file = os.path.join(
-                metadata.eval_output_dir, 'logs', f'instance_{instance.instance_id}.log'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            # add back the console handler to print ONE line
-            logger.addHandler(get_console_handler())
-            logger.info(
-                f'Starting evaluation for instance {instance.instance_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            file_handler = logging.FileHandler(log_file)
-            file_handler.setFormatter(
-                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-            )
-            logger.addHandler(file_handler)
-        else:
-            logger.info(f'Starting evaluation for instance {instance.instance_id}.')
-
-        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-
-        # ======= Run the agent on the instance =======
-        # Prepare instruction for the agent using suggested format in gpqa codebase
-        instruction = f"""
+    config = get_config(metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance['instance_id'], log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')
+
+    # ======= Run the agent on the instance =======
+    # Prepare instruction for the agent using suggested format in gpqa codebase
+    instruction = f"""
 What is the correct answer to this question:\n
 {instance['question']}\n
 
@@ -225,109 +214,98 @@ def process_instance(
 Ok now its time to start solving the question. Good luck!
 """
 
-        # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State | None = asyncio.run(
-            run_agent_controller(
-                agent,
-                instruction,
-                max_iterations=metadata.max_iterations,
-                max_budget_per_task=config.max_budget_per_task,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent.__class__.__name__
-                ),
-                sid=f'gptq_{str(instance.instance_id)}',
-            )
-        )
-        assert state is not None, 'State should not be None.'
-
-        # ======= Attempt to evaluate the agent's edits =======
-
-        question_choices = {
-            'A': instance['choices'][0],
-            'B': instance['choices'][1],
-            'C': instance['choices'][2],
-            'D': instance['choices'][3],
-        }
-        # get the final message from the state history (default to empty if not found)
-        found_answers = {
-            'A': False,
-            'B': False,
-            'C': False,
-            'D': False,
-        }
-        for event in state.history.get_events(reverse=True):
-            if (
-                isinstance(event, AgentFinishAction)
-                and event.source != 'user'
-                and '<<FINAL_ANSWER||' in event.thought
-            ):
-                final_message = event.thought
-                break
-            elif (
-                isinstance(event, MessageAction)
-                and event.source != 'user'
-                and '<<FINAL_ANSWER||' in event.content
-            ):
-                final_message = event.content
-                break
-            elif isinstance(event, Observation):
-                for option, option_text in question_choices.items():
-                    if option_text in event.content:
-                        found_answers[option] = True
-            else:
-                final_message = None
-
-        found_options = [option for option, found in found_answers.items() if found]
-        logger.info('#############################################')
-        logger.info(f'Final message generated by the agent: {final_message}')
-        logger.info('#############################################')
+    runtime = await create_runtime(config, sid=f'gptq_{str(instance.instance_id)}')
 
-        # check if the model output matches the ground truth
-        test_result = compare_answers(final_message, instance.correct_solution)
-        if final_message is None and len(found_options) > 0:
-            _selected = random.choice(found_options)
-            # if the final message is None, then the agent did not report the answer in the correct format
-            # so we randomly select one of the found options and compare it with the correct solution
-            test_result = _selected == instance.correct_solution
-            logger.info('#############################################')
-            logger.info('Agent did not report the answer in the correct format.')
-            logger.info(f'Found options: {found_options}')
-            logger.info(f'Selected option: {_selected}')
-            logger.info('#############################################')
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+            metadata.agent_class
+        ),
+    )
+    assert state is not None, 'State should not be None.'
+
+    # ======= Attempt to evaluate the agent's edits =======
+
+    question_choices = {
+        'A': instance['choices'][0],
+        'B': instance['choices'][1],
+        'C': instance['choices'][2],
+        'D': instance['choices'][3],
+    }
+    # get the final message from the state history (default to empty if not found)
+    found_answers = {
+        'A': False,
+        'B': False,
+        'C': False,
+        'D': False,
+    }
+    for event in state.history.get_events(reverse=True):
+        if (
+            isinstance(event, AgentFinishAction)
+            and event.source != 'user'
+            and '<<FINAL_ANSWER||' in event.thought
+        ):
+            final_message = event.thought
+            break
+        elif (
+            isinstance(event, MessageAction)
+            and event.source != 'user'
+            and '<<FINAL_ANSWER||' in event.content
+        ):
+            final_message = event.content
+            break
+        elif isinstance(event, Observation):
+            for option, option_text in question_choices.items():
+                if option_text in event.content:
+                    found_answers[option] = True
+        else:
+            final_message = None
+
+    found_options = [option for option, found in found_answers.items() if found]
+    logger.info('#############################################')
+    logger.info(f'Final message generated by the agent: {final_message}')
+    logger.info('#############################################')
 
+    # check if the model output matches the ground truth
+    test_result = compare_answers(final_message, instance.correct_solution)
+    if final_message is None and len(found_options) > 0:
+        _selected = random.choice(found_options)
+        # if the final message is None, then the agent did not report the answer in the correct format
+        # so we randomly select one of the found options and compare it with the correct solution
+        test_result = _selected == instance.correct_solution
         logger.info('#############################################')
-        logger.info(f'Test result: {test_result}')
+        logger.info('Agent did not report the answer in the correct format.')
+        logger.info(f'Found options: {found_options}')
+        logger.info(f'Selected option: {_selected}')
         logger.info('#############################################')
 
-        # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-        if state is None:
-            raise ValueError('State should not be None.')
-
-        metrics = state.metrics.get() if state.metrics else None
-
-        # Save the output
-        output = {
-            'task_id': instance.task_id,
-            'instance_id': instance.instance_id,
-            'instruction': instruction,
-            'metadata': metadata.model_dump(),
-            'history': state.history.compatibility_for_eval_history_pairs(),
-            'metrics': metrics,
-            'error': state.last_error if state and state.last_error else None,
-            'test_result': {
-                'result': test_result,
-                'found_answers': found_answers,
-                'last_message': final_message,
-            },
-        }
-
-    except Exception:
-        logger.error('Process instance failed')
-        raise
-    finally:
-        config.workspace_mount_path = old_workspace_mount_path
-        config.workspace_base = old_workspace_base
+    logger.info('#############################################')
+    logger.info(f'Test result: {test_result}')
+    logger.info('#############################################')
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=str(instance.instance_id),
+        instruction=instruction,
+        metadata=metadata,
+        history=state.history.compatibility_for_eval_history_pairs(),
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
+            'result': test_result,
+            'found_answers': found_answers,
+            'last_message': final_message,
+        },
+    )
     return output
 
 
@@ -343,8 +321,11 @@ def process_instance(
     )
     args, _ = parser.parse_known_args()
 
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
     # so we don't need to manage file uploading to OpenDevin's repo
@@ -355,8 +336,6 @@ def process_instance(
     gpqa_dataset = gpqa_dataset.to_pandas()
     # Add a new column 'instance_id' with the index
     gpqa_dataset['instance_id'] = gpqa_dataset.index
-    gpqa_dataset['task_id'] = gpqa_dataset.index
-    # gpqa_dataset = dataset['train'].to_pandas().sort_values(by='id').reset_index(drop=True)
 
     if args.agent_cls != 'CodeActAgent':
         raise ValueError(
@@ -374,15 +353,14 @@ def process_instance(
     )
 
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    prepared_dataset = prepare_dataset(
-        gpqa_dataset, output_file, args.eval_n_limit, 'task_id'
-    )
-
-    run_evaluation(
-        dataset=prepared_dataset,
-        metadata=metadata,
-        output_file=output_file,
-        num_workers=args.eval_num_workers,
-        process_instance_func=process_instance,
-        id_column='task_id',
+    prepared_dataset = prepare_dataset(gpqa_dataset, output_file, args.eval_n_limit)
+
+    asyncio.run(
+        run_evaluation(
+            dataset=prepared_dataset,
+            metadata=metadata,
+            output_file=output_file,
+            num_workers=args.eval_num_workers,
+            process_instance_func=process_instance,
+        )
     )
diff --git a/evaluation/humanevalfix/README.md b/evaluation/humanevalfix/README.md
index d231be1a6166..5a056a54da87 100644
--- a/evaluation/humanevalfix/README.md
+++ b/evaluation/humanevalfix/README.md
@@ -1,39 +1,10 @@
 # HumanEvalFix Evaluation with OpenDevin
 
-Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark introduced in [OctoPack: Instruction Tuning Code Large Language Models](https://arxiv.org/abs/2308.07124). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper.
+Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark introduced in [OctoPack: Instruction Tuning Code Large Language Models](https://arxiv.org/abs/2308.07124). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper. Currently only `python` evaluation is supported.
 
-## Setup Environment
+## Setup Environment and LLM Configuration
 
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-
-## Configure OpenDevin and your LLM
-
-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-ssh_hostname = "localhost"
-
-[sandbox]
-enable_auto_lint = true
-
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on HumanEvalFix
 
diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/humanevalfix/run_infer.py
index fa683966339f..8fc895042820 100644
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -9,9 +9,9 @@
 """
 
 import asyncio
-import logging
 import os
-import pathlib
+import tempfile
+from typing import Any
 
 import pandas as pd
 from datasets import load_dataset
@@ -19,20 +19,25 @@
 
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     codeact_user_response,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import CmdRunAction
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime
 
 IMPORT_HELPER = {
     'python': [
@@ -72,19 +77,105 @@
 }
 
 
-def get_test_result(instance, path, language='python', timeout=10):
-    # Evaluation reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L347
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='python:3.11-bookworm',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+def _get_instance_id(instance: pd.Series) -> str:
+    return instance.task_id.replace('/', '__')
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    problem_statement = (
+        instance.declaration + instance.buggy_solution + '\n' + instance.test
+    )
+    filename = f'{_get_instance_id(instance)}.py'
+    with tempfile.TemporaryDirectory() as tmpdir:
+        host_script_path = os.path.join(tmpdir, filename)
+        with open(host_script_path, 'w') as f:
+            f.write(problem_statement)
+        await runtime.copy_to(
+            host_script_path,
+            '/workspace',
+        )
+
+    # check file exists
+    action = CmdRunAction(command=f'ls /workspace/{_get_instance_id(instance)}.py')
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # default value
+    language = 'python'
+    timeout = 10
+
     test_result = {'result': {}, 'metadata': {}}
     code_metric = load('Muennighoff/code_eval_octopack')
     timeout = LANGUAGE_TO_TIMEOUT[language]
     num_workers = LANGUAGE_TO_NUM_WORKERS[language]
     python_imports = '\n'.join(IMPORT_HELPER[language])
 
-    # Load function from path
-    with open(path, 'r') as f:
-        function = f.read()
+    action = CmdRunAction(
+        command=f'cat /workspace/{_get_instance_id(instance)}.py', keep_prompt=False
+    )
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
 
-    function = [[python_imports + '\n' + function.strip()]]
+    function = obs.content.replace('\r\n', '\n')
+    logger.info(f'Function: {function}')
+    function = [[python_imports + '\n' + function]]
 
     results, logs = code_metric.compute(
         references=[instance.test],
@@ -99,129 +190,79 @@ def get_test_result(instance, path, language='python', timeout=10):
         'timeout': timeout,
         'num_workers': num_workers,
     }
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
     return test_result
 
 
-def process_instance(
+async def process_instance(
     instance: pd.Series,
     metadata: EvalMetadata,
     reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    old_workspace_mount_path = config.workspace_mount_path
-    old_workspace_base = config.workspace_base
-
-    try:
-        workspace_mount_path = os.path.join(
-            config.workspace_mount_path, '_eval_workspace'
-        )
-        # create process-specific workspace dir
-        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-
-        # reset workspace to config
-        config.workspace_base = workspace_mount_path
-        config.workspace_mount_path = workspace_mount_path
-
-        # use a session id for concurrent evaluation
-        sid = instance.task_id.replace('/', '__')
-
-        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-        if reset_logger:
-            # Set up logger
-            log_file = os.path.join(
-                metadata.eval_output_dir,
-                'logs',
-                f'instance_{sid}.log',
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            # add back the console handler to print ONE line
-            logger.addHandler(get_console_handler())
-            logger.info(
-                f'Starting evaluation for instance {instance.task_id}.\nLOG:   tail -f {log_file}'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            file_handler = logging.FileHandler(log_file)
-            file_handler.setFormatter(
-                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-            )
-            logger.addHandler(file_handler)
-
-        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-
-        # Create file with HumanEvalFix problem
-        # Prompt reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L509
-        problem_statement = (
-            instance.declaration + instance.buggy_solution + '\n' + instance.test
-        )
-        path = os.path.join(workspace_mount_path, f'{sid}.py')
-        with open(path, 'w') as f:
-            f.write(problem_statement)
+) -> EvalOutput:
+    config = get_config(metadata)
+    # use a session id for concurrent evaluation
+    sid = _get_instance_id(instance)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.task_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.task_id}.')
+
+    # Create file with HumanEvalFix problem
+    # Prompt reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L509
+    problem_statement = (
+        instance.declaration + instance.buggy_solution + '\n' + instance.test
+    )
 
-        # Prepare instruction
-        instruction = (
-            f'Please fix the function in {instance.task_id.replace("/", "__")}.py such that all test cases pass.\n'
-            'Environment has been set up for you to start working. You may assume all necessary tools are installed.\n\n'
-            '# Problem Statement\n'
-            f'{problem_statement}\n\n'
-        )
-        instruction += (
-            'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
-            'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n'
-            'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
-        )
-        # NOTE: You can actually set slightly different instruction for different agents
-        instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
-
-        # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State | None = asyncio.run(
-            run_agent_controller(
-                agent,
-                instruction,
-                max_iterations=metadata.max_iterations,
-                max_budget_per_task=config.max_budget_per_task,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent.__class__.__name__
-                ),
-                sid=sid,
-            )
-        )
+    # Prepare instruction
+    instruction = (
+        f'Please fix the function in {sid}.py such that all test cases pass.\n'
+        'Environment has been set up for you to start working. You may assume all necessary tools are installed.\n\n'
+        '# Problem Statement\n'
+        f'{problem_statement}\n\n'
+    )
+    instruction += (
+        'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+        'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n'
+        'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
+    )
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    runtime = await create_runtime(config, sid=sid)
+    await initialize_runtime(runtime, instance)
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+            metadata.agent_class
+        ),
+    )
 
-        # ======= Attempt to evaluate the agent's edits =======
-        test_result = get_test_result(instance, path)
-
-        # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-        if state is None:
-            raise ValueError('State should not be None.')
-        metrics = state.metrics.get() if state.metrics else None
-
-        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
-        # for compatibility with the existing output format, we can remake the pairs here
-        # remove when it becomes unnecessary
-        histories = state.history.compatibility_for_eval_history_pairs()
-
-        # Save the output
-        output = {
-            'task_id': instance.task_id,
-            'instruction': instruction,
-            'metadata': metadata.model_dump(),
-            'history': histories,
-            'metrics': metrics,
-            'error': state.last_error if state and state.last_error else None,
-            'test_result': test_result,
-        }
-    except Exception:
-        logger.error('Process instance failed')
-        raise
-    finally:
-        config.workspace_mount_path = old_workspace_mount_path
-        config.workspace_base = old_workspace_base
+    if state is None:
+        raise ValueError('State should not be None.')
+    metrics = state.metrics.get() if state.metrics else None
+    test_result = await complete_runtime(runtime, instance)
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=instance.task_id,
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
     return output
 
 
@@ -234,28 +275,31 @@ def process_instance(
         'bigcode/humanevalpack', 'python'
     )  # TODO: Support other languages
     hefix_tests = dataset['test'].to_pandas()
+    hefix_tests.rename(columns={'task_id': 'instance_id'}, inplace=True)
 
-    id_column = 'task_id'
-
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     metadata = make_metadata(
         llm_config,
-        args.dataset_name,
+        'humanevalfix-python',
         args.agent_cls,
         args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
-
-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    instances = prepare_dataset(hefix_tests, output_file, args.eval_n_limit)
+
+    asyncio.run(
+        run_evaluation(
+            instances,
+            metadata,
+            output_file,
+            args.eval_num_workers,
+            process_instance,
+        )
     )
diff --git a/evaluation/humanevalfix/scripts/run_infer.sh b/evaluation/humanevalfix/scripts/run_infer.sh
old mode 100644
new mode 100755
diff --git a/evaluation/logic_reasoning/Dockerfile b/evaluation/logic_reasoning/Dockerfile
new file mode 100644
index 000000000000..0730c2e36d0d
--- /dev/null
+++ b/evaluation/logic_reasoning/Dockerfile
@@ -0,0 +1,7 @@
+FROM ubuntu:22.04
+
+RUN apt-get update && apt-get install -y python3 python3-pip
+
+RUN pip install scitools-pyke
+
+# docker build -t xingyaoww/od_logic_reasoning .
diff --git a/evaluation/logic_reasoning/README.md b/evaluation/logic_reasoning/README.md
index c0e313cf8beb..79faae4fe071 100644
--- a/evaluation/logic_reasoning/README.md
+++ b/evaluation/logic_reasoning/README.md
@@ -2,38 +2,13 @@
 
 This folder contains evaluation harness for evaluating agents on the logic reasoning benchmark [ProntoQA](https://github.com/asaparov/prontoqa) and [ProofWriter](https://allenai.org/data/proofwriter).
 
-## Configure OpenDevin and your LLM
+## Setup Environment and LLM Configuration
 
-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-ssh_hostname = "localhost"
-
-[sandbox]
-enable_auto_lint = true
-
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview_llm]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model_llm]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on logic_reasoning
-The following code will run inference on the first example of the ProntoQA dataset,
-using OpenDevin 0.6.2 version.
+The following code will run inference on the first example of the ProofWriter dataset,
 
 ```bash
-./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA eval_gpt4_1106_preview_llm 0.6.2 1
+./evaluation/logic_reasoning/scripts/run_infer.sh eval_gpt4_1106_preview_llm ProofWriter
 ```
diff --git a/evaluation/logic_reasoning/instruction.txt b/evaluation/logic_reasoning/instruction.txt
index bb49e883c7a3..2a9b16582e3a 100644
--- a/evaluation/logic_reasoning/instruction.txt
+++ b/evaluation/logic_reasoning/instruction.txt
@@ -3,12 +3,12 @@ you can interact with an interactive Python (Jupyter Notebook) environment and r
 In this task, you need to use the code in [[logic_inference_path.py]] to help you. Specifically, you first need to instantiate a **LogicInferenceEngine** class and use the **safe_execute_program** method to prove the **logic programs**. You should receive *answer*, *flag*, *error_message* from the output.
 
 An example would be look like this:
-    <execute_ipython>
-    import sys
-    sys.path.append(workspace_mount_path)
-    engine = LogicInferenceEngine(dataset_name, workspace_mount_path)
-    answer, flag, error_message = engine.safe_execute_program(logic_programs)
-    </execute_ipython>
+<execute_ipython>
+import sys
+sys.path.append('/workspace')
+engine = LogicInferenceEngine()
+answer, flag, error_message = engine.safe_execute_program(logic_programs)
+</execute_ipython>
 
 Please send the *answer* variable through message.
 
diff --git a/evaluation/logic_reasoning/logic_inference.py b/evaluation/logic_reasoning/logic_inference.py
index fd8404da10a7..fd6d52b4c6dd 100644
--- a/evaluation/logic_reasoning/logic_inference.py
+++ b/evaluation/logic_reasoning/logic_inference.py
@@ -191,9 +191,9 @@ def answer_map_proofwriter(self, result, value_to_check):
 
 
 class LogicInferenceEngine:
-    def __init__(self, dataset_name, workspace_mount_path):
-        self.dataset_name = dataset_name
-        self.workspace_mount_path = workspace_mount_path
+    def __init__(self):
+        self.dataset_name = os.environ.get('DATASET_NAME', 'ProofWriter')
+        self.workspace_mount_path = '/workspace'
 
     def random_backup(self):
         if self.dataset_name == 'ProntoQA':
diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/logic_reasoning/run_infer.py
index f8a050bfd22e..d0739d4f39e0 100644
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -1,29 +1,35 @@
 import asyncio
-import logging
 import os
-import pathlib
-import shutil
 
 import pandas as pd
 from datasets import load_dataset
 
-from evaluation.swe_bench.swe_env_box import DockerSSHBox
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     codeact_user_response,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import (
+    AgentFinishAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
@@ -34,6 +40,28 @@
 }
 
 
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='xingyaoww/od-eval-logic-reasoning:v1.0',
+            enable_auto_lint=True,
+            use_host_network=False,
+            od_runtime_extra_deps='$OD_INTERPRETER_PATH -m pip install scitools-pyke',
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
 def get_choice(answer_str):
     choices = [
         'A',
@@ -83,7 +111,7 @@ def get_test_result(
         'the correct answer is',
         'The correct answer is',
         'The correct option is',
-        'Thus, the answer is',
+        'the answer is',
     ]
     if prediction is None:
         for indicator in indicators:
@@ -97,154 +125,143 @@ def get_test_result(
     return test_result
 
 
-def process_instance(
+CUR_EVAL_DIR = os.path.dirname(__file__)
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    # copy logic_inference.py to /workspace
+    await runtime.copy_to(
+        os.path.join(CUR_EVAL_DIR, 'logic_inference.py'), '/workspace'
+    )
+    # check if the file exists
+    obs = await runtime.run_action(CmdRunAction(command='ls /workspace'))
+    assert obs.exit_code == 0
+    assert 'logic_inference.py' in obs.content
+
+    await runtime.add_env_vars({'DATASET_NAME': metadata.dataset})
+
+    action = CmdRunAction(command='mkdir -p /workspace/.cache_program')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = IPythonRunCellAction(code='%pip install scitools-pyke')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    ipynb_obs = await runtime.run_action(action)
+    logger.info(ipynb_obs, extra={'msg_type': 'OBSERVATION'})
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+# Prepare instruction
+with open(os.path.join(CUR_EVAL_DIR, 'instruction.txt'), 'r') as f:
+    INSTRUCTION_TEMPLATE = f.read()
+
+
+async def process_instance(
     instance: pd.Series,
     metadata: EvalMetadata,
     reset_logger: bool = True,
 ):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    old_workspace_mount_path = config.workspace_mount_path
-    old_workspace_base = config.workspace_base
-
-    try:
-        workspace_mount_path = os.path.join(
-            config.workspace_mount_path, '_eval_workspace'
-        )
-        # create process-specific workspace dir
-        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-
-        # reset workspace to config
-        config.workspace_base = workspace_mount_path
-        config.workspace_mount_path = workspace_mount_path
-
-        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-        if reset_logger:
-            # Set up logger
-            log_file = os.path.join(
-                metadata.eval_output_dir, 'logs', f'instance_{instance["id"]}.log'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            # add back the console handler to print ONE line
-            logger.addHandler(get_console_handler())
-            logger.info(
-                f'Starting evaluation for instance {instance["id"]}.\nLOG:   tail -f {log_file}'
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            file_handler = logging.FileHandler(log_file)
-            file_handler.setFormatter(
-                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-            )
-            logger.addHandler(file_handler)
-
-        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-
-        # sandbox = DockerSSHBox()
-        logic_inference_path = os.path.join(workspace_mount_path, 'logic_inference.py')
-        if not os.path.exists(logic_inference_path):
-            shutil.copyfile(
-                './evaluation/logic_reasoning/logic_inference.py', logic_inference_path
-            )
-        logger.info(f'logic_inference.py copied to {workspace_mount_path}')
-
-        cache_dir = os.path.join(workspace_mount_path, '.cache_program')
-        if not os.path.exists(cache_dir):
-            os.makedirs(cache_dir)
-
-        # Prepare instruction
-
-        with open('./evaluation/logic_reasoning/instruction.txt', 'r') as f:
-            instruction = f.read()
-
-        instance_logic_programs = instance['raw_logic_programs'][0].strip()
-        instruction = instruction.replace('[[dataset_name]]', dataset_name)
-        instruction = instruction.replace('[[logic_programs]]', instance_logic_programs)
-        instruction = instruction.replace(
-            '[[logic_inference_path.py]]', logic_inference_path
-        )
+    config = get_config(metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance['instance_id'], log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')
+
+    instance_logic_programs = instance['raw_logic_programs'][0].strip()
+    instruction = (
+        INSTRUCTION_TEMPLATE.replace('[[dataset_name]]', dataset_name)
+        .replace('[[logic_programs]]', instance_logic_programs)
+        .replace('[[logic_inference_path.py]]', '/workspace/logic_inference.py')
+    )
 
-        # NOTE: You can actually set slightly different instruction for different agents
-        instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
-
-        # use a session id for concurrent evaluation
-        sid = instance['id'] + '_' + str(os.getpid())
-        sandbox = DockerSSHBox(sid=sid)
-        exit_code, command_output = sandbox.execute('pip install scitools-pyke')
-
-        # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State | None = asyncio.run(
-            run_agent_controller(
-                agent,
-                instruction,
-                max_iterations=metadata.max_iterations,
-                max_budget_per_task=config.max_budget_per_task,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent.__class__.__name__
-                ),
-                sandbox=sandbox,
-                sid=sid,
-            )
-        )
-        # ======= Attempt to evaluate the agent's edits =======
-        # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
-        # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-
-        if state is None:
-            raise ValueError('State should not be None.')
-
-        final_message = ''
-        messages = []
-        for event in state.history.get_events(reverse=True):
-            # will this be a MessageAction?
-            # TODO we can filter for types of events if we know what to expect
-            messages.append(event.content)
-            if str(event.content) in ["'A'", "'B'", "'C'"]:
-                final_message = event.content
-                break
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
 
-        final_message = final_message.strip("'")
-        logger.info(
-            f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}'
-        )
+    # use a session id for concurrent evaluation
+    sid = instance['instance_id']
+
+    runtime = await create_runtime(config, sid=sid)
+    await initialize_runtime(runtime, instance)
 
-        test_result = get_test_result(
-            model_answer=final_message, ground_truth=instance['answer']
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            task_str=instruction,
+            runtime=runtime,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+                metadata.agent_class
+            ),
         )
-        metrics = state.metrics.get() if state.metrics else None
-
-        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
-        # for compatibility with the existing output format, we can remake the pairs here
-        # remove when it becomes unnecessary
-        histories = state.history.compatibility_for_eval_history_pairs()
-
-        # Save the output
-        output = {
-            'id': instance['id'],
-            'instance': instance,
-            'instruction': instruction,
-            # 'metadata': metadata.model_dump(),
-            'history': histories,
-            'metrics': metrics,
-            'final_message': final_message,
-            'messages': messages,
-            'error': state.last_error if state and state.last_error else None,
-            'test_result': test_result,
-        }
-    except Exception:
-        logger.error('Process instance failed')
-        raise
-    finally:
-        config.workspace_mount_path = old_workspace_mount_path
-        config.workspace_base = old_workspace_base
-
-    # Close the sandbox
-    sandbox.close()
+    )
+    # ======= Attempt to evaluate the agent's edits =======
+    # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    final_message = ''
+    for event in state.history.get_events(reverse=True):
+        if isinstance(event, AgentFinishAction):
+            final_message = event.thought
+            break
+        elif isinstance(event, MessageAction):
+            final_message = event.content
+            break
+
+    final_message = final_message.strip("'")
+    logger.info(
+        f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}'
+    )
 
+    test_result = get_test_result(
+        model_answer=final_message, ground_truth=instance['answer']
+    )
+    test_result['final_message'] = final_message
+
+    metrics = state.metrics.get() if state.metrics else None
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=instance['instance_id'],
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
     return output
 
 
@@ -254,7 +271,7 @@ def process_instance(
         '--dataset',
         type=str,
         help='the logic reasoning dataset to evaluate on {ProntoQA, ProofWriter}',
-        default='ProntoQA',
+        default='ProofWriter',
     )
     parser.add_argument(
         '--data_split',
@@ -262,36 +279,32 @@ def process_instance(
         help='data split to evaluate on {validation}',  # right now we only support validation split
         default='validation',
     )
-
     args, _ = parser.parse_known_args()
-    if args.directory:
-        config.workspace_base = os.path.abspath(args.directory)
-        print(f'Setting workspace base to {config.workspace_base}')
 
     dataset_name = args.dataset
     data_split = args.data_split
     dataset = load_dataset(f'renma/{dataset_name}')
-    logic_reasoning_tests = dataset[data_split]
+    dataset_df = dataset[data_split].to_pandas()
+    dataset_df.rename(columns={'id': 'instance_id'}, inplace=True)
 
-    id_column = 'id'
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     metadata = make_metadata(
         llm_config,
-        args.dataset_name,
+        dataset_name,
         args.agent_cls,
         args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    instances = prepare_dataset(dataset_df, output_file, args.eval_n_limit)
+    asyncio.run(
+        run_evaluation(
+            instances, metadata, output_file, args.eval_num_workers, process_instance
+        )
     )
diff --git a/evaluation/logic_reasoning/scripts/run_infer.sh b/evaluation/logic_reasoning/scripts/run_infer.sh
old mode 100644
new mode 100755
index b7cec5cd546d..21fb7a1ae757
--- a/evaluation/logic_reasoning/scripts/run_infer.sh
+++ b/evaluation/logic_reasoning/scripts/run_infer.sh
@@ -3,8 +3,8 @@ set -eo pipefail
 
 source "evaluation/utils/version_control.sh"
 
-DATASET=$1
-MODEL_CONFIG=$2
+MODEL_CONFIG=$1
+DATASET=$2
 COMMIT_HASH=$3
 EVAL_LIMIT=$4
 AGENT=$5
@@ -23,6 +23,11 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
+if [ -z "$DATASET" ]; then
+  echo "Dataset not specified, use default ProofWriter"
+  DATASET="ProofWriter"
+fi
+
 get_agent_version
 
 echo "AGENT: $AGENT"
diff --git a/evaluation/miniwob/Dockerfile b/evaluation/miniwob/Dockerfile
new file mode 100644
index 000000000000..b7d191ac6786
--- /dev/null
+++ b/evaluation/miniwob/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:22.04
+
+RUN apt-get update && apt-get install -y python3 python3-pip git
+
+RUN git clone https://github.com/Farama-Foundation/miniwob-plusplus.git /miniwob-plusplus && \
+    git -C "/miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838
+
+ENV MINIWOB_URL="file:///miniwob-plusplus/miniwob/html/miniwob/"
+
+# docker build -t xingyaoww/od-eval-miniwob .
diff --git a/evaluation/miniwob/README.md b/evaluation/miniwob/README.md
index 7c26872a8d58..4426ebff7018 100644
--- a/evaluation/miniwob/README.md
+++ b/evaluation/miniwob/README.md
@@ -2,52 +2,9 @@
 
 This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on synthetic web browsing tasks.
 
-## Setup OpenDevin Environment
+## Setup Environment and LLM Configuration
 
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-## Configure OpenDevin and your LLM
-
-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-ssh_hostname = "localhost"
-
-[sandbox]
-box_type = "ssh"
-timeout = 120
-
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
-
-## Setup MiniWoB++ Environment and Environment Variables of MiniWoB++
-MiniWoB++ requires you to set up websites containing a static website that is accessible via URL to the machine running the OpenDevin agents.
-
-- Clone miniwob (use a specific frozen commit for reproducibility)
-```sh
-git clone git@github.com:Farama-Foundation/miniwob-plusplus.git
-git -C "./miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838
-```
-
-- Setup Miniwob URL (change `PATH_TO_MINIWOB_CLONED_REPO` here to the absolute path to your `miniwob-plusplus` folder) in `evaluation/miniwob/scripts/run_infer.sh`
-```sh
-export MINIWOB_URL="file://<PATH_TO_MINIWOB_CLONED_REPO>/miniwob/html/miniwob/"
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## Test if your environment works
 
@@ -56,7 +13,7 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
 ## Run Evaluation
 
 ```sh
-bash evaluation/miniwob/scripts/run_infer.sh
+./evaluation/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
 ```
 
 Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py
index 9218ac56d9bb..92c96bf42572 100644
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -1,7 +1,7 @@
 import asyncio
 import json
-import logging
 import os
+from typing import Any
 
 import browsergym.miniwob  # noqa F401 register miniwob tasks as gym environments
 import gymnasium as gym
@@ -9,84 +9,131 @@
 
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-from opendevin.runtime.docker.ssh_box import DockerSSHBox
-from opendevin.runtime.tools import RuntimeTool
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import (
+    BrowseInteractiveAction,
+    CmdRunAction,
+    MessageAction,
+)
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.browser.browser_env import (
+    BROWSER_EVAL_GET_GOAL_ACTION,
+    BROWSER_EVAL_GET_REWARDS_ACTION,
+)
+from opendevin.runtime.runtime import Runtime
 
 SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
 
-docker_ssh_box: DockerSSHBox | None = None
-
 
-def get_sandbox():
-    global docker_ssh_box
-    if docker_ssh_box is None:
-        docker_ssh_box = DockerSSHBox()
-    return docker_ssh_box
+def get_config(
+    metadata: EvalMetadata,
+    env_id: str,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='xingyaoww/od-eval-miniwob:v1.0',
+            enable_auto_lint=True,
+            use_host_network=False,
+            browsergym_eval_env=env_id,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+) -> str:
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    goal = obs.content
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    return goal
+
+
+async def complete_runtime(
+    runtime: Runtime,
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    return {
+        'rewards': json.loads(obs.content),
+    }
 
 
-def process_instance(
+async def process_instance(
     instance: pd.Series,
     metadata: EvalMetadata,
     reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
+) -> EvalOutput:
     env_id = instance.id
+    config = get_config(metadata, env_id)
+
     # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
     if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, env_id, log_dir)
     else:
         logger.info(f'Starting evaluation for instance {env_id}.')
 
-    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    runtime_tools_config = {
-        RuntimeTool.BROWSER: {
-            'browsergym_eval': env_id,
-            'browsergym_eval_save_dir': metadata.eval_output_dir,
-        }
-    }
+    runtime = await create_runtime(config, sid=env_id)
+    task_str = await initialize_runtime(runtime)
 
     state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            'PLACEHOLDER_GOAL',
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            runtime_tools_config=runtime_tools_config,
-            sandbox=get_sandbox(),
-            sid=env_id,
+        run_controller(
+            config=config,
+            task_str=task_str,  # take output from initialize_runtime
+            runtime=runtime,
         )
     )
 
@@ -99,18 +146,17 @@ def process_instance(
         raise ValueError('State should not be None.')
 
     metrics = state.metrics.get() if state.metrics else None
-    browsergym_eval_dir = os.path.join(metadata.eval_output_dir, env_id.split('/')[1])
-    # read goal
-    with open(
-        os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
-    ) as f:
-        instruction = f.read()
-    # read reward
-    with open(
-        os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
-    ) as f:
-        rewards = json.load(f)
-        reward = max(rewards)
+
+    # Instruction is the first message from the USER
+    instruction = ''
+    for event in state.history.get_events():
+        if isinstance(event, MessageAction):
+            instruction = event.content
+            break
+
+    return_val = await complete_runtime(runtime)
+    logger.info(f'Return value from complete_runtime: {return_val}')
+    reward = max(return_val['rewards'])
 
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
@@ -118,16 +164,17 @@ def process_instance(
     histories = state.history.compatibility_for_eval_history_pairs()
 
     # Save the output
-    output = {
-        'instance_id': env_id,
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': reward,
-    }
-
+    output = EvalOutput(
+        instance_id=env_id,
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
+            'reward': reward,
+        },
+    )
     return output
 
 
@@ -136,7 +183,7 @@ def process_instance(
 
     dataset = pd.DataFrame(
         {
-            'id': [
+            'instance_id': [
                 id
                 for id in gym.envs.registry.keys()
                 if id.startswith('browsergym/miniwob')
@@ -144,26 +191,25 @@ def process_instance(
         }
     )
 
-    id_column = 'id'
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     metadata = make_metadata(
         llm_config,
-        args.dataset_name,
+        'miniwob',
         args.agent_cls,
         args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
-    _ = get_sandbox()  # Initialize the sandbox
-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit)
+
+    asyncio.run(
+        run_evaluation(
+            instances, metadata, output_file, args.eval_num_workers, process_instance
+        )
     )
diff --git a/evaluation/miniwob/scripts/run_infer.sh b/evaluation/miniwob/scripts/run_infer.sh
old mode 100644
new mode 100755
index e6131b8c98a2..0fe525ba7958
--- a/evaluation/miniwob/scripts/run_infer.sh
+++ b/evaluation/miniwob/scripts/run_infer.sh
@@ -3,14 +3,10 @@ set -eo pipefail
 
 source "evaluation/utils/version_control.sh"
 
-# configure miniwob website, change URL to yours
-export MINIWOB_URL="file:///home/fangzhex/miniwob-plusplus/miniwob/html/miniwob/"
-
 # configure browsing agent
 export USE_NAV="false"
 export USE_CONCISE_ANSWER="true"
 
-
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT=$3
@@ -42,7 +38,7 @@ COMMAND="poetry run python evaluation/miniwob/run_infer.py \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
   --max-chars 10000000 \
-  --eval-num-workers $NUM_WORKERS \
+  --eval-num-workers $NUM_WORKERS"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"
diff --git a/evaluation/mint/Dockerfile b/evaluation/mint/Dockerfile
new file mode 100644
index 000000000000..af7366763dd0
--- /dev/null
+++ b/evaluation/mint/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:22.04
+
+RUN apt-get update && apt-get install -y python3 python3-pip git gcc
+
+WORKDIR /root
+
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+# docker build -t xingyaoww/od-eval-mint:v1.0 .
diff --git a/evaluation/mint/README.md b/evaluation/mint/README.md
index 1e07bd643153..3925cf5e407f 100644
--- a/evaluation/mint/README.md
+++ b/evaluation/mint/README.md
@@ -2,9 +2,11 @@
 
 This folder contains the evaluation harness for the [MINT benchmark](https://arxiv.org/abs/2309.10691) on LLMs' ability to solve tasks with multi-turn interactions.
 
-## Configure OpenDevin and LM
+We support evaluation of the [Eurus subset focus on math and code reasoning](https://arxiv.org/abs/2404.02078), including MATH, MMLU, TheoremQA, HumanEval, MBPP.
 
-Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## Start the evaluation
 
diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py
index b1d4c0826880..42ea45901456 100644
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -1,33 +1,36 @@
-import asyncio
 import functools
-import logging
 import os
-import pathlib
 from typing import Any, Dict
 
+import pandas as pd
 from datasets import load_dataset
 
-from evaluation.swe_bench.swe_env_box import DockerSSHBox
+from evaluation.mint.datatypes import TaskState
+from evaluation.mint.env import SimplifiedEnv
+from evaluation.mint.prompts import ToolPromptTemplate
+from evaluation.mint.tasks import Task
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-from .datatypes import TaskState
-from .env import SimplifiedEnv
-from .prompts import ToolPromptTemplate
-from .tasks import Task
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import (
+    CmdRunAction,
+)
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime
 
 
 def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str, int]):
@@ -42,7 +45,7 @@ def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str,
     last_action = state.history.get_last_action()
     result_state: TaskState = env.step(last_action.message or '')
 
-    state.task_state = result_state
+    state.extra_data['task_state'] = result_state
 
     if not result_state.latest_output:
         # Task is finished
@@ -62,77 +65,107 @@ def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str,
     'CodeActAgent': '\nIMPORTANT: When your answer is confirmed by the user to be correct, you can exit using the following command: <execute_bash> exit </execute_bash>.\n'
 }
 
+with open(os.path.join(os.path.dirname(__file__), 'requirements.txt'), 'r') as f:
+    MINT_DEPENDENCIES = f.read().splitlines()
+
+
+def load_incontext_example(task_name: str, with_tool: bool = True):
+    assert with_tool, 'NOT with_tool is not supported yet'
+    subset = {
+        'gsm8k': 'reasoning',
+        'math': 'reasoning',
+        'mmlu': 'reasoning',
+        'theoremqa': 'reasoning',
+        'mbpp': 'mbpp',
+        'humaneval': 'humaneval',
+    }[task_name]
+    with open(
+        os.path.join(
+            os.path.dirname(__file__),
+            'tasks',
+            'in_context_examples',
+            subset,
+            'with_tool.txt',
+        ),
+        'r',
+    ) as f:
+        return f.read()
+
+
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='xingyaoww/od-eval-mint:v1.0',
+            enable_auto_lint=True,
+            use_host_network=False,
+            od_runtime_extra_deps=f'$OD_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(runtime: Runtime):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
 
-def process_instance(
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def process_instance(
     instance: Any,
     metadata: EvalMetadata,
     reset_logger: bool = True,
 ):
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(metadata.llm_config))
-    workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
-    # create process-specific workspace dir
-    workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+    config = get_config(metadata)
 
     # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
     if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir, 'logs', f'instance_{instance.task_id}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {instance.task_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
-
-    logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-
-    # use a session id for concurrent processing
-    sid = instance.task_id + '_' + str(os.getpid())
-    sandbox = DockerSSHBox(sid=sid)
-
-    requirements_host_src = 'evaluation/mint/requirements.txt'
-    requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'
-    sandbox.copy_to(
-        host_src=requirements_host_src,
-        sandbox_dest=requirements_sandbox_dest,
-        recursive=False,
-    )
-    logger.info(
-        f'Copied files from [{requirements_host_src}] to [{requirements_sandbox_dest}] inside sandbox.'
-    )
-    exit_code, output = sandbox.execute(f'pip install -r {requirements_sandbox_dest}')
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
 
     # Prepare instruction
     assert metadata.details is not None
     instruction = ToolPromptTemplate(use_tool=True)(
         max_total_steps=metadata.max_iterations,
         max_propose_solution=metadata.details['max_propose_solution'],
-        in_context_example=instance.in_context_example(
-            use_tool=True, with_feedback=False
-        ),
+        in_context_example=instance.in_context_example,
         task_prompt='Task:\n' + instance.prompt,
     )
     instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the concise RESULT inside <solution> tag AND NEVER ASK FOR HUMAN HELP.\n'
 
     # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
     fake_user_response_fn = functools.partial(
-        AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[agent.__class__.__name__],
+        AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
         task=instance,
         task_config={
             'max_iterations': metadata.max_iterations,
@@ -140,24 +173,22 @@ def process_instance(
         },
     )
 
-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            fake_user_response_fn=fake_user_response_fn,
-            sandbox=sandbox,
-            sid=sid,
-        )
+    runtime = await create_runtime(config, sid=instance.instance_id)
+    await initialize_runtime(runtime)
+
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=fake_user_response_fn,
     )
 
     if state is None:
         raise ValueError('State should not be None.')
 
     task_state = None
-    if hasattr(state, 'task_state'):
-        task_state = state.task_state
+    if 'task_state' in state.extra_data:
+        task_state = state.extra_data['task_state']
         logger.info('Task state: ' + str(task_state.to_dict()))
 
     metrics = state.metrics.get() if state.metrics else None
@@ -168,30 +199,37 @@ def process_instance(
     histories = state.history.compatibility_for_eval_history_pairs()
 
     # Save the output
-    output = {
-        'id': instance.task_id,
-        'instance': instance.to_dict(),
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': task_state.success if task_state else False,
-    }
-
-    # Close the sandbox
-    sandbox.close()
-
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
+            'success': task_state.success if task_state else False,
+        },
+    )
     return output
 
 
 if __name__ == '__main__':
     parser = get_parser()
 
+    SUBSETS = [
+        # Eurus subset: https://arxiv.org/abs/2404.02078
+        'math',
+        # 'gsm8k',
+        'mmlu',
+        'theoremqa',
+        'mbpp',
+        'humaneval',
+    ]
     parser.add_argument(
         '--subset',
-        default='math',
-        choices=['math', 'gsm8k', 'mmlu', 'theoremqa', 'mbpp', 'humaneval'],
+        default='all',
+        choices=SUBSETS + ['all'],
         type=str,
         help='subset of the dataset to be used',
     )
@@ -206,19 +244,36 @@ def process_instance(
 
     # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
     # so we don't need to manage file uploading to OpenDevin's repo
-    mint_dataset = load_dataset(
-        'ryanhoangt/xingyaoww-mint-bench', name=args.subset, split='test'
-    )
-    logger.info(f'Evaluating MINT - {args.subset} subset')
-    mint_tests = mint_dataset.to_pandas()
+    if args.subset == 'all':
+        subsets = SUBSETS
+    else:
+        subsets = [args.subset]
 
-    id_column = 'id'
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    dataset_dfs = []
+    for subset in subsets:
+        in_context_example = load_incontext_example(subset)
+        _cur_dataset = load_dataset(
+            'ryanhoangt/xingyaoww-mint-bench', name=subset, split='test'
+        )
+        logger.info(f'Loaded MINT - {subset} subset')
+        _df = _cur_dataset.to_pandas().rename(columns={'id': 'instance_id'})
+        _df['instance_id'] = _df['instance_id'].apply(lambda x: f'{subset}/{x}')  # noqa
+        _df['in_context_example'] = in_context_example
+        dataset_dfs.append(_df)
+        logger.info(f'Loaded {len(_df)} instances for subset: {subset}')
+
+    dataset_df = pd.concat(dataset_dfs)
+    logger.info(f'Loaded {len(dataset_df)} instances for subset: {subsets}')
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     metadata = make_metadata(
         llm_config,
-        args.dataset_name,
+        f'MINT-{args.subset}',
         args.agent_cls,
         args.max_iterations,
         args.eval_note,
@@ -226,12 +281,7 @@ def process_instance(
         details={'max_propose_solution': args.max_propose_solution},
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(mint_dataset, output_file, args.eval_n_limit, id_column)
+    instances = prepare_dataset(dataset_df, output_file, args.eval_n_limit)
     run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+        instances, metadata, output_file, args.eval_num_workers, process_instance
     )
diff --git a/evaluation/mint/scripts/run_infer.sh b/evaluation/mint/scripts/run_infer.sh
old mode 100644
new mode 100755
index 8c8e017aa876..b9ec6d7a7a85
--- a/evaluation/mint/scripts/run_infer.sh
+++ b/evaluation/mint/scripts/run_infer.sh
@@ -29,15 +29,16 @@ COMMAND="poetry run python ./evaluation/mint/run_infer.py \
     --llm-config $MODEL_CONFIG \
     --max-iterations 5 \
     --max-propose-solution 2 \
-    --eval-num-workers $NUM_WORKERS \
+    --eval-num-workers $NUM_WORKERS
+"
 
 if [ -n "$SUBSET" ]; then
   echo "SUBSET: $SUBSET"
   COMMAND="$COMMAND --subset $SUBSET"
 # otherwise default to use the math subset
 else
-  echo "SUBSET: math"
-  COMMAND="$COMMAND --subset math"
+  echo "SUBSET: all"
+  COMMAND="$COMMAND --subset all"
 fi
 
 if [ -n "$EVAL_LIMIT" ]; then
diff --git a/evaluation/ml_bench/README.md b/evaluation/ml_bench/README.md
index 51e59cad7975..e6199327a7d9 100644
--- a/evaluation/ml_bench/README.md
+++ b/evaluation/ml_bench/README.md
@@ -10,40 +10,9 @@ The task introduces new challenges for LLMs, such as comprehending long and lang
 
 For more details on the ML-Bench task and dataset, please refer to the paper: [ML-Bench: Evaluating Large Language Models for Code Generation in Repository-Level Machine Learning Tasks](https://arxiv.org/abs/2311.09835).
 
-## Setup Environment
+## Setup Environment and LLM Configuration
 
-Please follow the [OpenDevin setup guide](https://github.com/OpenDevin/OpenDevin/blob/main/docs/setup.md) to set up the local development environment for OpenDevin.
-
-## Configure OpenDevin and your LLM
-
-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-ssh_hostname = "localhost"
-run_as_devin = false
-sandbox_container_image = "public.ecr.aws/i5g0m1f6/ml-bench" # Use the latest image from the ML-Bench repository
-
-[sandbox]
-enable_auto_lint = true
-
-
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on ML-Bench
 
diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/ml_bench/run_infer.py
index 9be043933177..c7baa77e03a6 100644
--- a/evaluation/ml_bench/run_infer.py
+++ b/evaluation/ml_bench/run_infer.py
@@ -13,29 +13,34 @@
 - Clean up the code and docker image used for evaluation.
 """
 
-import asyncio
-import logging
 import os
-import pathlib
 from typing import Any
 
+import pandas as pd
 from datasets import load_dataset
 
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     codeact_user_response,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+    load_app_config,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-from opendevin.runtime.docker.ssh_box import DockerSSHBox
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import CmdRunAction
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime
 
 config = load_app_config()
 
@@ -66,161 +71,203 @@
 }
 
 
-def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    old_workspace_mount_path = config.workspace_mount_path
-    old_workspace_base = config.workspace_base
-    try:
-        workspace_mount_path = os.path.join(
-            config.workspace_mount_path, '_eval_workspace'
-        )
-        # create process-specific workspace dir
-        # so that different agent don't interfere with each other.
-        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-
-        # reset workspace to config
-        config.workspace_base = workspace_mount_path
-        config.workspace_mount_path = workspace_mount_path
-
-        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-        if reset_logger:
-            # Set up logger
-            log_file = os.path.join(
-                metadata.eval_output_dir,
-                'logs',
-                f"instance_{instance['id']}_pid_{os.getpid()}.log",
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            # add back the console handler to print ONE line
-            logger.addHandler(get_console_handler())
-            logger.info(
-                f"Starting evaluation for instance {instance['id']}.\nLOG:   tail -f {log_file}"
-            )
-            # Remove all existing handlers from logger
-            for handler in logger.handlers[:]:
-                logger.removeHandler(handler)
-            file_handler = logging.FileHandler(log_file)
-            file_handler.setFormatter(
-                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-            )
-            logger.addHandler(file_handler)
-
-        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
-
-        # Create a sandbox, using the instance ID and PID as the session ID to avoid conflicts
-        sid = str(instance['id']) + '_' + str(os.getpid())
-        sandbox = DockerSSHBox(sid=sid)
-
-        # Set up the task environment
-        sandbox.execute(f'conda activate {ID2CONDA[instance["github_id"]]}')
-
-        # Clone the task repo into the sandbox
-        repo_url = instance['github']
-        repo_name = repo_url.split('/')[-1]
-        sandbox.execute(f'git clone {repo_url} /workspace/{repo_name}')
-        sandbox.execute(f'chmod -R 777 /workspace/{repo_name}')
-
-        # Navigate to the task's code path
-        task_path = os.path.join('/workspace', repo_name, instance['path'][2:])
-        sandbox.execute(f'cd {task_path}')
-
-        # Prepare the task instruction
-        instruction = (
-            f'Please complete the Machine Learning task in the following repository: {repo_name}\n\n'
-            f'The task is: {instance["task"]}\n\n'
-            f'{instance["instruction"]}\n\n'
-            'You should create a script named `run.sh` under the specified path in the repo to run the task.\n\n'
-            f'You can find the task repo at: {task_path}\n\n'
-            + (
-                'Here is the prefix code for the task:\n'
-                '```bash\n'
-                f'{instance["prefix_code"]}\n'
-                '```\n\n'
-                if instance['prefix_code']
-                else ''
-            )
-            + 'You should terminate the subprocess after running the task (e.g., call subprocess.Popen(args).wait()).'
-        )
-        instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
-
-        # Run the agent
-        state: State | None = asyncio.run(
-            run_agent_controller(
-                agent,
-                instruction,
-                max_iterations=metadata.max_iterations,
-                max_budget_per_task=config.max_budget_per_task,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent.__class__.__name__
-                ),
-                sandbox=sandbox,
-                sid=sid,
-            )
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='public.ecr.aws/i5g0m1f6/ml-bench',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    # Set up the task environment
+    action = CmdRunAction(command=f'conda activate {ID2CONDA[instance["github_id"]]}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    repo_url = instance['github']
+    repo_name = repo_url.split('/')[-1]
+    action = CmdRunAction(command=f'git clone {repo_url} /workspace/{repo_name}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command=f'chmod -R 777 /workspace/{repo_name}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    # Navigate to the task's code path
+    task_path = os.path.join('/workspace', repo_name, instance['path'][2:])
+    action = CmdRunAction(command=f'cd {task_path}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    repo_url = instance['github']
+    repo_name = repo_url.split('/')[-1]
+    task_path = os.path.join('/workspace', repo_name, instance['path'][2:])
+
+    # Evaluate the agent's script
+    eval_script = os.path.join(task_path, 'run.sh')
+    logger.info(f'Running evaluation script: {eval_script}')
+
+    action = CmdRunAction(command=f'cat {eval_script}', keep_prompt=False)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    if obs.exit_code == 0:
+        eval_script_content = obs.content
+    else:
+        logger.error(f'Error reading evaluation script: {obs.content}')
+        eval_script_content = ''
+
+    action = CmdRunAction(
+        command=f'timeout 120s conda run -n {ID2CONDA[instance["github_id"]]} bash {eval_script}',
+        timeout=600,
+    )
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    if obs.exit_code == 0:
+        eval_output = obs.content
+    else:
+        logger.error(f'Error running evaluation script: {obs.content}')
+        eval_output = ''
+
+    outputs = {
+        'eval_script_content': eval_script_content,
+        'eval_output': eval_output,
+    }
+    if obs.exit_code != 0 and obs.exit_code != 124:
+        logger.warning(f'Evaluation script failed with exit code {obs.exit_code}')
+        logger.warning(f'Output: {eval_output}')
+        outputs['success'] = int(
+            'KeyboardInterrupt' in eval_output
+        )  # super-dainiu: assume ``KeyboardInterrupt`` is a success as is done in ML-Bench
+
+    else:
+        logger.info(f'Evaluation script succeeded with exit code {obs.exit_code}')
+        logger.info(f'Output: {eval_output}')
+        outputs['success'] = 1
+    outputs['eval_exit_code'] = obs.exit_code
+
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    return outputs
+
+
+async def process_instance(
+    instance: Any, metadata: EvalMetadata, reset_logger: bool = True
+):
+    config = get_config(metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance['instance_id'], log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')
+
+    # Create a sandbox, using the instance ID and PID as the session ID to avoid conflicts
+    sid = str(instance['instance_id'])
+
+    repo_url = instance['github']
+    repo_name = repo_url.split('/')[-1]
+    task_path = os.path.join('/workspace', repo_name, instance['path'][2:])
+    # Prepare the task instruction
+    instruction = (
+        f'Please complete the Machine Learning task in the following repository: {repo_name}\n\n'
+        f'{instance["instruction"]}\n\n'
+        'You should create a script named `run.sh` under the specified path in the repo to run the task.\n\n'
+        f'You can find the task repo at: {task_path}\n\n'
+        + (
+            'Here is the prefix code for the task:\n'
+            '```bash\n'
+            f'{instance["prefix_code"]}\n'
+            '```\n\n'
+            if instance['prefix_code']
+            else ''
         )
-        assert state is not None
-        metrics = state.metrics.get() if state.metrics else {}
-
-        # Evaluate the agent's script
-        eval_script = os.path.join(task_path, 'run.sh')
-        logger.info(f'Running evaluation script: {eval_script}')
-
-        try:
-            _, eval_script_content = sandbox.execute(f'cat {eval_script}')
-        except Exception as e:
-            logger.error(f'Error reading evaluation script: {e}')
-            eval_script_content = ''
-
-        try:
-            exit_code, eval_output = sandbox.execute(
-                f'timeout 120s conda run -n {ID2CONDA[instance["github_id"]]} bash {eval_script}',
-                timeout=600,
-            )
-        except Exception as e:
-            logger.error(f'Error running evaluation script: {e}')
-            exit_code = -1
-            eval_output = ''
-
-        if exit_code != 0 and exit_code != 124:
-            logger.warning(f'Evaluation script failed with exit code {exit_code}')
-            logger.warning(f'Output: {eval_output}')
-            metrics['success'] = int(
-                'KeyboardInterrupt' in eval_output
-            )  # super-dainiu: assume ``KeyboardInterrupt`` is a success as is done in ML-Bench
-        else:
-            logger.info(f'Evaluation script succeeded with exit code {exit_code}')
-            logger.info(f'Output: {eval_output}')
-            metrics['success'] = 1
-
-        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
-        # for compatibility with the existing output format, we can remake the pairs here
-        # remove when it becomes unnecessary
-        histories = state.history.compatibility_for_eval_history_pairs()
-
-        # Save the output
-        output = {
-            'instance_id': instance['id'],
-            'repo': repo_url,
-            'instruction': instruction,
-            'metadata': metadata.model_dump(),
-            'history': histories,
-            'eval_script': eval_script_content,
-            'eval_exit_code': exit_code,
-            'eval_output': eval_output,
-            'metrics': metrics,
-        }
-
-    except Exception as e:
-        logger.error(f'Error processing instance {instance["id"]}: {e}')
-        raise
-    finally:
-        config.workspace_mount_path = old_workspace_mount_path
-        config.workspace_base = old_workspace_base
-
-    # Shutdown the sandbox
-    sandbox.close()
+        + 'You should terminate the subprocess after running the task (e.g., call subprocess.Popen(args).wait()).'
+    )
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
+
+    runtime = await create_runtime(config, sid=sid)
+    await initialize_runtime(runtime, instance)
+
+    # Run the agent
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+            metadata.agent_class
+        ),
+    )
+    assert state is not None
+    metrics = state.metrics.get() if state.metrics else {}
+
+    test_result = await complete_runtime(runtime)
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=instance['instance_id'],
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        test_result=test_result,
+        metrics=metrics,
+    )
     return output
 
 
@@ -238,30 +285,26 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
 
     data_split = args.eval_split
 
-    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
-    # so we don't need to manage file uploading to OpenDevin's repo
     ml_bench = load_dataset('super-dainiu/ml-bench', split=data_split).to_pandas()
+    ml_bench.rename(columns={'id': 'instance_id'}, inplace=True)
 
-    id_column = 'instance_id'
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     metadata = make_metadata(
         llm_config,
-        args.dataset_name,
+        f'ml-bench-{data_split}',
         args.agent_cls,
         args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(ml_bench, output_file, args.eval_n_limit, id_column)
+    instances = prepare_dataset(ml_bench, output_file, args.eval_n_limit)
 
     run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+        instances, metadata, output_file, args.eval_num_workers, process_instance
     )
diff --git a/evaluation/ml_bench/scripts/run_infer.sh b/evaluation/ml_bench/scripts/run_infer.sh
old mode 100644
new mode 100755
diff --git a/evaluation/swe_bench/README.md b/evaluation/swe_bench/README.md
index 39713ecfc975..146069ed4ea5 100644
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -1,132 +1,79 @@
 # SWE-Bench Evaluation with OpenDevin SWE-Bench Docker Image
 
-This folder contains the evaluation harness that we built on top of the original [SWE-Bench benchmark](https://www.swebench.com/) ([paper](https://arxiv.org/abs/2310.06770)). We created [a fork of SWE-Bench](https://github.com/OpenDevin/OD-SWE-bench.git) mostly built on top of [the original repo](https://github.com/princeton-nlp/SWE-bench) and [containerized](#opendevin-swe-bench-docker-image) it for easy evaluation.
+This folder contains the evaluation harness that we built on top of the original [SWE-Bench benchmark](https://www.swebench.com/) ([paper](https://arxiv.org/abs/2310.06770)).
 
 **UPDATE (7/1/2024): We now support the official SWE-Bench dockerized evaluation as announced [here](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md).**
 
-## Setup Environment
+The evaluation consists of three steps:
 
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to set up a local development environment for OpenDevin.
+1. Environment setup: [install python environment](../README.md#development-environment), [configure LLM config](../README.md#configure-opendevin-and-your-llm), and [pull docker](#opendevin-swe-bench-instance-level-docker-support).
+2. [Run inference](#run-inference-on-swe-bench-instances): Generate a edit patch for each Github issue
+3. [Evaluate patches using SWE-Bench docker](#evaluate-generated-patches)
 
-## OpenDevin SWE-Bench Docker Image
+## Setup Environment and LLM Configuration
 
-In [OpenDevin-SWE-Bench fork](https://github.com/OpenDevin/OD-SWE-bench.git) (mostly from [original repo](https://github.com/princeton-nlp/SWE-bench) with some fixes), we try to pre-build the **testbed** (i.e., code of the repository we want the agent to edit) AND the **conda environment**, so that in evaluation (inference) time, we can directly leverage existing environments for efficient evaluation.
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
-**We pack everything you need for SWE-Bench inference into one, gigantic, docker image.** To use it:
+## OpenDevin SWE-Bench Instance-level Docker Support
 
-```bash
-docker pull ghcr.io/opendevin/eval-swe-bench:full-v1.2.1
-```
-
-The Docker image contains several important directories:
-
-- `/swe_util/OD-SWE-bench`: root directory for the OD-SWE-bench repository
-- `/swe_util/eval_data`: directory to eval data
-  - `/swe_util/eval_data/eval_logs/`: evaluation logs
-  - `/swe_util/eval_data/eval_temp/`: temporary folder for the evaluation process
-  - `/swe_util/eval_data/instances/`: swe-bench raw instances
-  - `/swe_util/eval_data/outputs/`: model or agent outputs
-  - `/swe_util/eval_data/testbed_logs/`: logs for testbed building
-  - `/swe_util/eval_data/testbeds/`: directory for all testbeds
-- `/swe_util/miniforge3/`: directory for miniforge3
-
-To reproduce how we pack the image, check [this doc](./BUILD_TESTBED_AND_ENV.md).
+OpenDevin now support using the [official evaluation docker](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md) for both **[inference](#run-inference-on-swe-bench-instances) and [evaluation](#evaluate-generated-patches)**.
+This is now the default behavior.
 
-NOTE: We only support SWE-Bench lite for now. But modifying our existing scripts for full SWE-Bench should be quite straightforward.
+### Download Docker Images
 
-## Configure OpenDevin and your LLM
+**(Recommended for reproducibility)** If you have extra local space (e.g., 100GB), you can try pull the [instance-level docker images](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level) we've prepared by running:
 
-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-ssh_hostname = "localhost"
-
-[sandbox]
-box_type = "ssh"
-timeout = 120
-
-run_as_devin = false
-max_budget_per_task = 4 # 4 USD
-
-[sandbox]
-# SWEBench eval specific
-use_host_network = false
-enable_auto_lint = true
-
-# TODO: Change these to the model you want to evaluate
-[llm.eval_gpt4_1106_preview_llm]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[llm.eval_some_openai_compatible_model_llm]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
+```bash
+evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh instance
 ```
 
-## Test if your environment works
-
-Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench:full-v1.2`
-docker image. Then run this python script:
+If you want to save disk space a bit (e.g., with ~50GB free disk space), while speeding up the image pre-build process, you can pull the environment-level docker images:
 
 ```bash
-# export USE_INSTANCE_IMAGE=true # if you want to test support for instance-level docker images
-poetry run python evaluation/swe_bench/swe_env_box.py
+evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh env
 ```
 
-If you get to the interactive shell successfully, it means your environment works!
-If you see an error, please make sure your `config.toml` contains all
-`SWEBench eval specific` settings as shown in the previous section.
-
 ## Run Inference on SWE-Bench Instances
 
+Make sure your Docker daemon is running, and you have pulled the [instance-level docker image](#opendevin-swe-bench-instance-level-docker-support).
+
 ```bash
 ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers]
-# e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
+# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300
 ```
 
-where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
+where `model_config` is mandatory, and the rest are optional.
 
-`model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
 LLM settings, as defined in your `config.toml`.
-
-`git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
 like to evaluate. It could also be a release tag like `0.6.2`.
-
-`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
 to `CodeActAgent`.
-
-`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
 default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
 in order to use `eval_limit`, you must also set `agent`.
-
-`max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By
+- `max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By
 default, it is set to 30.
-
-`num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
+- `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
 default, it is set to 1.
 
 There are also two optional environment variables you can set.
 ```
-export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Ignore this if you are not sure.
-export USE_INSTANCE_IMAGE=true # if you want to use instance-level docker images
+export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Default to false. Ignore this if you are not sure.
+export USE_INSTANCE_IMAGE=true # if you want to use instance-level docker images. Default to true
 ```
 
-Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview_llm` and CodeActAgent,
+Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and CodeActAgent,
 
 then your command would be:
 
 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview_llm HEAD CodeActAgent 10
+./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
 ```
 
+### Specify a subset of tasks to run infer
+
 If you would like to specify a list of tasks you'd like to benchmark on, you could
 create a `config.toml` under `./evaluation/swe_bench/` folder, and put a list
 attribute named `selected_ids`, e.g.
@@ -146,22 +93,12 @@ With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patc
 
 **This evaluation is performed using the official dockerized evaluation announced [here](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md).**
 
-If you want to evaluate existing results, you should first run this to clone existing outputs
-
-```bash
-git clone https://huggingface.co/spaces/OpenDevin/evaluation evaluation/evaluation_outputs
-```
-
-If you have extra local space (e.g., 500GB), you can try pull the [instance-level docker images](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level) we've prepared to speed up the evaluation by running:
+> If you want to evaluate existing results, you should first run this to clone existing outputs
+>```bash
+>git clone https://huggingface.co/spaces/OpenDevin/evaluation evaluation/evaluation_outputs
+>```
 
-```bash
-evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh instance
-```
-
-If you want to save disk space a bit (e.g., with ~50GB free disk space), while speeding up the image pre-build process, you can pull the environment-level docker images:
-```bash
-evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh env
-```
+NOTE, you should have already pulled the instance-level OR env-level docker images following [this section](#opendevin-swe-bench-instance-level-docker-support).
 
 Then you can run the following:
 
@@ -171,13 +108,13 @@ Then you can run the following:
 ./evaluation/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
 ```
 
-PS: You can also pass in a JSONL with [SWE-Bench format](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-creating-predictions) to `./evaluation/swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`.
+> You can also pass in a JSONL with [SWE-Bench format](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-creating-predictions) to `./evaluation/swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`.
 
 The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/` with the following files/directory:
 
 - `README.md`: a report showing what are the instances that passed, failed, etc.
 - `report.json`: a JSON file that contains keys like `"resolved_ids"` pointing to instance IDs that are resolved by the agent.
-- `eval_outputs/`: a directory of test logs
+- `logs/`: a directory of test logs
 
 ## Visualize Results
 
@@ -189,9 +126,10 @@ git clone https://huggingface.co/spaces/OpenDevin/evaluation
 
 **(optional) setup streamlit environment with conda**:
 ```bash
+cd evaluation
 conda create -n streamlit python=3.10
 conda activate streamlit
-pip install streamlit altair st_pages
+pip install -r requirements.txt
 ```
 
 **run the visualizer**:
diff --git a/evaluation/swe_bench/prompt.py b/evaluation/swe_bench/prompt.py
new file mode 100644
index 000000000000..6b9da9afb11f
--- /dev/null
+++ b/evaluation/swe_bench/prompt.py
@@ -0,0 +1,28 @@
+CODEACT_SWE_PROMPT = """Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want.
+Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command.
+When you're satisfied with all of the changes you've made, you can run the following command: <execute_bash> exit </execute_bash>.
+Note however that you cannot use any interactive session commands (e.g. vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python <script_name>.py`.
+
+NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line!
+
+IMPORTANT TIPS:
+1. Always start by trying to replicate the bug that the issues discusses.
+    If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug.
+    Then start trying to fix it.
+    When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed.
+
+    If the bug reproduction script does not print anything when it successfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file,
+    so that you can be sure that the script indeed ran fine all the way through.
+
+2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it!
+
+3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker.
+
+4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file("buggy-input.png") If that doesn't work, use the linux 'find' command.
+
+5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current  open file.
+
+6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it.
+
+[Current directory: /workspace/{workspace_dir_name}]
+"""
diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 0578eb588a3b..089b6f040200 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -1,34 +1,39 @@
 import asyncio
-import logging
+import json
 import os
-import pathlib
+import tempfile
+from typing import Any
 
 import pandas as pd
 import toml
-import whatthepatch
 from datasets import load_dataset
 
 import agenthub
-from evaluation.swe_bench.swe_env_box import SWEBenchSSHBox
+from evaluation.swe_bench.prompt import CODEACT_SWE_PROMPT
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     codeact_user_response,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import CmdRunAction
+from opendevin.events.observation import CmdOutputObservation, ErrorObservation
+from opendevin.runtime.runtime import Runtime
 
-USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false') == 'true'
-USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false') == 'true'
+USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
+USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
@@ -41,183 +46,12 @@
 }
 
 
-def get_test_result(instance, sandbox, workspace_dir_name):
-    test_result = {'result': {}, 'metadata': {}}
-    # NOTE: if you need to do something in the sandbox to get the correctness metric, modify this function
-    try:
-        test_patch_parsed = whatthepatch.parse_patch(instance.test_patch)
-        # get a list of filepaths that are involved in the patch
-        involved_filepaths = set()
-        for patch in test_patch_parsed:
-            involved_filepaths.add(patch.header.old_path.removeprefix('a/'))
-            involved_filepaths.add(patch.header.new_path.removeprefix('b/'))
-        involved_filepaths = list(involved_filepaths)
-        test_result['metadata']['1_test_patch_parse_success'] = True
-        test_result['metadata']['1_test_involved_filepaths'] = involved_filepaths
-    except Exception as e:
-        logger.error(
-            f'Error parsing test patch for instance {instance.instance_id}: {e}'
-        )
-        test_result['metadata']['1_test_patch_parse_success'] = False
-        test_result['metadata']['1_test_patch_parse_error'] = str(e)
-        test_result['metadata']['1_test_involved_filepaths'] = None
-        involved_filepaths = []
-
-    # Try to revert the changes for involved filepaths
-    err_code, output = sandbox.execute(f'cd /workspace/{workspace_dir_name}')
-    test_result['metadata']['2_revert_test_involved_filepaths_success'] = []
-    for filepath in involved_filepaths:
-        err_code, output = sandbox.execute(
-            f'git checkout {instance["base_commit"]} -- {filepath}'
-        )
-        if err_code != 0:
-            logger.error(f'Error reverting changes for {filepath}: {output}')
-            test_result['metadata']['2_revert_test_involved_filepaths_success'].append(
-                False
-            )
-        else:
-            test_result['metadata']['2_revert_test_involved_filepaths_success'].append(
-                True
-            )
-
-    # Apply the testcase
-    err_code, output = sandbox.execute('git apply $SWE_TASK_DIR/test.patch')
-    if err_code != 0:
-        logger.error(f'Error applying test patch: {output}')
-        test_result['metadata']['3_apply_test_patch_success'] = False
-        test_result['metadata']['3_apply_test_patch_error'] = output
-    else:
-        test_result['metadata']['3_apply_test_patch_success'] = True
-
-    # Run the test command
-    err_code, output = sandbox.execute(
-        '$TEST_CMD > /workspace/$SWE_INSTANCE_ID.log 2>&1'
-    )
-    if err_code != 0:
-        logger.error(f'Error running test command: {output}')
-        test_result['metadata']['4_run_test_command_success'] = False
-        test_result['metadata']['4_run_test_command_error'] = output
-    else:
-        test_result['metadata']['4_run_test_command_success'] = True
-
-    # Get the test output
-    err_code, output = sandbox.execute('cat /workspace/$SWE_INSTANCE_ID.log')
-    if err_code != 0:
-        logger.error(f'Error getting test output: {output}')
-        test_result['metadata']['4_get_test_output_success'] = False
-        test_result['metadata']['4_get_test_output_error'] = output
-    else:
-        test_result['metadata']['4_get_test_output_success'] = True
-        test_result['test_output'] = output
-
-    # Reformat instance.json
-    # $SWE_TASK_DIR/instance.json is a dict {"XXX": "YYY"}, add a [ before and a ] after
-    err_code, output = sandbox.execute(
-        (
-            'cat $SWE_TASK_DIR/instance.json | sed "s/^{/[{/" | sed "s/}$/}]/" > /workspace/instance.json'
-        )
-    )
-    if err_code != 0:
-        logger.error(f'Error creating instance.json: {output}')
-        test_result['metadata']['5_reformat_instance_json_success'] = False
-        test_result['metadata']['5_reformat_instance_json_error'] = output
-    else:
-        test_result['metadata']['5_reformat_instance_json_success'] = True
+def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
+    return f'{instance.repo}__{instance.version}'.replace('/', '__')
 
-    if USE_INSTANCE_IMAGE:
-        # instance report is not supported in instance image mode
-        test_result['metadata']['6_get_instance_report_success'] = False
-        test_result['metadata']['6_get_instance_report_error'] = (
-            'Instance report is not supported in instance image mode.'
-        )
-
-    else:
-        # Get the instance report
-        err_code, output = sandbox.execute(
-            (
-                'cd /swe_util/OD-SWE-bench '
-                '&& export PYTHONPATH=$(pwd):$PYTHONPATH '
-                '&& conda run -n swe-bench-eval python swebench/metrics/get_instance_report.py --swe_bench_task /workspace/instance.json --log_path /workspace/$SWE_INSTANCE_ID.log'
-            )
-        )
-        if err_code != 0:
-            logger.error(f'Error getting instance report: {output}')
-            test_result['metadata']['6_get_instance_report_success'] = False
-            test_result['metadata']['6_get_instance_report_error'] = output
-        else:
-            test_result['metadata']['6_get_instance_report_success'] = True
-            test_result['result_raw'] = output
-
-            # try to parse output
-            for line in output.strip().split('\n'):
-                line = line.strip('-')
-                try:
-                    key, value = line.split(':')
-                except ValueError:
-                    # skip this line
-                    print(f'Error parsing result line: {line}')
-                    continue
-                value = value.strip()
-                try:
-                    value = int(value)
-                except ValueError:
-                    pass
-                test_result['result'][key.strip()] = value
-    return test_result
-
-
-def process_instance(
-    instance: pd.Series,
-    metadata: EvalMetadata,
-    reset_logger: bool = True,
-):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-
-    workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
-    # create process-specific workspace dir
-    workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
-    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
-
-    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-    if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir,
-            'infer_logs',
-            f'instance_{instance.instance_id}.log',
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {instance.instance_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        os.makedirs(os.path.dirname(log_file), exist_ok=True)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
-    else:
-        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
-
-    # NOTE: this is something special we do for SWE-Bench due to the reason described in the previous section
-    # You can omit this if you don't need to setup specialized sandbox
-    workspace_dir_name = f'{instance.repo}__{instance.version}'.replace('/', '__')
-    sandbox = SWEBenchSSHBox.get_box_for_instance(
-        instance,
-        workspace_dir_name,
-        workspace_mount_path=workspace_mount_path,
-        sandbox_plugins=agenthub.Agent.get_cls(metadata.agent_class).sandbox_plugins,
-        use_instance_image=USE_INSTANCE_IMAGE,
-    )
 
+def get_instruction(instance: pd.Series, metadata: EvalMetadata):
+    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
     # Prepare instruction
     if metadata.agent_class == 'CodeActSWEAgent':
         instruction = (
@@ -226,39 +60,11 @@ def process_instance(
             f'{instance.problem_statement}\n'
             '--- END ISSUE ---\n\n'
         )
-
         if USE_HINT_TEXT and instance.hints_text:
             instruction += (
                 f'--- BEGIN HINTS ---\n{instance.hints_text}\n--- END HINTS ---\n'
             )
-        instruction += f"""Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want.
-Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command.
-When you're satisfied with all of the changes you've made, you can run the following command: <execute_bash> exit </execute_bash>.
-Note however that you cannot use any interactive session commands (e.g. vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python <script_name>.py`.
-
-NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line!
-
-IMPORTANT TIPS:
-1. Always start by trying to replicate the bug that the issues discusses.
-    If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug.
-    Then start trying to fix it.
-    When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed.
-
-    If the bug reproduction script does not print anything when it successfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file,
-    so that you can be sure that the script indeed ran fine all the way through.
-
-2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it!
-
-3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker.
-
-4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file("buggy-input.png") If that doesn't work, use the linux 'find' command.
-
-5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current  open file.
-
-6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it.
-
-[Current directory: /workspace/{workspace_dir_name}]
-"""
+        instruction += CODEACT_SWE_PROMPT.format(workspace_dir_name=workspace_dir_name)
     else:
         # Testing general agents
         instruction = (
@@ -276,61 +82,278 @@ def process_instance(
         )
 
     # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
+    return instruction
 
-    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                agent.__class__.__name__
-            ],
-            sandbox=sandbox,
-            sid=instance.instance_id,
+
+def get_config(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+) -> AppConfig:
+    SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'
+    if USE_INSTANCE_IMAGE:
+        # We use a different instance image for the each instance of swe-bench eval
+        container_image = 'sweb.eval.x86_64.' + instance['instance_id']
+    else:
+        container_image = SWE_BENCH_CONTAINER_IMAGE
+
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_budget_per_task=4,
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image=container_image,
+            enable_auto_lint=True,
+            use_host_network=False,
+            # large enough timeout, since some testcases take very long to run
+            timeout=300,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Initialization Fn')
+    logger.info('-' * 30)
+    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(
+        command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc"""
+    )
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    if USE_INSTANCE_IMAGE:
+        # inject the init script
+        script_dir = os.path.dirname(__file__)
+
+        # inject the instance info
+        action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert (
+            obs.exit_code == 0
+        ), f'Failed to create /swe_util/eval_data/instances: {obs.content}'
+
+        swe_instance_json_name = 'swe-bench-instance.json'
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Construct the full path for the desired file name within the temporary directory
+            temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
+            # Write to the file with the desired name within the temporary directory
+            with open(temp_file_path, 'w') as f:
+                if not isinstance(instance, dict):
+                    json.dump([instance.to_dict()], f)
+                else:
+                    json.dump([instance], f)
+
+            # Copy the file to the desired location
+            await runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
+
+        # inject the instance swe entry
+        await runtime.copy_to(
+            str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
+            '/swe_util/',
+        )
+        action = CmdRunAction(command='cat ~/.bashrc')
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+
+        action = CmdRunAction(command='source ~/.bashrc')
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+
+        action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+    else:
+        action = CmdRunAction(command='source /swe_util/swe_entry.sh')
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert (
+            obs.exit_code == 0
+        ), f'Failed to source /swe_util/swe_entry.sh: {obs.content}'
+
+    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='git reset --hard')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(
+        command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
+    )
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Initialization Fn')
+    logger.info('-' * 30)
+
+
+async def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Completion Fn')
+    logger.info('-' * 30)
+    obs: CmdOutputObservation
+    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
+
+    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='git config --global core.pager ""')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='git add -A')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    n_retries = 0
+    git_patch = None
+    while n_retries < 5:
+        action = CmdRunAction(
+            command=f'git diff --no-color --cached {instance["base_commit"]}',
+            keep_prompt=False,
         )
+        action.timeout = 600 + 100 * n_retries
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        n_retries += 1
+        if isinstance(obs, CmdOutputObservation):
+            if obs.exit_code == 0:
+                git_patch = obs.content.strip()
+                break
+            else:
+                logger.info('Failed to get git diff, retrying...')
+                await asyncio.sleep(10)
+        elif isinstance(obs, ErrorObservation):
+            logger.error(f'Error occurred: {obs.content}. Retrying...')
+            await asyncio.sleep(10)
+        else:
+            raise ValueError(f'Unexpected observation type: {type(obs)}')
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Completion Fn')
+    logger.info('-' * 30)
+    return {'git_patch': git_patch}
+
+
+async def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(instance, metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
+
+    runtime = await create_runtime(config, sid=instance.instance_id)
+    await initialize_runtime(runtime, instance)
+
+    instruction = get_instruction(instance, metadata)
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
     )
 
     # ======= THIS IS SWE-Bench specific =======
     # Get git patch
-    git_patch = sandbox.get_diff_patch()
-    logger.info(f'Got git diff for instance {instance.instance_id}')
+    return_val = await complete_runtime(runtime, instance)
+    git_patch = return_val['git_patch']
+    logger.info(
+        f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
+    )
     # ==========================================
 
     # ======= Attempt to evaluate the agent's edits =======
-    # TODO: if you need to do something in the sandbox to get the correctness metric, modify this function
-    test_result = get_test_result(instance, sandbox, workspace_dir_name)
+    # we use eval_infer.sh to evaluate the agent's edits, not here
+    # because the agent may alter the environment / testcases
+    test_result = {
+        'git_patch': git_patch,
+    }
 
     # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
     # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
-
     if state is None:
         raise ValueError('State should not be None.')
 
-    metrics = state.metrics.get() if state.metrics else None
-
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
     histories = state.history.compatibility_for_eval_history_pairs()
+    metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
-    output = {
-        'instance_id': instance.instance_id,
-        'swe_instance': instance.to_dict(),  # SWE Bench specific
-        'instruction': instruction,
-        'git_patch': git_patch,  # SWE Bench specific
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': test_result,
-    }
-
-    # Close the sandbox
-    sandbox.close()
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instruction=instruction,
+        instance=instance.to_dict(),  # SWE Bench specific
+        test_result=test_result,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+    )
     return output
 
 
@@ -358,11 +381,12 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
     dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
     swe_bench_tests = filter_dataset(dataset['test'].to_pandas(), 'instance_id')
 
-    id_column = 'instance_id'
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    if args.llm_config and llm_config is None:
-        raise ValueError(f'Could not find LLM config {args.llm_config}')
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     details = {}
     _agent_cls = agenthub.Agent.get_cls(args.agent_cls)
@@ -382,14 +406,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
     )
 
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(
-        swe_bench_tests, output_file, args.eval_n_limit, id_column
-    )
-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)
+
+    asyncio.run(
+        run_evaluation(
+            instances, metadata, output_file, args.eval_num_workers, process_instance
+        )
     )
diff --git a/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py b/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py
index 49f167357c41..41d4fe6dae58 100644
--- a/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py
+++ b/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py
@@ -45,9 +45,16 @@ def process_git_patch(patch):
 
 
 def convert_row_to_swebench_format(row):
+    if 'git_patch' in row:
+        model_patch = row['git_patch']
+    elif 'test_result' in row and 'git_patch' in row['test_result']:
+        model_patch = row['test_result']['git_patch']
+    else:
+        raise ValueError(f'Row {row} does not have a git_patch')
+
     return {
         'instance_id': row['instance_id'],
-        'model_patch': process_git_patch(row['git_patch']),
+        'model_patch': process_git_patch(model_patch),
         'model_name_or_path': model_name,
     }
 
diff --git a/evaluation/swe_bench/scripts/run_infer.sh b/evaluation/swe_bench/scripts/run_infer.sh
index 65a76376c9c6..a31e2c311f60 100755
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/swe_bench/scripts/run_infer.sh
@@ -27,8 +27,8 @@ if [ -z "$MAX_ITER" ]; then
 fi
 
 if [ -z "$USE_INSTANCE_IMAGE" ]; then
-  echo "USE_INSTANCE_IMAGE not specified, use default false"
-  USE_INSTANCE_IMAGE=false
+  echo "USE_INSTANCE_IMAGE not specified, use default true"
+  USE_INSTANCE_IMAGE=true
 fi
 
 export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
diff --git a/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh b/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh
index c0b6ceb5cd4f..1d4051fd8404 100644
--- a/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh
+++ b/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh
@@ -45,7 +45,11 @@ echo "$item" | jq -r '.patch' > $SWE_TASK_DIR/gold.patch
 echo "$item" | jq 'del(.test_patch, .patch)' > $SWE_TASK_DIR/instance.json
 
 # Clear the workspace
-rm -rf /workspace/*
+if [ -d /workspace ]; then
+    rm -rf /workspace/*
+else
+    mkdir /workspace
+fi
 # Copy repo to workspace
 if [ -d /workspace/$WORKSPACE_NAME ]; then
     rm -rf /workspace/$WORKSPACE_NAME
@@ -61,7 +65,7 @@ mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir
 
 REPO_PATH=/workspace/$WORKSPACE_NAME
 echo "Repo Path: $REPO_PATH"
-echo "Test Command: $TEST_CMD"
+# echo "Test Command: $TEST_CMD"
 echo "export REPO_PATH=\"$REPO_PATH\"" >> ~/.bashrc
 # echo "export TEST_CMD=\"$TEST_CMD\"" >> ~/.bashrc
 
diff --git a/evaluation/swe_bench/swe_env_box.py b/evaluation/swe_bench/swe_env_box.py
deleted file mode 100644
index 47427c683615..000000000000
--- a/evaluation/swe_bench/swe_env_box.py
+++ /dev/null
@@ -1,313 +0,0 @@
-import json
-import os
-import sys
-import tempfile
-import uuid
-
-from datasets import load_dataset
-from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK
-from swebench.harness.utils import get_test_directives
-
-from opendevin.core.config import AppConfig, SandboxConfig, load_app_config
-from opendevin.core.logger import opendevin_logger as logger
-from opendevin.runtime.docker.ssh_box import DockerSSHBox
-from opendevin.runtime.plugins import (
-    AgentSkillsRequirement,
-    JupyterRequirement,
-    PluginRequirement,
-)
-
-SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'
-
-
-def get_image_name_from_instance_id(instance_id: str) -> str:
-    return 'sweb.eval.x86_64.' + instance_id
-
-
-class SWEBenchSSHBox(DockerSSHBox):
-    def __init__(
-        self,
-        config: AppConfig,
-        container_image: str,
-        timeout: int = 120,
-        sid: str | None = None,
-        swe_instance_id: str | None = None,
-        swe_instance: dict | None = None,
-        skip_workspace_mount: bool = True,
-        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
-        workspace_dir_name: str | None = None,
-        use_instance_image: bool = False,
-    ):
-        if swe_instance_id is None:
-            raise ValueError('swe_instance_id must be provided!')
-        self.swe_instance_id = swe_instance_id
-        self.swe_instance = swe_instance
-        self.skip_workspace_mount = skip_workspace_mount
-        self.workspace_dir_name = workspace_dir_name
-
-        assert (
-            container_image is not None
-        ), 'container_image is required for SWEBenchSSHBox!'
-        # Need to run as root to use SWEBench container
-        sid = f'swe_bench_{swe_instance_id}_' + str(uuid.uuid4())
-        logger.info(f'===Using container image: {container_image}')
-        super().__init__(
-            config=SandboxConfig(container_image=container_image, timeout=timeout),
-            persist_sandbox=config.persist_sandbox,
-            workspace_mount_path=config.workspace_mount_path,
-            sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
-            cache_dir=config.cache_dir,
-            run_as_devin=config.run_as_devin,
-            ssh_hostname=config.ssh_hostname,
-            ssh_password=config.ssh_password,
-            ssh_port=config.ssh_port,
-            sid=sid,
-        )
-        self.init_plugins(sandbox_plugins)
-
-        exit_code, output = self.execute('mv ~/.bashrc ~/.bashrc.bak')
-        assert exit_code == 0, f'Failed to backup ~/.bashrc: {output}'
-
-        exit_code, output = self.execute(
-            f"echo 'export SWE_INSTANCE_ID={self.swe_instance_id}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo \"alias git='git --no-pager'\" >> ~/.bashrc"
-        )
-        assert exit_code == 0, f'Failed to set SWE_INSTANCE_ID in ~/.bashrc: {output}'
-
-        logger.info('Sourcing swe_entry.sh to set up environment variables')
-        logger.info(
-            'Initialization of SWEBench may take approximately 10 minutes due to long-running installations, such as those requiring compilation.'
-        )
-        logger.info(f'Use instance image: {use_instance_image}')
-        if use_instance_image:
-            # we directly inject the instance info into the container and the init script
-            script_dir = os.path.dirname(__file__)
-
-            # inject test command
-            test_type = MAP_REPO_TO_TEST_FRAMEWORK[swe_instance['repo']][
-                swe_instance['version']
-            ]
-            swe_instance['test_directives'] = get_test_directives(swe_instance)
-            swe_instance['test_cmd'] = (
-                f"{test_type} {' '.join(swe_instance['test_directives'])}"
-            )
-            exit_code, output = self.execute(
-                f"""echo "export TEST_CMD='{swe_instance["test_cmd"]}'" >> ~/.bashrc"""
-            )
-            # assert exit_code == 0, f'Failed to set TEST_CMD in ~/.bashrc: {output}'
-
-            # inject the instance info
-            self.execute('mkdir -p /swe_util/eval_data/instances')
-            swe_instance_json_name = 'swe-bench-instance.json'
-            with tempfile.TemporaryDirectory() as temp_dir:
-                # Construct the full path for the desired file name within the temporary directory
-                temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
-                # Write to the file with the desired name within the temporary directory
-                with open(temp_file_path, 'w') as f:
-                    if not isinstance(swe_instance, dict):
-                        json.dump([swe_instance.to_dict()], f)
-                    else:
-                        json.dump([swe_instance], f)
-
-                # Copy the file to the desired location
-                self.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
-
-            # inject the init script
-            self.copy_to(
-                str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
-                '/swe_util/',
-            )
-            self.execute('cat ~/.bashrc')
-            self.execute('source ~/.bashrc')
-
-            self.execute('source /swe_util/instance_swe_entry.sh', timeout=600)
-            logger.info('exit code: %d', exit_code)
-            logger.info(output)
-            assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
-            logger.info('Sourced swe_entry.sh successfully')
-        else:
-            exit_code, output = self.execute(
-                'source /swe_util/swe_entry.sh', timeout=600
-            )
-            logger.info('exit code: %d', exit_code)
-            logger.info(output)
-            assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
-            logger.info('Sourced swe_entry.sh successfully')
-
-    @property
-    def volumes(self):
-        if self.skip_workspace_mount:
-            return {
-                k: v
-                for k, v in super().volumes.items()
-                if not v['bind'] == self.sandbox_workspace_dir
-            }
-        return super().volumes
-
-    @classmethod
-    def get_box_for_instance(
-        cls,
-        instance,
-        config: AppConfig,
-        workspace_dir_name=None,
-        skip_workspace_mount: bool = True,
-        workspace_mount_path: str | None = None,
-        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
-        use_instance_image: bool = False,
-    ) -> 'SWEBenchSSHBox':
-        if workspace_dir_name is None:
-            workspace_dir_name = f"{instance['repo']}__{instance['version']}".replace(
-                '/', '__'
-            )
-        old_workspace_base = config.workspace_base
-        old_workspace_mount_path = config.workspace_mount_path
-
-        try:
-            config.workspace_base = workspace_mount_path
-            config.workspace_mount_path = workspace_mount_path
-
-            # linting python after editing helps LLM fix indentations
-            config.sandbox.enable_auto_lint = True
-            # Need to run as root to use SWEBench container
-            config.run_as_devin = False
-            if use_instance_image:
-                container_image = get_image_name_from_instance_id(
-                    instance['instance_id']
-                )
-            else:
-                container_image = SWE_BENCH_CONTAINER_IMAGE
-            sandbox = cls(
-                container_image=container_image,
-                config=config,
-                swe_instance_id=instance['instance_id'],
-                swe_instance=instance,
-                skip_workspace_mount=skip_workspace_mount,
-                sandbox_plugins=sandbox_plugins,
-                workspace_dir_name=workspace_dir_name,
-                use_instance_image=use_instance_image,
-            )
-            logger.info(f"SSH box started for instance {instance['instance_id']}.")
-
-            # cd to the repo
-            exit_code, output = sandbox.execute(f'cd /workspace/{workspace_dir_name}')
-            if exit_code != 0:
-                logger.error(f'Failed to cd to the repo: {output}')
-                sys.exit(1)
-
-            # remove all future commits & remote following Devin
-            # https://www.cognition-labs.com/post/swe-bench-technical-report
-            exit_code, output = sandbox.execute('git reset --hard')
-            if exit_code != 0:
-                logger.error(f'Failed to reset the repo: {output}')
-                sys.exit(1)
-            exit_code, output = sandbox.execute(
-                'for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
-            )
-            if exit_code != 0:
-                logger.error(f'Failed to remove remote: {output}')
-                sys.exit(1)
-        except Exception:
-            raise
-        finally:
-            # restore workspace_base and workspace_mount_path
-            config.workspace_base = old_workspace_base
-            config.workspace_mount_path = old_workspace_mount_path
-        return sandbox
-
-    def get_diff_patch(self):
-        # add everything to the index
-        exit_code, output = self.execute(f'cd /workspace/{self.workspace_dir_name}')
-        if exit_code != 0:
-            logger.error('Failed to cd to the repo')
-            return ''
-
-        exit_code, _output = self.execute('git config --global core.pager ""')
-        if exit_code != 0:
-            logger.error('Failed to change git config')
-            return ''
-
-        # add everything to the index
-        exit_code, output = self.execute('git add -A')
-        if exit_code != 0:
-            logger.error('Failed to add everything to the index')
-            return ''
-
-        # get the git diff
-        exit_code, git_patch = self.execute(
-            f'git diff --no-color --cached {self.swe_instance["base_commit"]}'
-        )
-        if exit_code != 0:
-            logger.error('Failed to get git diff')
-            return ''
-        return git_patch
-
-
-if __name__ == '__main__':
-    config = load_app_config()
-
-    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
-    # so we don't need to manage file uploading to OpenDevin's repo
-    dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
-    swe_bench_tests = dataset['test'].to_pandas()
-    USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false') == 'true'
-    logger.info(f'USE_INSTANCE_IMAGE: {USE_INSTANCE_IMAGE}')
-
-    # INSTANCE_ID = 'django__django-11099'
-    INSTANCE_ID = 'astropy__astropy-12907'
-    swe_bench_tests = swe_bench_tests[swe_bench_tests['instance_id'] == INSTANCE_ID]
-    EXAMPLE_INSTANCE = swe_bench_tests.iloc[0].to_dict()
-
-    sandbox = SWEBenchSSHBox.get_box_for_instance(
-        config=config,
-        instance=EXAMPLE_INSTANCE,
-        sandbox_plugins=[AgentSkillsRequirement(), JupyterRequirement()],
-        use_instance_image=USE_INSTANCE_IMAGE,
-    )
-
-    # PRE TEST
-    exit_code, output = sandbox.execute('cd $REPO_PATH')
-    assert exit_code == 0, 'Failed to cd $REPO_PATH'
-    logger.info(f'cd $REPO_PATH: {output}')
-
-    # apply test patch
-    exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/test.patch')
-    assert exit_code == 0, 'Failed to apply test patch'
-    logger.info(f'git apply $SWE_TASK_DIR/test.patch: {output}')
-
-    # TEST
-    exit_code, output = sandbox.execute('$TEST_CMD')
-    assert exit_code == 1, 'Expected exit code 1 (since this is a FAIL_TO_PASS)'
-    logger.info(f'$TEST_CMD:\n{output}')
-
-    # apply gold patch
-    exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/gold.patch')
-    logger.info('exit code: %d', exit_code)
-    logger.info(f'git apply $SWE_TASK_DIR/gold.patch: {output}')
-
-    # TEST
-    exit_code, output = sandbox.execute('$TEST_CMD')
-    assert exit_code == 0, 'Expected exit code 0 (since we applied the gold patch)'
-    logger.info(f'$TEST_CMD:\n{output}')
-
-    # Reset the repo
-    exit_code, output = sandbox.execute('git reset --hard')
-    assert exit_code == 0, 'Failed to reset the repo'
-    logger.info(f'git reset --hard: {output}')
-
-    sys.stdout.flush()
-    try:
-        while True:
-            try:
-                user_input = input('>>> ')
-            except EOFError:
-                logger.info('Exiting...')
-                break
-            if user_input.lower() == 'exit':
-                logger.info('Exiting...')
-                break
-            exit_code, output = sandbox.execute(user_input)
-            logger.info('exit code: %d', exit_code)
-            logger.info(output)
-            sys.stdout.flush()
-    except KeyboardInterrupt:
-        logger.info('Exiting...')
-    sandbox.close()
diff --git a/evaluation/toolqa/Dockerfile b/evaluation/toolqa/Dockerfile
new file mode 100644
index 000000000000..a15b774fcfd5
--- /dev/null
+++ b/evaluation/toolqa/Dockerfile
@@ -0,0 +1,17 @@
+FROM ubuntu:22.04
+
+RUN apt-get update && apt-get install -y python3 python3-pip
+
+RUN mkdir /workspace
+WORKDIR /workspace
+
+
+COPY data/ /workspace/data/
+COPY tools/ /workspace/tools/
+
+# TODO: NEED TO FIGURE DEPENDECIES FOR THESE TOOLS
+
+# pushd evaluation/toolqa
+# mkdir data
+# python3 -c "from utils import download_data, download_tools; download_data('/workspace'); download_tools('/workspace')"
+# docker build --network host -t xingyaoww/od-eval-toolqa .
diff --git a/evaluation/toolqa/README.md b/evaluation/toolqa/README.md
index 058ac96beeac..9e9dba391533 100644
--- a/evaluation/toolqa/README.md
+++ b/evaluation/toolqa/README.md
@@ -2,13 +2,9 @@
 
 This folder contains an evaluation harness we built on top of the original [ToolQA](https://github.com/night-chen/ToolQA) ([paper](https://arxiv.org/pdf/2306.13304)).
 
-## Setup Environment
+## Setup Environment and LLM Configuration
 
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local development environment for OpenDevin.
-
-## Configure OpenDevin and your LLM
-
-Run `make setup-config` to set up the `config.toml` file if it does not exist at the root of the workspace.
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on ToolQA Instances
 
diff --git a/evaluation/toolqa/run_infer.py b/evaluation/toolqa/run_infer.py
index 7653d35568ff..930f1199e672 100644
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@@ -1,29 +1,31 @@
 import asyncio
-import logging
 import os
-import pathlib
 from typing import Any
 
 import pandas as pd
 
+from evaluation.toolqa.utils import encode_question, eval_answer, get_data
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     codeact_user_response,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    get_parser,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-
-from .utils import download_data, download_tools, encode_question, eval_answer, get_data
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import CmdRunAction
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.runtime import Runtime
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
@@ -34,59 +36,83 @@
 }
 
 
-def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    # create process-specific workspace dir
-    # we will create a workspace directory for EACH process
-    # so that different agent don't interfere with each other.
-    workspace_mount_path = config.workspace_mount_path
-    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+def get_config(
+    metadata: EvalMetadata,
+) -> AppConfig:
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='python:3.11-bookworm',
+            enable_auto_lint=True,
+            use_host_network=False,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(runtime: Runtime):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    await runtime.add_env_vars({'WOLFRAM_ALPHA_APPID': args.wolfram_alpha_appid})
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+
+
+async def process_instance(
+    instance: Any, metadata: EvalMetadata, reset_logger: bool = True
+):
+    config = get_config(metadata)
 
-    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-    eval_output_dir = metadata.eval_output_dir
     qid = instance.qid
     question = instance.question
     answer = instance.answer
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
     if reset_logger:
-        # Set up logger
-        log_file = os.path.join(eval_output_dir, 'logs', f'instance_{qid}.log')
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {qid}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
-    logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, qid, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {qid}.')
 
     # Prepare instruction
     instruction = encode_question(question)
     instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
     # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
-    # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
+    instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
+    logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
+
+    runtime = await create_runtime(config, sid=qid)
+    await initialize_runtime(runtime)
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            instruction,
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
-                agent.__class__.__name__
-            ],
-            sid=qid,
-        )
+    state: State | None = await run_controller(
+        config=config,
+        task_str=instruction,
+        runtime=runtime,
+        fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[metadata.agent_class],
     )
     # ======= Attempt to evaluate the agent's edits =======
     # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
@@ -110,17 +136,17 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
     histories = state.history.compatibility_for_eval_history_pairs()
 
     # Save the output
-    output = {
-        'qid': qid,
-        'text': model_answer_raw,
-        'correct': correct,
-        'answer_id': 'None',
-        'model_id': metadata.model_name,
-        'metadata': metadata,
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-    }
+    output = EvalOutput(
+        instance_id=qid,
+        test_result={
+            'model_answer_raw': model_answer_raw,
+            'correct': correct,
+        },
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+    )
     return output
 
 
@@ -145,8 +171,12 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
         default='YOUR_WOLFRAMALPHA_APPID',
     )
     args, _ = parser.parse_known_args()
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     dataset = ''
     hardness = ''
@@ -168,14 +198,9 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
     if args.hardness not in ['easy', 'hard']:
         raise ValueError('Please choose from easy and hard for hardness.')
 
-    # workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
-    workspace_mount_path = config.workspace_mount_path
-    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
     toolqa_test = pd.DataFrame(get_data(dataset, hardness))
-    toolqa_data_path = download_data(workspace_mount_path)
-    toolqa_tool_path = download_tools(workspace_mount_path, args.wolfram_alpha_appid)
+    toolqa_test.rename(columns={'qid': 'instance_id'}, inplace=True)
 
-    id_column = 'qid'
     metadata = make_metadata(
         llm_config,
         f'toolqa-{args.dataset}-{args.hardness}',
@@ -184,12 +209,9 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
         args.eval_output_dir,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(toolqa_test, output_file, args.eval_n_limit, id_column)
-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    instances = prepare_dataset(toolqa_test, output_file, args.eval_n_limit)
+    asyncio.run(
+        run_evaluation(
+            instances, metadata, output_file, args.eval_num_workers, process_instance
+        )
     )
diff --git a/evaluation/toolqa/utils.py b/evaluation/toolqa/utils.py
index 4155b74381de..c942de5cf87a 100644
--- a/evaluation/toolqa/utils.py
+++ b/evaluation/toolqa/utils.py
@@ -4,11 +4,12 @@
 import string
 import zipfile
 
-import gdown
 import requests
 
 
 def download_data(dir):
+    import gdown
+
     data_path = os.path.join(dir, 'data/external_corpus')
     if os.path.exists(data_path):
         return data_path
@@ -19,6 +20,7 @@ def download_data(dir):
         zip_ref.extractall(os.path.join(dir, 'data'))
     if os.path.exists(zip_path):
         os.remove(zip_path)
+    print(f'Data saved to {data_path}')
     return data_path
 
 
@@ -42,6 +44,7 @@ def download_tools(dir, wolfram_alpha_appid='YOUR_WOLFRAMALPHA_APPID'):
         output_file = os.path.join(tool_path, tool.split('/')[1])
         with open(output_file, 'wb') as f:
             f.write(response.content)
+        print(f'Tool saved to {output_file}')
     with open(os.path.join(tool_path, 'calculator.py'), 'r') as f:
         content = f.read()
     new_content = content.replace('YOUR_WOLFRAMALPHA_APPID', wolfram_alpha_appid)
@@ -64,14 +67,29 @@ def download_tools(dir, wolfram_alpha_appid='YOUR_WOLFRAMALPHA_APPID'):
         f.write(new_content)
 
 
+LOCAL_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
+
+
 def get_data(dataset, hardness):
-    data = []
-    url = f'https://raw.githubusercontent.com/night-chen/ToolQA/main/data/questions/{hardness}/{dataset}-{hardness}.jsonl'
-    url = requests.get(url)
-    if url.status_code == 200:
-        lines = url.text.splitlines()
-        for line in lines:
-            data.append(json.loads(line))
+    data_path = os.path.join(LOCAL_DATA_DIR, f'{dataset}-{hardness}.jsonl')
+    if os.path.exists(data_path):
+        print(f'Loading data from {data_path}')
+        with open(data_path, 'r') as f:
+            return json.load(f)
+    else:
+        print(
+            f'Downloading data from https://raw.githubusercontent.com/night-chen/ToolQA/main/data/questions/{hardness}/{dataset}-{hardness}.jsonl'
+        )
+        data = []
+        url = f'https://raw.githubusercontent.com/night-chen/ToolQA/main/data/questions/{hardness}/{dataset}-{hardness}.jsonl'
+        url = requests.get(url)
+        if url.status_code == 200:
+            lines = url.text.splitlines()
+            for line in lines:
+                data.append(json.loads(line))
+            with open(data_path, 'w') as f:
+                json.dump(data, f)
+        print(f'Data saved to {data_path}')
     return data
 
 
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index 8b3fe33b9371..771157f155a4 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -1,12 +1,13 @@
+import asyncio
 import json
+import logging
 import multiprocessing as mp
 import os
 import pathlib
 import subprocess
 import time
-from asyncio.log import logger
 from concurrent.futures import ProcessPoolExecutor
-from typing import Any, Callable
+from typing import Any, Awaitable, Callable
 
 import pandas as pd
 from pydantic import BaseModel
@@ -14,6 +15,8 @@
 
 from opendevin.controller.state.state import State
 from opendevin.core.config import LLMConfig
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
 from opendevin.events.action import Action
 from opendevin.events.action.message import MessageAction
 
@@ -38,6 +41,31 @@ def model_dump_json(self, *args, **kwargs):
         return json.dumps(dumped_dict)
 
 
+class EvalOutput(BaseModel):
+    # NOTE: User-specified
+    instance_id: str
+    instruction: str
+    # output of the evaluation
+    # store anything that is needed for the score calculation
+    test_result: dict[str, Any]
+
+    # Interaction info
+    metadata: EvalMetadata
+    history: list[tuple[dict[str, Any], dict[str, Any]]]
+    metrics: dict[str, Any]
+    error: str | None = None
+
+    # Optionally save the input test instance
+    instance: dict[str, Any] | None = None
+
+    def model_dump_json(self, *args, **kwargs):
+        dumped = super().model_dump_json(*args, **kwargs)
+        dumped_dict = json.loads(dumped)
+        # Apply custom serialization for metadata (to avoid leaking sensitive information)
+        dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
+        return json.dumps(dumped_dict)
+
+
 def codeact_user_response(
     state: State,
     encapsulate_solution: bool = False,
@@ -136,7 +164,11 @@ def make_metadata(
     return metadata
 
 
-def prepare_dataset(dataset: pd.DataFrame, output_file, eval_n_limit, id_column):
+def prepare_dataset(dataset: pd.DataFrame, output_file: str, eval_n_limit: int):
+    assert (
+        'instance_id' in dataset.columns
+    ), "Expected 'instance_id' column in the dataset. You should define your own unique identifier for each instance and use it as the 'instance_id' column."
+    id_column = 'instance_id'
     logger.info(f'Writing evaluation output to {output_file}')
     finished_ids = set()
     if os.path.exists(output_file):
@@ -164,14 +196,16 @@ def prepare_dataset(dataset: pd.DataFrame, output_file, eval_n_limit, id_column)
     return pd.DataFrame(new_dataset)
 
 
-def run_evaluation(
+async def run_evaluation(
     dataset: pd.DataFrame,
     metadata: EvalMetadata,
     output_file: str,
     num_workers: int,
-    process_instance_func: Callable[[pd.Series, EvalMetadata, bool], Any],
-    id_column: str,
+    process_instance_func: Callable[
+        [pd.Series, EvalMetadata, bool], Awaitable[EvalOutput]
+    ],
 ):
+    use_multiprocessing = num_workers > 1
     logger.info(
         f'Evaluation started with Agent {metadata.agent_class}, '
         f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.'
@@ -179,35 +213,77 @@ def run_evaluation(
     pbar = tqdm(total=len(dataset))
     output_fp = open(output_file, 'a')
 
-    def update_progress(future):
+    async def update_progress(future):
         pbar.update(1)
-        output = future.result()
-        pbar.set_description(f'Instance {output[id_column]}')
-        pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
+        output: EvalOutput = await future if use_multiprocessing else future
+
+        pbar.set_description(f'Instance {output.instance_id}')
+        pbar.set_postfix_str(f'Test Result: {output.test_result}')
         logger.info(
-            f'Finished evaluation for instance {output[id_column]}: {output["test_result"]["result"]}'
+            f'Finished evaluation for instance {output.instance_id}: {output.test_result}'
         )
-        output_fp.write(json.dumps(output) + '\n')
+        output_fp.write(json.dumps(output.model_dump()) + '\n')
         output_fp.flush()
 
     try:
-        with ProcessPoolExecutor(num_workers) as executor:
-            futures = []
+        if use_multiprocessing:
+            with ProcessPoolExecutor(num_workers) as executor:
+                loop = asyncio.get_event_loop()
+                futures = []
+                for _, instance in dataset.iterrows():
+                    future = loop.run_in_executor(
+                        executor,
+                        process_instance_func,
+                        instance,
+                        metadata,
+                        bool(num_workers > 1),
+                    )
+                    futures.append(update_progress(future))
+
+                await asyncio.gather(*futures)
+        # Use plain for loop for single process for easier debugging
+        else:
+            assert num_workers == 1
             for _, instance in dataset.iterrows():
-                future = executor.submit(
-                    process_instance_func,
-                    instance,
-                    metadata,
-                    bool(num_workers > 1),
-                )
-                future.add_done_callback(update_progress)
-                futures.append(future)
-
-            for future in futures:
-                future.result()
+                output = await process_instance_func(instance, metadata, False)
+                await update_progress(output)
+
     except KeyboardInterrupt:
         print('KeyboardInterrupt received. Cleaning up...')
         cleanup()
 
     output_fp.close()
     logger.info('Evaluation finished.')
+
+
+def reset_logger_for_multiprocessing(
+    logger: logging.Logger, instance_id: str, log_dir: str
+):
+    """Reset the logger for multiprocessing.
+
+    Save logs to a separate file for each process, instead of trying to write to the
+    same file/console from multiple processes.
+    """
+    # Set up logger
+    log_file = os.path.join(
+        log_dir,
+        f'instance_{instance_id}.log',
+    )
+    # Remove all existing handlers from logger
+    for handler in logger.handlers[:]:
+        logger.removeHandler(handler)
+    # add back the console handler to print ONE line
+    logger.addHandler(get_console_handler())
+    logger.info(
+        f'Starting evaluation for instance {instance_id}.\n'
+        f'Hint: run "tail -f {log_file}" to see live logs in a separate shell'
+    )
+    # Remove all existing handlers from logger
+    for handler in logger.handlers[:]:
+        logger.removeHandler(handler)
+    os.makedirs(os.path.dirname(log_file), exist_ok=True)
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setFormatter(
+        logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    )
+    logger.addHandler(file_handler)
diff --git a/evaluation/utils/version_control.sh b/evaluation/utils/version_control.sh
index f58f9bdce9f1..2ab2ada48493 100644
--- a/evaluation/utils/version_control.sh
+++ b/evaluation/utils/version_control.sh
@@ -1,13 +1,11 @@
 checkout_eval_branch() {
     if [ -z "$COMMIT_HASH" ]; then
         echo "Commit hash not specified, use current git commit"
-        build_sandbox
         return 0
     fi
 
     if git diff --quiet $COMMIT_HASH HEAD; then
         echo "The given hash is equivalent to the current HEAD"
-        build_sandbox
         return 0
     fi
 
@@ -30,14 +28,8 @@ checkout_eval_branch() {
     # Trap the EXIT signal to checkout original branch
     trap checkout_original_branch EXIT
 
-    build_sandbox
 }
 
-build_sandbox() {
-    echo "Build sandbox locally"
-    docker build -t eval-sandbox -f containers/sandbox/Dockerfile /tmp
-    export SANDBOX_CONTAINER_IMAGE="eval-sandbox"
-}
 
 checkout_original_branch() {
     if [ -z "$current_branch" ]; then
diff --git a/evaluation/webarena/README.md b/evaluation/webarena/README.md
index cb720a2ffc9c..2ed3f214bab1 100644
--- a/evaluation/webarena/README.md
+++ b/evaluation/webarena/README.md
@@ -2,59 +2,14 @@
 
 This folder contains evaluation for [WebArena](https://github.com/web-arena-x/webarena) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on realistic web browsing tasks.
 
-## Setup OpenDevin Environment
+## Setup Environment and LLM Configuration
 
-Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
-
-## Configure OpenDevin and your LLM
-
-Create a `config.toml` file if it does not exist at the root of the workspace.
-
-Add the following configurations:
-
-```toml
-[core]
-max_iterations = 100
-cache_dir = "/tmp/cache"
-ssh_hostname = "localhost"
-
-[sandbox]
-box_type = "ssh"
-timeout = 120
-
-# TODO: Change these to the model you want to evaluate
-[eval_gpt4_1106_preview]
-model = "gpt-4-1106-preview"
-api_key = "XXX"
-temperature = 0.0
-
-[eval_some_openai_compatible_model]
-model = "openai/MODEL_NAME"
-base_url = "https://OPENAI_COMPATIBLE_URL/v1"
-api_key = "XXX"
-temperature = 0.0
-```
+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
 
 ## Setup WebArena Environment
 WebArena requires you to set up websites containing pre-populated content that is accessible via URL to the machine running the OpenDevin agents.
 Follow [this document](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) to set up your own WebArena environment through local servers or AWS EC2 instances.
-Take note of the base URL of the machine where the environment is installed.
-
-## Setup Environment Variables of WebArena Websites
-
-Create a script `webarena_env.sh` under `evaluation/webarena/scripts` with the following:
-
-```bash
-export BASE_URL=<YOUR_SERVER_URL_HERE>
-export SHOPPING="$BASE_URL:7770/"
-export SHOPPING_ADMIN="$BASE_URL:7780/admin"
-export REDDIT="$BASE_URL:9999"
-export GITLAB="$BASE_URL:8023"
-export WIKIPEDIA="$BASE_URL:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing"
-export MAP="$BASE_URL:3000"
-export HOMEPAGE="$BASE_URL:4399"
-export OPENAI_API_KEY="yourkey" # this key is required for some WebArena validators that utilize LLMs
-```
+Take note of the base URL (`$WEBARENA_BASE_URL`) of the machine where the environment is installed.
 
 ## Test if your environment works
 
@@ -65,7 +20,9 @@ Follow the WebArena environment setup guide carefully, and make sure the URL fie
 
 ## Run Evaluation
 
-```sh
+```bash
+export WEBARENA_BASE_URL=<YOUR_SERVER_URL_HERE>
+export OPENAI_API_KEY="yourkey" # this key is required for some WebArena validators that utilize LLMs
 bash evaluation/webarena/scripts/run_infer.sh
 ```
 
diff --git a/evaluation/webarena/run_infer.py b/evaluation/webarena/run_infer.py
index 4e6d181566cf..f661a147c93b 100644
--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@@ -1,7 +1,7 @@
 import asyncio
 import json
-import logging
 import os
+from typing import Any
 
 import browsergym.webarena  # noqa F401 register webarena tasks as gym environments
 import gymnasium as gym
@@ -9,86 +9,146 @@
 
 from evaluation.utils.shared import (
     EvalMetadata,
+    EvalOutput,
     make_metadata,
     prepare_dataset,
+    reset_logger_for_multiprocessing,
     run_evaluation,
 )
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
-from opendevin.core.logger import get_console_handler
+from opendevin.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import run_agent_controller
-from opendevin.llm.llm import LLM
-from opendevin.runtime.docker.ssh_box import DockerSSHBox
-from opendevin.runtime.tools import RuntimeTool
-
-config = load_app_config()
+from opendevin.core.main import create_runtime, run_controller
+from opendevin.events.action import (
+    BrowseInteractiveAction,
+    CmdRunAction,
+    MessageAction,
+)
+from opendevin.events.observation import CmdOutputObservation
+from opendevin.runtime.browser.browser_env import (
+    BROWSER_EVAL_GET_GOAL_ACTION,
+    BROWSER_EVAL_GET_REWARDS_ACTION,
+)
+from opendevin.runtime.runtime import Runtime
 
 SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
 
 
-docker_ssh_box: DockerSSHBox | None = None
-
-
-def get_sandbox():
-    global docker_ssh_box
-    if docker_ssh_box is None:
-        docker_ssh_box = DockerSSHBox()
-    return docker_ssh_box
+def get_config(
+    metadata: EvalMetadata,
+    env_id: str,
+) -> AppConfig:
+    base_url = os.environ.get('WEBARENA_BASE_URL', None)
+    openai_api_key = os.environ.get('OPENAI_API_KEY', None)
+    assert base_url is not None, 'WEBARENA_BASE_URL must be set'
+    assert openai_api_key is not None, 'OPENAI_API_KEY must be set'
+
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_devin=False,
+        runtime='eventstream',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            container_image='python:3.11-bookworm',
+            enable_auto_lint=True,
+            use_host_network=False,
+            browsergym_eval_env=env_id,
+            od_runtime_startup_env_vars={
+                'BASE_URL': base_url,
+                'OPENAI_API_KEY': openai_api_key,
+                'SHOPPING': f'{base_url}:7770/',
+                'SHOPPING_ADMIN': f'{base_url}:7780/admin',
+                'REDDIT': f'{base_url}:9999',
+                'GITLAB': f'{base_url}:8023',
+                'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
+                'MAP': f'{base_url}:3000',
+                'HOMEPAGE': f'{base_url}:4399',
+            },
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(metadata.llm_config)
+    return config
+
+
+async def initialize_runtime(
+    runtime: Runtime,
+) -> dict:
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    goal = obs.content
+
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    return goal
+
+
+async def complete_runtime(
+    runtime: Runtime,
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    return {
+        'rewards': json.loads(obs.content),
+    }
 
 
-def process_instance(
+async def process_instance(
     instance: pd.Series,
     metadata: EvalMetadata,
     reset_logger: bool = True,
 ):
-    # Create the agent
-    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
-    env_id = instance.id
+    env_id = instance.instance_id
+    config = get_config(metadata, env_id)
+
     # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
     if reset_logger:
-        # Set up logger
-        log_file = os.path.join(
-            metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        # add back the console handler to print ONE line
-        logger.addHandler(get_console_handler())
-        logger.info(
-            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
-        )
-        # Remove all existing handlers from logger
-        for handler in logger.handlers[:]:
-            logger.removeHandler(handler)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(
-            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        )
-        logger.addHandler(file_handler)
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, env_id, log_dir)
     else:
         logger.info(f'Starting evaluation for instance {env_id}.')
 
-    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    runtime_tools_config = {
-        RuntimeTool.BROWSER: {
-            'browsergym_eval': env_id,
-            'browsergym_eval_save_dir': metadata.eval_output_dir,
-        }
-    }
+    runtime = await create_runtime(config, sid=env_id)
+    task_str = await initialize_runtime(runtime)
 
-    state: State | None = asyncio.run(
-        run_agent_controller(
-            agent,
-            'PLACEHOLDER_GOAL',
-            max_iterations=metadata.max_iterations,
-            max_budget_per_task=config.max_budget_per_task,
-            runtime_tools_config=runtime_tools_config,
-            sandbox=get_sandbox(),
-            sid=env_id,
-        )
+    state: State | None = await run_controller(
+        config=config,
+        task_str=task_str,
+        runtime=runtime,
     )
 
     # ======= Attempt to evaluate the agent's environment impact =======
@@ -100,18 +160,17 @@ def process_instance(
         raise ValueError('State should not be None.')
 
     metrics = state.metrics.get() if state.metrics else None
-    browsergym_eval_dir = os.path.join(metadata.eval_output_dir, env_id.split('/')[1])
-    # read goal
-    with open(
-        os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
-    ) as f:
-        instruction = f.read()
-    # read reward
-    with open(
-        os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
-    ) as f:
-        rewards = json.load(f)
-        reward = max(rewards)
+
+    # Instruction is the first message from the USER
+    instruction = ''
+    for event in state.history.get_events():
+        if isinstance(event, MessageAction):
+            instruction = event.content
+            break
+
+    return_val = await complete_runtime(runtime)
+    logger.info(f'Return value from complete_runtime: {return_val}')
+    reward = max(return_val['rewards'])
 
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
@@ -119,39 +178,38 @@ def process_instance(
     histories = state.history.compatibility_for_eval_history_pairs()
 
     # Save the output
-    output = {
-        'instance_id': env_id,
-        'instruction': instruction,
-        'metadata': metadata.model_dump(),
-        'history': histories,
-        'metrics': metrics,
-        'error': state.last_error if state and state.last_error else None,
-        'test_result': reward,
-    }
-
+    output = EvalOutput(
+        instance_id=env_id,
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
+            'reward': reward,
+        },
+    )
     return output
 
 
 if __name__ == '__main__':
     args = parse_arguments()
 
-    env_ids = [
-        id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
-    ]
-
     dataset = pd.DataFrame(
         {
-            'id': [
+            'instance_id': [
                 id
                 for id in gym.envs.registry.keys()
-                if id.startswith('browsergym/miniwob')
+                if id.startswith('browsergym/webarena')
             ]
         }
     )
 
-    id_column = 'id'
-    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
-    logger.info(f'Config for evaluation: {config}')
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     metadata = make_metadata(
         llm_config,
@@ -162,13 +220,14 @@ def process_instance(
         args.eval_output_dir,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
-    _ = get_sandbox()  # Initialize the sandbox
-    run_evaluation(
-        instances,
-        metadata,
-        output_file,
-        args.eval_num_workers,
-        process_instance,
-        id_column,
+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit)
+
+    asyncio.run(
+        run_evaluation(
+            instances,
+            metadata,
+            output_file,
+            args.eval_num_workers,
+            process_instance,
+        )
     )
diff --git a/frontend/README.md b/frontend/README.md
index 01530e63e222..4e6b58c79f4e 100644
--- a/frontend/README.md
+++ b/frontend/README.md
@@ -1,5 +1,11 @@
 # Getting Started with the OpenDevin Frontend
 
+The frontend code can be run against the docker image defined in the [Main README](../README.md) as a backend
+
+## Prerequisites
+
+A recent version of NodeJS / NPM (`brew install node`)
+
 ## Available Scripts
 
 In the project directory, you can run:
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index fe1c707dc141..8e9382e4d607 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -11,7 +11,7 @@
         "@monaco-editor/react": "^4.6.0",
         "@nextui-org/react": "^2.4.6",
         "@react-types/shared": "^3.24.1",
-        "@reduxjs/toolkit": "^2.2.6",
+        "@reduxjs/toolkit": "^2.2.7",
         "@vitejs/plugin-react": "^4.3.1",
         "@xterm/addon-fit": "^0.10.0",
         "@xterm/xterm": "^5.4.0",
@@ -26,29 +26,29 @@
         "react-dom": "^18.3.1",
         "react-highlight": "^0.15.0",
         "react-hot-toast": "^2.4.1",
-        "react-i18next": "^15.0.0",
+        "react-i18next": "^15.0.1",
         "react-icons": "^5.2.1",
         "react-markdown": "^9.0.1",
         "react-redux": "^9.1.2",
         "react-syntax-highlighter": "^15.5.0",
         "tailwind-merge": "^2.4.0",
-        "vite": "^5.3.5",
+        "vite": "^5.4.0",
         "web-vitals": "^3.5.2"
       },
       "devDependencies": {
-        "@tailwindcss/typography": "^0.5.13",
+        "@tailwindcss/typography": "^0.5.14",
         "@testing-library/jest-dom": "^6.4.8",
         "@testing-library/react": "^16.0.0",
         "@testing-library/user-event": "^14.5.2",
-        "@types/node": "^20.14.12",
+        "@types/node": "^22.1.0",
         "@types/react": "^18.3.3",
         "@types/react-dom": "^18.3.0",
         "@types/react-highlight": "^0.12.8",
         "@types/react-syntax-highlighter": "^15.5.13",
-        "@typescript-eslint/eslint-plugin": "^7.17.0",
-        "@typescript-eslint/parser": "^7.17.0",
+        "@typescript-eslint/eslint-plugin": "^7.18.0",
+        "@typescript-eslint/parser": "^7.18.0",
         "@vitest/coverage-v8": "^1.6.0",
-        "autoprefixer": "^10.4.19",
+        "autoprefixer": "^10.4.20",
         "eslint": "^8.57.0",
         "eslint-config-airbnb": "^19.0.4",
         "eslint-config-airbnb-typescript": "^18.0.0",
@@ -58,29 +58,20 @@
         "eslint-plugin-prettier": "^5.2.1",
         "eslint-plugin-react": "^7.35.0",
         "eslint-plugin-react-hooks": "^4.6.2",
-        "husky": "^9.1.2",
+        "husky": "^9.1.4",
         "jsdom": "^24.1.1",
-        "lint-staged": "^15.2.7",
-        "postcss": "^8.4.40",
+        "lint-staged": "^15.2.8",
+        "postcss": "^8.4.41",
         "prettier": "^3.3.3",
-        "tailwindcss": "^3.4.7",
+        "tailwindcss": "^3.4.9",
         "typescript": "^5.5.4",
-        "vite-tsconfig-paths": "^4.3.2",
+        "vite-tsconfig-paths": "^5.0.1",
         "vitest": "^1.6.0"
       },
       "engines": {
         "node": ">=14.8.0"
       }
     },
-    "node_modules/@aashutoshrathi/word-wrap": {
-      "version": "1.2.6",
-      "resolved": "https://registry.npmjs.org/@aashutoshrathi/word-wrap/-/word-wrap-1.2.6.tgz",
-      "integrity": "sha512-1Yjs2SvM8TflER/OD3cOjhWWOZb58A2t7wpE2S9XfBYTiIl+XFhQG2bjy4Pu1I+EAlCNUzRDYDdFwFYUKvXcIA==",
-      "dev": true,
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
     "node_modules/@adobe/css-tools": {
       "version": "4.4.0",
       "resolved": "https://registry.npmjs.org/@adobe/css-tools/-/css-tools-4.4.0.tgz",
@@ -111,11 +102,11 @@
       }
     },
     "node_modules/@babel/code-frame": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.24.6.tgz",
-      "integrity": "sha512-ZJhac6FkEd1yhG2AHOmfcXG4ceoLltoCVJjN5XsWN9BifBQr+cHJbWi0h68HZuSORq+3WtJ2z0hwF2NG1b5kcA==",
+      "version": "7.24.7",
+      "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.24.7.tgz",
+      "integrity": "sha512-BcYH1CVJBO9tvyIZ2jVeXgSIMvGZ2FDRvDdOIVQyuklNKSsx+eppDEBq/g47Ayw+RqNFE+URvOShmf+f/qwAlA==",
       "dependencies": {
-        "@babel/highlight": "^7.24.6",
+        "@babel/highlight": "^7.24.7",
         "picocolors": "^1.0.0"
       },
       "engines": {
@@ -123,28 +114,28 @@
       }
     },
     "node_modules/@babel/compat-data": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.24.6.tgz",
-      "integrity": "sha512-aC2DGhBq5eEdyXWqrDInSqQjO0k8xtPRf5YylULqx8MCd6jBtzqfta/3ETMRpuKIc5hyswfO80ObyA1MvkCcUQ==",
+      "version": "7.25.2",
+      "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.25.2.tgz",
+      "integrity": "sha512-bYcppcpKBvX4znYaPEeFau03bp89ShqNMLs+rmdptMw+heSZh9+z84d2YG+K7cYLbWwzdjtDoW/uqZmPjulClQ==",
       "engines": {
         "node": ">=6.9.0"
       }
     },
     "node_modules/@babel/core": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.24.6.tgz",
-      "integrity": "sha512-qAHSfAdVyFmIvl0VHELib8xar7ONuSHrE2hLnsaWkYNTI68dmi1x8GYDhJjMI/e7XWal9QBlZkwbOnkcw7Z8gQ==",
+      "version": "7.25.2",
+      "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.25.2.tgz",
+      "integrity": "sha512-BBt3opiCOxUr9euZ5/ro/Xv8/V7yJ5bjYMqG/C1YAo8MIKAnumZalCN+msbci3Pigy4lIQfPUpfMM27HMGaYEA==",
       "dependencies": {
         "@ampproject/remapping": "^2.2.0",
-        "@babel/code-frame": "^7.24.6",
-        "@babel/generator": "^7.24.6",
-        "@babel/helper-compilation-targets": "^7.24.6",
-        "@babel/helper-module-transforms": "^7.24.6",
-        "@babel/helpers": "^7.24.6",
-        "@babel/parser": "^7.24.6",
-        "@babel/template": "^7.24.6",
-        "@babel/traverse": "^7.24.6",
-        "@babel/types": "^7.24.6",
+        "@babel/code-frame": "^7.24.7",
+        "@babel/generator": "^7.25.0",
+        "@babel/helper-compilation-targets": "^7.25.2",
+        "@babel/helper-module-transforms": "^7.25.2",
+        "@babel/helpers": "^7.25.0",
+        "@babel/parser": "^7.25.0",
+        "@babel/template": "^7.25.0",
+        "@babel/traverse": "^7.25.2",
+        "@babel/types": "^7.25.2",
         "convert-source-map": "^2.0.0",
         "debug": "^4.1.0",
         "gensync": "^1.0.0-beta.2",
@@ -168,11 +159,11 @@
       }
     },
     "node_modules/@babel/generator": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.24.6.tgz",
-      "integrity": "sha512-S7m4eNa6YAPJRHmKsLHIDJhNAGNKoWNiWefz1MBbpnt8g9lvMDl1hir4P9bo/57bQEmuwEhnRU/AMWsD0G/Fbg==",
+      "version": "7.25.0",
+      "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.25.0.tgz",
+      "integrity": "sha512-3LEEcj3PVW8pW2R1SR1M89g/qrYk/m/mB/tLqn7dn4sbBUQyTqnlod+II2U4dqiGtUmkcnAmkMDralTFZttRiw==",
       "dependencies": {
-        "@babel/types": "^7.24.6",
+        "@babel/types": "^7.25.0",
         "@jridgewell/gen-mapping": "^0.3.5",
         "@jridgewell/trace-mapping": "^0.3.25",
         "jsesc": "^2.5.1"
@@ -182,13 +173,13 @@
       }
     },
     "node_modules/@babel/helper-compilation-targets": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.24.6.tgz",
-      "integrity": "sha512-VZQ57UsDGlX/5fFA7GkVPplZhHsVc+vuErWgdOiysI9Ksnw0Pbbd6pnPiR/mmJyKHgyIW0c7KT32gmhiF+cirg==",
+      "version": "7.25.2",
+      "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.25.2.tgz",
+      "integrity": "sha512-U2U5LsSaZ7TAt3cfaymQ8WHh0pxvdHoEk6HVpaexxixjyEquMh0L0YNJNM6CTGKMXV1iksi0iZkGw4AcFkPaaw==",
       "dependencies": {
-        "@babel/compat-data": "^7.24.6",
-        "@babel/helper-validator-option": "^7.24.6",
-        "browserslist": "^4.22.2",
+        "@babel/compat-data": "^7.25.2",
+        "@babel/helper-validator-option": "^7.24.8",
+        "browserslist": "^4.23.1",
         "lru-cache": "^5.1.1",
         "semver": "^6.3.1"
       },
@@ -204,58 +195,27 @@
         "semver": "bin/semver.js"
       }
     },
-    "node_modules/@babel/helper-environment-visitor": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.24.6.tgz",
-      "integrity": "sha512-Y50Cg3k0LKLMjxdPjIl40SdJgMB85iXn27Vk/qbHZCFx/o5XO3PSnpi675h1KEmmDb6OFArfd5SCQEQ5Q4H88g==",
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/helper-function-name": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.24.6.tgz",
-      "integrity": "sha512-xpeLqeeRkbxhnYimfr2PC+iA0Q7ljX/d1eZ9/inYbmfG2jpl8Lu3DyXvpOAnrS5kxkfOWJjioIMQsaMBXFI05w==",
-      "dependencies": {
-        "@babel/template": "^7.24.6",
-        "@babel/types": "^7.24.6"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/helper-hoist-variables": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.24.6.tgz",
-      "integrity": "sha512-SF/EMrC3OD7dSta1bLJIlrsVxwtd0UpjRJqLno6125epQMJ/kyFmpTT4pbvPbdQHzCHg+biQ7Syo8lnDtbR+uA==",
-      "dependencies": {
-        "@babel/types": "^7.24.6"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
     "node_modules/@babel/helper-module-imports": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.24.6.tgz",
-      "integrity": "sha512-a26dmxFJBF62rRO9mmpgrfTLsAuyHk4e1hKTUkD/fcMfynt8gvEKwQPQDVxWhca8dHoDck+55DFt42zV0QMw5g==",
+      "version": "7.24.7",
+      "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.24.7.tgz",
+      "integrity": "sha512-8AyH3C+74cgCVVXow/myrynrAGv+nTVg5vKu2nZph9x7RcRwzmh0VFallJuFTZ9mx6u4eSdXZfcOzSqTUm0HCA==",
       "dependencies": {
-        "@babel/types": "^7.24.6"
+        "@babel/traverse": "^7.24.7",
+        "@babel/types": "^7.24.7"
       },
       "engines": {
         "node": ">=6.9.0"
       }
     },
     "node_modules/@babel/helper-module-transforms": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.24.6.tgz",
-      "integrity": "sha512-Y/YMPm83mV2HJTbX1Qh2sjgjqcacvOlhbzdCCsSlblOKjSYmQqEbO6rUniWQyRo9ncyfjT8hnUjlG06RXDEmcA==",
+      "version": "7.25.2",
+      "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.25.2.tgz",
+      "integrity": "sha512-BjyRAbix6j/wv83ftcVJmBt72QtHI56C7JXZoG2xATiLpmoC7dpd8WnkikExHDVPpi/3qCmO6WY1EaXOluiecQ==",
       "dependencies": {
-        "@babel/helper-environment-visitor": "^7.24.6",
-        "@babel/helper-module-imports": "^7.24.6",
-        "@babel/helper-simple-access": "^7.24.6",
-        "@babel/helper-split-export-declaration": "^7.24.6",
-        "@babel/helper-validator-identifier": "^7.24.6"
+        "@babel/helper-module-imports": "^7.24.7",
+        "@babel/helper-simple-access": "^7.24.7",
+        "@babel/helper-validator-identifier": "^7.24.7",
+        "@babel/traverse": "^7.25.2"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -265,77 +225,67 @@
       }
     },
     "node_modules/@babel/helper-plugin-utils": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.24.6.tgz",
-      "integrity": "sha512-MZG/JcWfxybKwsA9N9PmtF2lOSFSEMVCpIRrbxccZFLJPrJciJdG/UhSh5W96GEteJI2ARqm5UAHxISwRDLSNg==",
+      "version": "7.24.8",
+      "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.24.8.tgz",
+      "integrity": "sha512-FFWx5142D8h2Mgr/iPVGH5G7w6jDn4jUSpZTyDnQO0Yn7Ks2Kuz6Pci8H6MPCoUJegd/UZQ3tAvfLCxQSnWWwg==",
       "engines": {
         "node": ">=6.9.0"
       }
     },
     "node_modules/@babel/helper-simple-access": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/helper-simple-access/-/helper-simple-access-7.24.6.tgz",
-      "integrity": "sha512-nZzcMMD4ZhmB35MOOzQuiGO5RzL6tJbsT37Zx8M5L/i9KSrukGXWTjLe1knIbb/RmxoJE9GON9soq0c0VEMM5g==",
-      "dependencies": {
-        "@babel/types": "^7.24.6"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/helper-split-export-declaration": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.24.6.tgz",
-      "integrity": "sha512-CvLSkwXGWnYlF9+J3iZUvwgAxKiYzK3BWuo+mLzD/MDGOZDj7Gq8+hqaOkMxmJwmlv0iu86uH5fdADd9Hxkymw==",
+      "version": "7.24.7",
+      "resolved": "https://registry.npmjs.org/@babel/helper-simple-access/-/helper-simple-access-7.24.7.tgz",
+      "integrity": "sha512-zBAIvbCMh5Ts+b86r/CjU+4XGYIs+R1j951gxI3KmmxBMhCg4oQMsv6ZXQ64XOm/cvzfU1FmoCyt6+owc5QMYg==",
       "dependencies": {
-        "@babel/types": "^7.24.6"
+        "@babel/traverse": "^7.24.7",
+        "@babel/types": "^7.24.7"
       },
       "engines": {
         "node": ">=6.9.0"
       }
     },
     "node_modules/@babel/helper-string-parser": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.24.6.tgz",
-      "integrity": "sha512-WdJjwMEkmBicq5T9fm/cHND3+UlFa2Yj8ALLgmoSQAJZysYbBjw+azChSGPN4DSPLXOcooGRvDwZWMcF/mLO2Q==",
+      "version": "7.24.8",
+      "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.24.8.tgz",
+      "integrity": "sha512-pO9KhhRcuUyGnJWwyEgnRJTSIZHiT+vMD0kPeD+so0l7mxkMT19g3pjY9GTnHySck/hDzq+dtW/4VgnMkippsQ==",
       "engines": {
         "node": ">=6.9.0"
       }
     },
     "node_modules/@babel/helper-validator-identifier": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.24.6.tgz",
-      "integrity": "sha512-4yA7s865JHaqUdRbnaxarZREuPTHrjpDT+pXoAZ1yhyo6uFnIEpS8VMu16siFOHDpZNKYv5BObhsB//ycbICyw==",
+      "version": "7.24.7",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.24.7.tgz",
+      "integrity": "sha512-rR+PBcQ1SMQDDyF6X0wxtG8QyLCgUB0eRAGguqRLfkCA87l7yAP7ehq8SNj96OOGTO8OBV70KhuFYcIkHXOg0w==",
       "engines": {
         "node": ">=6.9.0"
       }
     },
     "node_modules/@babel/helper-validator-option": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.24.6.tgz",
-      "integrity": "sha512-Jktc8KkF3zIkePb48QO+IapbXlSapOW9S+ogZZkcO6bABgYAxtZcjZ/O005111YLf+j4M84uEgwYoidDkXbCkQ==",
+      "version": "7.24.8",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.24.8.tgz",
+      "integrity": "sha512-xb8t9tD1MHLungh/AIoWYN+gVHaB9kwlu8gffXGSt3FFEIT7RjS+xWbc2vUD1UTZdIpKj/ab3rdqJ7ufngyi2Q==",
       "engines": {
         "node": ">=6.9.0"
       }
     },
     "node_modules/@babel/helpers": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.24.6.tgz",
-      "integrity": "sha512-V2PI+NqnyFu1i0GyTd/O/cTpxzQCYioSkUIRmgo7gFEHKKCg5w46+r/A6WeUR1+P3TeQ49dspGPNd/E3n9AnnA==",
+      "version": "7.25.0",
+      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.25.0.tgz",
+      "integrity": "sha512-MjgLZ42aCm0oGjJj8CtSM3DB8NOOf8h2l7DCTePJs29u+v7yO/RBX9nShlKMgFnRks/Q4tBAe7Hxnov9VkGwLw==",
       "dependencies": {
-        "@babel/template": "^7.24.6",
-        "@babel/types": "^7.24.6"
+        "@babel/template": "^7.25.0",
+        "@babel/types": "^7.25.0"
       },
       "engines": {
         "node": ">=6.9.0"
       }
     },
     "node_modules/@babel/highlight": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.24.6.tgz",
-      "integrity": "sha512-2YnuOp4HAk2BsBrJJvYCbItHx0zWscI1C3zgWkz+wDyD9I7GIVrfnLyrR4Y1VR+7p+chAEcrgRQYZAGIKMV7vQ==",
+      "version": "7.24.7",
+      "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.24.7.tgz",
+      "integrity": "sha512-EStJpq4OuY8xYfhGVXngigBJRWxftKX9ksiGDnmlY3o7B/V7KIAc9X4oiK87uPJSc/vs5L869bem5fhZa8caZw==",
       "dependencies": {
-        "@babel/helper-validator-identifier": "^7.24.6",
+        "@babel/helper-validator-identifier": "^7.24.7",
         "chalk": "^2.4.2",
         "js-tokens": "^4.0.0",
         "picocolors": "^1.0.0"
@@ -409,9 +359,12 @@
       }
     },
     "node_modules/@babel/parser": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.24.6.tgz",
-      "integrity": "sha512-eNZXdfU35nJC2h24RznROuOpO94h6x8sg9ju0tT9biNtLZ2vuP8SduLqqV+/8+cebSLV9SJEAN5Z3zQbJG/M+Q==",
+      "version": "7.25.3",
+      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.25.3.tgz",
+      "integrity": "sha512-iLTJKDbJ4hMvFPgQwwsVoxtHyWpKKPBrxkANrSYewDPaPpT5py5yeVkgPIJ7XYXhndxJpaA3PyALSXQ7u8e/Dw==",
+      "dependencies": {
+        "@babel/types": "^7.25.2"
+      },
       "bin": {
         "parser": "bin/babel-parser.js"
       },
@@ -420,11 +373,11 @@
       }
     },
     "node_modules/@babel/plugin-transform-react-jsx-self": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-self/-/plugin-transform-react-jsx-self-7.24.6.tgz",
-      "integrity": "sha512-FfZfHXtQ5jYPQsCRyLpOv2GeLIIJhs8aydpNh39vRDjhD411XcfWDni5i7OjP/Rs8GAtTn7sWFFELJSHqkIxYg==",
+      "version": "7.24.7",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-self/-/plugin-transform-react-jsx-self-7.24.7.tgz",
+      "integrity": "sha512-fOPQYbGSgH0HUp4UJO4sMBFjY6DuWq+2i8rixyUMb3CdGixs/gccURvYOAhajBdKDoGajFr3mUq5rH3phtkGzw==",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.24.6"
+        "@babel/helper-plugin-utils": "^7.24.7"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -434,11 +387,11 @@
       }
     },
     "node_modules/@babel/plugin-transform-react-jsx-source": {
-      "version": "7.24.1",
-      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-source/-/plugin-transform-react-jsx-source-7.24.1.tgz",
-      "integrity": "sha512-1v202n7aUq4uXAieRTKcwPzNyphlCuqHHDcdSNc+vdhoTEZcFMh+L5yZuCmGaIO7bs1nJUNfHB89TZyoL48xNA==",
+      "version": "7.24.7",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-source/-/plugin-transform-react-jsx-source-7.24.7.tgz",
+      "integrity": "sha512-J2z+MWzZHVOemyLweMqngXrgGC42jQ//R0KdxqkIz/OrbVIIlhFI3WigZ5fO+nwFvBlncr4MGapd8vTyc7RPNQ==",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.24.0"
+        "@babel/helper-plugin-utils": "^7.24.7"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -448,9 +401,9 @@
       }
     },
     "node_modules/@babel/runtime": {
-      "version": "7.24.8",
-      "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.24.8.tgz",
-      "integrity": "sha512-5F7SDGs1T72ZczbRwbGO9lQi0NLjQxzl6i4lJxLxfW9U5UluCSyEJeniWvnhl3/euNiqQVbo8zruhsDfid0esA==",
+      "version": "7.25.0",
+      "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.25.0.tgz",
+      "integrity": "sha512-7dRy4DwXwtzBrPbZflqxnvfxLF8kdZXPkhymtDeFoFqE6ldzjQFgYTtYIFARcLEYDrqfBfYcZt1WqFxRoyC9Rw==",
       "dependencies": {
         "regenerator-runtime": "^0.14.0"
       },
@@ -459,31 +412,28 @@
       }
     },
     "node_modules/@babel/template": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.24.6.tgz",
-      "integrity": "sha512-3vgazJlLwNXi9jhrR1ef8qiB65L1RK90+lEQwv4OxveHnqC3BfmnHdgySwRLzf6akhlOYenT+b7AfWq+a//AHw==",
+      "version": "7.25.0",
+      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.25.0.tgz",
+      "integrity": "sha512-aOOgh1/5XzKvg1jvVz7AVrx2piJ2XBi227DHmbY6y+bM9H2FlN+IfecYu4Xl0cNiiVejlsCri89LUsbj8vJD9Q==",
       "dependencies": {
-        "@babel/code-frame": "^7.24.6",
-        "@babel/parser": "^7.24.6",
-        "@babel/types": "^7.24.6"
+        "@babel/code-frame": "^7.24.7",
+        "@babel/parser": "^7.25.0",
+        "@babel/types": "^7.25.0"
       },
       "engines": {
         "node": ">=6.9.0"
       }
     },
     "node_modules/@babel/traverse": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.24.6.tgz",
-      "integrity": "sha512-OsNjaJwT9Zn8ozxcfoBc+RaHdj3gFmCmYoQLUII1o6ZrUwku0BMg80FoOTPx+Gi6XhcQxAYE4xyjPTo4SxEQqw==",
-      "dependencies": {
-        "@babel/code-frame": "^7.24.6",
-        "@babel/generator": "^7.24.6",
-        "@babel/helper-environment-visitor": "^7.24.6",
-        "@babel/helper-function-name": "^7.24.6",
-        "@babel/helper-hoist-variables": "^7.24.6",
-        "@babel/helper-split-export-declaration": "^7.24.6",
-        "@babel/parser": "^7.24.6",
-        "@babel/types": "^7.24.6",
+      "version": "7.25.3",
+      "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.25.3.tgz",
+      "integrity": "sha512-HefgyP1x754oGCsKmV5reSmtV7IXj/kpaE1XYY+D9G5PvKKoFfSbiS4M77MdjuwlZKDIKFCffq9rPU+H/s3ZdQ==",
+      "dependencies": {
+        "@babel/code-frame": "^7.24.7",
+        "@babel/generator": "^7.25.0",
+        "@babel/parser": "^7.25.3",
+        "@babel/template": "^7.25.0",
+        "@babel/types": "^7.25.2",
         "debug": "^4.3.1",
         "globals": "^11.1.0"
       },
@@ -492,12 +442,12 @@
       }
     },
     "node_modules/@babel/types": {
-      "version": "7.24.6",
-      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.24.6.tgz",
-      "integrity": "sha512-WaMsgi6Q8zMgMth93GvWPXkhAIEobfsIkLTacoVZoK1J0CevIPGYY2Vo5YvJGqyHqXM6P4ppOYGsIRU8MM9pFQ==",
+      "version": "7.25.2",
+      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.25.2.tgz",
+      "integrity": "sha512-YTnYtra7W9e6/oAZEHj0bJehPRUlLH9/fbpT5LfB0NhQXyALCRkRs3zH9v07IYhkgpqX6Z78FnuccZr/l4Fs4Q==",
       "dependencies": {
-        "@babel/helper-string-parser": "^7.24.6",
-        "@babel/helper-validator-identifier": "^7.24.6",
+        "@babel/helper-string-parser": "^7.24.8",
+        "@babel/helper-validator-identifier": "^7.24.7",
         "to-fast-properties": "^2.0.0"
       },
       "engines": {
@@ -871,9 +821,9 @@
       }
     },
     "node_modules/@eslint-community/regexpp": {
-      "version": "4.10.0",
-      "resolved": "https://registry.npmjs.org/@eslint-community/regexpp/-/regexpp-4.10.0.tgz",
-      "integrity": "sha512-Cu96Sd2By9mCNTx2iyKOmq10v22jUVQv0lQnlGNy16oE9589yE+QADPbrMGCkA51cKZSg3Pu/aTJVTGfL/qjUA==",
+      "version": "4.11.0",
+      "resolved": "https://registry.npmjs.org/@eslint-community/regexpp/-/regexpp-4.11.0.tgz",
+      "integrity": "sha512-G/M/tIiMrTAxEWRfLfQJMmGNX28IxBg4PBz8XqQhqUHLFI6TL2htpIB1iQCj144V5ee/JaKyT9/WZ0MGZWfA7A==",
       "dev": true,
       "engines": {
         "node": "^12.0.0 || ^14.0.0 || >=16.0.0"
@@ -939,18 +889,6 @@
         "node": "*"
       }
     },
-    "node_modules/@eslint/eslintrc/node_modules/type-fest": {
-      "version": "0.20.2",
-      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz",
-      "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==",
-      "dev": true,
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/@eslint/js": {
       "version": "8.57.0",
       "resolved": "https://registry.npmjs.org/@eslint/js/-/js-8.57.0.tgz",
@@ -1008,6 +946,7 @@
       "version": "0.11.14",
       "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.14.tgz",
       "integrity": "sha512-3T8LkOmg45BV5FICb15QQMsyUSWrQ8AygVfC7ZG32zOalnqrilm018ZVCw0eapXux8FtA33q8PSRSstjee3jSg==",
+      "deprecated": "Use @eslint/config-array instead",
       "dev": true,
       "dependencies": {
         "@humanwhocodes/object-schema": "^2.0.2",
@@ -1057,12 +996,13 @@
       "version": "2.0.3",
       "resolved": "https://registry.npmjs.org/@humanwhocodes/object-schema/-/object-schema-2.0.3.tgz",
       "integrity": "sha512-93zYdMES/c1D69yZiKDBj0V24vqNzB/koF26KPaagAfd3P/4gUlh3Dys5ogAK+Exi9QyzlD8x/08Zt7wIKcDcA==",
+      "deprecated": "Use @eslint/object-schema instead",
       "dev": true
     },
     "node_modules/@internationalized/date": {
-      "version": "3.5.4",
-      "resolved": "https://registry.npmjs.org/@internationalized/date/-/date-3.5.4.tgz",
-      "integrity": "sha512-qoVJVro+O0rBaw+8HPjUB1iH8Ihf8oziEnqMnvhJUSuVIrHOuZ6eNLHNvzXJKUvAtaDiqMnRlg8Z2mgh09BlUw==",
+      "version": "3.5.5",
+      "resolved": "https://registry.npmjs.org/@internationalized/date/-/date-3.5.5.tgz",
+      "integrity": "sha512-H+CfYvOZ0LTJeeLOqm19E3uj/4YjrmOFtBufDHPfvtI80hFAMqtrp7oCACpe4Cil5l8S0Qu/9dYfZc/5lY8WQQ==",
       "dependencies": {
         "@swc/helpers": "^0.5.0"
       }
@@ -1227,9 +1167,9 @@
       }
     },
     "node_modules/@jridgewell/sourcemap-codec": {
-      "version": "1.4.15",
-      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.15.tgz",
-      "integrity": "sha512-eF2rxCRulEKXHTRiDrDy6erMYWqNw4LPdQ8UQA4huuxaQsVeRPFl2oM8oDGxMFhJUWZf9McpLtJasDDZb/Bpeg=="
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz",
+      "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ=="
     },
     "node_modules/@jridgewell/trace-mapping": {
       "version": "0.3.25",
@@ -3048,28 +2988,166 @@
       }
     },
     "node_modules/@react-aria/grid": {
-      "version": "3.9.1",
-      "resolved": "https://registry.npmjs.org/@react-aria/grid/-/grid-3.9.1.tgz",
-      "integrity": "sha512-fGEZqAEaS8mqzV/II3N4ndoNWegIcbh+L3PmKbXdpKKUP8VgMs/WY5rYl5WAF0f5RoFwXqx3ibDLeR9tKj/bOg==",
+      "version": "3.10.1",
+      "resolved": "https://registry.npmjs.org/@react-aria/grid/-/grid-3.10.1.tgz",
+      "integrity": "sha512-7dSgiYVQapBtPV4SIit+9fJ1qoEjtp+PXffJkWAPtGbg/jJ4b0jcVzykH7ARD4w/6jAJN/oVSfrKZqFPoLAd9w==",
       "dependencies": {
-        "@react-aria/focus": "^3.17.1",
-        "@react-aria/i18n": "^3.11.1",
-        "@react-aria/interactions": "^3.21.3",
+        "@react-aria/focus": "^3.18.1",
+        "@react-aria/i18n": "^3.12.1",
+        "@react-aria/interactions": "^3.22.1",
         "@react-aria/live-announcer": "^3.3.4",
-        "@react-aria/selection": "^3.18.1",
-        "@react-aria/utils": "^3.24.1",
-        "@react-stately/collections": "^3.10.7",
-        "@react-stately/grid": "^3.8.7",
-        "@react-stately/selection": "^3.15.1",
-        "@react-stately/virtualizer": "^3.7.1",
-        "@react-types/checkbox": "^3.8.1",
-        "@react-types/grid": "^3.2.6",
-        "@react-types/shared": "^3.23.1",
+        "@react-aria/selection": "^3.19.1",
+        "@react-aria/utils": "^3.25.1",
+        "@react-stately/collections": "^3.10.9",
+        "@react-stately/grid": "^3.9.1",
+        "@react-stately/selection": "^3.16.1",
+        "@react-types/checkbox": "^3.8.3",
+        "@react-types/grid": "^3.2.8",
+        "@react-types/shared": "^3.24.1",
         "@swc/helpers": "^0.5.0"
       },
       "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0",
-        "react-dom": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0",
+        "react-dom": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/grid/node_modules/@react-aria/focus": {
+      "version": "3.18.1",
+      "resolved": "https://registry.npmjs.org/@react-aria/focus/-/focus-3.18.1.tgz",
+      "integrity": "sha512-N0Cy61WCIv+57mbqC7hiZAsB+3rF5n4JKabxUmg/2RTJL6lq7hJ5N4gx75ymKxkN8GnVDwt4pKZah48Wopa5jw==",
+      "dependencies": {
+        "@react-aria/interactions": "^3.22.1",
+        "@react-aria/utils": "^3.25.1",
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0",
+        "clsx": "^2.0.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/grid/node_modules/@react-aria/i18n": {
+      "version": "3.12.1",
+      "resolved": "https://registry.npmjs.org/@react-aria/i18n/-/i18n-3.12.1.tgz",
+      "integrity": "sha512-0q3gyogF9Ekah+9LOo6tcfshxsk2Ope+KdbtFHJVhznedMxn6RpHGcVur5ImbQ1dYafA5CmjBUGJW70b56+BGA==",
+      "dependencies": {
+        "@internationalized/date": "^3.5.5",
+        "@internationalized/message": "^3.1.4",
+        "@internationalized/number": "^3.5.3",
+        "@internationalized/string": "^3.2.3",
+        "@react-aria/ssr": "^3.9.5",
+        "@react-aria/utils": "^3.25.1",
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/grid/node_modules/@react-aria/interactions": {
+      "version": "3.22.1",
+      "resolved": "https://registry.npmjs.org/@react-aria/interactions/-/interactions-3.22.1.tgz",
+      "integrity": "sha512-5TLzQaDAQQ5C70yG8GInbO4wIylKY67RfTIIwQPGR/4n5OIjbUD8BOj3NuSsuZ/frUPaBXo1VEBBmSO23fxkjw==",
+      "dependencies": {
+        "@react-aria/ssr": "^3.9.5",
+        "@react-aria/utils": "^3.25.1",
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/grid/node_modules/@react-aria/selection": {
+      "version": "3.19.1",
+      "resolved": "https://registry.npmjs.org/@react-aria/selection/-/selection-3.19.1.tgz",
+      "integrity": "sha512-mbExvq2Omi60sTWFGjwcNz1ja2P8VDsxWAqSypHRTyqXhtgqbv8V/v8Gp+7BmVPH1YHcbhztl6rvUZTDOSszzw==",
+      "dependencies": {
+        "@react-aria/focus": "^3.18.1",
+        "@react-aria/i18n": "^3.12.1",
+        "@react-aria/interactions": "^3.22.1",
+        "@react-aria/utils": "^3.25.1",
+        "@react-stately/selection": "^3.16.1",
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0",
+        "react-dom": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/grid/node_modules/@react-aria/ssr": {
+      "version": "3.9.5",
+      "resolved": "https://registry.npmjs.org/@react-aria/ssr/-/ssr-3.9.5.tgz",
+      "integrity": "sha512-xEwGKoysu+oXulibNUSkXf8itW0npHHTa6c4AyYeZIJyRoegeteYuFpZUBPtIDE8RfHdNsSmE1ssOkxRnwbkuQ==",
+      "dependencies": {
+        "@swc/helpers": "^0.5.0"
+      },
+      "engines": {
+        "node": ">= 12"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/grid/node_modules/@react-aria/utils": {
+      "version": "3.25.1",
+      "resolved": "https://registry.npmjs.org/@react-aria/utils/-/utils-3.25.1.tgz",
+      "integrity": "sha512-5Uj864e7T5+yj78ZfLnfHqmypLiqW2mN+nsdslog2z5ssunTqjolVeM15ootXskjISlZ7MojLpq97kIC4nlnAw==",
+      "dependencies": {
+        "@react-aria/ssr": "^3.9.5",
+        "@react-stately/utils": "^3.10.2",
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0",
+        "clsx": "^2.0.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/grid/node_modules/@react-stately/collections": {
+      "version": "3.10.9",
+      "resolved": "https://registry.npmjs.org/@react-stately/collections/-/collections-3.10.9.tgz",
+      "integrity": "sha512-plyrng6hOQMG8LrjArMA6ts/DgWyXln3g90/hFNbqe/hdVYF53sDVsj8Jb+5LtoYTpiAlV6eOvy1XR0vPZUf8w==",
+      "dependencies": {
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/grid/node_modules/@react-stately/utils": {
+      "version": "3.10.2",
+      "resolved": "https://registry.npmjs.org/@react-stately/utils/-/utils-3.10.2.tgz",
+      "integrity": "sha512-fh6OTQtbeQC0ywp6LJuuKs6tKIgFvt/DlIZEcIpGho6/oZG229UnIk6TUekwxnDbumuYyan6D9EgUtEMmT8UIg==",
+      "dependencies": {
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/grid/node_modules/@react-types/checkbox": {
+      "version": "3.8.3",
+      "resolved": "https://registry.npmjs.org/@react-types/checkbox/-/checkbox-3.8.3.tgz",
+      "integrity": "sha512-f4c1mnLEt0iS1NMkyZXgT3q3AgcxzDk7w6MSONOKydcnh0xG5L2oefY14DhVDLkAuQS7jThlUFwiAs+MxiO3MA==",
+      "dependencies": {
+        "@react-types/shared": "^3.24.1"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/grid/node_modules/@react-types/grid": {
+      "version": "3.2.8",
+      "resolved": "https://registry.npmjs.org/@react-types/grid/-/grid-3.2.8.tgz",
+      "integrity": "sha512-6PJrpukwMqlv3IhJSDkJuVbhHM8Oe6hd2supWqd9adMXrlSP7QHt9a8SgFcFblCCTx8JzUaA0PvY5sTudcEtOQ==",
+      "dependencies": {
+        "@react-types/shared": "^3.24.1"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
       }
     },
     "node_modules/@react-aria/i18n": {
@@ -3281,20 +3359,89 @@
       }
     },
     "node_modules/@react-aria/spinbutton": {
-      "version": "3.6.5",
-      "resolved": "https://registry.npmjs.org/@react-aria/spinbutton/-/spinbutton-3.6.5.tgz",
-      "integrity": "sha512-0aACBarF/Xr/7ixzjVBTQ0NBwwwsoGkf5v6AVFVMTC0uYMXHTALvRs+ULHjHMa5e/cX/aPlEvaVT7jfSs+Xy9Q==",
+      "version": "3.6.7",
+      "resolved": "https://registry.npmjs.org/@react-aria/spinbutton/-/spinbutton-3.6.7.tgz",
+      "integrity": "sha512-OCimp4yXoFIgh6WAMOls5DDDRDRO75ZFic3YA6wLWTRNHxo1Lj8S90i1A6pakY6bi4hdBCKmj4DnFSNKAw1iWg==",
       "dependencies": {
-        "@react-aria/i18n": "^3.11.1",
+        "@react-aria/i18n": "^3.12.1",
         "@react-aria/live-announcer": "^3.3.4",
-        "@react-aria/utils": "^3.24.1",
-        "@react-types/button": "^3.9.4",
-        "@react-types/shared": "^3.23.1",
+        "@react-aria/utils": "^3.25.1",
+        "@react-types/button": "^3.9.6",
+        "@react-types/shared": "^3.24.1",
         "@swc/helpers": "^0.5.0"
       },
       "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0",
-        "react-dom": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0",
+        "react-dom": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/spinbutton/node_modules/@react-aria/i18n": {
+      "version": "3.12.1",
+      "resolved": "https://registry.npmjs.org/@react-aria/i18n/-/i18n-3.12.1.tgz",
+      "integrity": "sha512-0q3gyogF9Ekah+9LOo6tcfshxsk2Ope+KdbtFHJVhznedMxn6RpHGcVur5ImbQ1dYafA5CmjBUGJW70b56+BGA==",
+      "dependencies": {
+        "@internationalized/date": "^3.5.5",
+        "@internationalized/message": "^3.1.4",
+        "@internationalized/number": "^3.5.3",
+        "@internationalized/string": "^3.2.3",
+        "@react-aria/ssr": "^3.9.5",
+        "@react-aria/utils": "^3.25.1",
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/spinbutton/node_modules/@react-aria/ssr": {
+      "version": "3.9.5",
+      "resolved": "https://registry.npmjs.org/@react-aria/ssr/-/ssr-3.9.5.tgz",
+      "integrity": "sha512-xEwGKoysu+oXulibNUSkXf8itW0npHHTa6c4AyYeZIJyRoegeteYuFpZUBPtIDE8RfHdNsSmE1ssOkxRnwbkuQ==",
+      "dependencies": {
+        "@swc/helpers": "^0.5.0"
+      },
+      "engines": {
+        "node": ">= 12"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/spinbutton/node_modules/@react-aria/utils": {
+      "version": "3.25.1",
+      "resolved": "https://registry.npmjs.org/@react-aria/utils/-/utils-3.25.1.tgz",
+      "integrity": "sha512-5Uj864e7T5+yj78ZfLnfHqmypLiqW2mN+nsdslog2z5ssunTqjolVeM15ootXskjISlZ7MojLpq97kIC4nlnAw==",
+      "dependencies": {
+        "@react-aria/ssr": "^3.9.5",
+        "@react-stately/utils": "^3.10.2",
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0",
+        "clsx": "^2.0.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/spinbutton/node_modules/@react-stately/utils": {
+      "version": "3.10.2",
+      "resolved": "https://registry.npmjs.org/@react-stately/utils/-/utils-3.10.2.tgz",
+      "integrity": "sha512-fh6OTQtbeQC0ywp6LJuuKs6tKIgFvt/DlIZEcIpGho6/oZG229UnIk6TUekwxnDbumuYyan6D9EgUtEMmT8UIg==",
+      "dependencies": {
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/spinbutton/node_modules/@react-types/button": {
+      "version": "3.9.6",
+      "resolved": "https://registry.npmjs.org/@react-types/button/-/button-3.9.6.tgz",
+      "integrity": "sha512-8lA+D5JLbNyQikf8M/cPP2cji91aVTcqjrGpDqI7sQnaLFikM8eFR6l1ZWGtZS5MCcbfooko77ha35SYplSQvw==",
+      "dependencies": {
+        "@react-types/shared": "^3.24.1"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
       }
     },
     "node_modules/@react-aria/ssr": {
@@ -3391,69 +3538,163 @@
       }
     },
     "node_modules/@react-aria/toggle": {
-      "version": "3.10.4",
-      "resolved": "https://registry.npmjs.org/@react-aria/toggle/-/toggle-3.10.4.tgz",
-      "integrity": "sha512-bRk+CdB8QzrSyGNjENXiTWxfzYKRw753iwQXsEAU7agPCUdB8cZJyrhbaUoD0rwczzTp2zDbZ9rRbUPdsBE2YQ==",
-      "dependencies": {
-        "@react-aria/focus": "^3.17.1",
-        "@react-aria/interactions": "^3.21.3",
-        "@react-aria/utils": "^3.24.1",
-        "@react-stately/toggle": "^3.7.4",
-        "@react-types/checkbox": "^3.8.1",
-        "@swc/helpers": "^0.5.0"
-      },
-      "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
-      }
-    },
-    "node_modules/@react-aria/tooltip": {
-      "version": "3.7.4",
-      "resolved": "https://registry.npmjs.org/@react-aria/tooltip/-/tooltip-3.7.4.tgz",
-      "integrity": "sha512-+XRx4HlLYqWY3fB8Z60bQi/rbWDIGlFUtXYbtoa1J+EyRWfhpvsYImP8qeeNO/vgjUtDy1j9oKa8p6App9mBMQ==",
-      "dependencies": {
-        "@react-aria/focus": "^3.17.1",
-        "@react-aria/interactions": "^3.21.3",
-        "@react-aria/utils": "^3.24.1",
-        "@react-stately/tooltip": "^3.4.9",
-        "@react-types/shared": "^3.23.1",
-        "@react-types/tooltip": "^3.4.9",
+      "version": "3.10.6",
+      "resolved": "https://registry.npmjs.org/@react-aria/toggle/-/toggle-3.10.6.tgz",
+      "integrity": "sha512-AGlbtB1b8grrtjbiW5Au0LKYzxR83RHbHhaUkFwajyYRGyuEzr3Y03OiveoPB+DayA8Gz3H1ZVmW++8JZQOWHw==",
+      "dependencies": {
+        "@react-aria/focus": "^3.18.1",
+        "@react-aria/interactions": "^3.22.1",
+        "@react-aria/utils": "^3.25.1",
+        "@react-stately/toggle": "^3.7.6",
+        "@react-types/checkbox": "^3.8.3",
+        "@react-types/shared": "^3.24.1",
         "@swc/helpers": "^0.5.0"
       },
       "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
       }
     },
-    "node_modules/@react-aria/utils": {
-      "version": "3.24.1",
-      "resolved": "https://registry.npmjs.org/@react-aria/utils/-/utils-3.24.1.tgz",
-      "integrity": "sha512-O3s9qhPMd6n42x9sKeJ3lhu5V1Tlnzhu6Yk8QOvDuXf7UGuUjXf9mzfHJt1dYzID4l9Fwm8toczBzPM9t0jc8Q==",
+    "node_modules/@react-aria/toggle/node_modules/@react-aria/focus": {
+      "version": "3.18.1",
+      "resolved": "https://registry.npmjs.org/@react-aria/focus/-/focus-3.18.1.tgz",
+      "integrity": "sha512-N0Cy61WCIv+57mbqC7hiZAsB+3rF5n4JKabxUmg/2RTJL6lq7hJ5N4gx75ymKxkN8GnVDwt4pKZah48Wopa5jw==",
       "dependencies": {
-        "@react-aria/ssr": "^3.9.4",
-        "@react-stately/utils": "^3.10.1",
-        "@react-types/shared": "^3.23.1",
+        "@react-aria/interactions": "^3.22.1",
+        "@react-aria/utils": "^3.25.1",
+        "@react-types/shared": "^3.24.1",
         "@swc/helpers": "^0.5.0",
         "clsx": "^2.0.0"
       },
       "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
       }
     },
-    "node_modules/@react-aria/visually-hidden": {
-      "version": "3.8.12",
-      "resolved": "https://registry.npmjs.org/@react-aria/visually-hidden/-/visually-hidden-3.8.12.tgz",
-      "integrity": "sha512-Bawm+2Cmw3Xrlr7ARzl2RLtKh0lNUdJ0eNqzWcyx4c0VHUAWtThmH5l+HRqFUGzzutFZVo89SAy40BAbd0gjVw==",
+    "node_modules/@react-aria/toggle/node_modules/@react-aria/interactions": {
+      "version": "3.22.1",
+      "resolved": "https://registry.npmjs.org/@react-aria/interactions/-/interactions-3.22.1.tgz",
+      "integrity": "sha512-5TLzQaDAQQ5C70yG8GInbO4wIylKY67RfTIIwQPGR/4n5OIjbUD8BOj3NuSsuZ/frUPaBXo1VEBBmSO23fxkjw==",
       "dependencies": {
-        "@react-aria/interactions": "^3.21.3",
-        "@react-aria/utils": "^3.24.1",
-        "@react-types/shared": "^3.23.1",
+        "@react-aria/ssr": "^3.9.5",
+        "@react-aria/utils": "^3.25.1",
+        "@react-types/shared": "^3.24.1",
         "@swc/helpers": "^0.5.0"
       },
       "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
       }
     },
-    "node_modules/@react-stately/calendar": {
-      "version": "3.5.1",
+    "node_modules/@react-aria/toggle/node_modules/@react-aria/ssr": {
+      "version": "3.9.5",
+      "resolved": "https://registry.npmjs.org/@react-aria/ssr/-/ssr-3.9.5.tgz",
+      "integrity": "sha512-xEwGKoysu+oXulibNUSkXf8itW0npHHTa6c4AyYeZIJyRoegeteYuFpZUBPtIDE8RfHdNsSmE1ssOkxRnwbkuQ==",
+      "dependencies": {
+        "@swc/helpers": "^0.5.0"
+      },
+      "engines": {
+        "node": ">= 12"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/toggle/node_modules/@react-aria/utils": {
+      "version": "3.25.1",
+      "resolved": "https://registry.npmjs.org/@react-aria/utils/-/utils-3.25.1.tgz",
+      "integrity": "sha512-5Uj864e7T5+yj78ZfLnfHqmypLiqW2mN+nsdslog2z5ssunTqjolVeM15ootXskjISlZ7MojLpq97kIC4nlnAw==",
+      "dependencies": {
+        "@react-aria/ssr": "^3.9.5",
+        "@react-stately/utils": "^3.10.2",
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0",
+        "clsx": "^2.0.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/toggle/node_modules/@react-stately/toggle": {
+      "version": "3.7.6",
+      "resolved": "https://registry.npmjs.org/@react-stately/toggle/-/toggle-3.7.6.tgz",
+      "integrity": "sha512-xRZyrjNVu1VCd1xpg5RwmNYs9fXb+JHChoUaRcBmGCCjsPD0R5uR3iNuE17RXJtWS3/8o9IJVn90+/7NW7boOg==",
+      "dependencies": {
+        "@react-stately/utils": "^3.10.2",
+        "@react-types/checkbox": "^3.8.3",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/toggle/node_modules/@react-stately/utils": {
+      "version": "3.10.2",
+      "resolved": "https://registry.npmjs.org/@react-stately/utils/-/utils-3.10.2.tgz",
+      "integrity": "sha512-fh6OTQtbeQC0ywp6LJuuKs6tKIgFvt/DlIZEcIpGho6/oZG229UnIk6TUekwxnDbumuYyan6D9EgUtEMmT8UIg==",
+      "dependencies": {
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/toggle/node_modules/@react-types/checkbox": {
+      "version": "3.8.3",
+      "resolved": "https://registry.npmjs.org/@react-types/checkbox/-/checkbox-3.8.3.tgz",
+      "integrity": "sha512-f4c1mnLEt0iS1NMkyZXgT3q3AgcxzDk7w6MSONOKydcnh0xG5L2oefY14DhVDLkAuQS7jThlUFwiAs+MxiO3MA==",
+      "dependencies": {
+        "@react-types/shared": "^3.24.1"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-aria/tooltip": {
+      "version": "3.7.4",
+      "resolved": "https://registry.npmjs.org/@react-aria/tooltip/-/tooltip-3.7.4.tgz",
+      "integrity": "sha512-+XRx4HlLYqWY3fB8Z60bQi/rbWDIGlFUtXYbtoa1J+EyRWfhpvsYImP8qeeNO/vgjUtDy1j9oKa8p6App9mBMQ==",
+      "dependencies": {
+        "@react-aria/focus": "^3.17.1",
+        "@react-aria/interactions": "^3.21.3",
+        "@react-aria/utils": "^3.24.1",
+        "@react-stately/tooltip": "^3.4.9",
+        "@react-types/shared": "^3.23.1",
+        "@react-types/tooltip": "^3.4.9",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+      }
+    },
+    "node_modules/@react-aria/utils": {
+      "version": "3.24.1",
+      "resolved": "https://registry.npmjs.org/@react-aria/utils/-/utils-3.24.1.tgz",
+      "integrity": "sha512-O3s9qhPMd6n42x9sKeJ3lhu5V1Tlnzhu6Yk8QOvDuXf7UGuUjXf9mzfHJt1dYzID4l9Fwm8toczBzPM9t0jc8Q==",
+      "dependencies": {
+        "@react-aria/ssr": "^3.9.4",
+        "@react-stately/utils": "^3.10.1",
+        "@react-types/shared": "^3.23.1",
+        "@swc/helpers": "^0.5.0",
+        "clsx": "^2.0.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+      }
+    },
+    "node_modules/@react-aria/visually-hidden": {
+      "version": "3.8.12",
+      "resolved": "https://registry.npmjs.org/@react-aria/visually-hidden/-/visually-hidden-3.8.12.tgz",
+      "integrity": "sha512-Bawm+2Cmw3Xrlr7ARzl2RLtKh0lNUdJ0eNqzWcyx4c0VHUAWtThmH5l+HRqFUGzzutFZVo89SAy40BAbd0gjVw==",
+      "dependencies": {
+        "@react-aria/interactions": "^3.21.3",
+        "@react-aria/utils": "^3.24.1",
+        "@react-types/shared": "^3.23.1",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+      }
+    },
+    "node_modules/@react-stately/calendar": {
+      "version": "3.5.1",
       "resolved": "https://registry.npmjs.org/@react-stately/calendar/-/calendar-3.5.1.tgz",
       "integrity": "sha512-7l7QhqGUJ5AzWHfvZzbTe3J4t72Ht5BmhW4hlVI7flQXtfrmYkVtl3ZdytEZkkHmWGYZRW9b4IQTQGZxhtlElA==",
       "dependencies": {
@@ -3552,18 +3793,41 @@
       }
     },
     "node_modules/@react-stately/grid": {
-      "version": "3.8.7",
-      "resolved": "https://registry.npmjs.org/@react-stately/grid/-/grid-3.8.7.tgz",
-      "integrity": "sha512-he3TXCLAhF5C5z1/G4ySzcwyt7PEiWcVIupxebJQqRyFrNWemSuv+7tolnStmG8maMVIyV3P/3j4eRBbdSlOIg==",
+      "version": "3.9.1",
+      "resolved": "https://registry.npmjs.org/@react-stately/grid/-/grid-3.9.1.tgz",
+      "integrity": "sha512-LSVIcXO/cqwG0IgDSk2juDbpARBS1IzGnsTp/8vSOejMxq5MXrwxL5hUcqNczL8Ss6aLpELm42tCS0kPm3cMKw==",
       "dependencies": {
-        "@react-stately/collections": "^3.10.7",
-        "@react-stately/selection": "^3.15.1",
-        "@react-types/grid": "^3.2.6",
-        "@react-types/shared": "^3.23.1",
+        "@react-stately/collections": "^3.10.9",
+        "@react-stately/selection": "^3.16.1",
+        "@react-types/grid": "^3.2.8",
+        "@react-types/shared": "^3.24.1",
         "@swc/helpers": "^0.5.0"
       },
       "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-stately/grid/node_modules/@react-stately/collections": {
+      "version": "3.10.9",
+      "resolved": "https://registry.npmjs.org/@react-stately/collections/-/collections-3.10.9.tgz",
+      "integrity": "sha512-plyrng6hOQMG8LrjArMA6ts/DgWyXln3g90/hFNbqe/hdVYF53sDVsj8Jb+5LtoYTpiAlV6eOvy1XR0vPZUf8w==",
+      "dependencies": {
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-stately/grid/node_modules/@react-types/grid": {
+      "version": "3.2.8",
+      "resolved": "https://registry.npmjs.org/@react-types/grid/-/grid-3.2.8.tgz",
+      "integrity": "sha512-6PJrpukwMqlv3IhJSDkJuVbhHM8Oe6hd2supWqd9adMXrlSP7QHt9a8SgFcFblCCTx8JzUaA0PvY5sTudcEtOQ==",
+      "dependencies": {
+        "@react-types/shared": "^3.24.1"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
       }
     },
     "node_modules/@react-stately/list": {
@@ -3624,33 +3888,141 @@
       }
     },
     "node_modules/@react-stately/select": {
-      "version": "3.6.4",
-      "resolved": "https://registry.npmjs.org/@react-stately/select/-/select-3.6.4.tgz",
-      "integrity": "sha512-whZgF1N53D0/dS8tOFdrswB0alsk5Q5620HC3z+5f2Hpi8gwgAZ8TYa+2IcmMYRiT+bxVuvEc/NirU9yPmqGbA==",
+      "version": "3.6.6",
+      "resolved": "https://registry.npmjs.org/@react-stately/select/-/select-3.6.6.tgz",
+      "integrity": "sha512-JEpBosWNSXRexE/iReATei1EiVdTIwOWlLcCGw6K7oC/5/f+OHMsh2Kkt/c/RzM/to3vgR+Wbbqwrb712AWgYQ==",
       "dependencies": {
-        "@react-stately/form": "^3.0.3",
-        "@react-stately/list": "^3.10.5",
-        "@react-stately/overlays": "^3.6.7",
-        "@react-types/select": "^3.9.4",
-        "@react-types/shared": "^3.23.1",
+        "@react-stately/form": "^3.0.5",
+        "@react-stately/list": "^3.10.7",
+        "@react-stately/overlays": "^3.6.9",
+        "@react-types/select": "^3.9.6",
+        "@react-types/shared": "^3.24.1",
         "@swc/helpers": "^0.5.0"
       },
       "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-stately/select/node_modules/@react-stately/collections": {
+      "version": "3.10.9",
+      "resolved": "https://registry.npmjs.org/@react-stately/collections/-/collections-3.10.9.tgz",
+      "integrity": "sha512-plyrng6hOQMG8LrjArMA6ts/DgWyXln3g90/hFNbqe/hdVYF53sDVsj8Jb+5LtoYTpiAlV6eOvy1XR0vPZUf8w==",
+      "dependencies": {
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-stately/select/node_modules/@react-stately/form": {
+      "version": "3.0.5",
+      "resolved": "https://registry.npmjs.org/@react-stately/form/-/form-3.0.5.tgz",
+      "integrity": "sha512-J3plwJ63HQz109OdmaTqTA8Qhvl3gcYYK7DtgKyNP6mc/Me2Q4tl2avkWoA+22NRuv5m+J8TpBk4AVHUEOwqeQ==",
+      "dependencies": {
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-stately/select/node_modules/@react-stately/list": {
+      "version": "3.10.7",
+      "resolved": "https://registry.npmjs.org/@react-stately/list/-/list-3.10.7.tgz",
+      "integrity": "sha512-W5PG7uG5GQV2Q59vXJE7QLKHZIoUNEx+JmHrBUCMKUgyngSpKIIEDR/R/C1b6ZJ9jMqqZA68Zlnd5iK1/mBi1A==",
+      "dependencies": {
+        "@react-stately/collections": "^3.10.9",
+        "@react-stately/selection": "^3.16.1",
+        "@react-stately/utils": "^3.10.2",
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-stately/select/node_modules/@react-stately/overlays": {
+      "version": "3.6.9",
+      "resolved": "https://registry.npmjs.org/@react-stately/overlays/-/overlays-3.6.9.tgz",
+      "integrity": "sha512-4chfyzKw7P2UEainm0yzjUgYwG1ovBejN88eTrn+O62x5huuMCwe0cbMxmYh4y7IhRFSee3jIJd0SP0u/+i39w==",
+      "dependencies": {
+        "@react-stately/utils": "^3.10.2",
+        "@react-types/overlays": "^3.8.9",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-stately/select/node_modules/@react-stately/utils": {
+      "version": "3.10.2",
+      "resolved": "https://registry.npmjs.org/@react-stately/utils/-/utils-3.10.2.tgz",
+      "integrity": "sha512-fh6OTQtbeQC0ywp6LJuuKs6tKIgFvt/DlIZEcIpGho6/oZG229UnIk6TUekwxnDbumuYyan6D9EgUtEMmT8UIg==",
+      "dependencies": {
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-stately/select/node_modules/@react-types/overlays": {
+      "version": "3.8.9",
+      "resolved": "https://registry.npmjs.org/@react-types/overlays/-/overlays-3.8.9.tgz",
+      "integrity": "sha512-9ni9upQgXPnR+K9cWmbYWvm3ll9gH8P/XsEZprqIV5zNLMF334jADK48h4jafb1X9RFnj0WbHo6BqcSObzjTig==",
+      "dependencies": {
+        "@react-types/shared": "^3.24.1"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-stately/select/node_modules/@react-types/select": {
+      "version": "3.9.6",
+      "resolved": "https://registry.npmjs.org/@react-types/select/-/select-3.9.6.tgz",
+      "integrity": "sha512-cVSFR0eJLup/ht1Uto+y8uyLmHO89J6wNh65SIHb3jeVz9oLBAedP3YNI2qB+F9qFMUcA8PBSLXIIuT6gXzLgQ==",
+      "dependencies": {
+        "@react-types/shared": "^3.24.1"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
       }
     },
     "node_modules/@react-stately/selection": {
-      "version": "3.15.1",
-      "resolved": "https://registry.npmjs.org/@react-stately/selection/-/selection-3.15.1.tgz",
-      "integrity": "sha512-6TQnN9L0UY9w19B7xzb1P6mbUVBtW840Cw1SjgNXCB3NPaCf59SwqClYzoj8O2ZFzMe8F/nUJtfU1NS65/OLlw==",
+      "version": "3.16.1",
+      "resolved": "https://registry.npmjs.org/@react-stately/selection/-/selection-3.16.1.tgz",
+      "integrity": "sha512-qmnmYaXY7IhhzmIiInec1a/yPxlPSBHka6vrWddvt0S6zN7FU5cv6sm69ONUwYwLKSoaNHgOGvZhmsTzyV0O2A==",
       "dependencies": {
-        "@react-stately/collections": "^3.10.7",
-        "@react-stately/utils": "^3.10.1",
-        "@react-types/shared": "^3.23.1",
+        "@react-stately/collections": "^3.10.9",
+        "@react-stately/utils": "^3.10.2",
+        "@react-types/shared": "^3.24.1",
         "@swc/helpers": "^0.5.0"
       },
       "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-stately/selection/node_modules/@react-stately/collections": {
+      "version": "3.10.9",
+      "resolved": "https://registry.npmjs.org/@react-stately/collections/-/collections-3.10.9.tgz",
+      "integrity": "sha512-plyrng6hOQMG8LrjArMA6ts/DgWyXln3g90/hFNbqe/hdVYF53sDVsj8Jb+5LtoYTpiAlV6eOvy1XR0vPZUf8w==",
+      "dependencies": {
+        "@react-types/shared": "^3.24.1",
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-stately/selection/node_modules/@react-stately/utils": {
+      "version": "3.10.2",
+      "resolved": "https://registry.npmjs.org/@react-stately/utils/-/utils-3.10.2.tgz",
+      "integrity": "sha512-fh6OTQtbeQC0ywp6LJuuKs6tKIgFvt/DlIZEcIpGho6/oZG229UnIk6TUekwxnDbumuYyan6D9EgUtEMmT8UIg==",
+      "dependencies": {
+        "@swc/helpers": "^0.5.0"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
       }
     },
     "node_modules/@react-stately/slider": {
@@ -3848,15 +4220,26 @@
       }
     },
     "node_modules/@react-types/dialog": {
-      "version": "3.5.10",
-      "resolved": "https://registry.npmjs.org/@react-types/dialog/-/dialog-3.5.10.tgz",
-      "integrity": "sha512-S9ga+edOLNLZw7/zVOnZdT5T40etpzUYBXEKdFPbxyPYnERvRxJAsC1/ASuBU9fQAXMRgLZzADWV+wJoGS/X9g==",
+      "version": "3.5.12",
+      "resolved": "https://registry.npmjs.org/@react-types/dialog/-/dialog-3.5.12.tgz",
+      "integrity": "sha512-JmpQbSpXltqEyYfEwoqDolABIiojeExkqolHNdQlayIsfFuSxZxNwXZPOpz58Ri/iwv21JP7K3QF0Gb2Ohxl9w==",
       "dependencies": {
-        "@react-types/overlays": "^3.8.7",
-        "@react-types/shared": "^3.23.1"
+        "@react-types/overlays": "^3.8.9",
+        "@react-types/shared": "^3.24.1"
       },
       "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/@react-types/dialog/node_modules/@react-types/overlays": {
+      "version": "3.8.9",
+      "resolved": "https://registry.npmjs.org/@react-types/overlays/-/overlays-3.8.9.tgz",
+      "integrity": "sha512-9ni9upQgXPnR+K9cWmbYWvm3ll9gH8P/XsEZprqIV5zNLMF334jADK48h4jafb1X9RFnj0WbHo6BqcSObzjTig==",
+      "dependencies": {
+        "@react-types/shared": "^3.24.1"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
       }
     },
     "node_modules/@react-types/grid": {
@@ -3882,14 +4265,14 @@
       }
     },
     "node_modules/@react-types/listbox": {
-      "version": "3.4.9",
-      "resolved": "https://registry.npmjs.org/@react-types/listbox/-/listbox-3.4.9.tgz",
-      "integrity": "sha512-S5G+WmNKUIOPZxZ4svWwWQupP3C6LmVfnf8QQmPDvwYXGzVc0WovkqUWyhhjJirFDswTXRCO9p0yaTHHIlkdwQ==",
+      "version": "3.5.1",
+      "resolved": "https://registry.npmjs.org/@react-types/listbox/-/listbox-3.5.1.tgz",
+      "integrity": "sha512-n5bOgD9lgfK1qaLtag9WPnu151SwXBCNn/OgGY/Br9mWRl+nPUEYtFcPX+2VCld7uThf54kwrTmzlFnaraIlcw==",
       "dependencies": {
-        "@react-types/shared": "^3.23.1"
+        "@react-types/shared": "^3.24.1"
       },
       "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
       }
     },
     "node_modules/@react-types/menu": {
@@ -3957,25 +4340,25 @@
       }
     },
     "node_modules/@react-types/slider": {
-      "version": "3.7.3",
-      "resolved": "https://registry.npmjs.org/@react-types/slider/-/slider-3.7.3.tgz",
-      "integrity": "sha512-F8qFQaD2mqug2D0XeWMmjGBikiwbdERFlhFzdvNGbypPLz3AZICBKp1ZLPWdl0DMuy03G/jy6Gl4mDobl7RT2g==",
+      "version": "3.7.5",
+      "resolved": "https://registry.npmjs.org/@react-types/slider/-/slider-3.7.5.tgz",
+      "integrity": "sha512-bRitwQRQjQoOcKEdPMljnvm474dwrmsc6pdsVQDh/qynzr+KO9IHuYc3qPW53WVE2hMQJDohlqtCAWQXWQ5Vcg==",
       "dependencies": {
-        "@react-types/shared": "^3.23.1"
+        "@react-types/shared": "^3.24.1"
       },
       "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
       }
     },
     "node_modules/@react-types/switch": {
-      "version": "3.5.3",
-      "resolved": "https://registry.npmjs.org/@react-types/switch/-/switch-3.5.3.tgz",
-      "integrity": "sha512-Nb6+J5MrPaFa8ZNFKGMzAsen/NNzl5UG/BbC65SLGPy7O0VDa/sUpn7dcu8V2xRpRwwIN/Oso4v63bt2sgdkgA==",
+      "version": "3.5.5",
+      "resolved": "https://registry.npmjs.org/@react-types/switch/-/switch-3.5.5.tgz",
+      "integrity": "sha512-SZx1Bd+COhAOs/RTifbZG+uq/llwba7VAKx7XBeX4LeIz1dtguy5bigOBgFTMQi4qsIVCpybSWEEl+daj4XFPw==",
       "dependencies": {
-        "@react-types/shared": "^3.23.1"
+        "@react-types/shared": "^3.24.1"
       },
       "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0"
+        "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0 || ^19.0.0"
       }
     },
     "node_modules/@react-types/table": {
@@ -4025,9 +4408,9 @@
       }
     },
     "node_modules/@reduxjs/toolkit": {
-      "version": "2.2.6",
-      "resolved": "https://registry.npmjs.org/@reduxjs/toolkit/-/toolkit-2.2.6.tgz",
-      "integrity": "sha512-kH0r495c5z1t0g796eDQAkYbEQ3a1OLYN9o8jQQVZyKyw367pfRGS+qZLkHYvFHiUUdafpoSlQ2QYObIApjPWA==",
+      "version": "2.2.7",
+      "resolved": "https://registry.npmjs.org/@reduxjs/toolkit/-/toolkit-2.2.7.tgz",
+      "integrity": "sha512-faI3cZbSdFb8yv9dhDTmGwclW0vk0z5o1cia+kf7gCbaCwHI5e+7tP57mJUv22pNcNbeA62GSrPpfrUfdXcQ6g==",
       "dependencies": {
         "immer": "^10.0.3",
         "redux": "^5.0.1",
@@ -4048,9 +4431,9 @@
       }
     },
     "node_modules/@rollup/rollup-android-arm-eabi": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.14.1.tgz",
-      "integrity": "sha512-fH8/o8nSUek8ceQnT7K4EQbSiV7jgkHq81m9lWZFIXjJ7lJzpWXbQFpT/Zh6OZYnpFykvzC3fbEvEAFZu03dPA==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.19.2.tgz",
+      "integrity": "sha512-OHflWINKtoCFSpm/WmuQaWW4jeX+3Qt3XQDepkkiFTsoxFc5BpF3Z5aDxFZgBqRjO6ATP5+b1iilp4kGIZVWlA==",
       "cpu": [
         "arm"
       ],
@@ -4060,9 +4443,9 @@
       ]
     },
     "node_modules/@rollup/rollup-android-arm64": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.14.1.tgz",
-      "integrity": "sha512-Y/9OHLjzkunF+KGEoJr3heiD5X9OLa8sbT1lm0NYeKyaM3oMhhQFvPB0bNZYJwlq93j8Z6wSxh9+cyKQaxS7PQ==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.19.2.tgz",
+      "integrity": "sha512-k0OC/b14rNzMLDOE6QMBCjDRm3fQOHAL8Ldc9bxEWvMo4Ty9RY6rWmGetNTWhPo+/+FNd1lsQYRd0/1OSix36A==",
       "cpu": [
         "arm64"
       ],
@@ -4072,9 +4455,9 @@
       ]
     },
     "node_modules/@rollup/rollup-darwin-arm64": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.14.1.tgz",
-      "integrity": "sha512-+kecg3FY84WadgcuSVm6llrABOdQAEbNdnpi5X3UwWiFVhZIZvKgGrF7kmLguvxHNQy+UuRV66cLVl3S+Rkt+Q==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.19.2.tgz",
+      "integrity": "sha512-IIARRgWCNWMTeQH+kr/gFTHJccKzwEaI0YSvtqkEBPj7AshElFq89TyreKNFAGh5frLfDCbodnq+Ye3dqGKPBw==",
       "cpu": [
         "arm64"
       ],
@@ -4084,9 +4467,9 @@
       ]
     },
     "node_modules/@rollup/rollup-darwin-x64": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.14.1.tgz",
-      "integrity": "sha512-2pYRzEjVqq2TB/UNv47BV/8vQiXkFGVmPFwJb+1E0IFFZbIX8/jo1olxqqMbo6xCXf8kabANhp5bzCij2tFLUA==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.19.2.tgz",
+      "integrity": "sha512-52udDMFDv54BTAdnw+KXNF45QCvcJOcYGl3vQkp4vARyrcdI/cXH8VXTEv/8QWfd6Fru8QQuw1b2uNersXOL0g==",
       "cpu": [
         "x64"
       ],
@@ -4096,9 +4479,21 @@
       ]
     },
     "node_modules/@rollup/rollup-linux-arm-gnueabihf": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.14.1.tgz",
-      "integrity": "sha512-mS6wQ6Do6/wmrF9aTFVpIJ3/IDXhg1EZcQFYHZLHqw6AzMBjTHWnCG35HxSqUNphh0EHqSM6wRTT8HsL1C0x5g==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.19.2.tgz",
+      "integrity": "sha512-r+SI2t8srMPYZeoa1w0o/AfoVt9akI1ihgazGYPQGRilVAkuzMGiTtexNZkrPkQsyFrvqq/ni8f3zOnHw4hUbA==",
+      "cpu": [
+        "arm"
+      ],
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm-musleabihf": {
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.19.2.tgz",
+      "integrity": "sha512-+tYiL4QVjtI3KliKBGtUU7yhw0GMcJJuB9mLTCEauHEsqfk49gtUBXGtGP3h1LW8MbaTY6rSFIQV1XOBps1gBA==",
       "cpu": [
         "arm"
       ],
@@ -4108,9 +4503,9 @@
       ]
     },
     "node_modules/@rollup/rollup-linux-arm64-gnu": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.14.1.tgz",
-      "integrity": "sha512-p9rGKYkHdFMzhckOTFubfxgyIO1vw//7IIjBBRVzyZebWlzRLeNhqxuSaZ7kCEKVkm/kuC9fVRW9HkC/zNRG2w==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.19.2.tgz",
+      "integrity": "sha512-OR5DcvZiYN75mXDNQQxlQPTv4D+uNCUsmSCSY2FolLf9W5I4DSoJyg7z9Ea3TjKfhPSGgMJiey1aWvlWuBzMtg==",
       "cpu": [
         "arm64"
       ],
@@ -4120,9 +4515,9 @@
       ]
     },
     "node_modules/@rollup/rollup-linux-arm64-musl": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.14.1.tgz",
-      "integrity": "sha512-nDY6Yz5xS/Y4M2i9JLQd3Rofh5OR8Bn8qe3Mv/qCVpHFlwtZSBYSPaU4mrGazWkXrdQ98GB//H0BirGR/SKFSw==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.19.2.tgz",
+      "integrity": "sha512-Hw3jSfWdUSauEYFBSFIte6I8m6jOj+3vifLg8EU3lreWulAUpch4JBjDMtlKosrBzkr0kwKgL9iCfjA8L3geoA==",
       "cpu": [
         "arm64"
       ],
@@ -4132,11 +4527,11 @@
       ]
     },
     "node_modules/@rollup/rollup-linux-powerpc64le-gnu": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.14.1.tgz",
-      "integrity": "sha512-im7HE4VBL+aDswvcmfx88Mp1soqL9OBsdDBU8NqDEYtkri0qV0THhQsvZtZeNNlLeCUQ16PZyv7cqutjDF35qw==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.19.2.tgz",
+      "integrity": "sha512-rhjvoPBhBwVnJRq/+hi2Q3EMiVF538/o9dBuj9TVLclo9DuONqt5xfWSaE6MYiFKpo/lFPJ/iSI72rYWw5Hc7w==",
       "cpu": [
-        "ppc64le"
+        "ppc64"
       ],
       "optional": true,
       "os": [
@@ -4144,9 +4539,9 @@
       ]
     },
     "node_modules/@rollup/rollup-linux-riscv64-gnu": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.14.1.tgz",
-      "integrity": "sha512-RWdiHuAxWmzPJgaHJdpvUUlDz8sdQz4P2uv367T2JocdDa98iRw2UjIJ4QxSyt077mXZT2X6pKfT2iYtVEvOFw==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.19.2.tgz",
+      "integrity": "sha512-EAz6vjPwHHs2qOCnpQkw4xs14XJq84I81sDRGPEjKPFVPBw7fwvtwhVjcZR6SLydCv8zNK8YGFblKWd/vRmP8g==",
       "cpu": [
         "riscv64"
       ],
@@ -4156,9 +4551,9 @@
       ]
     },
     "node_modules/@rollup/rollup-linux-s390x-gnu": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.14.1.tgz",
-      "integrity": "sha512-VMgaGQ5zRX6ZqV/fas65/sUGc9cPmsntq2FiGmayW9KMNfWVG/j0BAqImvU4KTeOOgYSf1F+k6at1UfNONuNjA==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.19.2.tgz",
+      "integrity": "sha512-IJSUX1xb8k/zN9j2I7B5Re6B0NNJDJ1+soezjNojhT8DEVeDNptq2jgycCOpRhyGj0+xBn7Cq+PK7Q+nd2hxLA==",
       "cpu": [
         "s390x"
       ],
@@ -4168,9 +4563,9 @@
       ]
     },
     "node_modules/@rollup/rollup-linux-x64-gnu": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.14.1.tgz",
-      "integrity": "sha512-9Q7DGjZN+hTdJomaQ3Iub4m6VPu1r94bmK2z3UeWP3dGUecRC54tmVu9vKHTm1bOt3ASoYtEz6JSRLFzrysKlA==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.19.2.tgz",
+      "integrity": "sha512-OgaToJ8jSxTpgGkZSkwKE+JQGihdcaqnyHEFOSAU45utQ+yLruE1dkonB2SDI8t375wOKgNn8pQvaWY9kPzxDQ==",
       "cpu": [
         "x64"
       ],
@@ -4180,9 +4575,9 @@
       ]
     },
     "node_modules/@rollup/rollup-linux-x64-musl": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.14.1.tgz",
-      "integrity": "sha512-JNEG/Ti55413SsreTguSx0LOVKX902OfXIKVg+TCXO6Gjans/k9O6ww9q3oLGjNDaTLxM+IHFMeXy/0RXL5R/g==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.19.2.tgz",
+      "integrity": "sha512-5V3mPpWkB066XZZBgSd1lwozBk7tmOkKtquyCJ6T4LN3mzKENXyBwWNQn8d0Ci81hvlBw5RoFgleVpL6aScLYg==",
       "cpu": [
         "x64"
       ],
@@ -4192,9 +4587,9 @@
       ]
     },
     "node_modules/@rollup/rollup-win32-arm64-msvc": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.14.1.tgz",
-      "integrity": "sha512-ryS22I9y0mumlLNwDFYZRDFLwWh3aKaC72CWjFcFvxK0U6v/mOkM5Up1bTbCRAhv3kEIwW2ajROegCIQViUCeA==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.19.2.tgz",
+      "integrity": "sha512-ayVstadfLeeXI9zUPiKRVT8qF55hm7hKa+0N1V6Vj+OTNFfKSoUxyZvzVvgtBxqSb5URQ8sK6fhwxr9/MLmxdA==",
       "cpu": [
         "arm64"
       ],
@@ -4204,9 +4599,9 @@
       ]
     },
     "node_modules/@rollup/rollup-win32-ia32-msvc": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.14.1.tgz",
-      "integrity": "sha512-TdloItiGk+T0mTxKx7Hp279xy30LspMso+GzQvV2maYePMAWdmrzqSNZhUpPj3CGw12aGj57I026PgLCTu8CGg==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.19.2.tgz",
+      "integrity": "sha512-Mda7iG4fOLHNsPqjWSjANvNZYoW034yxgrndof0DwCy0D3FvTjeNo+HGE6oGWgvcLZNLlcp0hLEFcRs+UGsMLg==",
       "cpu": [
         "ia32"
       ],
@@ -4216,9 +4611,9 @@
       ]
     },
     "node_modules/@rollup/rollup-win32-x64-msvc": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.14.1.tgz",
-      "integrity": "sha512-wQGI+LY/Py20zdUPq+XCem7JcPOyzIJBm3dli+56DJsQOHbnXZFEwgmnC6el1TPAfC8lBT3m+z69RmLykNUbew==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.19.2.tgz",
+      "integrity": "sha512-DPi0ubYhSow/00YqmG1jWm3qt1F8aXziHc/UNy8bo9cpCacqhuWu+iSq/fp2SyEQK7iYTZ60fBU9cat3MXTjIQ==",
       "cpu": [
         "x64"
       ],
@@ -4242,9 +4637,9 @@
       }
     },
     "node_modules/@tailwindcss/typography": {
-      "version": "0.5.13",
-      "resolved": "https://registry.npmjs.org/@tailwindcss/typography/-/typography-0.5.13.tgz",
-      "integrity": "sha512-ADGcJ8dX21dVVHIwTRgzrcunY6YY9uSlAHHGVKvkA+vLc5qLwEszvKts40lx7z0qc4clpjclwLeK5rVCV2P/uw==",
+      "version": "0.5.14",
+      "resolved": "https://registry.npmjs.org/@tailwindcss/typography/-/typography-0.5.14.tgz",
+      "integrity": "sha512-ZvOCjUbsJBjL9CxQBn+VEnFpouzuKhxh2dH8xMIWHILL+HfOYtlAkWcyoon8LlzE53d2Yo6YO6pahKKNW3q1YQ==",
       "dev": true,
       "dependencies": {
         "lodash.castarray": "^4.4.0",
@@ -4256,23 +4651,10 @@
         "tailwindcss": ">=3.0.0 || insiders"
       }
     },
-    "node_modules/@tailwindcss/typography/node_modules/postcss-selector-parser": {
-      "version": "6.0.10",
-      "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.0.10.tgz",
-      "integrity": "sha512-IQ7TZdoaqbT+LCpShg46jnZVlhWD2w6iQYAcYXfHARZ7X1t/UGhhceQDs5X0cGqKvYlHNOuv7Oa1xmb0oQuA3w==",
-      "dev": true,
-      "dependencies": {
-        "cssesc": "^3.0.0",
-        "util-deprecate": "^1.0.2"
-      },
-      "engines": {
-        "node": ">=4"
-      }
-    },
     "node_modules/@testing-library/dom": {
-      "version": "10.1.0",
-      "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.1.0.tgz",
-      "integrity": "sha512-wdsYKy5zupPyLCW2Je5DLHSxSfbIp6h80WoHOQc+RPtmPGA52O9x5MJEkv92Sjonpq+poOAtUKhh1kBGAXBrNA==",
+      "version": "10.4.0",
+      "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.0.tgz",
+      "integrity": "sha512-pemlzrSESWbdAloYml3bAJMEfNh1Z7EduzqPKprCH5S341frlpYnUEW0H72dLxa6IsYr+mPno20GiSm+h9dEdQ==",
       "dev": true,
       "peer": true,
       "dependencies": {
@@ -4406,9 +4788,9 @@
       }
     },
     "node_modules/@types/babel__traverse": {
-      "version": "7.20.5",
-      "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.20.5.tgz",
-      "integrity": "sha512-WXCyOcRtH37HAUkpXhUduaxdm82b4GSlyTqajXviN4EfiuPgNYR109xMCKvpl6zPIpua0DGlMEDCq+g8EdoheQ==",
+      "version": "7.20.6",
+      "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.20.6.tgz",
+      "integrity": "sha512-r1bzfrm0tomOI8g1SzvCaQHo6Lcv6zu0EA+W2kHrt8dyrHQxGzBBL4kdkzIS+jBMV+EYcMAEAqXqYaLJq5rOZg==",
       "dependencies": {
         "@babel/types": "^7.20.7"
       }
@@ -4435,11 +4817,11 @@
       }
     },
     "node_modules/@types/hast": {
-      "version": "2.3.10",
-      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
-      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "version": "3.0.4",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz",
+      "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==",
       "dependencies": {
-        "@types/unist": "^2"
+        "@types/unist": "*"
       }
     },
     "node_modules/@types/json5": {
@@ -4462,9 +4844,9 @@
       }
     },
     "node_modules/@types/mdast": {
-      "version": "4.0.3",
-      "resolved": "https://registry.npmjs.org/@types/mdast/-/mdast-4.0.3.tgz",
-      "integrity": "sha512-LsjtqsyF+d2/yFOYaN22dHZI1Cpwkrj+g06G8+qtUKlhovPW89YhqSnfKtMbkgmEtYpH2gydRNULd6y8mciAFg==",
+      "version": "4.0.4",
+      "resolved": "https://registry.npmjs.org/@types/mdast/-/mdast-4.0.4.tgz",
+      "integrity": "sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA==",
       "dependencies": {
         "@types/unist": "*"
       }
@@ -4475,12 +4857,12 @@
       "integrity": "sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g=="
     },
     "node_modules/@types/node": {
-      "version": "20.14.12",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.12.tgz",
-      "integrity": "sha512-r7wNXakLeSsGT0H1AU863vS2wa5wBOK4bWMjZz2wj+8nBx+m5PeIn0k8AloSLpRuiwdRQZwarZqHE4FNArPuJQ==",
+      "version": "22.1.0",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.1.0.tgz",
+      "integrity": "sha512-AOmuRF0R2/5j1knA3c6G3HOk523Ga+l+ZXltX8SF1+5oqcXijjfTd8fY3XRZqSihEu9XhtQnKYLmkFaoxgsJHw==",
       "devOptional": true,
       "dependencies": {
-        "undici-types": "~5.26.4"
+        "undici-types": "~6.13.0"
       }
     },
     "node_modules/@types/prop-types": {
@@ -4525,9 +4907,9 @@
       }
     },
     "node_modules/@types/unist": {
-      "version": "2.0.10",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.10.tgz",
-      "integrity": "sha512-IfYcSBWE3hLpBg8+X2SEa8LVkJdJEkT2Ese2aaLs3ptGdVtABxndrMaxuFlQ1qdFf9Q5rDvDpxI3WwgvKFAsQA=="
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
+      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
     },
     "node_modules/@types/use-sync-external-store": {
       "version": "0.0.3",
@@ -4535,16 +4917,16 @@
       "integrity": "sha512-EwmlvuaxPNej9+T4v5AuBPJa2x2UOJVdjCtDHgcDqitUeOtjnJKJ+apYjVcAoBEMjKW1VVFGZLUb5+qqa09XFA=="
     },
     "node_modules/@typescript-eslint/eslint-plugin": {
-      "version": "7.17.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-7.17.0.tgz",
-      "integrity": "sha512-pyiDhEuLM3PuANxH7uNYan1AaFs5XE0zw1hq69JBvGvE7gSuEoQl1ydtEe/XQeoC3GQxLXyOVa5kNOATgM638A==",
+      "version": "7.18.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-7.18.0.tgz",
+      "integrity": "sha512-94EQTWZ40mzBc42ATNIBimBEDltSJ9RQHCC8vc/PDbxi4k8dVwUAv4o98dk50M1zB+JGFxp43FP7f8+FP8R6Sw==",
       "dev": true,
       "dependencies": {
         "@eslint-community/regexpp": "^4.10.0",
-        "@typescript-eslint/scope-manager": "7.17.0",
-        "@typescript-eslint/type-utils": "7.17.0",
-        "@typescript-eslint/utils": "7.17.0",
-        "@typescript-eslint/visitor-keys": "7.17.0",
+        "@typescript-eslint/scope-manager": "7.18.0",
+        "@typescript-eslint/type-utils": "7.18.0",
+        "@typescript-eslint/utils": "7.18.0",
+        "@typescript-eslint/visitor-keys": "7.18.0",
         "graphemer": "^1.4.0",
         "ignore": "^5.3.1",
         "natural-compare": "^1.4.0",
@@ -4568,15 +4950,15 @@
       }
     },
     "node_modules/@typescript-eslint/parser": {
-      "version": "7.17.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-7.17.0.tgz",
-      "integrity": "sha512-puiYfGeg5Ydop8eusb/Hy1k7QmOU6X3nvsqCgzrB2K4qMavK//21+PzNE8qeECgNOIoertJPUC1SpegHDI515A==",
+      "version": "7.18.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-7.18.0.tgz",
+      "integrity": "sha512-4Z+L8I2OqhZV8qA132M4wNL30ypZGYOQVBfMgxDH/K5UX0PNqTu1c6za9ST5r9+tavvHiTWmBnKzpCJ/GlVFtg==",
       "dev": true,
       "dependencies": {
-        "@typescript-eslint/scope-manager": "7.17.0",
-        "@typescript-eslint/types": "7.17.0",
-        "@typescript-eslint/typescript-estree": "7.17.0",
-        "@typescript-eslint/visitor-keys": "7.17.0",
+        "@typescript-eslint/scope-manager": "7.18.0",
+        "@typescript-eslint/types": "7.18.0",
+        "@typescript-eslint/typescript-estree": "7.18.0",
+        "@typescript-eslint/visitor-keys": "7.18.0",
         "debug": "^4.3.4"
       },
       "engines": {
@@ -4596,13 +4978,13 @@
       }
     },
     "node_modules/@typescript-eslint/scope-manager": {
-      "version": "7.17.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-7.17.0.tgz",
-      "integrity": "sha512-0P2jTTqyxWp9HiKLu/Vemr2Rg1Xb5B7uHItdVZ6iAenXmPo4SZ86yOPCJwMqpCyaMiEHTNqizHfsbmCFT1x9SA==",
+      "version": "7.18.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-7.18.0.tgz",
+      "integrity": "sha512-jjhdIE/FPF2B7Z1uzc6i3oWKbGcHb87Qw7AWj6jmEqNOfDFbJWtjt/XfwCpvNkpGWlcJaog5vTR+VV8+w9JflA==",
       "dev": true,
       "dependencies": {
-        "@typescript-eslint/types": "7.17.0",
-        "@typescript-eslint/visitor-keys": "7.17.0"
+        "@typescript-eslint/types": "7.18.0",
+        "@typescript-eslint/visitor-keys": "7.18.0"
       },
       "engines": {
         "node": "^18.18.0 || >=20.0.0"
@@ -4613,13 +4995,13 @@
       }
     },
     "node_modules/@typescript-eslint/type-utils": {
-      "version": "7.17.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-7.17.0.tgz",
-      "integrity": "sha512-XD3aaBt+orgkM/7Cei0XNEm1vwUxQ958AOLALzPlbPqb8C1G8PZK85tND7Jpe69Wualri81PLU+Zc48GVKIMMA==",
+      "version": "7.18.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-7.18.0.tgz",
+      "integrity": "sha512-XL0FJXuCLaDuX2sYqZUUSOJ2sG5/i1AAze+axqmLnSkNEVMVYLF+cbwlB2w8D1tinFuSikHmFta+P+HOofrLeA==",
       "dev": true,
       "dependencies": {
-        "@typescript-eslint/typescript-estree": "7.17.0",
-        "@typescript-eslint/utils": "7.17.0",
+        "@typescript-eslint/typescript-estree": "7.18.0",
+        "@typescript-eslint/utils": "7.18.0",
         "debug": "^4.3.4",
         "ts-api-utils": "^1.3.0"
       },
@@ -4640,9 +5022,9 @@
       }
     },
     "node_modules/@typescript-eslint/types": {
-      "version": "7.17.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-7.17.0.tgz",
-      "integrity": "sha512-a29Ir0EbyKTKHnZWbNsrc/gqfIBqYPwj3F2M+jWE/9bqfEHg0AMtXzkbUkOG6QgEScxh2+Pz9OXe11jHDnHR7A==",
+      "version": "7.18.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-7.18.0.tgz",
+      "integrity": "sha512-iZqi+Ds1y4EDYUtlOOC+aUmxnE9xS/yCigkjA7XpTKV6nCBd3Hp/PRGGmdwnfkV2ThMyYldP1wRpm/id99spTQ==",
       "dev": true,
       "engines": {
         "node": "^18.18.0 || >=20.0.0"
@@ -4653,13 +5035,13 @@
       }
     },
     "node_modules/@typescript-eslint/typescript-estree": {
-      "version": "7.17.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-7.17.0.tgz",
-      "integrity": "sha512-72I3TGq93t2GoSBWI093wmKo0n6/b7O4j9o8U+f65TVD0FS6bI2180X5eGEr8MA8PhKMvYe9myZJquUT2JkCZw==",
+      "version": "7.18.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-7.18.0.tgz",
+      "integrity": "sha512-aP1v/BSPnnyhMHts8cf1qQ6Q1IFwwRvAQGRvBFkWlo3/lH29OXA3Pts+c10nxRxIBrDnoMqzhgdwVe5f2D6OzA==",
       "dev": true,
       "dependencies": {
-        "@typescript-eslint/types": "7.17.0",
-        "@typescript-eslint/visitor-keys": "7.17.0",
+        "@typescript-eslint/types": "7.18.0",
+        "@typescript-eslint/visitor-keys": "7.18.0",
         "debug": "^4.3.4",
         "globby": "^11.1.0",
         "is-glob": "^4.0.3",
@@ -4681,15 +5063,15 @@
       }
     },
     "node_modules/@typescript-eslint/utils": {
-      "version": "7.17.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-7.17.0.tgz",
-      "integrity": "sha512-r+JFlm5NdB+JXc7aWWZ3fKSm1gn0pkswEwIYsrGPdsT2GjsRATAKXiNtp3vgAAO1xZhX8alIOEQnNMl3kbTgJw==",
+      "version": "7.18.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-7.18.0.tgz",
+      "integrity": "sha512-kK0/rNa2j74XuHVcoCZxdFBMF+aq/vH83CXAOHieC+2Gis4mF8jJXT5eAfyD3K0sAxtPuwxaIOIOvhwzVDt/kw==",
       "dev": true,
       "dependencies": {
         "@eslint-community/eslint-utils": "^4.4.0",
-        "@typescript-eslint/scope-manager": "7.17.0",
-        "@typescript-eslint/types": "7.17.0",
-        "@typescript-eslint/typescript-estree": "7.17.0"
+        "@typescript-eslint/scope-manager": "7.18.0",
+        "@typescript-eslint/types": "7.18.0",
+        "@typescript-eslint/typescript-estree": "7.18.0"
       },
       "engines": {
         "node": "^18.18.0 || >=20.0.0"
@@ -4703,12 +5085,12 @@
       }
     },
     "node_modules/@typescript-eslint/visitor-keys": {
-      "version": "7.17.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-7.17.0.tgz",
-      "integrity": "sha512-RVGC9UhPOCsfCdI9pU++K4nD7to+jTcMIbXTSOcrLqUEW6gF2pU1UUbYJKc9cvcRSK1UDeMJ7pdMxf4bhMpV/A==",
+      "version": "7.18.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-7.18.0.tgz",
+      "integrity": "sha512-cDF0/Gf81QpY3xYyJKDV14Zwdmid5+uuENhjH2EqFaF0ni+yAyq/LzMaIJdhNJXZI7uLzwIlA+V7oWoyn6Curg==",
       "dev": true,
       "dependencies": {
-        "@typescript-eslint/types": "7.17.0",
+        "@typescript-eslint/types": "7.18.0",
         "eslint-visitor-keys": "^3.4.3"
       },
       "engines": {
@@ -4769,20 +5151,6 @@
         "vitest": "1.6.0"
       }
     },
-    "node_modules/@vitest/coverage-v8/node_modules/istanbul-lib-source-maps": {
-      "version": "5.0.6",
-      "resolved": "https://registry.npmjs.org/istanbul-lib-source-maps/-/istanbul-lib-source-maps-5.0.6.tgz",
-      "integrity": "sha512-yg2d+Em4KizZC5niWhQaIomgf5WlL4vOOjZ5xGCmF8SnPE/mDWWXgvRExdcpCgh9lLRRa1/fSYp2ymmbJ1pI+A==",
-      "dev": true,
-      "dependencies": {
-        "@jridgewell/trace-mapping": "^0.3.23",
-        "debug": "^4.1.1",
-        "istanbul-lib-coverage": "^3.0.0"
-      },
-      "engines": {
-        "node": ">=10"
-      }
-    },
     "node_modules/@vitest/expect": {
       "version": "1.6.0",
       "resolved": "https://registry.npmjs.org/@vitest/expect/-/expect-1.6.0.tgz",
@@ -4827,9 +5195,9 @@
       }
     },
     "node_modules/@vitest/runner/node_modules/yocto-queue": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-1.0.0.tgz",
-      "integrity": "sha512-9bnSc/HEW2uRy67wc+T8UwauLuPJVn28jb+GtJY16iiKWyvmYJRXVT4UamsAEGQfPohgr2q4Tq0sQbQlxTfi1g==",
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-1.1.1.tgz",
+      "integrity": "sha512-b4JR1PFR10y1mKjhHY9LaGo6tmrgjit7hxVIeAmyMw3jegXR4dhYqLaQF5zMXZxY7tLpMyJeLjr1C4rLmkVe8g==",
       "dev": true,
       "engines": {
         "node": ">=12.20"
@@ -4957,9 +5325,9 @@
       "integrity": "sha512-hqJHYaQb5OptNunnyAnkHyM8aCjZ1MEIDTQu1iIbbTD/xops91NB5yq1ZK/dC2JDbVWtF23zUtl9JE2NqwT87A=="
     },
     "node_modules/acorn": {
-      "version": "8.11.3",
-      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz",
-      "integrity": "sha512-Y9rRfJG5jcKOE0CLisYbojUjIrIEE7AGMzA/Sm4BslANhbS+cDMpgBdcPT91oJ7OuJ9hYJBx59RjbhxVnrF8Xg==",
+      "version": "8.12.1",
+      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.12.1.tgz",
+      "integrity": "sha512-tcpGyI9zbizT9JbV6oYE477V6mTlXvvi0T0G3SNIYE2apm/G5huBa1+K89VGeovbg+jycCrfhl3ADxErOuO6Jg==",
       "dev": true,
       "bin": {
         "acorn": "bin/acorn"
@@ -4978,10 +5346,13 @@
       }
     },
     "node_modules/acorn-walk": {
-      "version": "8.3.2",
-      "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.3.2.tgz",
-      "integrity": "sha512-cjkyv4OtNCIeqhHrfS81QWXoCBPExR/J62oyEqepVw8WaQeSqpW2uhuLPh1m9eWhDuOo/jUXVTlifvesOWp/4A==",
+      "version": "8.3.3",
+      "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.3.3.tgz",
+      "integrity": "sha512-MxXdReSRhGO7VlFe1bRG/oI7/mdLV9B9JJT0N8vZOhF7gFRR5l3M8W9G8JxmKV+JC5mGqJ0QvqfSOLsCPa4nUw==",
       "dev": true,
+      "dependencies": {
+        "acorn": "^8.11.0"
+      },
       "engines": {
         "node": ">=0.4.0"
       }
@@ -5014,6 +5385,21 @@
         "url": "https://github.com/sponsors/epoberezkin"
       }
     },
+    "node_modules/ansi-escapes": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-7.0.0.tgz",
+      "integrity": "sha512-GdYO7a61mR0fOlAsvC9/rIHf7L96sBc6dEWzeOu+KAea5bZyQRPIpojrVoI4AXGJS/ycu/fBTdLrUkA4ODrvjw==",
+      "dev": true,
+      "dependencies": {
+        "environment": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/ansi-regex": {
       "version": "5.0.1",
       "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
@@ -5254,9 +5640,9 @@
       "dev": true
     },
     "node_modules/autoprefixer": {
-      "version": "10.4.19",
-      "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.19.tgz",
-      "integrity": "sha512-BaENR2+zBZ8xXhM4pUaKUxlVdxZ0EZhjvbopwnXmxRUfqDmwSpC2lAi/QXvx7NRdPCo1WKEcEF6mV64si1z4Ew==",
+      "version": "10.4.20",
+      "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.20.tgz",
+      "integrity": "sha512-XY25y5xSv/wEoqzDyXXME4AFfkZI0P23z6Fs3YgymDnKJkCGOnkL0iTxCa85UTqaSgfcqyf3UA6+c7wUvx/16g==",
       "dev": true,
       "funding": [
         {
@@ -5273,11 +5659,11 @@
         }
       ],
       "dependencies": {
-        "browserslist": "^4.23.0",
-        "caniuse-lite": "^1.0.30001599",
+        "browserslist": "^4.23.3",
+        "caniuse-lite": "^1.0.30001646",
         "fraction.js": "^4.3.7",
         "normalize-range": "^0.1.2",
-        "picocolors": "^1.0.0",
+        "picocolors": "^1.0.1",
         "postcss-value-parser": "^4.2.0"
       },
       "bin": {
@@ -5306,9 +5692,9 @@
       }
     },
     "node_modules/axe-core": {
-      "version": "4.9.1",
-      "resolved": "https://registry.npmjs.org/axe-core/-/axe-core-4.9.1.tgz",
-      "integrity": "sha512-QbUdXJVTpvUTHU7871ppZkdOLBeGUKBQWHkHrvN2V9IQWGMt61zf3B45BtzjxEJzYuj0JBjBZP/hmYS/R9pmAw==",
+      "version": "4.10.0",
+      "resolved": "https://registry.npmjs.org/axe-core/-/axe-core-4.10.0.tgz",
+      "integrity": "sha512-Mr2ZakwQ7XUAjp7pAwQWRhhK8mQQ6JAaNWSjmjxil0R8BPioMtQsTLOolGYkji1rcL++3dCqZA3zWqpT+9Ew6g==",
       "dev": true,
       "engines": {
         "node": ">=4"
@@ -5368,9 +5754,9 @@
       }
     },
     "node_modules/browserslist": {
-      "version": "4.23.0",
-      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.23.0.tgz",
-      "integrity": "sha512-QW8HiM1shhT2GuzkvklfjcKDiWFXHOeFCIA/huJPwHsslwcydgk7X+z2zXpEijP98UCY7HbubZt5J2Zgvf0CaQ==",
+      "version": "4.23.3",
+      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.23.3.tgz",
+      "integrity": "sha512-btwCFJVjI4YWDNfau8RhZ+B1Q/VLoUITrm3RlP6y1tYGWIOa+InuYiRGXUBXo8nA1qKmHMyLB/iVQg5TT4eFoA==",
       "funding": [
         {
           "type": "opencollective",
@@ -5386,10 +5772,10 @@
         }
       ],
       "dependencies": {
-        "caniuse-lite": "^1.0.30001587",
-        "electron-to-chromium": "^1.4.668",
-        "node-releases": "^2.0.14",
-        "update-browserslist-db": "^1.0.13"
+        "caniuse-lite": "^1.0.30001646",
+        "electron-to-chromium": "^1.5.4",
+        "node-releases": "^2.0.18",
+        "update-browserslist-db": "^1.1.0"
       },
       "bin": {
         "browserslist": "cli.js"
@@ -5444,9 +5830,9 @@
       }
     },
     "node_modules/caniuse-lite": {
-      "version": "1.0.30001607",
-      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001607.tgz",
-      "integrity": "sha512-WcvhVRjXLKFB/kmOFVwELtMxyhq3iM/MvmXcyCe2PNf166c39mptscOc/45TTS96n2gpNV2z7+NakArTWZCQ3w==",
+      "version": "1.0.30001646",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001646.tgz",
+      "integrity": "sha512-dRg00gudiBDDTmUhClSdv3hqRfpbOnU28IpI1T6PBTLWa+kOj0681C8uML3PifYfREuBrVjDGhL3adYpBT6spw==",
       "funding": [
         {
           "type": "opencollective",
@@ -5472,9 +5858,9 @@
       }
     },
     "node_modules/chai": {
-      "version": "4.4.1",
-      "resolved": "https://registry.npmjs.org/chai/-/chai-4.4.1.tgz",
-      "integrity": "sha512-13sOfMv2+DWduEU+/xbun3LScLoqN17nBeTLUsmDfKdoiC1fr0n9PU4guu4AhRcOVFk/sW8LyZWHuhWtQZiF+g==",
+      "version": "4.5.0",
+      "resolved": "https://registry.npmjs.org/chai/-/chai-4.5.0.tgz",
+      "integrity": "sha512-RITGBfijLkBddZvnn8jdqoTypxvqbOLYQkGGxXzeFjVHvudaPw0HNFD9x928/eUwYWd2dPCugVqspGALTZZQKw==",
       "dev": true,
       "dependencies": {
         "assertion-error": "^1.1.0",
@@ -5483,7 +5869,7 @@
         "get-func-name": "^2.0.2",
         "loupe": "^2.3.6",
         "pathval": "^1.1.1",
-        "type-detect": "^4.0.8"
+        "type-detect": "^4.1.0"
       },
       "engines": {
         "node": ">=4"
@@ -5506,9 +5892,9 @@
       }
     },
     "node_modules/character-entities": {
-      "version": "1.2.4",
-      "resolved": "https://registry.npmjs.org/character-entities/-/character-entities-1.2.4.tgz",
-      "integrity": "sha512-iBMyeEHxfVnIakwOuDXpVkc54HijNgCyQB2w0VfGQThle6NXn50zU6V/u+LDhxHcDUPojn6Kpga3PTAD8W1bQw==",
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/character-entities/-/character-entities-2.0.2.tgz",
+      "integrity": "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==",
       "funding": {
         "type": "github",
         "url": "https://github.com/sponsors/wooorm"
@@ -5524,18 +5910,18 @@
       }
     },
     "node_modules/character-entities-legacy": {
-      "version": "1.1.4",
-      "resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-1.1.4.tgz",
-      "integrity": "sha512-3Xnr+7ZFS1uxeiUDvV02wQ+QDbc55o97tIV5zHScSPJpcLm/r0DFPcoY3tYRp+VZukxuMeKgXYmsXQHO05zQeA==",
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-3.0.0.tgz",
+      "integrity": "sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==",
       "funding": {
         "type": "github",
         "url": "https://github.com/sponsors/wooorm"
       }
     },
     "node_modules/character-reference-invalid": {
-      "version": "1.1.4",
-      "resolved": "https://registry.npmjs.org/character-reference-invalid/-/character-reference-invalid-1.1.4.tgz",
-      "integrity": "sha512-mKKUkUbhPpQlCOfIuZkvSEgktjPFIsZKRRbC6KWVEMvlzblj3i3asQv5ODsrwt0N3pHAEvjP8KTQPHkp0+6jOg==",
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/character-reference-invalid/-/character-reference-invalid-2.0.1.tgz",
+      "integrity": "sha512-iBZ4F4wRbyORVsu0jPV7gXkOsGYjGHPmAyv+HiHG8gi5PtC9KI2j1+v8/tlibRvjoWX027ypmG/n0HtO5t7unw==",
       "funding": {
         "type": "github",
         "url": "https://github.com/sponsors/wooorm"
@@ -5588,15 +5974,15 @@
       }
     },
     "node_modules/cli-cursor": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-4.0.0.tgz",
-      "integrity": "sha512-VGtlMu3x/4DOtIUwEkRezxUZ2lBacNJCHash0N0WeZDBS+7Ux1dm3XWAgWYxLJFMMdOeXMHXorshEFhbMSGelg==",
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-5.0.0.tgz",
+      "integrity": "sha512-aCj4O5wKyszjMmDT4tZj93kxyydN/K5zPWSCe6/0AV/AA1pqe5ZBIw0a2ZfPQV7lL5/yb5HsUreJ6UFAF1tEQw==",
       "dev": true,
       "dependencies": {
-        "restore-cursor": "^4.0.0"
+        "restore-cursor": "^5.0.0"
       },
       "engines": {
-        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
+        "node": ">=18"
       },
       "funding": {
         "url": "https://github.com/sponsors/sindresorhus"
@@ -5687,9 +6073,9 @@
       }
     },
     "node_modules/comma-separated-tokens": {
-      "version": "1.0.8",
-      "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-1.0.8.tgz",
-      "integrity": "sha512-GHuDRO12Sypu2cV70d1dkA2EUmXHgntrzbpvOB+Qy+49ypNfGgFQIC2fhhXbnyrJRynDCAARsT7Ou0M6hirpfw==",
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz",
+      "integrity": "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==",
       "funding": {
         "type": "github",
         "url": "https://github.com/sponsors/wooorm"
@@ -5715,6 +6101,12 @@
       "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
       "dev": true
     },
+    "node_modules/confbox": {
+      "version": "0.1.7",
+      "resolved": "https://registry.npmjs.org/confbox/-/confbox-0.1.7.tgz",
+      "integrity": "sha512-uJcB/FKZtBMCJpK8MQji6bJHgu1tixKPxRLeGkNzBoOZzpnZUJm0jm2/sBDWcuBx1dYgxV4JU+g5hmNxCyAmdA==",
+      "dev": true
+    },
     "node_modules/confusing-browser-globals": {
       "version": "1.0.11",
       "resolved": "https://registry.npmjs.org/confusing-browser-globals/-/confusing-browser-globals-1.0.11.tgz",
@@ -5776,6 +6168,12 @@
         "node": ">=18"
       }
     },
+    "node_modules/cssstyle/node_modules/rrweb-cssom": {
+      "version": "0.6.0",
+      "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz",
+      "integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw==",
+      "dev": true
+    },
     "node_modules/csstype": {
       "version": "3.1.3",
       "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
@@ -5852,9 +6250,9 @@
       }
     },
     "node_modules/debug": {
-      "version": "4.3.4",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
-      "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
+      "version": "4.3.6",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.6.tgz",
+      "integrity": "sha512-O/09Bd4Z1fBrU4VzkhFqVgpPzaGbw6Sm9FEkBT1A/YBXQFGuuSxa1dN2nxgxS34JmKXqYx8CZAwEVoJFImUXIg==",
       "dependencies": {
         "ms": "2.1.2"
       },
@@ -5885,19 +6283,10 @@
         "url": "https://github.com/sponsors/wooorm"
       }
     },
-    "node_modules/decode-named-character-reference/node_modules/character-entities": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/character-entities/-/character-entities-2.0.2.tgz",
-      "integrity": "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==",
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
-      }
-    },
     "node_modules/deep-eql": {
-      "version": "4.1.3",
-      "resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-4.1.3.tgz",
-      "integrity": "sha512-WaEtAOpRA1MQ0eohqZjpGD8zdI0Ovsm8mmFhaDN8dvDZzyoUMcYDnf5Y6iu7HTXxf8JDS23qWa4a+hKCDyOPzw==",
+      "version": "4.1.4",
+      "resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-4.1.4.tgz",
+      "integrity": "sha512-SUwdGfqdKOwxCPeVYjwSyRpJ7Z+fhpwIAtmCUdZIWZ/YP5R9WAsyuSgpLVDi9bjWoN2LXHNss/dk3urXtdQxGg==",
       "dev": true,
       "dependencies": {
         "type-detect": "^4.0.0"
@@ -6076,9 +6465,9 @@
       "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA=="
     },
     "node_modules/electron-to-chromium": {
-      "version": "1.4.729",
-      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.729.tgz",
-      "integrity": "sha512-bx7+5Saea/qu14kmPTDHQxkp2UnziG3iajUQu3BxFvCOnpAJdDbMV4rSl+EqFDkkpNNVUFlR1kDfpL59xfy1HA=="
+      "version": "1.5.4",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.4.tgz",
+      "integrity": "sha512-orzA81VqLyIGUEA77YkVA1D+N+nNfl2isJVjjmOyrlxuooZ19ynb+dOlaDTqd/idKRS9lDCSBmtzM+kyCsMnkA=="
     },
     "node_modules/emoji-regex": {
       "version": "9.2.2",
@@ -6097,6 +6486,18 @@
         "url": "https://github.com/fb55/entities?sponsor=1"
       }
     },
+    "node_modules/environment": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/environment/-/environment-1.1.0.tgz",
+      "integrity": "sha512-xUtoPkMggbz0MPyPiIWr1Kp4aeWJjDZ6SMvURhimjdZgsRuDplF5/s9hcgGhyXMhs+6vpnuoiZ2kFiu3FMnS8Q==",
+      "dev": true,
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/es-abstract": {
       "version": "1.23.3",
       "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.23.3.tgz",
@@ -6337,7 +6738,6 @@
       "resolved": "https://registry.npmjs.org/eslint/-/eslint-8.57.0.tgz",
       "integrity": "sha512-dZ6+mexnaTIbSBZWgou51U6OmzIhYM2VcNdtiTtI7qPNZm35Akpr0f6vtw3w1Kmn5PYo+tZVfh13WrhpS6oLqQ==",
       "dev": true,
-      "license": "MIT",
       "dependencies": {
         "@eslint-community/eslint-utils": "^4.2.0",
         "@eslint-community/regexpp": "^4.6.1",
@@ -6852,18 +7252,6 @@
         "node": "*"
       }
     },
-    "node_modules/eslint/node_modules/type-fest": {
-      "version": "0.20.2",
-      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz",
-      "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==",
-      "dev": true,
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/espree": {
       "version": "9.6.1",
       "resolved": "https://registry.npmjs.org/espree/-/espree-9.6.1.tgz",
@@ -6882,9 +7270,9 @@
       }
     },
     "node_modules/esquery": {
-      "version": "1.5.0",
-      "resolved": "https://registry.npmjs.org/esquery/-/esquery-1.5.0.tgz",
-      "integrity": "sha512-YQLXUplAwJgCydQ78IMJywZCceoqk1oH01OERdSAJc/7U2AylwjhSCLDEtqwg811idIS/9fIU5GjG73IgjKMVg==",
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/esquery/-/esquery-1.6.0.tgz",
+      "integrity": "sha512-ca9pw9fomFcKPvFLXhBKUK90ZvGibiGOvRJNbjljY7s7uq/5YO4BOzcYtJqExdx99rF6aAcnRxHmcUHcz6sQsg==",
       "dev": true,
       "dependencies": {
         "estraverse": "^5.1.0"
@@ -6947,6 +7335,29 @@
       "integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==",
       "dev": true
     },
+    "node_modules/execa": {
+      "version": "8.0.1",
+      "resolved": "https://registry.npmjs.org/execa/-/execa-8.0.1.tgz",
+      "integrity": "sha512-VyhnebXciFV2DESc+p6B+y0LjSm0krU4OgJN44qFAhBY0TJ+1V61tYD2+wHusZ6F9n5K+vl8k0sTy7PEfV4qpg==",
+      "dev": true,
+      "dependencies": {
+        "cross-spawn": "^7.0.3",
+        "get-stream": "^8.0.1",
+        "human-signals": "^5.0.0",
+        "is-stream": "^3.0.0",
+        "merge-stream": "^2.0.0",
+        "npm-run-path": "^5.1.0",
+        "onetime": "^6.0.0",
+        "signal-exit": "^4.1.0",
+        "strip-final-newline": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=16.17"
+      },
+      "funding": {
+        "url": "https://github.com/sindresorhus/execa?sponsor=1"
+      }
+    },
     "node_modules/extend": {
       "version": "3.0.2",
       "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
@@ -7099,9 +7510,9 @@
       }
     },
     "node_modules/foreground-child": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.1.1.tgz",
-      "integrity": "sha512-TMKDUnIte6bfb5nWv7V/caI169OHgvwjb7V4WkeUvbQQdjr5rWKqHFiKWb/fcOwB+CzBT+qbWjvj+DVwRskpIg==",
+      "version": "3.2.1",
+      "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.2.1.tgz",
+      "integrity": "sha512-PXUUyLqrR2XCWICfv6ukppP96sdFwWbNEnfEMt7jNsISjMsvaLNinAHNDYyvkyU+SZG2BTSbT5NjG+vZslfGTA==",
       "dependencies": {
         "cross-spawn": "^7.0.0",
         "signal-exit": "^4.0.1"
@@ -7113,17 +7524,6 @@
         "url": "https://github.com/sponsors/isaacs"
       }
     },
-    "node_modules/foreground-child/node_modules/signal-exit": {
-      "version": "4.1.0",
-      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
-      "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==",
-      "engines": {
-        "node": ">=14"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
     "node_modules/form-data": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz",
@@ -7160,9 +7560,9 @@
       }
     },
     "node_modules/framer-motion": {
-      "version": "11.3.8",
-      "resolved": "https://registry.npmjs.org/framer-motion/-/framer-motion-11.3.8.tgz",
-      "integrity": "sha512-1D+RDTsIp4Rz2dq/oToqSEc9idEQwgBRQyBq4rGpFba+0Z+GCbj9z1s0+ikFbanWe3YJ0SqkNlDe08GcpFGj5A==",
+      "version": "11.3.21",
+      "resolved": "https://registry.npmjs.org/framer-motion/-/framer-motion-11.3.21.tgz",
+      "integrity": "sha512-D+hfIsvzV8eL/iycld4K+tKlg2Q2LdwnrcBEohtGw3cG1AIuNYATbT5RUqIM1ndsAk+EfGhoSGf0UaiFodc5Tw==",
       "peer": true,
       "dependencies": {
         "tslib": "^2.4.0"
@@ -7294,6 +7694,18 @@
         "node": ">=6"
       }
     },
+    "node_modules/get-stream": {
+      "version": "8.0.1",
+      "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-8.0.1.tgz",
+      "integrity": "sha512-VaUJspBffn/LMCJVoMvSAdmscJyS1auj5Zulnn5UoYcY531UWmdwhRWkcGKnGU93m5HSXP9LP2usOryrBtQowA==",
+      "dev": true,
+      "engines": {
+        "node": ">=16"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/get-symbol-description": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/get-symbol-description/-/get-symbol-description-1.0.2.tgz",
@@ -7315,6 +7727,7 @@
       "version": "7.2.3",
       "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz",
       "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==",
+      "deprecated": "Glob versions prior to v9 are no longer supported",
       "dev": true,
       "dependencies": {
         "fs.realpath": "^1.0.0",
@@ -7373,12 +7786,13 @@
       }
     },
     "node_modules/globalthis": {
-      "version": "1.0.3",
-      "resolved": "https://registry.npmjs.org/globalthis/-/globalthis-1.0.3.tgz",
-      "integrity": "sha512-sFdI5LyBiNTHjRd7cGPWapiHWMOXKyuBNX/cWJ3NfzrZQVa8GI/8cofCl74AOVqq9W5kNmguTIzJ/1s2gyI9wA==",
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/globalthis/-/globalthis-1.0.4.tgz",
+      "integrity": "sha512-DpLKbNU4WylpxJykQujfCcwYWiV/Jhm50Goo0wrVILAv5jOr9d+H+UR3PhSCD2rCCEIg0uc+G+muBTwD54JhDQ==",
       "dev": true,
       "dependencies": {
-        "define-properties": "^1.1.3"
+        "define-properties": "^1.2.1",
+        "gopd": "^1.0.1"
       },
       "engines": {
         "node": ">= 0.4"
@@ -7554,46 +7968,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/hast-util-to-jsx-runtime/node_modules/@types/hast": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz",
-      "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==",
-      "dependencies": {
-        "@types/unist": "*"
-      }
-    },
-    "node_modules/hast-util-to-jsx-runtime/node_modules/@types/unist": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
-      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
-    },
-    "node_modules/hast-util-to-jsx-runtime/node_modules/comma-separated-tokens": {
-      "version": "2.0.3",
-      "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz",
-      "integrity": "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==",
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
-      }
-    },
-    "node_modules/hast-util-to-jsx-runtime/node_modules/property-information": {
-      "version": "6.5.0",
-      "resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz",
-      "integrity": "sha512-PgTgs/BlvHxOu8QuEN7wi5A0OmXaBcHpmCSTehcs6Uuu9IkDIEo13Hy7n898RHfrQ49vKCoGeWZSaAK01nwVig==",
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
-      }
-    },
-    "node_modules/hast-util-to-jsx-runtime/node_modules/space-separated-tokens": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz",
-      "integrity": "sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==",
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
-      }
-    },
     "node_modules/hast-util-whitespace": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz",
@@ -7606,14 +7980,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/hast-util-whitespace/node_modules/@types/hast": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz",
-      "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==",
-      "dependencies": {
-        "@types/unist": "*"
-      }
-    },
     "node_modules/hastscript": {
       "version": "6.0.0",
       "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-6.0.0.tgz",
@@ -7630,6 +7996,49 @@
         "url": "https://opencollective.com/unified"
       }
     },
+    "node_modules/hastscript/node_modules/@types/hast": {
+      "version": "2.3.10",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
+      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/hastscript/node_modules/@types/unist": {
+      "version": "2.0.10",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.10.tgz",
+      "integrity": "sha512-IfYcSBWE3hLpBg8+X2SEa8LVkJdJEkT2Ese2aaLs3ptGdVtABxndrMaxuFlQ1qdFf9Q5rDvDpxI3WwgvKFAsQA=="
+    },
+    "node_modules/hastscript/node_modules/comma-separated-tokens": {
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-1.0.8.tgz",
+      "integrity": "sha512-GHuDRO12Sypu2cV70d1dkA2EUmXHgntrzbpvOB+Qy+49ypNfGgFQIC2fhhXbnyrJRynDCAARsT7Ou0M6hirpfw==",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/hastscript/node_modules/property-information": {
+      "version": "5.6.0",
+      "resolved": "https://registry.npmjs.org/property-information/-/property-information-5.6.0.tgz",
+      "integrity": "sha512-YUHSPk+A30YPv+0Qf8i9Mbfe/C0hdPXk1s1jPVToV8pk8BQtpw10ct89Eo7OWkutrwqvT0eicAxlOg3dOAu8JA==",
+      "dependencies": {
+        "xtend": "^4.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/hastscript/node_modules/space-separated-tokens": {
+      "version": "1.1.5",
+      "resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-1.1.5.tgz",
+      "integrity": "sha512-q/JSVd1Lptzhf5bkYm4ob4iWPjx0KiRe3sRFBNrVqbJkFaBm5vbbowy1mymoPNLRa52+oadOhJ+K49wsSeSjTA==",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
     "node_modules/highlight.js": {
       "version": "10.7.3",
       "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-10.7.3.tgz",
@@ -7699,10 +8108,19 @@
         "node": ">= 14"
       }
     },
+    "node_modules/human-signals": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-5.0.0.tgz",
+      "integrity": "sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=16.17.0"
+      }
+    },
     "node_modules/husky": {
-      "version": "9.1.2",
-      "resolved": "https://registry.npmjs.org/husky/-/husky-9.1.2.tgz",
-      "integrity": "sha512-1/aDMXZdhr1VdJJTLt6e7BipM0Jd9qkpubPiIplon1WmCeOy3nnzsCMeBqS9AsL5ioonl8F8y/F2CLOmk19/Pw==",
+      "version": "9.1.4",
+      "resolved": "https://registry.npmjs.org/husky/-/husky-9.1.4.tgz",
+      "integrity": "sha512-bho94YyReb4JV7LYWRWxZ/xr6TtOTt8cMfmQ39MQYJ7f/YE268s3GdghGwi+y4zAeqewE5zYLvuhV0M0ijsDEA==",
       "dev": true,
       "bin": {
         "husky": "bin.js"
@@ -7774,9 +8192,9 @@
       }
     },
     "node_modules/immer": {
-      "version": "10.0.4",
-      "resolved": "https://registry.npmjs.org/immer/-/immer-10.0.4.tgz",
-      "integrity": "sha512-cuBuGK40P/sk5IzWa9QPUaAdvPHjkk1c+xYsd9oZw+YQQEV+10G0P5uMpGctZZKnyQ+ibRO08bD25nWLmYi2pw==",
+      "version": "10.1.1",
+      "resolved": "https://registry.npmjs.org/immer/-/immer-10.1.1.tgz",
+      "integrity": "sha512-s2MPrmjovJcoMaHtx6K11Ra7oD05NT97w1IC5zpMkT6Atjr7H8LjaDd81iIxUYpMKSRRNMJE703M1Fhr/TctHw==",
       "funding": {
         "type": "opencollective",
         "url": "https://opencollective.com/immer"
@@ -7820,6 +8238,7 @@
       "version": "1.0.6",
       "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
       "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==",
+      "deprecated": "This module is not supported, and leaks memory. Do not use it. Check out lru-cache if you want a good and tested way to coalesce async requests by a key value, which is much more comprehensive and powerful.",
       "dev": true,
       "dependencies": {
         "once": "^1.3.0",
@@ -7871,21 +8290,21 @@
       }
     },
     "node_modules/is-alphabetical": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/is-alphabetical/-/is-alphabetical-1.0.4.tgz",
-      "integrity": "sha512-DwzsA04LQ10FHTZuL0/grVDk4rFoVH1pjAToYwBrHSxcrBIGQuXrQMtD5U1b0U2XVgKZCTLLP8u2Qxqhy3l2Vg==",
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-alphabetical/-/is-alphabetical-2.0.1.tgz",
+      "integrity": "sha512-FWyyY60MeTNyeSRpkM2Iry0G9hpr7/9kD40mD/cGQEuilcZYS4okz8SN2Q6rLCJ8gbCt6fN+rC+6tMGS99LaxQ==",
       "funding": {
         "type": "github",
         "url": "https://github.com/sponsors/wooorm"
       }
     },
     "node_modules/is-alphanumerical": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/is-alphanumerical/-/is-alphanumerical-1.0.4.tgz",
-      "integrity": "sha512-UzoZUr+XfVz3t3v4KyGEniVL9BDRoQtY7tOyrRybkVNjDFWyo1yhXNGrrBTQxp3ib9BLAWs7k2YKBQsFRkZG9A==",
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-alphanumerical/-/is-alphanumerical-2.0.1.tgz",
+      "integrity": "sha512-hmbYhX/9MUMF5uh7tOXyK/n0ZvWpad5caBA17GsC6vyuCqaWliRG5K1qS9inmUhEMaOBIW7/whAnSwveW/LtZw==",
       "dependencies": {
-        "is-alphabetical": "^1.0.0",
-        "is-decimal": "^1.0.0"
+        "is-alphabetical": "^2.0.0",
+        "is-decimal": "^2.0.0"
       },
       "funding": {
         "type": "github",
@@ -7924,6 +8343,11 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
+    "node_modules/is-arrayish": {
+      "version": "0.3.2",
+      "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
+      "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ=="
+    },
     "node_modules/is-async-function": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/is-async-function/-/is-async-function-2.0.0.tgz",
@@ -7991,11 +8415,14 @@
       }
     },
     "node_modules/is-core-module": {
-      "version": "2.13.1",
-      "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.13.1.tgz",
-      "integrity": "sha512-hHrIjvZsftOsvKSn2TRYl63zvxsgE0K+0mYMoH6gD4omR5IWB2KynivBQczo3+wF1cCkjzvptnI9Q0sPU66ilw==",
+      "version": "2.15.0",
+      "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.15.0.tgz",
+      "integrity": "sha512-Dd+Lb2/zvk9SKy1TGCt1wFJFo/MWBPMX5x7KcvLajWTGuomczdQX61PvY5yK6SVACwpoexWo81IfFyoKY2QnTA==",
       "dependencies": {
-        "hasown": "^2.0.0"
+        "hasown": "^2.0.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
       },
       "funding": {
         "url": "https://github.com/sponsors/ljharb"
@@ -8032,9 +8459,9 @@
       }
     },
     "node_modules/is-decimal": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/is-decimal/-/is-decimal-1.0.4.tgz",
-      "integrity": "sha512-RGdriMmQQvZ2aqaQq3awNA6dCGtKpiDFcOzrTWrDAT2MiWrKQVPmxLGHl7Y2nNu6led0kEyoX0enY0qXYsv9zw==",
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-decimal/-/is-decimal-2.0.1.tgz",
+      "integrity": "sha512-AAB9hiomQs5DXWcRB1rqsxGUstbRroFOPPVAomNk/3XHR5JyEZChOyTWe2oayKnsSsr/kcGqF+z6yuH6HHpN0A==",
       "funding": {
         "type": "github",
         "url": "https://github.com/sponsors/wooorm"
@@ -8099,9 +8526,9 @@
       }
     },
     "node_modules/is-hexadecimal": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/is-hexadecimal/-/is-hexadecimal-1.0.4.tgz",
-      "integrity": "sha512-gyPJuv83bHMpocVYoqof5VDiZveEoGoFL8m3BXNb2VW8Xs+rz9kqO8LOQ5DH6EsuvilT1ApazU0pyl+ytbPtlw==",
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-hexadecimal/-/is-hexadecimal-2.0.1.tgz",
+      "integrity": "sha512-DgZQp241c8oO6cA1SbTEWiXeoxV42vlcJxgH+B3hi1AiqqKruZR3ZGF8In3fj4+/y/7rHvlOZLZtgJ/4ttYGZg==",
       "funding": {
         "type": "github",
         "url": "https://github.com/sponsors/wooorm"
@@ -8223,6 +8650,18 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
+    "node_modules/is-stream": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-3.0.0.tgz",
+      "integrity": "sha512-LnQR4bZ9IADDRSkvpqMGvt/tEJWclzklNgSw48V5EAaAeDd6qGvN8ei6k5p0tvxSR171VmGyHuTiAOfxAbr8kA==",
+      "dev": true,
+      "engines": {
+        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/is-string": {
       "version": "1.0.7",
       "resolved": "https://registry.npmjs.org/is-string/-/is-string-1.0.7.tgz",
@@ -8342,6 +8781,20 @@
         "node": ">=10"
       }
     },
+    "node_modules/istanbul-lib-source-maps": {
+      "version": "5.0.6",
+      "resolved": "https://registry.npmjs.org/istanbul-lib-source-maps/-/istanbul-lib-source-maps-5.0.6.tgz",
+      "integrity": "sha512-yg2d+Em4KizZC5niWhQaIomgf5WlL4vOOjZ5xGCmF8SnPE/mDWWXgvRExdcpCgh9lLRRa1/fSYp2ymmbJ1pI+A==",
+      "dev": true,
+      "dependencies": {
+        "@jridgewell/trace-mapping": "^0.3.23",
+        "debug": "^4.1.1",
+        "istanbul-lib-coverage": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
     "node_modules/istanbul-reports": {
       "version": "3.1.7",
       "resolved": "https://registry.npmjs.org/istanbul-reports/-/istanbul-reports-3.1.7.tgz",
@@ -8369,15 +8822,12 @@
       }
     },
     "node_modules/jackspeak": {
-      "version": "2.3.6",
-      "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-2.3.6.tgz",
-      "integrity": "sha512-N3yCS/NegsOBokc8GAdM8UcmfsKiSS8cipheD/nivzr700H+nsMOxJjQnvwOcRYVuFkdH0wGUvW2WbXGmrZGbQ==",
+      "version": "3.4.3",
+      "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-3.4.3.tgz",
+      "integrity": "sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==",
       "dependencies": {
         "@isaacs/cliui": "^8.0.2"
       },
-      "engines": {
-        "node": ">=14"
-      },
       "funding": {
         "url": "https://github.com/sponsors/isaacs"
       },
@@ -8386,9 +8836,9 @@
       }
     },
     "node_modules/jiti": {
-      "version": "1.21.0",
-      "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.0.tgz",
-      "integrity": "sha512-gFqAIbuKyyso/3G2qhiO2OM6shY6EPP/R0+mkDbyspxKazh8BXDC5FiFsUjlczgdNz/vfra0da2y+aHrusLG/Q==",
+      "version": "1.21.6",
+      "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.6.tgz",
+      "integrity": "sha512-2yTgeWTWzMWkHu6Jp9NKgePDaYHbntiwvYuuJLbbN9vl7DC9DvXKOB2BC3ZZ92D3cvV/aflH0osDfwpHepQ53w==",
       "bin": {
         "jiti": "bin/jiti.js"
       }
@@ -8458,12 +8908,6 @@
         }
       }
     },
-    "node_modules/jsdom/node_modules/rrweb-cssom": {
-      "version": "0.7.1",
-      "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.7.1.tgz",
-      "integrity": "sha512-TrEMa7JGdVm0UThDJSx7ddw5nVm3UJS9o9CCIZ72B1vSyEZoziDqBYP3XIoi/12lKrJR8rE3jeFHMok2F/Mnsg==",
-      "dev": true
-    },
     "node_modules/jsesc": {
       "version": "2.5.2",
       "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
@@ -8504,12 +8948,6 @@
         "node": ">=6"
       }
     },
-    "node_modules/jsonc-parser": {
-      "version": "3.2.1",
-      "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.1.tgz",
-      "integrity": "sha512-AilxAyFOAcK5wA1+LeaySVBrHsGQvUFCDWXKpZjzaL0PqW+xfBOttn8GNtWKFWqneyMZj41MWF9Kl6iPWLwgOA==",
-      "dev": true
-    },
     "node_modules/jsx-ast-utils": {
       "version": "3.3.5",
       "resolved": "https://registry.npmjs.org/jsx-ast-utils/-/jsx-ast-utils-3.3.5.tgz",
@@ -8535,9 +8973,9 @@
       }
     },
     "node_modules/language-subtag-registry": {
-      "version": "0.3.22",
-      "resolved": "https://registry.npmjs.org/language-subtag-registry/-/language-subtag-registry-0.3.22.tgz",
-      "integrity": "sha512-tN0MCzyWnoz/4nHS6uxdlFWoUZT7ABptwKPQ52Ea7URk6vll88bWBVhodtnlfEuCcKWNGoc+uGbw1cwa9IKh/w==",
+      "version": "0.3.23",
+      "resolved": "https://registry.npmjs.org/language-subtag-registry/-/language-subtag-registry-0.3.23.tgz",
+      "integrity": "sha512-0K65Lea881pHotoGEa5gDlMxt3pctLi2RplBb7Ezh4rRdLEOtgi7n4EwK9lamnUCkKBqaeKRVebTq6BAxSkpXQ==",
       "dev": true
     },
     "node_modules/language-tags": {
@@ -8566,9 +9004,9 @@
       }
     },
     "node_modules/lilconfig": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.1.tgz",
-      "integrity": "sha512-O18pf7nyvHTckunPWCV1XUNXU1piu01y2b7ATJ0ppkUkk8ocqVWBrYjJBCwHDjD/ZWcfyrA0P4gKhzWGi5EINQ==",
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.2.tgz",
+      "integrity": "sha512-eop+wDAvpItUys0FWkHIKeC9ybYrTGbU41U5K7+bttZZeohvnY7M9dZ5kB21GNWiFT2q1OoPTvncPCgSOVO5ow==",
       "engines": {
         "node": ">=14"
       },
@@ -8582,21 +9020,21 @@
       "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg=="
     },
     "node_modules/lint-staged": {
-      "version": "15.2.7",
-      "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-15.2.7.tgz",
-      "integrity": "sha512-+FdVbbCZ+yoh7E/RosSdqKJyUM2OEjTciH0TFNkawKgvFp1zbGlEC39RADg+xKBG1R4mhoH2j85myBQZ5wR+lw==",
+      "version": "15.2.8",
+      "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-15.2.8.tgz",
+      "integrity": "sha512-PUWFf2zQzsd9EFU+kM1d7UP+AZDbKFKuj+9JNVTBkhUFhbg4MAt6WfyMMwBfM4lYqd4D2Jwac5iuTu9rVj4zCQ==",
       "dev": true,
       "dependencies": {
         "chalk": "~5.3.0",
         "commander": "~12.1.0",
-        "debug": "~4.3.4",
+        "debug": "~4.3.6",
         "execa": "~8.0.1",
-        "lilconfig": "~3.1.1",
-        "listr2": "~8.2.1",
+        "lilconfig": "~3.1.2",
+        "listr2": "~8.2.4",
         "micromatch": "~4.0.7",
         "pidtree": "~0.6.0",
         "string-argv": "~0.3.2",
-        "yaml": "~2.4.2"
+        "yaml": "~2.5.0"
       },
       "bin": {
         "lint-staged": "bin/lint-staged.js"
@@ -8620,151 +9058,17 @@
         "url": "https://github.com/chalk/chalk?sponsor=1"
       }
     },
-    "node_modules/lint-staged/node_modules/execa": {
-      "version": "8.0.1",
-      "resolved": "https://registry.npmjs.org/execa/-/execa-8.0.1.tgz",
-      "integrity": "sha512-VyhnebXciFV2DESc+p6B+y0LjSm0krU4OgJN44qFAhBY0TJ+1V61tYD2+wHusZ6F9n5K+vl8k0sTy7PEfV4qpg==",
-      "dev": true,
-      "dependencies": {
-        "cross-spawn": "^7.0.3",
-        "get-stream": "^8.0.1",
-        "human-signals": "^5.0.0",
-        "is-stream": "^3.0.0",
-        "merge-stream": "^2.0.0",
-        "npm-run-path": "^5.1.0",
-        "onetime": "^6.0.0",
-        "signal-exit": "^4.1.0",
-        "strip-final-newline": "^3.0.0"
-      },
-      "engines": {
-        "node": ">=16.17"
-      },
-      "funding": {
-        "url": "https://github.com/sindresorhus/execa?sponsor=1"
-      }
-    },
-    "node_modules/lint-staged/node_modules/get-stream": {
-      "version": "8.0.1",
-      "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-8.0.1.tgz",
-      "integrity": "sha512-VaUJspBffn/LMCJVoMvSAdmscJyS1auj5Zulnn5UoYcY531UWmdwhRWkcGKnGU93m5HSXP9LP2usOryrBtQowA==",
-      "dev": true,
-      "engines": {
-        "node": ">=16"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/lint-staged/node_modules/human-signals": {
-      "version": "5.0.0",
-      "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-5.0.0.tgz",
-      "integrity": "sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ==",
-      "dev": true,
-      "engines": {
-        "node": ">=16.17.0"
-      }
-    },
-    "node_modules/lint-staged/node_modules/is-stream": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-3.0.0.tgz",
-      "integrity": "sha512-LnQR4bZ9IADDRSkvpqMGvt/tEJWclzklNgSw48V5EAaAeDd6qGvN8ei6k5p0tvxSR171VmGyHuTiAOfxAbr8kA==",
-      "dev": true,
-      "engines": {
-        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/lint-staged/node_modules/mimic-fn": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-4.0.0.tgz",
-      "integrity": "sha512-vqiC06CuhBTUdZH+RYl8sFrL096vA45Ok5ISO6sE/Mr1jRbGH4Csnhi8f3wKVl7x8mO4Au7Ir9D3Oyv1VYMFJw==",
-      "dev": true,
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/lint-staged/node_modules/npm-run-path": {
-      "version": "5.3.0",
-      "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-5.3.0.tgz",
-      "integrity": "sha512-ppwTtiJZq0O/ai0z7yfudtBpWIoxM8yE6nHi1X47eFR2EWORqfbu6CnPlNsjeN683eT0qG6H/Pyf9fCcvjnnnQ==",
-      "dev": true,
-      "dependencies": {
-        "path-key": "^4.0.0"
-      },
-      "engines": {
-        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/lint-staged/node_modules/onetime": {
-      "version": "6.0.0",
-      "resolved": "https://registry.npmjs.org/onetime/-/onetime-6.0.0.tgz",
-      "integrity": "sha512-1FlR+gjXK7X+AsAHso35MnyN5KqGwJRi/31ft6x0M194ht7S+rWAvd7PHss9xSKMzE0asv1pyIHaJYq+BbacAQ==",
-      "dev": true,
-      "dependencies": {
-        "mimic-fn": "^4.0.0"
-      },
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/lint-staged/node_modules/path-key": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/path-key/-/path-key-4.0.0.tgz",
-      "integrity": "sha512-haREypq7xkM7ErfgIyA0z+Bj4AGKlMSdlQE2jvJo6huWD1EdkKYV+G/T4nq0YEF2vgTT8kqMFKo1uHn950r4SQ==",
-      "dev": true,
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/lint-staged/node_modules/signal-exit": {
-      "version": "4.1.0",
-      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
-      "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==",
-      "dev": true,
-      "engines": {
-        "node": ">=14"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
-    "node_modules/lint-staged/node_modules/strip-final-newline": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/strip-final-newline/-/strip-final-newline-3.0.0.tgz",
-      "integrity": "sha512-dOESqjYr96iWYylGObzd39EuNTa5VJxyvVAEm5Jnh7KGo75V43Hk1odPQkNDyXNmUR6k+gEiDVXnjB8HJ3crXw==",
-      "dev": true,
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/listr2": {
-      "version": "8.2.1",
-      "resolved": "https://registry.npmjs.org/listr2/-/listr2-8.2.1.tgz",
-      "integrity": "sha512-irTfvpib/rNiD637xeevjO2l3Z5loZmuaRi0L0YE5LfijwVY96oyVn0DFD3o/teAok7nfobMG1THvvcHh/BP6g==",
+      "version": "8.2.4",
+      "resolved": "https://registry.npmjs.org/listr2/-/listr2-8.2.4.tgz",
+      "integrity": "sha512-opevsywziHd3zHCVQGAj8zu+Z3yHNkkoYhWIGnq54RrCVwLz0MozotJEDnKsIBLvkfLGN6BLOyAeRrYI0pKA4g==",
       "dev": true,
       "dependencies": {
         "cli-truncate": "^4.0.0",
         "colorette": "^2.0.20",
         "eventemitter3": "^5.0.1",
-        "log-update": "^6.0.0",
-        "rfdc": "^1.3.1",
+        "log-update": "^6.1.0",
+        "rfdc": "^1.4.1",
         "wrap-ansi": "^9.0.0"
       },
       "engines": {
@@ -8857,14 +9161,14 @@
       "integrity": "sha512-XeqSp49hNGmlkj2EJlfrQFIzQ6lXdNro9sddtQzcJY8QaoC2GO0DT7xaIokHeyM+mIT0mPMlPvkYzg2xCuHdZg=="
     },
     "node_modules/log-update": {
-      "version": "6.0.0",
-      "resolved": "https://registry.npmjs.org/log-update/-/log-update-6.0.0.tgz",
-      "integrity": "sha512-niTvB4gqvtof056rRIrTZvjNYE4rCUzO6X/X+kYjd7WFxXeJ0NwEFnRxX6ehkvv3jTwrXnNdtAak5XYZuIyPFw==",
+      "version": "6.1.0",
+      "resolved": "https://registry.npmjs.org/log-update/-/log-update-6.1.0.tgz",
+      "integrity": "sha512-9ie8ItPR6tjY5uYJh8K/Zrv/RMZ5VOlOWvtZdEHYSTFKZfIBPQa9tOAEeAWhd+AnIneLJ22w5fjOYtoutpWq5w==",
       "dev": true,
       "dependencies": {
-        "ansi-escapes": "^6.2.0",
-        "cli-cursor": "^4.0.0",
-        "slice-ansi": "^7.0.0",
+        "ansi-escapes": "^7.0.0",
+        "cli-cursor": "^5.0.0",
+        "slice-ansi": "^7.1.0",
         "strip-ansi": "^7.1.0",
         "wrap-ansi": "^9.0.0"
       },
@@ -8875,18 +9179,6 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/log-update/node_modules/ansi-escapes": {
-      "version": "6.2.1",
-      "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-6.2.1.tgz",
-      "integrity": "sha512-4nJ3yixlEthEJ9Rk4vPcdBRkZvQZlYyu8j4/Mqz5sgIkddmEnH2Yj2ZrnP9S3tQOvSNRUIgVNF/1yPpRAGNRig==",
-      "dev": true,
-      "engines": {
-        "node": ">=14.16"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/log-update/node_modules/ansi-regex": {
       "version": "6.0.1",
       "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.0.1.tgz",
@@ -9018,12 +9310,12 @@
       }
     },
     "node_modules/magic-string": {
-      "version": "0.30.10",
-      "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.10.tgz",
-      "integrity": "sha512-iIRwTIf0QKV3UAnYK4PU8uiEc4SRh5jX0mwpIwETPpHdhVM4f53RSwS/vXvN1JhGX+Cs7B8qIq3d6AH49O5fAQ==",
+      "version": "0.30.11",
+      "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.11.tgz",
+      "integrity": "sha512-+Wri9p0QHMy+545hKww7YAu5NyzF8iomPL/RQazugQ9+Ez4Ic3mERMd8ZTX5rfK944j+560ZJi8iAwgak1Ac7A==",
       "dev": true,
       "dependencies": {
-        "@jridgewell/sourcemap-codec": "^1.4.15"
+        "@jridgewell/sourcemap-codec": "^1.5.0"
       }
     },
     "node_modules/magicast": {
@@ -9053,9 +9345,9 @@
       }
     },
     "node_modules/mdast-util-from-markdown": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.0.tgz",
-      "integrity": "sha512-n7MTOr/z+8NAX/wmhhDji8O3bRvPTV/U0oTCaZJkjhPSKTPhS3xufVhKGF8s1pJ7Ox4QgoIU7KHseh09S+9rTA==",
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.1.tgz",
+      "integrity": "sha512-aJEUyzZ6TzlsX2s5B4Of7lN7EQtAxvtradMMglCQDyaTFgse6CmtmdJ15ElnVRlCg1vpNyVtbem0PWzlNieZsA==",
       "dependencies": {
         "@types/mdast": "^4.0.0",
         "@types/unist": "^3.0.0",
@@ -9075,11 +9367,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-from-markdown/node_modules/@types/unist": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
-      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
-    },
     "node_modules/mdast-util-mdx-expression": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/mdast-util-mdx-expression/-/mdast-util-mdx-expression-2.0.0.tgz",
@@ -9097,14 +9384,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-mdx-expression/node_modules/@types/hast": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz",
-      "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==",
-      "dependencies": {
-        "@types/unist": "*"
-      }
-    },
     "node_modules/mdast-util-mdx-jsx": {
       "version": "3.1.2",
       "resolved": "https://registry.npmjs.org/mdast-util-mdx-jsx/-/mdast-util-mdx-jsx-3.1.2.tgz",
@@ -9129,110 +9408,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-mdx-jsx/node_modules/@types/hast": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz",
-      "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==",
-      "dependencies": {
-        "@types/unist": "*"
-      }
-    },
-    "node_modules/mdast-util-mdx-jsx/node_modules/@types/unist": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
-      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
-    },
-    "node_modules/mdast-util-mdx-jsx/node_modules/character-entities": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/character-entities/-/character-entities-2.0.2.tgz",
-      "integrity": "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==",
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
-      }
-    },
-    "node_modules/mdast-util-mdx-jsx/node_modules/character-entities-legacy": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-3.0.0.tgz",
-      "integrity": "sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==",
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
-      }
-    },
-    "node_modules/mdast-util-mdx-jsx/node_modules/character-reference-invalid": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/character-reference-invalid/-/character-reference-invalid-2.0.1.tgz",
-      "integrity": "sha512-iBZ4F4wRbyORVsu0jPV7gXkOsGYjGHPmAyv+HiHG8gi5PtC9KI2j1+v8/tlibRvjoWX027ypmG/n0HtO5t7unw==",
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
-      }
-    },
-    "node_modules/mdast-util-mdx-jsx/node_modules/is-alphabetical": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/is-alphabetical/-/is-alphabetical-2.0.1.tgz",
-      "integrity": "sha512-FWyyY60MeTNyeSRpkM2Iry0G9hpr7/9kD40mD/cGQEuilcZYS4okz8SN2Q6rLCJ8gbCt6fN+rC+6tMGS99LaxQ==",
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
-      }
-    },
-    "node_modules/mdast-util-mdx-jsx/node_modules/is-alphanumerical": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/is-alphanumerical/-/is-alphanumerical-2.0.1.tgz",
-      "integrity": "sha512-hmbYhX/9MUMF5uh7tOXyK/n0ZvWpad5caBA17GsC6vyuCqaWliRG5K1qS9inmUhEMaOBIW7/whAnSwveW/LtZw==",
-      "dependencies": {
-        "is-alphabetical": "^2.0.0",
-        "is-decimal": "^2.0.0"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
-      }
-    },
-    "node_modules/mdast-util-mdx-jsx/node_modules/is-decimal": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/is-decimal/-/is-decimal-2.0.1.tgz",
-      "integrity": "sha512-AAB9hiomQs5DXWcRB1rqsxGUstbRroFOPPVAomNk/3XHR5JyEZChOyTWe2oayKnsSsr/kcGqF+z6yuH6HHpN0A==",
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
-      }
-    },
-    "node_modules/mdast-util-mdx-jsx/node_modules/is-hexadecimal": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/is-hexadecimal/-/is-hexadecimal-2.0.1.tgz",
-      "integrity": "sha512-DgZQp241c8oO6cA1SbTEWiXeoxV42vlcJxgH+B3hi1AiqqKruZR3ZGF8In3fj4+/y/7rHvlOZLZtgJ/4ttYGZg==",
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
-      }
-    },
-    "node_modules/mdast-util-mdx-jsx/node_modules/parse-entities": {
-      "version": "4.0.1",
-      "resolved": "https://registry.npmjs.org/parse-entities/-/parse-entities-4.0.1.tgz",
-      "integrity": "sha512-SWzvYcSJh4d/SGLIOQfZ/CoNv6BTlI6YEQ7Nj82oDVnRpwe/Z/F1EMx42x3JAOwGBlCjeCH0BRJQbQ/opHL17w==",
-      "dependencies": {
-        "@types/unist": "^2.0.0",
-        "character-entities": "^2.0.0",
-        "character-entities-legacy": "^3.0.0",
-        "character-reference-invalid": "^2.0.0",
-        "decode-named-character-reference": "^1.0.0",
-        "is-alphanumerical": "^2.0.0",
-        "is-decimal": "^2.0.0",
-        "is-hexadecimal": "^2.0.0"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
-      }
-    },
-    "node_modules/mdast-util-mdx-jsx/node_modules/parse-entities/node_modules/@types/unist": {
-      "version": "2.0.10",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.10.tgz",
-      "integrity": "sha512-IfYcSBWE3hLpBg8+X2SEa8LVkJdJEkT2Ese2aaLs3ptGdVtABxndrMaxuFlQ1qdFf9Q5rDvDpxI3WwgvKFAsQA=="
-    },
     "node_modules/mdast-util-mdxjs-esm": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/mdast-util-mdxjs-esm/-/mdast-util-mdxjs-esm-2.0.1.tgz",
@@ -9250,14 +9425,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-mdxjs-esm/node_modules/@types/hast": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz",
-      "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==",
-      "dependencies": {
-        "@types/unist": "*"
-      }
-    },
     "node_modules/mdast-util-phrasing": {
       "version": "4.1.0",
       "resolved": "https://registry.npmjs.org/mdast-util-phrasing/-/mdast-util-phrasing-4.1.0.tgz",
@@ -9272,9 +9439,9 @@
       }
     },
     "node_modules/mdast-util-to-hast": {
-      "version": "13.1.0",
-      "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.1.0.tgz",
-      "integrity": "sha512-/e2l/6+OdGp/FB+ctrJ9Avz71AN/GRH3oi/3KAx/kMnoUsD6q0woXlDT8lLEeViVKE7oZxE7RXzvO3T8kF2/sA==",
+      "version": "13.2.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.0.tgz",
+      "integrity": "sha512-QGYKEuUsYT9ykKBCMOEDLsU5JRObWQusAolFMeko/tYPufNkRffBAQjIE+99jbA87xv6FgmjLtwjh9wBWajwAA==",
       "dependencies": {
         "@types/hast": "^3.0.0",
         "@types/mdast": "^4.0.0",
@@ -9291,14 +9458,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-to-hast/node_modules/@types/hast": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz",
-      "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==",
-      "dependencies": {
-        "@types/unist": "*"
-      }
-    },
     "node_modules/mdast-util-to-markdown": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/mdast-util-to-markdown/-/mdast-util-to-markdown-2.1.0.tgz",
@@ -9318,11 +9477,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/mdast-util-to-markdown/node_modules/@types/unist": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
-      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
-    },
     "node_modules/mdast-util-to-string": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-4.0.0.tgz",
@@ -9803,13 +9957,28 @@
         "node": ">= 0.6"
       }
     },
-    "node_modules/mimic-fn": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-2.1.0.tgz",
-      "integrity": "sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==",
+    "node_modules/mimic-fn": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-4.0.0.tgz",
+      "integrity": "sha512-vqiC06CuhBTUdZH+RYl8sFrL096vA45Ok5ISO6sE/Mr1jRbGH4Csnhi8f3wKVl7x8mO4Au7Ir9D3Oyv1VYMFJw==",
+      "dev": true,
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/mimic-function": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/mimic-function/-/mimic-function-5.0.1.tgz",
+      "integrity": "sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==",
       "dev": true,
       "engines": {
-        "node": ">=6"
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
     "node_modules/min-indent": {
@@ -9822,9 +9991,9 @@
       }
     },
     "node_modules/minimatch": {
-      "version": "9.0.4",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.4.tgz",
-      "integrity": "sha512-KqWh+VchfxcMNRAJjj2tnsSJdNbHsVgnkBhTNrW7AjVo6OvLtxw8zfT9oLw1JSohlFzJ8jCoTgaoXvJ+kHt6fw==",
+      "version": "9.0.5",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
+      "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
       "dependencies": {
         "brace-expansion": "^2.0.1"
       },
@@ -9845,23 +10014,23 @@
       }
     },
     "node_modules/minipass": {
-      "version": "7.0.4",
-      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.0.4.tgz",
-      "integrity": "sha512-jYofLM5Dam9279rdkWzqHozUo4ybjdZmCsDHePy5V/PbBcVMiSZR97gmAy45aqi8CK1lG2ECd356FU86avfwUQ==",
+      "version": "7.1.2",
+      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz",
+      "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==",
       "engines": {
         "node": ">=16 || 14 >=14.17"
       }
     },
     "node_modules/mlly": {
-      "version": "1.6.1",
-      "resolved": "https://registry.npmjs.org/mlly/-/mlly-1.6.1.tgz",
-      "integrity": "sha512-vLgaHvaeunuOXHSmEbZ9izxPx3USsk8KCQ8iC+aTlp5sKRSoZvwhHh5L9VbKSaVC6sJDqbyohIS76E2VmHIPAA==",
+      "version": "1.7.1",
+      "resolved": "https://registry.npmjs.org/mlly/-/mlly-1.7.1.tgz",
+      "integrity": "sha512-rrVRZRELyQzrIUAVMHxP97kv+G786pHmOKzuFII8zDYahFBS7qnHh2AlYSl1GAHhaMPCz6/oHjVMcfFYgFYHgA==",
       "dev": true,
       "dependencies": {
         "acorn": "^8.11.3",
         "pathe": "^1.1.2",
-        "pkg-types": "^1.0.3",
-        "ufo": "^1.3.2"
+        "pkg-types": "^1.1.1",
+        "ufo": "^1.5.3"
       }
     },
     "node_modules/monaco-editor": {
@@ -9946,9 +10115,9 @@
       }
     },
     "node_modules/node-releases": {
-      "version": "2.0.14",
-      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.14.tgz",
-      "integrity": "sha512-y10wOWt8yZpqXmOgRo77WaHEmhYQYGNA6y421PKsKYWEK8aW+cqAphborZDhqfyKrbZEN92CN1X2KbafY2s7Yw=="
+      "version": "2.0.18",
+      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.18.tgz",
+      "integrity": "sha512-d9VeXT4SJ7ZeOqGX6R5EM022wpL+eWPooLI+5UpWn2jCT1aosUQEhQP214x33Wkwx3JQMvIm+tIoVOdodFS40g=="
     },
     "node_modules/normalize-path": {
       "version": "3.0.0",
@@ -9967,6 +10136,33 @@
         "node": ">=0.10.0"
       }
     },
+    "node_modules/npm-run-path": {
+      "version": "5.3.0",
+      "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-5.3.0.tgz",
+      "integrity": "sha512-ppwTtiJZq0O/ai0z7yfudtBpWIoxM8yE6nHi1X47eFR2EWORqfbu6CnPlNsjeN683eT0qG6H/Pyf9fCcvjnnnQ==",
+      "dev": true,
+      "dependencies": {
+        "path-key": "^4.0.0"
+      },
+      "engines": {
+        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/npm-run-path/node_modules/path-key": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/path-key/-/path-key-4.0.0.tgz",
+      "integrity": "sha512-haREypq7xkM7ErfgIyA0z+Bj4AGKlMSdlQE2jvJo6huWD1EdkKYV+G/T4nq0YEF2vgTT8kqMFKo1uHn950r4SQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/nwsapi": {
       "version": "2.2.12",
       "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.12.tgz",
@@ -9990,10 +10186,13 @@
       }
     },
     "node_modules/object-inspect": {
-      "version": "1.13.1",
-      "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.1.tgz",
-      "integrity": "sha512-5qoj1RUiKOMsCCNLV1CBiPYE10sziTsnmNxkAI/rZhiD63CF7IqdFGC/XzjWjpSgLf0LxXX3bDFIh0E18f6UhQ==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.2.tgz",
+      "integrity": "sha512-IRZSRuzJiynemAXPYtPe5BoI/RESNYR7TYm50MC5Mqbd3Jmw5y790sErYw3V6SryFJD64b74qQQs9wn5Bg/k3g==",
       "dev": true,
+      "engines": {
+        "node": ">= 0.4"
+      },
       "funding": {
         "url": "https://github.com/sponsors/ljharb"
       }
@@ -10114,32 +10313,32 @@
       }
     },
     "node_modules/onetime": {
-      "version": "5.1.2",
-      "resolved": "https://registry.npmjs.org/onetime/-/onetime-5.1.2.tgz",
-      "integrity": "sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==",
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/onetime/-/onetime-6.0.0.tgz",
+      "integrity": "sha512-1FlR+gjXK7X+AsAHso35MnyN5KqGwJRi/31ft6x0M194ht7S+rWAvd7PHss9xSKMzE0asv1pyIHaJYq+BbacAQ==",
       "dev": true,
       "dependencies": {
-        "mimic-fn": "^2.1.0"
+        "mimic-fn": "^4.0.0"
       },
       "engines": {
-        "node": ">=6"
+        "node": ">=12"
       },
       "funding": {
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
     "node_modules/optionator": {
-      "version": "0.9.3",
-      "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz",
-      "integrity": "sha512-JjCoypp+jKn1ttEFExxhetCKeJt9zhAgAve5FXHixTvFDW/5aEktX9bufBKLRRMdU7bNtpLfcGu94B3cdEJgjg==",
+      "version": "0.9.4",
+      "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
+      "integrity": "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==",
       "dev": true,
       "dependencies": {
-        "@aashutoshrathi/word-wrap": "^1.2.3",
         "deep-is": "^0.1.3",
         "fast-levenshtein": "^2.0.6",
         "levn": "^0.4.1",
         "prelude-ls": "^1.2.1",
-        "type-check": "^0.4.0"
+        "type-check": "^0.4.0",
+        "word-wrap": "^1.2.5"
       },
       "engines": {
         "node": ">= 0.8.0"
@@ -10175,6 +10374,11 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/package-json-from-dist": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.0.tgz",
+      "integrity": "sha512-dATvCeZN/8wQsGywez1mzHtTlP22H8OEfPrVMLNr4/eGa+ijtLn/6M5f0dY8UKNrC2O9UCU6SSoG3qRKnt7STw=="
+    },
     "node_modules/parent-module": {
       "version": "1.0.1",
       "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
@@ -10188,22 +10392,29 @@
       }
     },
     "node_modules/parse-entities": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/parse-entities/-/parse-entities-2.0.0.tgz",
-      "integrity": "sha512-kkywGpCcRYhqQIchaWqZ875wzpS/bMKhz5HnN3p7wveJTkTtyAB/AlnS0f8DFSqYW1T82t6yEAkEcB+A1I3MbQ==",
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/parse-entities/-/parse-entities-4.0.1.tgz",
+      "integrity": "sha512-SWzvYcSJh4d/SGLIOQfZ/CoNv6BTlI6YEQ7Nj82oDVnRpwe/Z/F1EMx42x3JAOwGBlCjeCH0BRJQbQ/opHL17w==",
       "dependencies": {
-        "character-entities": "^1.0.0",
-        "character-entities-legacy": "^1.0.0",
-        "character-reference-invalid": "^1.0.0",
-        "is-alphanumerical": "^1.0.0",
-        "is-decimal": "^1.0.0",
-        "is-hexadecimal": "^1.0.0"
+        "@types/unist": "^2.0.0",
+        "character-entities": "^2.0.0",
+        "character-entities-legacy": "^3.0.0",
+        "character-reference-invalid": "^2.0.0",
+        "decode-named-character-reference": "^1.0.0",
+        "is-alphanumerical": "^2.0.0",
+        "is-decimal": "^2.0.0",
+        "is-hexadecimal": "^2.0.0"
       },
       "funding": {
         "type": "github",
         "url": "https://github.com/sponsors/wooorm"
       }
     },
+    "node_modules/parse-entities/node_modules/@types/unist": {
+      "version": "2.0.10",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.10.tgz",
+      "integrity": "sha512-IfYcSBWE3hLpBg8+X2SEa8LVkJdJEkT2Ese2aaLs3ptGdVtABxndrMaxuFlQ1qdFf9Q5rDvDpxI3WwgvKFAsQA=="
+    },
     "node_modules/parse5": {
       "version": "7.1.2",
       "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
@@ -10248,27 +10459,24 @@
       "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw=="
     },
     "node_modules/path-scurry": {
-      "version": "1.10.2",
-      "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.10.2.tgz",
-      "integrity": "sha512-7xTavNy5RQXnsjANvVvMkEjvloOinkAjv/Z6Ildz9v2RinZ4SBKTWFOVRbaF8p0vpHnyjV/UwNDdKuUv6M5qcA==",
+      "version": "1.11.1",
+      "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz",
+      "integrity": "sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==",
       "dependencies": {
         "lru-cache": "^10.2.0",
         "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0"
       },
       "engines": {
-        "node": ">=16 || 14 >=14.17"
+        "node": ">=16 || 14 >=14.18"
       },
       "funding": {
         "url": "https://github.com/sponsors/isaacs"
       }
     },
     "node_modules/path-scurry/node_modules/lru-cache": {
-      "version": "10.2.0",
-      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.2.0.tgz",
-      "integrity": "sha512-2bIM8x+VAf6JT4bKAljS1qUWgMsqZRPGJS6FSahIMPVvctcNhyVp7AJu7quxOW9jwkryBReKZY5tY5JYv2n/7Q==",
-      "engines": {
-        "node": "14 || >=16.14"
-      }
+      "version": "10.4.3",
+      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz",
+      "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ=="
     },
     "node_modules/path-type": {
       "version": "4.0.0",
@@ -10339,14 +10547,14 @@
       }
     },
     "node_modules/pkg-types": {
-      "version": "1.0.3",
-      "resolved": "https://registry.npmjs.org/pkg-types/-/pkg-types-1.0.3.tgz",
-      "integrity": "sha512-nN7pYi0AQqJnoLPC9eHFQ8AcyaixBUOwvqc5TDnIKCMEE6I0y8P7OKA7fPexsXGCGxQDl/cmrLAp26LhcwxZ4A==",
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/pkg-types/-/pkg-types-1.1.3.tgz",
+      "integrity": "sha512-+JrgthZG6m3ckicaOB74TwQ+tBWsFl3qVQg7mN8ulwSOElJ7gBhKzj2VkCPnZ4NlF6kEquYU+RIYNVAvzd54UA==",
       "dev": true,
       "dependencies": {
-        "jsonc-parser": "^3.2.0",
-        "mlly": "^1.2.0",
-        "pathe": "^1.1.0"
+        "confbox": "^0.1.7",
+        "mlly": "^1.7.1",
+        "pathe": "^1.1.2"
       }
     },
     "node_modules/possible-typed-array-names": {
@@ -10359,9 +10567,9 @@
       }
     },
     "node_modules/postcss": {
-      "version": "8.4.40",
-      "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.40.tgz",
-      "integrity": "sha512-YF2kKIUzAofPMpfH6hOi2cGnv/HrUlfucspc7pDyvv7kGdqXrfj8SCl/t8owkEgKEuu8ZcRjSOxFxVLqwChZ2Q==",
+      "version": "8.4.41",
+      "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.41.tgz",
+      "integrity": "sha512-TesUflQ0WKZqAvg52PWL6kHgLKP6xB6heTOdoYM0Wt2UHyxNa4K25EZZMgKns3BH1RLVbZCREPpLY0rhnNoHVQ==",
       "funding": [
         {
           "type": "opencollective",
@@ -10454,27 +10662,46 @@
       }
     },
     "node_modules/postcss-nested": {
-      "version": "6.0.1",
-      "resolved": "https://registry.npmjs.org/postcss-nested/-/postcss-nested-6.0.1.tgz",
-      "integrity": "sha512-mEp4xPMi5bSWiMbsgoPfcP74lsWLHkQbZc3sY+jWYd65CUwXrUaTp0fmNpa01ZcETKlIgUdFN/MpS2xZtqL9dQ==",
+      "version": "6.2.0",
+      "resolved": "https://registry.npmjs.org/postcss-nested/-/postcss-nested-6.2.0.tgz",
+      "integrity": "sha512-HQbt28KulC5AJzG+cZtj9kvKB93CFCdLvog1WFLf1D+xmMvPGlBstkpTEZfK5+AN9hfJocyBFCNiqyS48bpgzQ==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/postcss/"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
       "dependencies": {
-        "postcss-selector-parser": "^6.0.11"
+        "postcss-selector-parser": "^6.1.1"
       },
       "engines": {
         "node": ">=12.0"
       },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/postcss/"
-      },
       "peerDependencies": {
         "postcss": "^8.2.14"
       }
     },
+    "node_modules/postcss-nested/node_modules/postcss-selector-parser": {
+      "version": "6.1.1",
+      "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.1.1.tgz",
+      "integrity": "sha512-b4dlw/9V8A71rLIDsSwVmak9z2DuBUB7CA1/wSdelNEzqsjoSPeADTWNO09lpH49Diy3/JIZ2bSPB1dI3LJCHg==",
+      "dependencies": {
+        "cssesc": "^3.0.0",
+        "util-deprecate": "^1.0.2"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
     "node_modules/postcss-selector-parser": {
-      "version": "6.0.16",
-      "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.0.16.tgz",
-      "integrity": "sha512-A0RVJrX+IUkVZbW3ClroRWurercFhieevHB38sr2+l9eUClMqome3LmEmnhlNy+5Mr2EYN6B2Kaw9wYdd+VHiw==",
+      "version": "6.0.10",
+      "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.0.10.tgz",
+      "integrity": "sha512-IQ7TZdoaqbT+LCpShg46jnZVlhWD2w6iQYAcYXfHARZ7X1t/UGhhceQDs5X0cGqKvYlHNOuv7Oa1xmb0oQuA3w==",
+      "dev": true,
       "dependencies": {
         "cssesc": "^3.0.0",
         "util-deprecate": "^1.0.2"
@@ -10578,12 +10805,9 @@
       "dev": true
     },
     "node_modules/property-information": {
-      "version": "5.6.0",
-      "resolved": "https://registry.npmjs.org/property-information/-/property-information-5.6.0.tgz",
-      "integrity": "sha512-YUHSPk+A30YPv+0Qf8i9Mbfe/C0hdPXk1s1jPVToV8pk8BQtpw10ct89Eo7OWkutrwqvT0eicAxlOg3dOAu8JA==",
-      "dependencies": {
-        "xtend": "^4.0.0"
-      },
+      "version": "6.5.0",
+      "resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz",
+      "integrity": "sha512-PgTgs/BlvHxOu8QuEN7wi5A0OmXaBcHpmCSTehcs6Uuu9IkDIEo13Hy7n898RHfrQ49vKCoGeWZSaAK01nwVig==",
       "funding": {
         "type": "github",
         "url": "https://github.com/sponsors/wooorm"
@@ -10676,9 +10900,9 @@
       }
     },
     "node_modules/react-i18next": {
-      "version": "15.0.0",
-      "resolved": "https://registry.npmjs.org/react-i18next/-/react-i18next-15.0.0.tgz",
-      "integrity": "sha512-2O3IgF4zivg57Q6p6i+ChDgJ371IDcEWbuWC6gvoh5NbkDMs0Q+O7RPr4v61+Se32E0V+LmtwePAeqWZW0bi6g==",
+      "version": "15.0.1",
+      "resolved": "https://registry.npmjs.org/react-i18next/-/react-i18next-15.0.1.tgz",
+      "integrity": "sha512-NwxLqNM6CLbeGA9xPsjits0EnXdKgCRSS6cgkgOdNcPXqL+1fYNl8fBg1wmnnHvFy812Bt4IWTPE9zjoPmFj3w==",
       "dependencies": {
         "@babel/runtime": "^7.24.8",
         "html-parse-stringify": "^3.0.1"
@@ -10736,14 +10960,6 @@
         "react": ">=18"
       }
     },
-    "node_modules/react-markdown/node_modules/@types/hast": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz",
-      "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==",
-      "dependencies": {
-        "@types/unist": "*"
-      }
-    },
     "node_modules/react-redux": {
       "version": "9.1.2",
       "resolved": "https://registry.npmjs.org/react-redux/-/react-redux-9.1.2.tgz",
@@ -10952,6 +11168,90 @@
         "url": "https://github.com/sponsors/wooorm"
       }
     },
+    "node_modules/refractor/node_modules/character-entities": {
+      "version": "1.2.4",
+      "resolved": "https://registry.npmjs.org/character-entities/-/character-entities-1.2.4.tgz",
+      "integrity": "sha512-iBMyeEHxfVnIakwOuDXpVkc54HijNgCyQB2w0VfGQThle6NXn50zU6V/u+LDhxHcDUPojn6Kpga3PTAD8W1bQw==",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/refractor/node_modules/character-entities-legacy": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-1.1.4.tgz",
+      "integrity": "sha512-3Xnr+7ZFS1uxeiUDvV02wQ+QDbc55o97tIV5zHScSPJpcLm/r0DFPcoY3tYRp+VZukxuMeKgXYmsXQHO05zQeA==",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/refractor/node_modules/character-reference-invalid": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/character-reference-invalid/-/character-reference-invalid-1.1.4.tgz",
+      "integrity": "sha512-mKKUkUbhPpQlCOfIuZkvSEgktjPFIsZKRRbC6KWVEMvlzblj3i3asQv5ODsrwt0N3pHAEvjP8KTQPHkp0+6jOg==",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/refractor/node_modules/is-alphabetical": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/is-alphabetical/-/is-alphabetical-1.0.4.tgz",
+      "integrity": "sha512-DwzsA04LQ10FHTZuL0/grVDk4rFoVH1pjAToYwBrHSxcrBIGQuXrQMtD5U1b0U2XVgKZCTLLP8u2Qxqhy3l2Vg==",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/refractor/node_modules/is-alphanumerical": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/is-alphanumerical/-/is-alphanumerical-1.0.4.tgz",
+      "integrity": "sha512-UzoZUr+XfVz3t3v4KyGEniVL9BDRoQtY7tOyrRybkVNjDFWyo1yhXNGrrBTQxp3ib9BLAWs7k2YKBQsFRkZG9A==",
+      "dependencies": {
+        "is-alphabetical": "^1.0.0",
+        "is-decimal": "^1.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/refractor/node_modules/is-decimal": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/is-decimal/-/is-decimal-1.0.4.tgz",
+      "integrity": "sha512-RGdriMmQQvZ2aqaQq3awNA6dCGtKpiDFcOzrTWrDAT2MiWrKQVPmxLGHl7Y2nNu6led0kEyoX0enY0qXYsv9zw==",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/refractor/node_modules/is-hexadecimal": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/is-hexadecimal/-/is-hexadecimal-1.0.4.tgz",
+      "integrity": "sha512-gyPJuv83bHMpocVYoqof5VDiZveEoGoFL8m3BXNb2VW8Xs+rz9kqO8LOQ5DH6EsuvilT1ApazU0pyl+ytbPtlw==",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/refractor/node_modules/parse-entities": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/parse-entities/-/parse-entities-2.0.0.tgz",
+      "integrity": "sha512-kkywGpCcRYhqQIchaWqZ875wzpS/bMKhz5HnN3p7wveJTkTtyAB/AlnS0f8DFSqYW1T82t6yEAkEcB+A1I3MbQ==",
+      "dependencies": {
+        "character-entities": "^1.0.0",
+        "character-entities-legacy": "^1.0.0",
+        "character-reference-invalid": "^1.0.0",
+        "is-alphanumerical": "^1.0.0",
+        "is-decimal": "^1.0.0",
+        "is-hexadecimal": "^1.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
     "node_modules/refractor/node_modules/prismjs": {
       "version": "1.27.0",
       "resolved": "https://registry.npmjs.org/prismjs/-/prismjs-1.27.0.tgz",
@@ -11014,14 +11314,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/remark-rehype/node_modules/@types/hast": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz",
-      "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==",
-      "dependencies": {
-        "@types/unist": "*"
-      }
-    },
     "node_modules/requires-port": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz",
@@ -11029,9 +11321,9 @@
       "dev": true
     },
     "node_modules/reselect": {
-      "version": "5.1.0",
-      "resolved": "https://registry.npmjs.org/reselect/-/reselect-5.1.0.tgz",
-      "integrity": "sha512-aw7jcGLDpSgNDyWBQLv2cedml85qd95/iszJjN988zX1t7AVRJi19d9kto5+W7oCfQ94gyo40dVbT6g2k4/kXg=="
+      "version": "5.1.1",
+      "resolved": "https://registry.npmjs.org/reselect/-/reselect-5.1.1.tgz",
+      "integrity": "sha512-K/BG6eIky/SBpzfHZv/dd+9JBFiS4SWV7FIujVyJRux6e45+73RaUHXLmIR1f7WOMaQ0U1km6qwklRQxpJJY0w=="
     },
     "node_modules/resolve": {
       "version": "1.22.8",
@@ -11059,16 +11351,31 @@
       }
     },
     "node_modules/restore-cursor": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-4.0.0.tgz",
-      "integrity": "sha512-I9fPXU9geO9bHOt9pHHOhOkYerIMsmVaWB0rA2AI9ERh/+x/i7MV5HKBNrg+ljO5eoPVgCcnFuRjJ9uH6I/3eg==",
+      "version": "5.1.0",
+      "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-5.1.0.tgz",
+      "integrity": "sha512-oMA2dcrw6u0YfxJQXm342bFKX/E4sG9rbTzO9ptUcR/e8A33cHuvStiYOwH7fszkZlZ1z/ta9AAoPk2F4qIOHA==",
       "dev": true,
       "dependencies": {
-        "onetime": "^5.1.0",
-        "signal-exit": "^3.0.2"
+        "onetime": "^7.0.0",
+        "signal-exit": "^4.1.0"
       },
       "engines": {
-        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/restore-cursor/node_modules/onetime": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/onetime/-/onetime-7.0.0.tgz",
+      "integrity": "sha512-VXJjc87FScF88uafS3JllDgvAm+c/Slfz06lorj2uAY34rlUu0Nt+v8wreiImcrgAjjIHp1rXpTDlLOGw29WwQ==",
+      "dev": true,
+      "dependencies": {
+        "mimic-function": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=18"
       },
       "funding": {
         "url": "https://github.com/sponsors/sindresorhus"
@@ -11084,15 +11391,16 @@
       }
     },
     "node_modules/rfdc": {
-      "version": "1.3.1",
-      "resolved": "https://registry.npmjs.org/rfdc/-/rfdc-1.3.1.tgz",
-      "integrity": "sha512-r5a3l5HzYlIC68TpmYKlxWjmOP6wiPJ1vWv2HeLhNsRZMrCkxeqxiHlQ21oXmQ4F3SiryXBHhAD7JZqvOJjFmg==",
+      "version": "1.4.1",
+      "resolved": "https://registry.npmjs.org/rfdc/-/rfdc-1.4.1.tgz",
+      "integrity": "sha512-q1b3N5QkRUWUl7iyylaaj3kOpIT0N2i9MqIEQXP73GVsN9cw3fdx8X63cEmWhJGi2PPCF23Ijp7ktmd39rawIA==",
       "dev": true
     },
     "node_modules/rimraf": {
       "version": "3.0.2",
       "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
       "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==",
+      "deprecated": "Rimraf versions prior to v4 are no longer supported",
       "dev": true,
       "dependencies": {
         "glob": "^7.1.3"
@@ -11105,9 +11413,9 @@
       }
     },
     "node_modules/rollup": {
-      "version": "4.14.1",
-      "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.14.1.tgz",
-      "integrity": "sha512-4LnHSdd3QK2pa1J6dFbfm1HN0D7vSK/ZuZTsdyUAlA6Rr1yTouUTL13HaDOGJVgby461AhrNGBS7sCGXXtT+SA==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.19.2.tgz",
+      "integrity": "sha512-6/jgnN1svF9PjNYJ4ya3l+cqutg49vOZ4rVgsDKxdl+5gpGPnByFXWGyfH9YGx9i3nfBwSu1Iyu6vGwFFA0BdQ==",
       "dependencies": {
         "@types/estree": "1.0.5"
       },
@@ -11119,28 +11427,29 @@
         "npm": ">=8.0.0"
       },
       "optionalDependencies": {
-        "@rollup/rollup-android-arm-eabi": "4.14.1",
-        "@rollup/rollup-android-arm64": "4.14.1",
-        "@rollup/rollup-darwin-arm64": "4.14.1",
-        "@rollup/rollup-darwin-x64": "4.14.1",
-        "@rollup/rollup-linux-arm-gnueabihf": "4.14.1",
-        "@rollup/rollup-linux-arm64-gnu": "4.14.1",
-        "@rollup/rollup-linux-arm64-musl": "4.14.1",
-        "@rollup/rollup-linux-powerpc64le-gnu": "4.14.1",
-        "@rollup/rollup-linux-riscv64-gnu": "4.14.1",
-        "@rollup/rollup-linux-s390x-gnu": "4.14.1",
-        "@rollup/rollup-linux-x64-gnu": "4.14.1",
-        "@rollup/rollup-linux-x64-musl": "4.14.1",
-        "@rollup/rollup-win32-arm64-msvc": "4.14.1",
-        "@rollup/rollup-win32-ia32-msvc": "4.14.1",
-        "@rollup/rollup-win32-x64-msvc": "4.14.1",
+        "@rollup/rollup-android-arm-eabi": "4.19.2",
+        "@rollup/rollup-android-arm64": "4.19.2",
+        "@rollup/rollup-darwin-arm64": "4.19.2",
+        "@rollup/rollup-darwin-x64": "4.19.2",
+        "@rollup/rollup-linux-arm-gnueabihf": "4.19.2",
+        "@rollup/rollup-linux-arm-musleabihf": "4.19.2",
+        "@rollup/rollup-linux-arm64-gnu": "4.19.2",
+        "@rollup/rollup-linux-arm64-musl": "4.19.2",
+        "@rollup/rollup-linux-powerpc64le-gnu": "4.19.2",
+        "@rollup/rollup-linux-riscv64-gnu": "4.19.2",
+        "@rollup/rollup-linux-s390x-gnu": "4.19.2",
+        "@rollup/rollup-linux-x64-gnu": "4.19.2",
+        "@rollup/rollup-linux-x64-musl": "4.19.2",
+        "@rollup/rollup-win32-arm64-msvc": "4.19.2",
+        "@rollup/rollup-win32-ia32-msvc": "4.19.2",
+        "@rollup/rollup-win32-x64-msvc": "4.19.2",
         "fsevents": "~2.3.2"
       }
     },
     "node_modules/rrweb-cssom": {
-      "version": "0.6.0",
-      "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz",
-      "integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw==",
+      "version": "0.7.1",
+      "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.7.1.tgz",
+      "integrity": "sha512-TrEMa7JGdVm0UThDJSx7ddw5nVm3UJS9o9CCIZ72B1vSyEZoziDqBYP3XIoi/12lKrJR8rE3jeFHMok2F/Mnsg==",
       "dev": true
     },
     "node_modules/run-parallel": {
@@ -11235,13 +11544,10 @@
       }
     },
     "node_modules/semver": {
-      "version": "7.6.0",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz",
-      "integrity": "sha512-EnwXhrlwXMk9gKu5/flx5sv/an57AkRplG3hTK68W7FRDN+k+OWBj65M7719OkA82XLBxrcX0KSHj+X5COhOVg==",
+      "version": "7.6.3",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz",
+      "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==",
       "dev": true,
-      "dependencies": {
-        "lru-cache": "^6.0.0"
-      },
       "bin": {
         "semver": "bin/semver.js"
       },
@@ -11249,24 +11555,6 @@
         "node": ">=10"
       }
     },
-    "node_modules/semver/node_modules/lru-cache": {
-      "version": "6.0.0",
-      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz",
-      "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==",
-      "dev": true,
-      "dependencies": {
-        "yallist": "^4.0.0"
-      },
-      "engines": {
-        "node": ">=10"
-      }
-    },
-    "node_modules/semver/node_modules/yallist": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz",
-      "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==",
-      "dev": true
-    },
     "node_modules/set-function-length": {
       "version": "1.2.2",
       "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
@@ -11343,23 +11631,23 @@
       "dev": true
     },
     "node_modules/signal-exit": {
-      "version": "3.0.7",
-      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
-      "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==",
-      "dev": true
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
+      "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==",
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
     },
     "node_modules/simple-swizzle": {
       "version": "0.2.2",
       "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz",
       "integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==",
       "dependencies": {
-        "is-arrayish": "^0.3.1"
-      }
-    },
-    "node_modules/simple-swizzle/node_modules/is-arrayish": {
-      "version": "0.3.2",
-      "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
-      "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ=="
+        "is-arrayish": "^0.3.1"
+      }
     },
     "node_modules/slash": {
       "version": "3.0.0",
@@ -11407,9 +11695,9 @@
       }
     },
     "node_modules/space-separated-tokens": {
-      "version": "1.1.5",
-      "resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-1.1.5.tgz",
-      "integrity": "sha512-q/JSVd1Lptzhf5bkYm4ob4iWPjx0KiRe3sRFBNrVqbJkFaBm5vbbowy1mymoPNLRa52+oadOhJ+K49wsSeSjTA==",
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz",
+      "integrity": "sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==",
       "funding": {
         "type": "github",
         "url": "https://github.com/sponsors/wooorm"
@@ -11454,9 +11742,9 @@
       }
     },
     "node_modules/string-width": {
-      "version": "7.1.0",
-      "resolved": "https://registry.npmjs.org/string-width/-/string-width-7.1.0.tgz",
-      "integrity": "sha512-SEIJCWiX7Kg4c129n48aDRwLbFb2LJmXXFrWBG4NGaRtMQ3myKPKbwrD1BKqQn74oCoNMBVrfDEr5M9YxCsrkw==",
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-7.2.0.tgz",
+      "integrity": "sha512-tsaTIkKW9b4N+AEj+SVA+WhJzV7/zMhcSu78mLKWSk7cXMOSHsBKFWUs0fWwq8QyK3MgJBQRX6Gbi4kYbdvGkQ==",
       "dev": true,
       "dependencies": {
         "emoji-regex": "^10.3.0",
@@ -11638,15 +11926,6 @@
         "url": "https://github.com/sponsors/wooorm"
       }
     },
-    "node_modules/stringify-entities/node_modules/character-entities-legacy": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-3.0.0.tgz",
-      "integrity": "sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==",
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/wooorm"
-      }
-    },
     "node_modules/strip-ansi": {
       "version": "6.0.1",
       "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
@@ -11670,6 +11949,27 @@
         "node": ">=8"
       }
     },
+    "node_modules/strip-bom": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz",
+      "integrity": "sha512-vavAMRXOgBVNF6nyEEmL3DBK19iRpDcoIwW+swQ+CbGiu7lju6t+JklA1MHweoWtadgt4ISVUsXLyDq34ddcwA==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/strip-final-newline": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/strip-final-newline/-/strip-final-newline-3.0.0.tgz",
+      "integrity": "sha512-dOESqjYr96iWYylGObzd39EuNTa5VJxyvVAEm5Jnh7KGo75V43Hk1odPQkNDyXNmUR6k+gEiDVXnjB8HJ3crXw==",
+      "dev": true,
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/strip-indent": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/strip-indent/-/strip-indent-3.0.0.tgz",
@@ -11750,22 +12050,20 @@
       }
     },
     "node_modules/sucrase/node_modules/glob": {
-      "version": "10.3.12",
-      "resolved": "https://registry.npmjs.org/glob/-/glob-10.3.12.tgz",
-      "integrity": "sha512-TCNv8vJ+xz4QiqTpfOJA7HvYv+tNIRHKfUWw/q+v2jdgN4ebz+KY9tGx5J4rHP0o84mNP+ApH66HRX8us3Khqg==",
+      "version": "10.4.5",
+      "resolved": "https://registry.npmjs.org/glob/-/glob-10.4.5.tgz",
+      "integrity": "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==",
       "dependencies": {
         "foreground-child": "^3.1.0",
-        "jackspeak": "^2.3.6",
-        "minimatch": "^9.0.1",
-        "minipass": "^7.0.4",
-        "path-scurry": "^1.10.2"
+        "jackspeak": "^3.1.2",
+        "minimatch": "^9.0.4",
+        "minipass": "^7.1.2",
+        "package-json-from-dist": "^1.0.0",
+        "path-scurry": "^1.11.1"
       },
       "bin": {
         "glob": "dist/esm/bin.mjs"
       },
-      "engines": {
-        "node": ">=16 || 14 >=14.17"
-      },
       "funding": {
         "url": "https://github.com/sponsors/isaacs"
       }
@@ -11849,9 +12147,9 @@
       }
     },
     "node_modules/tailwindcss": {
-      "version": "3.4.7",
-      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.7.tgz",
-      "integrity": "sha512-rxWZbe87YJb4OcSopb7up2Ba4U82BoiSGUdoDr3Ydrg9ckxFS/YWsvhN323GMcddgU65QRy7JndC7ahhInhvlQ==",
+      "version": "3.4.9",
+      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.9.tgz",
+      "integrity": "sha512-1SEOvRr6sSdV5IDf9iC+NU4dhwdqzF4zKKq3sAbasUWHEM6lsMhX+eNN5gkPx1BvLFEnZQEUFbXnGj8Qlp83Pg==",
       "dependencies": {
         "@alloc/quick-lru": "^5.2.0",
         "arg": "^5.0.2",
@@ -11892,6 +12190,18 @@
         "node": ">=10"
       }
     },
+    "node_modules/tailwindcss/node_modules/postcss-selector-parser": {
+      "version": "6.1.1",
+      "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.1.1.tgz",
+      "integrity": "sha512-b4dlw/9V8A71rLIDsSwVmak9z2DuBUB7CA1/wSdelNEzqsjoSPeADTWNO09lpH49Diy3/JIZ2bSPB1dI3LJCHg==",
+      "dependencies": {
+        "cssesc": "^3.0.0",
+        "util-deprecate": "^1.0.2"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
     "node_modules/test-exclude": {
       "version": "6.0.0",
       "resolved": "https://registry.npmjs.org/test-exclude/-/test-exclude-6.0.0.tgz",
@@ -11954,9 +12264,9 @@
       }
     },
     "node_modules/tinybench": {
-      "version": "2.7.0",
-      "resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.7.0.tgz",
-      "integrity": "sha512-Qgayeb106x2o4hNzNjsZEfFziw8IbKqtbXBjVh7VIZfBxfD5M4gWtpyx5+YTae2gJ6Y6Dz/KLepiv16RFeQWNA==",
+      "version": "2.8.0",
+      "resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.8.0.tgz",
+      "integrity": "sha512-1/eK7zUnIklz4JUUlL+658n58XO2hHLQfSk1Zf2LKieUjxidN16eKFEoDEfjHc3ohofSSqK3X5yO6VGb6iW8Lw==",
       "dev": true
     },
     "node_modules/tinypool": {
@@ -12102,19 +12412,10 @@
         "json5": "lib/cli.js"
       }
     },
-    "node_modules/tsconfig-paths/node_modules/strip-bom": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz",
-      "integrity": "sha512-vavAMRXOgBVNF6nyEEmL3DBK19iRpDcoIwW+swQ+CbGiu7lju6t+JklA1MHweoWtadgt4ISVUsXLyDq34ddcwA==",
-      "dev": true,
-      "engines": {
-        "node": ">=4"
-      }
-    },
     "node_modules/tslib": {
-      "version": "2.6.2",
-      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.2.tgz",
-      "integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q=="
+      "version": "2.6.3",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz",
+      "integrity": "sha512-xNvxJEOUiWPGhUuUdQgAJPKOOJfGnIyKySOc09XkKsgdUV/3E2zvwZYdejjmRgPCgcym1juLH3226yA7sEFJKQ=="
     },
     "node_modules/type-check": {
       "version": "0.4.0",
@@ -12129,14 +12430,26 @@
       }
     },
     "node_modules/type-detect": {
-      "version": "4.0.8",
-      "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
-      "integrity": "sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g==",
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.1.0.tgz",
+      "integrity": "sha512-Acylog8/luQ8L7il+geoSxhEkazvkslg7PSNKOX59mbB9cOveP5aq9h74Y7YU8yDpJwetzQQrfIwtf4Wp4LKcw==",
       "dev": true,
       "engines": {
         "node": ">=4"
       }
     },
+    "node_modules/type-fest": {
+      "version": "0.20.2",
+      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz",
+      "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/typed-array-buffer": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/typed-array-buffer/-/typed-array-buffer-1.0.2.tgz",
@@ -12224,9 +12537,9 @@
       }
     },
     "node_modules/ufo": {
-      "version": "1.5.3",
-      "resolved": "https://registry.npmjs.org/ufo/-/ufo-1.5.3.tgz",
-      "integrity": "sha512-Y7HYmWaFwPUmkoQCUIAYpKqkOf+SbVj/2fJJZ4RJMCfZp0rTGwRbzQD+HghfnhKOjL9E01okqz+ncJskGYfBNw==",
+      "version": "1.5.4",
+      "resolved": "https://registry.npmjs.org/ufo/-/ufo-1.5.4.tgz",
+      "integrity": "sha512-UsUk3byDzKd04EyoZ7U4DOlxQaD14JUKQl6/P7wiX4FNvUfm3XL246n9W5AmqwW5RSFJ27NAuM0iLscAOYUiGQ==",
       "dev": true
     },
     "node_modules/unbox-primitive": {
@@ -12245,15 +12558,15 @@
       }
     },
     "node_modules/undici-types": {
-      "version": "5.26.5",
-      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
-      "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
+      "version": "6.13.0",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.13.0.tgz",
+      "integrity": "sha512-xtFJHudx8S2DSoujjMd1WeWvn7KKWFRESZTMeL1RptAYERu29D6jphMjjY+vn96jvN3kVPDNxU/E13VTaXj6jg==",
       "devOptional": true
     },
     "node_modules/unified": {
-      "version": "11.0.4",
-      "resolved": "https://registry.npmjs.org/unified/-/unified-11.0.4.tgz",
-      "integrity": "sha512-apMPnyLjAX+ty4OrNap7yumyVAMlKx5IWU2wlzzUdYJO9A8f1p9m/gywF/GM2ZDFcjQPrx59Mc90KwmxsoklxQ==",
+      "version": "11.0.5",
+      "resolved": "https://registry.npmjs.org/unified/-/unified-11.0.5.tgz",
+      "integrity": "sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA==",
       "dependencies": {
         "@types/unist": "^3.0.0",
         "bail": "^2.0.0",
@@ -12268,11 +12581,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/unified/node_modules/@types/unist": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
-      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
-    },
     "node_modules/unist-util-is": {
       "version": "6.0.0",
       "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-6.0.0.tgz",
@@ -12285,11 +12593,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/unist-util-is/node_modules/@types/unist": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
-      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
-    },
     "node_modules/unist-util-position": {
       "version": "5.0.0",
       "resolved": "https://registry.npmjs.org/unist-util-position/-/unist-util-position-5.0.0.tgz",
@@ -12302,11 +12605,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/unist-util-position/node_modules/@types/unist": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
-      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
-    },
     "node_modules/unist-util-remove-position": {
       "version": "5.0.0",
       "resolved": "https://registry.npmjs.org/unist-util-remove-position/-/unist-util-remove-position-5.0.0.tgz",
@@ -12320,11 +12618,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/unist-util-remove-position/node_modules/@types/unist": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
-      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
-    },
     "node_modules/unist-util-stringify-position": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz",
@@ -12337,11 +12630,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/unist-util-stringify-position/node_modules/@types/unist": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
-      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
-    },
     "node_modules/unist-util-visit": {
       "version": "5.0.0",
       "resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-5.0.0.tgz",
@@ -12369,16 +12657,6 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/unist-util-visit-parents/node_modules/@types/unist": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
-      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
-    },
-    "node_modules/unist-util-visit/node_modules/@types/unist": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
-      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
-    },
     "node_modules/universalify": {
       "version": "0.2.0",
       "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz",
@@ -12389,9 +12667,9 @@
       }
     },
     "node_modules/update-browserslist-db": {
-      "version": "1.0.13",
-      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.13.tgz",
-      "integrity": "sha512-xebP81SNcPuNpPP3uzeW1NYXxI3rxyJzF3pD6sH4jE7o/IX+WtSpwnVU+qIsDPyk0d3hmFQ7mjqc6AtV604hbg==",
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.0.tgz",
+      "integrity": "sha512-EdRAaAyk2cUE1wOf2DkEhzxqOQvFOoRJFNS6NeyJ01Gp2beMRpBAINjM2iDXE3KCuKhwnvHIQCJm6ThL2Z+HzQ==",
       "funding": [
         {
           "type": "opencollective",
@@ -12407,8 +12685,8 @@
         }
       ],
       "dependencies": {
-        "escalade": "^3.1.1",
-        "picocolors": "^1.0.0"
+        "escalade": "^3.1.2",
+        "picocolors": "^1.0.1"
       },
       "bin": {
         "update-browserslist-db": "cli.js"
@@ -12515,9 +12793,9 @@
       }
     },
     "node_modules/use-sync-external-store": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.2.0.tgz",
-      "integrity": "sha512-eEgnFxGQ1Ife9bzYs6VLi8/4X6CObHMw9Qr9tPY43iKwsPw8xE8+EFsf/2cFZ5S3esXgpWgtSCtLNS41F+sKPA==",
+      "version": "1.2.2",
+      "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.2.2.tgz",
+      "integrity": "sha512-PElTlVMwpblvbNqQ82d2n6RjStvdSoNe9FG28kNfz3WiXilJm4DdNkEzRhCZuIDwY8U08WVihhGR5iRqAwfDiw==",
       "peerDependencies": {
         "react": "^16.8.0 || ^17.0.0 || ^18.0.0"
       }
@@ -12528,9 +12806,9 @@
       "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw=="
     },
     "node_modules/vfile": {
-      "version": "6.0.1",
-      "resolved": "https://registry.npmjs.org/vfile/-/vfile-6.0.1.tgz",
-      "integrity": "sha512-1bYqc7pt6NIADBJ98UiG0Bn/CHIVOoZ/IyEkqIruLg0mE1BKzkOXY2D6CSqQIcKqgadppE5lrxgWXJmXd7zZJw==",
+      "version": "6.0.2",
+      "resolved": "https://registry.npmjs.org/vfile/-/vfile-6.0.2.tgz",
+      "integrity": "sha512-zND7NlS8rJYb/sPqkb13ZvbbUoExdbi4w3SfRrMq6R3FvnLQmmfpajJNITuuYm6AZ5uao9vy4BAos3EXBPf2rg==",
       "dependencies": {
         "@types/unist": "^3.0.0",
         "unist-util-stringify-position": "^4.0.0",
@@ -12554,23 +12832,13 @@
         "url": "https://opencollective.com/unified"
       }
     },
-    "node_modules/vfile-message/node_modules/@types/unist": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
-      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
-    },
-    "node_modules/vfile/node_modules/@types/unist": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
-      "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
-    },
     "node_modules/vite": {
-      "version": "5.3.5",
-      "resolved": "https://registry.npmjs.org/vite/-/vite-5.3.5.tgz",
-      "integrity": "sha512-MdjglKR6AQXQb9JGiS7Rc2wC6uMjcm7Go/NHNO63EwiJXfuk9PgqiP/n5IDJCziMkfw9n4Ubp7lttNwz+8ZVKA==",
+      "version": "5.4.0",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.0.tgz",
+      "integrity": "sha512-5xokfMX0PIiwCMCMb9ZJcMyh5wbBun0zUzKib+L65vAZ8GY9ePZMXxFrHbr/Kyll2+LSCY7xtERPpxkBDKngwg==",
       "dependencies": {
         "esbuild": "^0.21.3",
-        "postcss": "^8.4.39",
+        "postcss": "^8.4.40",
         "rollup": "^4.13.0"
       },
       "bin": {
@@ -12590,6 +12858,7 @@
         "less": "*",
         "lightningcss": "^1.21.0",
         "sass": "*",
+        "sass-embedded": "*",
         "stylus": "*",
         "sugarss": "*",
         "terser": "^5.4.0"
@@ -12607,6 +12876,9 @@
         "sass": {
           "optional": true
         },
+        "sass-embedded": {
+          "optional": true
+        },
         "stylus": {
           "optional": true
         },
@@ -12641,9 +12913,9 @@
       }
     },
     "node_modules/vite-tsconfig-paths": {
-      "version": "4.3.2",
-      "resolved": "https://registry.npmjs.org/vite-tsconfig-paths/-/vite-tsconfig-paths-4.3.2.tgz",
-      "integrity": "sha512-0Vd/a6po6Q+86rPlntHye7F31zA2URZMbH8M3saAZ/xR9QoGN/L21bxEGfXdWmFdNkqPpRdxFT7nmNe12e9/uA==",
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/vite-tsconfig-paths/-/vite-tsconfig-paths-5.0.1.tgz",
+      "integrity": "sha512-yqwv+LstU7NwPeNqajZzLEBVpUFU6Dugtb2P84FXuvaoYA+/70l9MHE+GYfYAycVyPSDYZ7mjOFuYBRqlEpTig==",
       "dev": true,
       "dependencies": {
         "debug": "^4.1.1",
@@ -12724,140 +12996,6 @@
         }
       }
     },
-    "node_modules/vitest/node_modules/execa": {
-      "version": "8.0.1",
-      "resolved": "https://registry.npmjs.org/execa/-/execa-8.0.1.tgz",
-      "integrity": "sha512-VyhnebXciFV2DESc+p6B+y0LjSm0krU4OgJN44qFAhBY0TJ+1V61tYD2+wHusZ6F9n5K+vl8k0sTy7PEfV4qpg==",
-      "dev": true,
-      "dependencies": {
-        "cross-spawn": "^7.0.3",
-        "get-stream": "^8.0.1",
-        "human-signals": "^5.0.0",
-        "is-stream": "^3.0.0",
-        "merge-stream": "^2.0.0",
-        "npm-run-path": "^5.1.0",
-        "onetime": "^6.0.0",
-        "signal-exit": "^4.1.0",
-        "strip-final-newline": "^3.0.0"
-      },
-      "engines": {
-        "node": ">=16.17"
-      },
-      "funding": {
-        "url": "https://github.com/sindresorhus/execa?sponsor=1"
-      }
-    },
-    "node_modules/vitest/node_modules/get-stream": {
-      "version": "8.0.1",
-      "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-8.0.1.tgz",
-      "integrity": "sha512-VaUJspBffn/LMCJVoMvSAdmscJyS1auj5Zulnn5UoYcY531UWmdwhRWkcGKnGU93m5HSXP9LP2usOryrBtQowA==",
-      "dev": true,
-      "engines": {
-        "node": ">=16"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/vitest/node_modules/human-signals": {
-      "version": "5.0.0",
-      "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-5.0.0.tgz",
-      "integrity": "sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ==",
-      "dev": true,
-      "engines": {
-        "node": ">=16.17.0"
-      }
-    },
-    "node_modules/vitest/node_modules/is-stream": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-3.0.0.tgz",
-      "integrity": "sha512-LnQR4bZ9IADDRSkvpqMGvt/tEJWclzklNgSw48V5EAaAeDd6qGvN8ei6k5p0tvxSR171VmGyHuTiAOfxAbr8kA==",
-      "dev": true,
-      "engines": {
-        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/vitest/node_modules/mimic-fn": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-4.0.0.tgz",
-      "integrity": "sha512-vqiC06CuhBTUdZH+RYl8sFrL096vA45Ok5ISO6sE/Mr1jRbGH4Csnhi8f3wKVl7x8mO4Au7Ir9D3Oyv1VYMFJw==",
-      "dev": true,
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/vitest/node_modules/npm-run-path": {
-      "version": "5.3.0",
-      "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-5.3.0.tgz",
-      "integrity": "sha512-ppwTtiJZq0O/ai0z7yfudtBpWIoxM8yE6nHi1X47eFR2EWORqfbu6CnPlNsjeN683eT0qG6H/Pyf9fCcvjnnnQ==",
-      "dev": true,
-      "dependencies": {
-        "path-key": "^4.0.0"
-      },
-      "engines": {
-        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/vitest/node_modules/onetime": {
-      "version": "6.0.0",
-      "resolved": "https://registry.npmjs.org/onetime/-/onetime-6.0.0.tgz",
-      "integrity": "sha512-1FlR+gjXK7X+AsAHso35MnyN5KqGwJRi/31ft6x0M194ht7S+rWAvd7PHss9xSKMzE0asv1pyIHaJYq+BbacAQ==",
-      "dev": true,
-      "dependencies": {
-        "mimic-fn": "^4.0.0"
-      },
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/vitest/node_modules/path-key": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/path-key/-/path-key-4.0.0.tgz",
-      "integrity": "sha512-haREypq7xkM7ErfgIyA0z+Bj4AGKlMSdlQE2jvJo6huWD1EdkKYV+G/T4nq0YEF2vgTT8kqMFKo1uHn950r4SQ==",
-      "dev": true,
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/vitest/node_modules/signal-exit": {
-      "version": "4.1.0",
-      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
-      "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==",
-      "dev": true,
-      "engines": {
-        "node": ">=14"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
-    "node_modules/vitest/node_modules/strip-final-newline": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/strip-final-newline/-/strip-final-newline-3.0.0.tgz",
-      "integrity": "sha512-dOESqjYr96iWYylGObzd39EuNTa5VJxyvVAEm5Jnh7KGo75V43Hk1odPQkNDyXNmUR6k+gEiDVXnjB8HJ3crXw==",
-      "dev": true,
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/void-elements": {
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/void-elements/-/void-elements-3.1.0.tgz",
@@ -12957,13 +13095,13 @@
       }
     },
     "node_modules/which-builtin-type": {
-      "version": "1.1.3",
-      "resolved": "https://registry.npmjs.org/which-builtin-type/-/which-builtin-type-1.1.3.tgz",
-      "integrity": "sha512-YmjsSMDBYsM1CaFiayOVT06+KJeXf0o5M/CAd4o1lTadFAtacTUM49zoYxr/oroopFDfhvN6iEcBxUyc3gvKmw==",
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/which-builtin-type/-/which-builtin-type-1.1.4.tgz",
+      "integrity": "sha512-bppkmBSsHFmIMSl8BO9TbsyzsvGjVoppt8xUiGzwiu/bhDCGxnpOKCxgqj6GuyHE0mINMDecBFPlOm2hzY084w==",
       "dev": true,
       "dependencies": {
-        "function.prototype.name": "^1.1.5",
-        "has-tostringtag": "^1.0.0",
+        "function.prototype.name": "^1.1.6",
+        "has-tostringtag": "^1.0.2",
         "is-async-function": "^2.0.0",
         "is-date-object": "^1.0.5",
         "is-finalizationregistry": "^1.0.2",
@@ -12972,8 +13110,8 @@
         "is-weakref": "^1.0.2",
         "isarray": "^2.0.5",
         "which-boxed-primitive": "^1.0.2",
-        "which-collection": "^1.0.1",
-        "which-typed-array": "^1.1.9"
+        "which-collection": "^1.0.2",
+        "which-typed-array": "^1.1.15"
       },
       "engines": {
         "node": ">= 0.4"
@@ -13020,9 +13158,9 @@
       }
     },
     "node_modules/why-is-node-running": {
-      "version": "2.2.2",
-      "resolved": "https://registry.npmjs.org/why-is-node-running/-/why-is-node-running-2.2.2.tgz",
-      "integrity": "sha512-6tSwToZxTOcotxHeA+qGCq1mVzKR3CwcJGmVcY+QE8SHy6TnpFnh8PAvPNHYr7EcuVeG0QSMxtYCuO1ta/G/oA==",
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/why-is-node-running/-/why-is-node-running-2.3.0.tgz",
+      "integrity": "sha512-hUrmaWBdVDcxvYqnyh09zunKzROWjbZTiNy8dBEjkS7ehEDQibXJ7XvlmtbwuTclUiIyN+CyXQD4Vmko8fNm8w==",
       "dev": true,
       "dependencies": {
         "siginfo": "^2.0.0",
@@ -13035,6 +13173,15 @@
         "node": ">=8"
       }
     },
+    "node_modules/word-wrap": {
+      "version": "1.2.5",
+      "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.5.tgz",
+      "integrity": "sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
     "node_modules/wrap-ansi": {
       "version": "9.0.0",
       "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-9.0.0.tgz",
@@ -13190,9 +13337,9 @@
       "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g=="
     },
     "node_modules/yaml": {
-      "version": "2.4.2",
-      "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.4.2.tgz",
-      "integrity": "sha512-B3VqDZ+JAg1nZpaEmWtTXUlBneoGx6CPM9b0TENK6aoSu5t73dItudwdgmi6tHlIZZId4dZ9skcAQ2UbcyAeVA==",
+      "version": "2.5.0",
+      "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.5.0.tgz",
+      "integrity": "sha512-2wWLbGbYDiSqqIKoPjar3MPgB94ErzCtrNE1FdqGuaO0pi2JGjmE8aW8TDZwzU7vuxcGRdL/4gPQwQ7hD5AMSw==",
       "bin": {
         "yaml": "bin.mjs"
       },
diff --git a/frontend/package.json b/frontend/package.json
index e11090fae513..f5134a233119 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -10,7 +10,7 @@
     "@monaco-editor/react": "^4.6.0",
     "@nextui-org/react": "^2.4.6",
     "@react-types/shared": "^3.24.1",
-    "@reduxjs/toolkit": "^2.2.6",
+    "@reduxjs/toolkit": "^2.2.7",
     "@vitejs/plugin-react": "^4.3.1",
     "@xterm/addon-fit": "^0.10.0",
     "@xterm/xterm": "^5.4.0",
@@ -25,13 +25,13 @@
     "react-dom": "^18.3.1",
     "react-highlight": "^0.15.0",
     "react-hot-toast": "^2.4.1",
-    "react-i18next": "^15.0.0",
+    "react-i18next": "^15.0.1",
     "react-icons": "^5.2.1",
     "react-markdown": "^9.0.1",
     "react-redux": "^9.1.2",
     "react-syntax-highlighter": "^15.5.0",
     "tailwind-merge": "^2.4.0",
-    "vite": "^5.3.5",
+    "vite": "^5.4.0",
     "web-vitals": "^3.5.2"
   },
   "scripts": {
@@ -59,19 +59,19 @@
     ]
   },
   "devDependencies": {
-    "@tailwindcss/typography": "^0.5.13",
+    "@tailwindcss/typography": "^0.5.14",
     "@testing-library/jest-dom": "^6.4.8",
     "@testing-library/react": "^16.0.0",
     "@testing-library/user-event": "^14.5.2",
-    "@types/node": "^20.14.12",
+    "@types/node": "^22.1.0",
     "@types/react": "^18.3.3",
     "@types/react-dom": "^18.3.0",
     "@types/react-highlight": "^0.12.8",
     "@types/react-syntax-highlighter": "^15.5.13",
-    "@typescript-eslint/eslint-plugin": "^7.17.0",
-    "@typescript-eslint/parser": "^7.17.0",
+    "@typescript-eslint/eslint-plugin": "^7.18.0",
+    "@typescript-eslint/parser": "^7.18.0",
     "@vitest/coverage-v8": "^1.6.0",
-    "autoprefixer": "^10.4.19",
+    "autoprefixer": "^10.4.20",
     "eslint": "^8.57.0",
     "eslint-config-airbnb": "^19.0.4",
     "eslint-config-airbnb-typescript": "^18.0.0",
@@ -81,14 +81,14 @@
     "eslint-plugin-prettier": "^5.2.1",
     "eslint-plugin-react": "^7.35.0",
     "eslint-plugin-react-hooks": "^4.6.2",
-    "husky": "^9.1.2",
+    "husky": "^9.1.4",
     "jsdom": "^24.1.1",
-    "lint-staged": "^15.2.7",
-    "postcss": "^8.4.40",
+    "lint-staged": "^15.2.8",
+    "postcss": "^8.4.41",
     "prettier": "^3.3.3",
-    "tailwindcss": "^3.4.7",
+    "tailwindcss": "^3.4.9",
     "typescript": "^5.5.4",
-    "vite-tsconfig-paths": "^4.3.2",
+    "vite-tsconfig-paths": "^5.0.1",
     "vitest": "^1.6.0"
   },
   "packageManager": "npm@10.5.0",
diff --git a/frontend/src/components/AgentControlBar.tsx b/frontend/src/components/AgentControlBar.tsx
index 04f1775b2323..7c4437086d0a 100644
--- a/frontend/src/components/AgentControlBar.tsx
+++ b/frontend/src/components/AgentControlBar.tsx
@@ -8,6 +8,7 @@ import { changeAgentState } from "#/services/agentStateService";
 import store, { RootState } from "#/store";
 import AgentState from "#/types/AgentState";
 import { clearMessages } from "#/state/chatSlice";
+import Session from "#/services/session";
 
 const IgnoreTaskStateMap: { [k: string]: AgentState[] } = {
   [AgentState.PAUSED]: [
@@ -83,6 +84,7 @@ function AgentControlBar() {
     }
 
     if (action === AgentState.STOPPED) {
+      Session._history = [];
       store.dispatch(clearMessages());
     } else {
       setIsLoading(true);
diff --git a/frontend/src/components/AgentStatusBar.tsx b/frontend/src/components/AgentStatusBar.tsx
index 9c38e3e20bf6..890fec87c95b 100644
--- a/frontend/src/components/AgentStatusBar.tsx
+++ b/frontend/src/components/AgentStatusBar.tsx
@@ -1,10 +1,9 @@
-import React, { useEffect } from "react";
+import React from "react";
 import { useTranslation } from "react-i18next";
 import { useSelector } from "react-redux";
 import { I18nKey } from "#/i18n/declaration";
 import { RootState } from "#/store";
 import AgentState from "#/types/AgentState";
-import beep from "#/utils/beep";
 
 enum IndicatorColor {
   BLUE = "bg-blue-500",
@@ -80,16 +79,6 @@ function AgentStatusBar() {
   // - Agent is thinking
   // - Agent is ready
   // - Agent is not available
-  useEffect(() => {
-    if (
-      curAgentState === AgentState.AWAITING_USER_INPUT ||
-      curAgentState === AgentState.ERROR ||
-      curAgentState === AgentState.INIT
-    ) {
-      if (document.cookie.indexOf("audio") !== -1) beep();
-    }
-  }, [curAgentState]);
-
   return (
     <div className="flex items-center">
       <div
diff --git a/frontend/src/components/Browser.test.tsx b/frontend/src/components/Browser.test.tsx
index 71a13b015586..31f87d383198 100644
--- a/frontend/src/components/Browser.test.tsx
+++ b/frontend/src/components/Browser.test.tsx
@@ -29,7 +29,7 @@ describe("Browser", () => {
       },
     });
 
-    expect(screen.getByText("https://example.com")).toBeInTheDocument();
+    expect(screen.getByRole("textbox")).toHaveValue("https://example.com");
     expect(screen.getByAltText(/browser screenshot/i)).toBeInTheDocument();
   });
 });
diff --git a/frontend/src/components/Browser.tsx b/frontend/src/components/Browser.tsx
index 891aee45ec6b..89cd700ed341 100644
--- a/frontend/src/components/Browser.tsx
+++ b/frontend/src/components/Browser.tsx
@@ -1,17 +1,29 @@
-import React from "react";
+import React, { useState } from "react";
 import { useTranslation } from "react-i18next";
 import { IoIosGlobe } from "react-icons/io";
 import { useSelector } from "react-redux";
 import { I18nKey } from "#/i18n/declaration";
 import { RootState } from "#/store";
+import { updateBrowserTabUrl } from "#/services/browseService";
 
 function Browser(): JSX.Element {
   const { t } = useTranslation();
-
   const { url, screenshotSrc } = useSelector(
     (state: RootState) => state.browser,
   );
 
+  const [editableUrl, setEditableUrl] = useState(url);
+
+  const handleUrlChange = (e: React.ChangeEvent<HTMLInputElement>) => {
+    setEditableUrl(e.target.value);
+  };
+
+  const handleURLBar = (e: React.KeyboardEvent<HTMLInputElement>) => {
+    if (e.key === "Enter") {
+      updateBrowserTabUrl(editableUrl);
+    }
+  };
+
   const imgSrc =
     screenshotSrc && screenshotSrc.startsWith("data:image/png;base64,")
       ? screenshotSrc
@@ -20,7 +32,13 @@ function Browser(): JSX.Element {
   return (
     <div className="h-full w-full flex flex-col text-neutral-400">
       <div className="w-full p-2 truncate border-b border-neutral-600">
-        {url}
+        <input
+          type="text"
+          value={editableUrl}
+          onChange={handleUrlChange}
+          onKeyDown={handleURLBar}
+          className="w-full bg-transparent border-none outline-none text-neutral-400"
+        />
       </div>
       <div className="overflow-y-auto grow scrollbar-hide rounded-xl">
         {screenshotSrc ? (
diff --git a/frontend/src/components/Resizable.tsx b/frontend/src/components/Resizable.tsx
index 4d7663e45963..5bdc98afb2d8 100644
--- a/frontend/src/components/Resizable.tsx
+++ b/frontend/src/components/Resizable.tsx
@@ -25,49 +25,58 @@ export function Container({
   orientation,
   initialSize,
 }: ContainerProps): JSX.Element {
-  const [firstSize, setFirstSize] = useState<number | undefined>(initialSize);
-  const [dividerPosition, setDividerPosition] = useState<undefined | number>(
-    undefined,
-  );
+  const [firstSize, setFirstSize] = useState<number>(initialSize);
+  const [dividerPosition, setDividerPosition] = useState<number | null>(null);
   const firstRef = useRef<HTMLDivElement>(null);
 
   useEffect(() => {
-    if (firstRef.current !== null) {
-      if (orientation === Orientation.HORIZONTAL) {
-        firstRef.current.style.width = `${firstSize}px`;
-      } else {
-        firstRef.current.style.height = `${firstSize}px`;
-      }
+    if (dividerPosition == null || !firstRef.current) {
+      return undefined;
     }
-  }, [firstSize, orientation]);
-
-  const onMouseMove = (e: MouseEvent) => {
-    e.preventDefault();
-    if (firstSize && dividerPosition) {
-      if (orientation === Orientation.HORIZONTAL) {
-        const newLeftWidth = firstSize + e.clientX - dividerPosition;
-        setDividerPosition(e.clientX);
-        setFirstSize(newLeftWidth);
-      } else {
-        const newTopHeight = firstSize + e.clientY - dividerPosition;
-        setDividerPosition(e.clientY);
-        setFirstSize(newTopHeight);
+    const getFirstSizeFromEvent = (e: MouseEvent) => {
+      const position =
+        orientation === Orientation.HORIZONTAL ? e.clientX : e.clientY;
+      return firstSize + position - dividerPosition;
+    };
+    const onMouseMove = (e: MouseEvent) => {
+      e.preventDefault();
+      const newFirstSize = getFirstSizeFromEvent(e);
+      const { current } = firstRef;
+      if (current) {
+        if (orientation === Orientation.HORIZONTAL) {
+          current.style.width = `${newFirstSize}px`;
+        } else {
+          current.style.height = `${newFirstSize}px`;
+        }
       }
-    }
-  };
-
-  const onMouseUp = () => {
-    document.removeEventListener("mousemove", onMouseMove);
-    document.removeEventListener("mouseup", onMouseUp);
-  };
+    };
+    const onMouseUp = (e: MouseEvent) => {
+      e.preventDefault();
+      setFirstSize(getFirstSizeFromEvent(e));
+      setDividerPosition(null);
+      document.removeEventListener("mousemove", onMouseMove);
+      document.removeEventListener("mouseup", onMouseUp);
+    };
+    document.addEventListener("mousemove", onMouseMove);
+    document.addEventListener("mouseup", onMouseUp);
+    return () => {
+      document.removeEventListener("mousemove", onMouseMove);
+      document.removeEventListener("mouseup", onMouseUp);
+    };
+  }, [dividerPosition, firstSize, orientation]);
 
   const onMouseDown = (e: React.MouseEvent) => {
     e.preventDefault();
-    setDividerPosition(
-      orientation === Orientation.HORIZONTAL ? e.clientX : e.clientY,
-    );
-    document.addEventListener("mousemove", onMouseMove);
-    document.addEventListener("mouseup", onMouseUp);
+    const position =
+      orientation === Orientation.HORIZONTAL ? e.clientX : e.clientY;
+    setDividerPosition(position);
+  };
+
+  const getStyleForFirst = () => {
+    if (orientation === Orientation.HORIZONTAL) {
+      return { width: `${firstSize}px` };
+    }
+    return { height: `${firstSize}px` };
   };
 
   return (
@@ -77,7 +86,7 @@ export function Container({
         className,
       )}
     >
-      <div ref={firstRef} className={firstClassName}>
+      <div ref={firstRef} className={firstClassName} style={getStyleForFirst()}>
         {firstChild}
       </div>
       <div
diff --git a/frontend/src/components/chat/Chat.test.tsx b/frontend/src/components/chat/Chat.test.tsx
index 5637f819475c..6b912c7222a3 100644
--- a/frontend/src/components/chat/Chat.test.tsx
+++ b/frontend/src/components/chat/Chat.test.tsx
@@ -5,9 +5,9 @@ import { renderWithProviders } from "test-utils";
 import Chat from "./Chat";
 
 const MESSAGES: Message[] = [
-  { sender: "assistant", content: "Hello!" },
-  { sender: "user", content: "Hi!" },
-  { sender: "assistant", content: "How can I help you today?" },
+  { sender: "assistant", content: "Hello!", imageUrls: [] },
+  { sender: "user", content: "Hi!", imageUrls: [] },
+  { sender: "assistant", content: "How can I help you today?", imageUrls: [] },
 ];
 
 describe("Chat", () => {
diff --git a/frontend/src/components/chat/ChatInput.test.tsx b/frontend/src/components/chat/ChatInput.test.tsx
index 02b786308077..6447ff531209 100644
--- a/frontend/src/components/chat/ChatInput.test.tsx
+++ b/frontend/src/components/chat/ChatInput.test.tsx
@@ -55,7 +55,7 @@ describe("ChatInput", () => {
     await user.type(textarea, "Hello, world!");
     await user.click(button);
 
-    expect(onSendMessage).toHaveBeenCalledWith("Hello, world!");
+    expect(onSendMessage).toHaveBeenCalledWith("Hello, world!", "", []);
     // Additionally, check if it was called exactly once
     expect(onSendMessage).toHaveBeenCalledTimes(1);
   });
@@ -68,7 +68,7 @@ describe("ChatInput", () => {
     await user.type(textarea, "Hello, world!");
     await user.keyboard("{Enter}");
 
-    expect(onSendMessage).toHaveBeenCalledWith("Hello, world!");
+    expect(onSendMessage).toHaveBeenCalledWith("Hello, world!", "", []);
   });
 
   it("should NOT send a message when shift + enter is pressed", async () => {
diff --git a/frontend/src/components/chat/ChatInput.tsx b/frontend/src/components/chat/ChatInput.tsx
index c92dddd1c1d5..270da3b94915 100644
--- a/frontend/src/components/chat/ChatInput.tsx
+++ b/frontend/src/components/chat/ChatInput.tsx
@@ -1,26 +1,48 @@
 import { Textarea } from "@nextui-org/react";
 import React from "react";
 import { useTranslation } from "react-i18next";
-import { VscArrowUp } from "react-icons/vsc";
+import { VscArrowUp, VscFileMedia } from "react-icons/vsc";
 import { twMerge } from "tailwind-merge";
 import { I18nKey } from "#/i18n/declaration";
 
 interface ChatInputProps {
   disabled?: boolean;
-  onSendMessage: (message: string) => void;
+  onSendMessage: (
+    message: string,
+    dispatchContent?: string,
+    image_urls?: string[],
+  ) => void;
 }
 
 function ChatInput({ disabled = false, onSendMessage }: ChatInputProps) {
   const { t } = useTranslation();
 
   const [message, setMessage] = React.useState("");
+  const [files, setFiles] = React.useState<File[]>([]);
   // This is true when the user is typing in an IME (e.g., Chinese, Japanese)
   const [isComposing, setIsComposing] = React.useState(false);
 
-  const handleSendChatMessage = () => {
+  const convertImageToBase64 = (file: File): Promise<string> =>
+    new Promise((resolve, reject) => {
+      const reader = new FileReader();
+      reader.onloadend = () => {
+        resolve(reader.result as string);
+      };
+      reader.onerror = reject;
+      reader.readAsDataURL(file);
+    });
+
+  const handleSendChatMessage = async () => {
     if (message.trim()) {
-      onSendMessage(message);
+      let base64images: string[] = [];
+      if (files.length > 0) {
+        base64images = await Promise.all(
+          files.map((file) => convertImageToBase64(file)),
+        );
+      }
+      onSendMessage(message, "", base64images);
       setMessage("");
+      setFiles([]);
     }
   };
 
@@ -33,6 +55,33 @@ function ChatInput({ disabled = false, onSendMessage }: ChatInputProps) {
     }
   };
 
+  const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => {
+    if (event.target.files) {
+      setFiles((prev) => [...prev, ...Array.from(event.target.files!)]);
+    }
+  };
+
+  const removeFile = (index: number) => {
+    setFiles((prevFiles) => prevFiles.filter((_, i) => i !== index));
+  };
+
+  const handlePaste = (event: React.ClipboardEvent<HTMLInputElement>) => {
+    const clipboardItems = Array.from(event.clipboardData.items);
+    const pastedFiles: File[] = [];
+    clipboardItems.forEach((item) => {
+      if (item.type.startsWith("image/")) {
+        const file = item.getAsFile();
+        if (file) {
+          pastedFiles.push(file);
+        }
+      }
+    });
+    if (pastedFiles.length > 0) {
+      setFiles((prevFiles) => [...prevFiles, ...pastedFiles]);
+      event.preventDefault();
+    }
+  };
+
   return (
     <div className="w-full relative text-base flex pt-3">
       <Textarea
@@ -42,6 +91,7 @@ function ChatInput({ disabled = false, onSendMessage }: ChatInputProps) {
         onCompositionStart={() => setIsComposing(true)}
         onCompositionEnd={() => setIsComposing(false)}
         placeholder={t(I18nKey.CHAT_INTERFACE$INPUT_PLACEHOLDER)}
+        onPaste={handlePaste}
         className="pb-3 px-3"
         classNames={{
           inputWrapper: "bg-neutral-700 border border-neutral-600 rounded-lg",
@@ -51,7 +101,26 @@ function ChatInput({ disabled = false, onSendMessage }: ChatInputProps) {
         minRows={1}
         variant="bordered"
       />
-
+      <label
+        htmlFor="file-input"
+        className={twMerge(
+          "bg-transparent border rounded-lg p-1 border-white hover:opacity-80 cursor-pointer select-none absolute right-16 bottom-[19px] transition active:bg-white active:text-black",
+          disabled
+            ? "cursor-not-allowed border-neutral-400 text-neutral-400"
+            : "hover:bg-neutral-500",
+        )}
+        aria-label={t(I18nKey.CHAT_INTERFACE$TOOLTIP_UPLOAD_IMAGE)}
+      >
+        <VscFileMedia />
+        <input
+          type="file"
+          accept="image/*"
+          onChange={handleFileChange}
+          className="hidden"
+          id="file-input"
+          multiple
+        />
+      </label>
       <button
         type="button"
         onClick={handleSendChatMessage}
@@ -60,12 +129,32 @@ function ChatInput({ disabled = false, onSendMessage }: ChatInputProps) {
           "bg-transparent border rounded-lg p-1 border-white hover:opacity-80 cursor-pointer select-none absolute right-5 bottom-[19px] transition active:bg-white active:text-black",
           disabled
             ? "cursor-not-allowed border-neutral-400 text-neutral-400"
-            : "hover:bg-neutral-500 ",
+            : "hover:bg-neutral-500",
         )}
         aria-label={t(I18nKey.CHAT_INTERFACE$TOOLTIP_SEND_MESSAGE)}
       >
         <VscArrowUp />
       </button>
+      {files.length > 0 && (
+        <div className="absolute bottom-16 right-5 flex space-x-2 p-4 border-1 border-neutral-500 bg-neutral-800 rounded-lg">
+          {files.map((file, index) => (
+            <div key={index} className="relative">
+              <img
+                src={URL.createObjectURL(file)}
+                alt="upload preview"
+                className="w-24 h-24 object-contain rounded bg-white"
+              />
+              <button
+                type="button"
+                onClick={() => removeFile(index)}
+                className="absolute top-0 right-0 bg-black border border-grey-200 text-white rounded-full w-5 h-5 flex pb-1 items-center justify-center"
+              >
+                &times;
+              </button>
+            </div>
+          ))}
+        </div>
+      )}
     </div>
   );
 }
diff --git a/frontend/src/components/chat/ChatInterface.test.tsx b/frontend/src/components/chat/ChatInterface.test.tsx
index 5e982e1e71ad..cec7b98c8be2 100644
--- a/frontend/src/components/chat/ChatInterface.test.tsx
+++ b/frontend/src/components/chat/ChatInterface.test.tsx
@@ -19,7 +19,7 @@ describe("ChatInterface", () => {
 
   const userMessageEvent = {
     action: ActionType.MESSAGE,
-    args: { content: "my message" },
+    args: { content: "my message", images_urls: [] },
   };
 
   afterEach(() => {
@@ -35,7 +35,7 @@ describe("ChatInterface", () => {
     const { store } = renderWithProviders(<ChatInterface />, {
       preloadedState: {
         chat: {
-          messages: [{ sender: "user", content: "Hello" }],
+          messages: [{ sender: "user", content: "Hello", imageUrls: [] }],
         },
       },
     });
diff --git a/frontend/src/components/chat/ChatInterface.tsx b/frontend/src/components/chat/ChatInterface.tsx
index b08bc23ebda3..f68bb53b3cb1 100644
--- a/frontend/src/components/chat/ChatInterface.tsx
+++ b/frontend/src/components/chat/ChatInterface.tsx
@@ -1,4 +1,4 @@
-import React, { useRef } from "react";
+import React, { useRef, useState, useEffect } from "react";
 import { useDispatch, useSelector } from "react-redux";
 import { IoMdChatbubbles } from "react-icons/io";
 import { RiArrowRightDoubleLine } from "react-icons/ri";
@@ -16,6 +16,7 @@ import { addUserMessage, addAssistantMessage } from "#/state/chatSlice";
 import { I18nKey } from "#/i18n/declaration";
 import { useScrollToBottom } from "#/hooks/useScrollToBottom";
 import FeedbackModal from "../modals/feedback/FeedbackModal";
+import beep from "#/utils/beep";
 
 interface ScrollButtonProps {
   onClick: () => void;
@@ -54,6 +55,7 @@ function ChatInterface() {
     "positive" | "negative"
   >("positive");
   const [feedbackShared, setFeedbackShared] = React.useState(0);
+  const [autoMode, setAutoMode] = useState(false);
 
   const {
     isOpen: feedbackModalIsOpen,
@@ -66,14 +68,27 @@ function ChatInterface() {
     setFeedbackPolarity(polarity);
   };
 
-  const handleSendMessage = (content: string) => {
-    dispatch(addUserMessage(content));
-    sendChatMessage(content);
+  const handleSendMessage = (
+    content: string,
+    dispatchContent: string = "",
+    imageUrls: string[] = [],
+  ) => {
+    dispatch(
+      addUserMessage({ content: dispatchContent || content, imageUrls }),
+    );
+    sendChatMessage(content, imageUrls);
   };
 
   const { t } = useTranslation();
   const handleSendContinueMsg = () => {
-    handleSendMessage(t(I18nKey.CHAT_INTERFACE$INPUT_CONTINUE_MESSAGE));
+    handleSendMessage(t(I18nKey.CHAT_INTERFACE$INPUT_CONTINUE_MESSAGE), "", []);
+  };
+
+  const handleAutoMsg = () => {
+    handleSendMessage(
+      t(I18nKey.CHAT_INTERFACE$AUTO_MESSAGE),
+      t(I18nKey.CHAT_INTERFACE$INPUT_AUTO_MESSAGE),
+    );
   };
 
   const scrollRef = useRef<HTMLDivElement>(null);
@@ -81,17 +96,45 @@ function ChatInterface() {
   const { scrollDomToBottom, onChatBodyScroll, hitBottom } =
     useScrollToBottom(scrollRef);
 
-  React.useEffect(() => {
+  useEffect(() => {
     if (curAgentState === AgentState.INIT && messages.length === 0) {
       dispatch(addAssistantMessage(t(I18nKey.CHAT_INTERFACE$INITIAL_MESSAGE)));
     }
   }, [curAgentState, dispatch, messages.length, t]);
 
+  useEffect(() => {
+    if (autoMode && curAgentState === AgentState.AWAITING_USER_INPUT) {
+      handleAutoMsg();
+    }
+  }, [autoMode, curAgentState]);
+
+  useEffect(() => {
+    if (
+      (!autoMode && curAgentState === AgentState.AWAITING_USER_INPUT) ||
+      curAgentState === AgentState.ERROR ||
+      curAgentState === AgentState.INIT ||
+      curAgentState === AgentState.FINISHED
+    ) {
+      if (document.cookie.indexOf("audio") !== -1) beep();
+    }
+  }, [curAgentState]);
+
   return (
     <div className="flex flex-col h-full bg-neutral-800">
       <div className="flex items-center gap-2 border-b border-neutral-600 text-sm px-4 py-2">
         <IoMdChatbubbles />
         Chat
+        <div className="ml-auto">
+          <label className="flex items-center space-x-2">
+            <input
+              type="checkbox"
+              checked={autoMode}
+              onChange={() => setAutoMode(!autoMode)}
+              aria-label="Auto Mode"
+            />
+            <span>Auto Mode</span>
+          </label>
+        </div>
       </div>
       <div className="flex-1 flex flex-col relative min-h-0">
         <div
@@ -114,15 +157,16 @@ function ChatInterface() {
           )}
           {hitBottom && (
             <>
-              {curAgentState === AgentState.AWAITING_USER_INPUT && (
-                <ScrollButton
-                  onClick={handleSendContinueMsg}
-                  icon={
-                    <RiArrowRightDoubleLine className="inline mr-2 w-3 h-3" />
-                  }
-                  label={t(I18nKey.CHAT_INTERFACE$INPUT_CONTINUE_MESSAGE)}
-                />
-              )}
+              {curAgentState === AgentState.AWAITING_USER_INPUT &&
+                !autoMode && (
+                  <ScrollButton
+                    onClick={handleSendContinueMsg}
+                    icon={
+                      <RiArrowRightDoubleLine className="inline mr-2 w-3 h-3" />
+                    }
+                    label={t(I18nKey.CHAT_INTERFACE$INPUT_CONTINUE_MESSAGE)}
+                  />
+                )}
               {curAgentState === AgentState.RUNNING && <TypingIndicator />}
             </>
           )}
diff --git a/frontend/src/components/chat/ChatMessage.test.tsx b/frontend/src/components/chat/ChatMessage.test.tsx
index 9d902b865257..0f0d3becd2ce 100644
--- a/frontend/src/components/chat/ChatMessage.test.tsx
+++ b/frontend/src/components/chat/ChatMessage.test.tsx
@@ -9,7 +9,7 @@ describe("Message", () => {
   it("should render a user message", () => {
     render(
       <ChatMessage
-        message={{ sender: "user", content: "Hello" }}
+        message={{ sender: "user", content: "Hello", imageUrls: [] }}
         isLastMessage={false}
       />,
     );
@@ -21,7 +21,7 @@ describe("Message", () => {
   it("should render an assistant message", () => {
     render(
       <ChatMessage
-        message={{ sender: "assistant", content: "Hi" }}
+        message={{ sender: "assistant", content: "Hi", imageUrls: [] }}
         isLastMessage={false}
       />,
     );
@@ -36,6 +36,7 @@ describe("Message", () => {
         message={{
           sender: "user",
           content: "```js\nconsole.log('Hello')\n```",
+          imageUrls: [],
         }}
         isLastMessage={false}
       />,
@@ -55,7 +56,7 @@ describe("Message", () => {
       const user = userEvent.setup();
       render(
         <ChatMessage
-          message={{ sender: "user", content: "Hello" }}
+          message={{ sender: "user", content: "Hello", imageUrls: [] }}
           isLastMessage={false}
         />,
       );
@@ -79,7 +80,7 @@ describe("Message", () => {
       const user = userEvent.setup();
       render(
         <ChatMessage
-          message={{ sender: "user", content: "Hello" }}
+          message={{ sender: "user", content: "Hello", imageUrls: [] }}
           isLastMessage={false}
         />,
       );
@@ -113,7 +114,11 @@ describe("Message", () => {
       // it should not render buttons if the message is not the last one
       const { rerender } = render(
         <ChatMessage
-          message={{ sender: "assistant", content: "Are you sure?" }}
+          message={{
+            sender: "assistant",
+            content: "Are you sure?",
+            imageUrls: [],
+          }}
           isLastMessage={false}
           awaitingUserConfirmation
         />,
@@ -123,7 +128,7 @@ describe("Message", () => {
       // it should not render buttons if the message is not from the assistant
       rerender(
         <ChatMessage
-          message={{ sender: "user", content: "Yes" }}
+          message={{ sender: "user", content: "Yes", imageUrls: [] }}
           isLastMessage
           awaitingUserConfirmation
         />,
@@ -133,7 +138,11 @@ describe("Message", () => {
       // it should not render buttons if the message is not awaiting user confirmation
       rerender(
         <ChatMessage
-          message={{ sender: "assistant", content: "Are you sure?" }}
+          message={{
+            sender: "assistant",
+            content: "Are you sure?",
+            imageUrls: [],
+          }}
           isLastMessage
           awaitingUserConfirmation={false}
         />,
@@ -143,7 +152,11 @@ describe("Message", () => {
       // it should render buttons if all conditions are met
       rerender(
         <ChatMessage
-          message={{ sender: "assistant", content: "Are you sure?" }}
+          message={{
+            sender: "assistant",
+            content: "Are you sure?",
+            imageUrls: [],
+          }}
           isLastMessage
           awaitingUserConfirmation
         />,
diff --git a/frontend/src/components/chat/ChatMessage.tsx b/frontend/src/components/chat/ChatMessage.tsx
index be19b07f672e..ed8a283d4c54 100644
--- a/frontend/src/components/chat/ChatMessage.tsx
+++ b/frontend/src/components/chat/ChatMessage.tsx
@@ -77,6 +77,18 @@ function ChatMessage({
         </button>
       )}
       <Markdown components={{ code }}>{message.content}</Markdown>
+      {message.imageUrls.length > 0 && (
+        <div className="flex space-x-2 mt-2">
+          {message.imageUrls.map((url, index) => (
+            <img
+              key={index}
+              src={url}
+              alt={`upload preview ${index}`}
+              className="w-24 h-24 object-contain rounded bg-white"
+            />
+          ))}
+        </div>
+      )}
       {isLastMessage &&
         message.sender === "assistant" &&
         awaitingUserConfirmation && <ConfirmationButtons />}
diff --git a/frontend/src/components/chat/message.d.ts b/frontend/src/components/chat/message.d.ts
index 6a3aa4916788..e7431412fc3d 100644
--- a/frontend/src/components/chat/message.d.ts
+++ b/frontend/src/components/chat/message.d.ts
@@ -1,4 +1,5 @@
 type Message = {
   sender: "user" | "assistant";
   content: string;
+  imageUrls: string[];
 };
diff --git a/frontend/src/components/file-explorer/ExplorerTree.tsx b/frontend/src/components/file-explorer/ExplorerTree.tsx
index 0356f505e059..32b014d9a613 100644
--- a/frontend/src/components/file-explorer/ExplorerTree.tsx
+++ b/frontend/src/components/file-explorer/ExplorerTree.tsx
@@ -1,5 +1,7 @@
 import React from "react";
+import { useTranslation } from "react-i18next";
 import TreeNode from "./TreeNode";
+import { I18nKey } from "#/i18n/declaration";
 
 interface ExplorerTreeProps {
   files: string[];
@@ -7,8 +9,16 @@ interface ExplorerTreeProps {
 }
 
 function ExplorerTree({ files, defaultOpen = false }: ExplorerTreeProps) {
+  const { t } = useTranslation();
+  if (files.length === 0) {
+    return (
+      <div className="text-sm text-gray-400 pt-4">
+        {t(I18nKey.EXPLORER$EMPTY_WORKSPACE_MESSAGE)}
+      </div>
+    );
+  }
   return (
-    <div className="w-full overflow-x-auto h-full pt-[4px]">
+    <div className="w-full h-full pt-[4px]">
       {files.map((file) => (
         <TreeNode key={file} path={file} defaultOpen={defaultOpen} />
       ))}
diff --git a/frontend/src/components/file-explorer/FileExplorer.tsx b/frontend/src/components/file-explorer/FileExplorer.tsx
index e63d7537c229..7691dc143eed 100644
--- a/frontend/src/components/file-explorer/FileExplorer.tsx
+++ b/frontend/src/components/file-explorer/FileExplorer.tsx
@@ -108,11 +108,8 @@ function FileExplorer() {
     }
     dispatch(setRefreshID(Math.random()));
     try {
-      const fileList = await listFiles("/");
+      const fileList = await listFiles();
       setFiles(fileList);
-      if (fileList.length === 0) {
-        toast.info(t(I18nKey.EXPLORER$EMPTY_WORKSPACE_MESSAGE));
-      }
     } catch (error) {
       toast.error("refresh-error", t(I18nKey.EXPLORER$REFRESH_ERROR_MESSAGE));
     }
@@ -188,10 +185,6 @@ function FileExplorer() {
     };
   }, []);
 
-  if (!files.length) {
-    return null;
-  }
-
   return (
     <div className="relative h-full">
       {isDragging && (
@@ -219,19 +212,17 @@ function FileExplorer() {
           isHidden ? "w-12" : "w-60",
         )}
       >
-        <div className="flex flex-col relative h-full p-2">
+        <div className="flex flex-col relative h-full px-3 py-2">
           <div className="sticky top-0 bg-neutral-800 z-10">
             <div
               className={twMerge(
-                "flex items-center mt-2 mb-1 p-2",
+                "flex items-center",
                 isHidden ? "justify-center" : "justify-between",
               )}
             >
               {!isHidden && (
-                <div className="ml-1 text-neutral-300 font-bold text-sm">
-                  <div className="ml-1 text-neutral-300 font-bold text-sm">
-                    {t(I18nKey.EXPLORER$LABEL_WORKSPACE)}
-                  </div>
+                <div className="text-neutral-300 font-bold text-sm">
+                  {t(I18nKey.EXPLORER$LABEL_WORKSPACE)}
                 </div>
               )}
               <ExplorerActions
diff --git a/frontend/src/components/modals/settings/SettingsForm.test.tsx b/frontend/src/components/modals/settings/SettingsForm.test.tsx
index e8ade8dcf0cb..c948989cec48 100644
--- a/frontend/src/components/modals/settings/SettingsForm.test.tsx
+++ b/frontend/src/components/modals/settings/SettingsForm.test.tsx
@@ -52,7 +52,7 @@ describe("SettingsForm", () => {
     expect(confirmationModeInput).toHaveAttribute("data-selected", "true");
   });
 
-  it("should display the existing values if it they are present", () => {
+  it("should display the existing values if they are present", () => {
     renderSettingsForm({
       LLM_MODEL: "model2",
       AGENT: "agent2",
@@ -119,17 +119,18 @@ describe("SettingsForm", () => {
     });
 
     it("should call the onAgentChange handler when the agent changes", async () => {
+      const user = userEvent.setup();
       renderSettingsForm();
 
+      // We need to enable the agent select
+      const agentSwitch = screen.getByTestId("enableagentselect");
+      await user.click(agentSwitch);
+
       const agentInput = screen.getByRole("combobox", { name: "agent" });
-      await act(async () => {
-        await userEvent.click(agentInput);
-      });
+      await user.click(agentInput);
 
       const agent3 = screen.getByText("agent3");
-      await act(async () => {
-        await userEvent.click(agent3);
-      });
+      await user.click(agent3);
 
       expect(onAgentChangeMock).toHaveBeenCalledWith("agent3");
     });
diff --git a/frontend/src/components/modals/settings/SettingsForm.tsx b/frontend/src/components/modals/settings/SettingsForm.tsx
index 927db98883c2..4a3ab2c87b9c 100644
--- a/frontend/src/components/modals/settings/SettingsForm.tsx
+++ b/frontend/src/components/modals/settings/SettingsForm.tsx
@@ -33,17 +33,10 @@ function SettingsForm({
 }: SettingsFormProps) {
   const { t } = useTranslation();
   const { isOpen: isVisible, onOpenChange: onVisibleChange } = useDisclosure();
+  const [isAgentSelectEnabled, setIsAgentSelectEnabled] = React.useState(false);
 
   return (
     <>
-      <AutocompleteCombobox
-        ariaLabel="agent"
-        items={agents.map((agent) => ({ value: agent, label: agent }))}
-        defaultKey={settings.AGENT}
-        onChange={onAgentChange}
-        tooltip={t(I18nKey.SETTINGS$AGENT_TOOLTIP)}
-        disabled={disabled}
-      />
       <AutocompleteCombobox
         ariaLabel="model"
         items={models.map((model) => ({ value: model, label: model }))}
@@ -88,6 +81,23 @@ function SettingsForm({
         tooltip={t(I18nKey.SETTINGS$LANGUAGE_TOOLTIP)}
         disabled={disabled}
       />
+      <AutocompleteCombobox
+        ariaLabel="agent"
+        items={agents.map((agent) => ({ value: agent, label: agent }))}
+        defaultKey={settings.AGENT}
+        onChange={onAgentChange}
+        tooltip={t(I18nKey.SETTINGS$AGENT_TOOLTIP)}
+        disabled={disabled || !isAgentSelectEnabled}
+      />
+      <Switch
+        defaultSelected={false}
+        isSelected={isAgentSelectEnabled}
+        onValueChange={setIsAgentSelectEnabled}
+        aria-label="enableagentselect"
+        data-testid="enableagentselect"
+      >
+        {t(I18nKey.SETTINGS$AGENT_SELECT_ENABLED)}
+      </Switch>
       <Switch
         aria-label="confirmationmode"
         data-testid="confirmationmode"
diff --git a/frontend/src/components/modals/settings/SettingsModal.test.tsx b/frontend/src/components/modals/settings/SettingsModal.test.tsx
index 8462942ac616..49956de2e8c2 100644
--- a/frontend/src/components/modals/settings/SettingsModal.test.tsx
+++ b/frontend/src/components/modals/settings/SettingsModal.test.tsx
@@ -231,6 +231,10 @@ describe("SettingsModal", () => {
       ),
     );
 
+    // We need to enable the agent select first
+    const agentSwitch = screen.getByTestId("enableagentselect");
+    await user.click(agentSwitch);
+
     const resetButton = screen.getByRole("button", {
       name: /MODAL_RESET_BUTTON_LABEL/i,
     });
diff --git a/frontend/src/components/terminal/Terminal.test.tsx b/frontend/src/components/terminal/Terminal.test.tsx
index c8a583dca45c..254bb6290cca 100644
--- a/frontend/src/components/terminal/Terminal.test.tsx
+++ b/frontend/src/components/terminal/Terminal.test.tsx
@@ -44,7 +44,9 @@ describe("Terminal", () => {
     expect(screen.getByText("Terminal")).toBeInTheDocument();
     expect(mockTerminal.open).toHaveBeenCalledTimes(1);
 
-    expect(mockTerminal.write).toHaveBeenCalledWith("$ ");
+    expect(mockTerminal.write).toHaveBeenCalledWith(
+      "opendevin@docker-desktop:/workspace $ ",
+    );
   });
 
   it("should load commands to the terminal", () => {
@@ -54,7 +56,7 @@ describe("Terminal", () => {
     ]);
 
     expect(mockTerminal.writeln).toHaveBeenNthCalledWith(1, "INPUT");
-    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(2, "OUTPUT");
+    expect(mockTerminal.write).toHaveBeenNthCalledWith(2, "OUTPUT");
   });
 
   it("should write commands to the terminal", () => {
@@ -66,13 +68,13 @@ describe("Terminal", () => {
     });
 
     expect(mockTerminal.writeln).toHaveBeenNthCalledWith(1, "echo Hello");
-    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(2, "Hello");
+    expect(mockTerminal.write).toHaveBeenNthCalledWith(2, "Hello");
 
     act(() => {
       store.dispatch(appendInput("echo World"));
     });
 
-    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(3, "echo World");
+    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(2, "echo World");
   });
 
   it("should load and write commands to the terminal", () => {
@@ -82,13 +84,13 @@ describe("Terminal", () => {
     ]);
 
     expect(mockTerminal.writeln).toHaveBeenNthCalledWith(1, "echo Hello");
-    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(2, "Hello");
+    expect(mockTerminal.write).toHaveBeenNthCalledWith(2, "Hello");
 
     act(() => {
       store.dispatch(appendInput("echo Hello"));
     });
 
-    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(3, "echo Hello");
+    expect(mockTerminal.writeln).toHaveBeenNthCalledWith(2, "echo Hello");
   });
 
   it("should end the line with a dollar sign after writing a command", () => {
@@ -99,7 +101,9 @@ describe("Terminal", () => {
     });
 
     expect(mockTerminal.writeln).toHaveBeenCalledWith("echo Hello");
-    expect(mockTerminal.write).toHaveBeenCalledWith("$ ");
+    expect(mockTerminal.write).toHaveBeenCalledWith(
+      "opendevin@docker-desktop:/workspace $ ",
+    );
   });
 
   // This test fails because it expects `disposeMock` to have been called before the component is unmounted.
diff --git a/frontend/src/hooks/useTerminal.ts b/frontend/src/hooks/useTerminal.ts
index d2665d33b543..587055fe1d06 100644
--- a/frontend/src/hooks/useTerminal.ts
+++ b/frontend/src/hooks/useTerminal.ts
@@ -34,7 +34,7 @@ export const useTerminal = (commands: Command[] = []) => {
       terminal.current.loadAddon(fitAddon.current);
       terminal.current.open(ref.current);
 
-      terminal.current.write("$ ");
+      terminal.current.write("opendevin@docker-desktop:/workspace $ ");
       terminal.current.onKey(({ key, domEvent }) => {
         if (domEvent.key === "Enter") {
           terminal.current?.write("\r\n");
@@ -93,15 +93,15 @@ export const useTerminal = (commands: Command[] = []) => {
       // Start writing commands from the last command index
       for (let i = lastCommandIndex.current; i < commands.length; i += 1) {
         const command = commands[i];
-        const lines = command.content.split("\n");
+        const lines = command.content.split("\r\n");
 
-        lines.forEach((line: string) => {
-          terminal.current?.writeln(line);
+        lines.forEach((line, index) => {
+          if (index < lines.length - 1 || command.type === "input") {
+            terminal.current?.writeln(line);
+          } else {
+            terminal.current?.write(line);
+          }
         });
-
-        if (command.type === "output") {
-          terminal.current.write("\n$ ");
-        }
       }
 
       lastCommandIndex.current = commands.length; // Update the position of the last command
diff --git a/frontend/src/i18n/translation.json b/frontend/src/i18n/translation.json
index 4ec0d05db7e9..46c1dd46954d 100644
--- a/frontend/src/i18n/translation.json
+++ b/frontend/src/i18n/translation.json
@@ -601,6 +601,12 @@
     "zh-CN": "继续",
     "de": "Fortfahren"
   },
+  "CHAT_INTERFACE$AUTO_MESSAGE": {
+    "en": "Please continue working on the task on whatever approach you think is suitable.\nIf you think you have solved the task, you can give <finish> to end the interaction.\nIMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n"
+  },
+  "CHAT_INTERFACE$INPUT_AUTO_MESSAGE":{
+    "en": "Sent the default Auto Msg 🤖"
+  },
   "CHAT_INTERFACE$USER_ASK_CONFIRMATION": {
     "en": "Do you want to continue with this action?",
     "de": "Möchten Sie mit dieser Aktion fortfahren?",
@@ -649,6 +655,11 @@
     "zh-CN": "发送消息",
     "de": "Nachricht senden"
   },
+  "CHAT_INTERFACE$TOOLTIP_UPLOAD_IMAGE": {
+    "en": "Upload image",
+    "zh-CN": "上传图片",
+    "de": "Bild hochladen"
+  },
   "CHAT_INTERFACE$INITIAL_MESSAGE": {
     "en": "Hi! I'm OpenDevin, an AI Software Engineer. What would you like to build with me today?",
     "zh-CN": "你好！我是 OpenDevin，一名 AI 软件工程师。今天想和我一起编写什么程序呢?",
@@ -721,6 +732,9 @@
     "de": "Wartet auf die Bestätigung des Benutzers, bevor der Code ausgeführt wird.",
     "zh-CN": "在执行代码之前等待用户确认。"
   },
+  "SETTINGS$AGENT_SELECT_ENABLED": {
+    "en": "Enable Agent Selection - Advanced Users"
+  },
   "BROWSER$EMPTY_MESSAGE": {
     "en": "No page loaded.",
     "zh-CN": "页面未加载",
diff --git a/frontend/src/index.css b/frontend/src/index.css
index 6e3a605ba0ce..a981d6b3411b 100644
--- a/frontend/src/index.css
+++ b/frontend/src/index.css
@@ -40,3 +40,12 @@ code {
   padding: 0;
   background-color: inherit;
 }
+
+.markdown-body {
+  white-space: pre-wrap; /* Handles line breaks */
+}
+
+.markdown-body ul {
+  list-style-type: disc; /* Handles bullet points */
+  margin-left: 20px;
+}
diff --git a/frontend/src/services/actions.ts b/frontend/src/services/actions.ts
index b1d91f5485fc..c55b52b3258b 100644
--- a/frontend/src/services/actions.ts
+++ b/frontend/src/services/actions.ts
@@ -28,7 +28,9 @@ const messageActions = {
   },
   [ActionType.MESSAGE]: (message: ActionMessage) => {
     if (message.source === "user") {
-      store.dispatch(addUserMessage(message.args.content));
+      store.dispatch(
+        addUserMessage({ content: message.args.content, imageUrls: [] }),
+      );
     } else {
       store.dispatch(addAssistantMessage(message.args.content));
     }
diff --git a/frontend/src/services/browseService.ts b/frontend/src/services/browseService.ts
new file mode 100644
index 000000000000..41525f1d30e7
--- /dev/null
+++ b/frontend/src/services/browseService.ts
@@ -0,0 +1,8 @@
+import ActionType from "#/types/ActionType";
+import Session from "./session";
+
+export function updateBrowserTabUrl(newUrl: string): void {
+  const event = { action: ActionType.BROWSE, args: { url: newUrl } };
+  const eventString = JSON.stringify(event);
+  Session.send(eventString);
+}
diff --git a/frontend/src/services/chatService.ts b/frontend/src/services/chatService.ts
index af1ab45ce86b..d857fb603f44 100644
--- a/frontend/src/services/chatService.ts
+++ b/frontend/src/services/chatService.ts
@@ -1,8 +1,11 @@
 import ActionType from "#/types/ActionType";
 import Session from "./session";
 
-export function sendChatMessage(message: string): void {
-  const event = { action: ActionType.MESSAGE, args: { content: message } };
+export function sendChatMessage(message: string, images_urls: string[]): void {
+  const event = {
+    action: ActionType.MESSAGE,
+    args: { content: message, images_urls },
+  };
   const eventString = JSON.stringify(event);
   Session.send(eventString);
 }
diff --git a/frontend/src/services/fileService.ts b/frontend/src/services/fileService.ts
index bdbae8991ebf..77751c43f411 100644
--- a/frontend/src/services/fileService.ts
+++ b/frontend/src/services/fileService.ts
@@ -67,10 +67,14 @@ export async function uploadFiles(files: FileList): Promise<UploadResult> {
   };
 }
 
-export async function listFiles(path: string = "/"): Promise<string[]> {
-  const data = await request(
-    `/api/list-files?path=${encodeURIComponent(path)}`,
-  );
+export async function listFiles(
+  path: string | undefined = undefined,
+): Promise<string[]> {
+  let url = "/api/list-files";
+  if (path) {
+    url = `/api/list-files?path=${encodeURIComponent(path)}`;
+  }
+  const data = await request(url);
   if (!Array.isArray(data)) {
     throw new Error("Invalid response format: data is not an array");
   }
diff --git a/frontend/src/state/chatSlice.ts b/frontend/src/state/chatSlice.ts
index 757806e1e048..a1b01fa77682 100644
--- a/frontend/src/state/chatSlice.ts
+++ b/frontend/src/state/chatSlice.ts
@@ -10,12 +10,15 @@ export const chatSlice = createSlice({
   name: "chat",
   initialState,
   reducers: {
-    addUserMessage(state, action: PayloadAction<string>) {
+    addUserMessage(
+      state,
+      action: PayloadAction<{ content: string; imageUrls: string[] }>,
+    ) {
       const message: Message = {
         sender: "user",
-        content: action.payload,
+        content: action.payload.content,
+        imageUrls: action.payload.imageUrls,
       };
-
       state.messages.push(message);
     },
 
@@ -23,8 +26,8 @@ export const chatSlice = createSlice({
       const message: Message = {
         sender: "assistant",
         content: action.payload,
+        imageUrls: [],
       };
-
       state.messages.push(message);
     },
 
diff --git a/opendevin/README.md b/opendevin/README.md
index f977abbdeb10..11c52dcab2e8 100644
--- a/opendevin/README.md
+++ b/opendevin/README.md
@@ -3,11 +3,7 @@
 This directory contains the core components of OpenDevin.
 
 This diagram provides an overview of the roles of each component and how they communicate and collaborate.
-
-<div style="text-align: center;">
-  <img src="https://github.com/OpenDevin/OpenDevin/assets/16201837/97d747e3-29d8-4ccb-8d34-6ad1adb17f38" alt="OpenDevin System Architecture Diagram Jul 4 2024" />
-  <p><em>OpenDevin System Architecture Diagram (July 4, 2024)</em></p>
-</div>
+![OpenDevin System Architecture Diagram (July 4, 2024)](../docs/static/img/system_architecture_overview.png)
 
 ## Classes
 The key classes in OpenDevin are:
diff --git a/opendevin/condenser/condenser.py b/opendevin/condenser/condenser.py
new file mode 100644
index 000000000000..d1fef59117fb
--- /dev/null
+++ b/opendevin/condenser/condenser.py
@@ -0,0 +1,147 @@
+from opendevin.core.exceptions import (
+    SummarizeError,
+)
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.message import Message, TextContent
+from opendevin.events.action import (
+    AgentSummarizeAction,
+)
+
+from .prompts import (
+    MESSAGE_SUMMARY_WARNING_FRAC,
+    SUMMARY_PROMPT_SYSTEM,
+    parse_summary_response,
+)
+
+
+class CondenserMixin:
+    """Condenses a group of condensable messages as done by MemGPT."""
+
+    def condense(
+        self,
+        messages: list[Message],
+    ):
+        # Start past the system message, and example messages.,
+        # and collect messages for summarization until we reach the desired truncation token fraction (eg 50%)
+        # Do not allow truncation  for in-context examples of function calling
+        token_counts = [
+            self.get_token_count([message])  # type: ignore
+            for message in messages
+            if message.condensable
+        ]
+        message_buffer_token_count = sum(token_counts)  # no system and example message
+
+        desired_token_count_to_summarize = int(
+            message_buffer_token_count * self.config.message_summary_trunc_tokens_frac  # type: ignore
+        )
+
+        candidate_messages_to_summarize = []
+        tokens_so_far = 0
+        for message in messages:
+            if message.condensable:
+                candidate_messages_to_summarize.append(message)
+                tokens_so_far += self.get_token_count([message])  # type: ignore
+            if tokens_so_far > desired_token_count_to_summarize:
+                last_summarized_event_id = message.event_id
+                break
+
+        # TODO: Add functionality for preserving last N messages
+        # MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST = 3
+        # if preserve_last_N_messages:
+        #     candidate_messages_to_summarize = candidate_messages_to_summarize[:-MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST]
+        #     token_counts = token_counts[:-MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST]
+
+        logger.debug(
+            f'message_summary_trunc_tokens_frac={self.config.message_summary_trunc_tokens_frac}'  # type: ignore
+        )
+        # logger.debug(f'MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST={MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST}')
+        logger.debug(f'token_counts={token_counts}')
+        logger.debug(f'message_buffer_token_count={message_buffer_token_count}')
+        logger.debug(
+            f'desired_token_count_to_summarize={desired_token_count_to_summarize}'
+        )
+        logger.debug(
+            f'len(candidate_messages_to_summarize)={len(candidate_messages_to_summarize)}'
+        )
+
+        if len(candidate_messages_to_summarize) == 0:
+            raise SummarizeError(
+                f"Summarize error: tried to run summarize, but couldn't find enough messages to compress [len={len(messages)}]"
+            )
+
+        # TODO: Try to make an assistant message come after the cutoff
+
+        message_sequence_to_summarize = candidate_messages_to_summarize
+
+        if len(message_sequence_to_summarize) <= 1:
+            # This prevents a potential infinite loop of summarizing the same message over and over
+            raise SummarizeError(
+                f"Summarize error: tried to run summarize, but couldn't find enough messages to compress [len={len(message_sequence_to_summarize)} <= 1]"
+            )
+        else:
+            print(
+                f'Attempting to summarize with last summarized event id = {last_summarized_event_id}'
+            )
+
+        action_response = self.summarize_messages(
+            message_sequence_to_summarize=message_sequence_to_summarize
+        )
+        summary_action: AgentSummarizeAction = parse_summary_response(action_response)
+        summary_action.last_summarized_event_id = (
+            last_summarized_event_id if last_summarized_event_id else -1
+        )
+        return summary_action
+
+    def _format_summary_history(self, message_history: list[dict]) -> str:
+        # TODO use existing prompt formatters for this (eg ChatML)
+        return '\n'.join([f'{m["role"]}: {m["content"]}' for m in message_history])
+
+    def summarize_messages(self, message_sequence_to_summarize: list[Message]):
+        """Summarize a message sequence using LLM"""
+        context_window = self.config.max_input_tokens  # type: ignore
+        summary_prompt = SUMMARY_PROMPT_SYSTEM
+        summary_input = self._format_summary_history(
+            self.get_text_messages(message_sequence_to_summarize)  # type: ignore
+        )
+        summary_input_tkns = self.get_token_count(text=summary_input)  # type: ignore
+        if context_window is None:
+            raise ValueError('context_window should not be None')
+        if summary_input_tkns > MESSAGE_SUMMARY_WARNING_FRAC * context_window:
+            trunc_ratio = (
+                MESSAGE_SUMMARY_WARNING_FRAC * context_window / summary_input_tkns
+            ) * 0.8  # For good measure...
+            cutoff = int(len(message_sequence_to_summarize) * trunc_ratio)
+            curr_summary = self.summarize_messages(
+                message_sequence_to_summarize=message_sequence_to_summarize[:cutoff]
+            )
+            curr_summary = parse_summary_response(curr_summary)
+            curr_summary_message = (
+                'Summary of all Action and Observations till now. \n'
+                f'Action: {curr_summary.summarized_actions}\n'
+                f'Observation: {curr_summary.summarized_observations}'
+            )
+            input = [
+                Message(
+                    role='assistant', content=[TextContent(text=curr_summary_message)]
+                )
+            ] + message_sequence_to_summarize[cutoff:]
+            summary_input = self._format_summary_history(self.get_text_messages(input))  # type: ignore
+
+        message_sequence = []
+        message_sequence.append(
+            Message(role='system', content=[TextContent(text=summary_prompt)])
+        )
+        message_sequence.append(
+            Message(role='user', content=[TextContent(text=summary_input)])
+        )
+
+        response = self.completion(  # type: ignore
+            messages=message_sequence,
+            temperature=0.0,
+            condense=True,
+        )
+
+        print(f'summarize_messages gpt reply: {response.choices[0].message.content}')
+
+        action_response = response['choices'][0]['message']['content']
+        return action_response
diff --git a/opendevin/condenser/prompts.py b/opendevin/condenser/prompts.py
new file mode 100644
index 000000000000..a1de3ae9fb30
--- /dev/null
+++ b/opendevin/condenser/prompts.py
@@ -0,0 +1,74 @@
+import re
+
+from opendevin.core.exceptions import (
+    InvalidSummaryResponseError,
+    LLMMalformedActionError,
+    LLMResponseError,
+)
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.utils import json
+from opendevin.events.action.agent import AgentSummarizeAction
+from opendevin.events.event import EventSource
+from opendevin.events.serialization.action import action_from_dict
+
+WORD_LIMIT = 200
+MESSAGE_SUMMARY_WARNING_FRAC = 0.75
+
+SUMMARY_PROMPT_SYSTEM = """
+Your job is to summarize a history of previous messages in a conversation between an AI persona and a human. The conversation you are given is a from a fixed context window and may not be complete. Keep your summary less than {WORD_LIMIT} words, do NOT exceed this word limit.
+Only output the summary, do NOT include anything else in your output.
+Given the following actions and observations, create a JSON response with:
+    - "action": "summarize"
+    - args:
+      - "summarized_actions": A precise sentence summarizing all the provided actions, written in the first person.
+      - "summarized_observations": A few precise sentences summarizing all the provided observations, written in the third person.
+Example:
+{
+    "action": "summarize",
+    "args": {
+        "summarized_actions": "I located the UML specification PDF, parsed its content, and searched for information about sequence diagrams.",
+        "summarized_observations": "The agent encountered a UnicodeDecodeError when initially searching the PDF text, but was able to resolve this by installing the PyPDF2 library and successfully extracting relevant information about sequence diagrams."
+    }
+}
+Make sure to include in observations any relevant information that the agent needs to remember.
+%(events)s
+"""
+
+
+def parse_summary_response(response: str) -> AgentSummarizeAction:
+    """
+    Parses a JSON summary of events.
+
+    Parameters:
+    - response: The response string to be parsed
+    Returns:
+    - The summary action output by the model
+    """
+    try:
+        try:
+            action_dict = json.loads(response)
+        except LLMResponseError:
+            pattern = r'(?<=Action:\s).+?(?=\s*Observation:)|(?<=Observation:\s).+'
+            matches = re.findall(pattern, response.replace('**', ''), re.DOTALL)
+            action_summary = matches[0].strip() if matches else None
+            observation_summary = matches[1].strip() if len(matches) > 1 else None
+            action_dict = {
+                'action': 'summarize',
+                'args': {
+                    'summarized_actions': action_summary,
+                    'summarized_observations': observation_summary,
+                },
+            }
+
+        action = action_from_dict(action_dict)
+        if action is None or not isinstance(action, AgentSummarizeAction):
+            error_message = f'Expected a summarize action, but the response got {str(type(action)) if action else None}'
+            logger.error(error_message)
+            raise InvalidSummaryResponseError(error_message)
+        action._source = EventSource.AGENT  # type: ignore
+    except (LLMResponseError, LLMMalformedActionError) as e:
+        logger.error(f'Failed to parse summary response: {str(e)}')
+        raise InvalidSummaryResponseError(
+            f'Failed to parse the response: {str(e)}'
+        ) from e
+    return action
diff --git a/opendevin/controller/agent_controller.py b/opendevin/controller/agent_controller.py
index bbb0e11cf9fd..533322d5ff8d 100644
--- a/opendevin/controller/agent_controller.py
+++ b/opendevin/controller/agent_controller.py
@@ -21,6 +21,7 @@
     AgentDelegateAction,
     AgentFinishAction,
     AgentRejectAction,
+    AgentSummarizeAction,
     ChangeAgentStateAction,
     CmdRunAction,
     IPythonRunCellAction,
@@ -36,6 +37,7 @@
     ErrorObservation,
     Observation,
 )
+from opendevin.events.observation.browse import BrowserOutputObservation
 from opendevin.llm.llm import LLM
 
 # note: RESUME is only available on web GUI
@@ -131,9 +133,9 @@ async def report_error(self, message: str, exception: Exception | None = None):
         - a user-friendly message, which will be shown in the chat box. This should not be a raw exception message.
         - an ErrorObservation that can be sent to the LLM by the agent, with the exception message, so it can self-correct next time.
         """
-        self.state.last_error = message
         if exception:
-            self.state.last_error += f': {exception}'
+            message += f': {exception}'
+        self.state.last_error = message
         self.event_stream.add_event(ErrorObservation(message), EventSource.AGENT)
 
     async def _start_step_loop(self):
@@ -145,7 +147,6 @@ async def _start_step_loop(self):
                 logger.info('AgentController task was cancelled')
                 break
             except Exception as e:
-                traceback.print_exc()
                 logger.error(f'Error while running the agent: {e}')
                 logger.error(traceback.format_exc())
                 await self.report_error(
@@ -161,6 +162,10 @@ async def on_event(self, event: Event):
             await self.set_agent_state_to(event.agent_state)  # type: ignore
         elif isinstance(event, MessageAction):
             if event.source == EventSource.USER:
+                logger.info(
+                    event,
+                    extra={'msg_type': 'ACTION', 'event_source': EventSource.USER},
+                )
                 if self.get_agent_state() != AgentState.RUNNING:
                     await self.set_agent_state_to(AgentState.RUNNING)
             elif event.source == EventSource.AGENT and event.wait_for_response:
@@ -196,6 +201,8 @@ async def on_event(self, event: Event):
                 logger.info(event, extra={'msg_type': 'OBSERVATION'})
             elif isinstance(event, CmdOutputObservation):
                 logger.info(event, extra={'msg_type': 'OBSERVATION'})
+            elif isinstance(event, BrowserOutputObservation):
+                logger.info(event, extra={'msg_type': 'OBSERVATION'})
             elif isinstance(event, AgentDelegateObservation):
                 self.state.history.on_event(event)
                 logger.info(event, extra={'msg_type': 'OBSERVATION'})
@@ -376,16 +383,27 @@ async def _step(self) -> None:
                     self.state.traffic_control_state = TrafficControlState.NORMAL
                 else:
                     self.state.traffic_control_state = TrafficControlState.THROTTLING
-                    await self.report_error(
-                        f'Task budget exceeded. Current cost: {current_cost:.2f}, Max budget: {self.max_budget_per_task:.2f}, task paused. {TRAFFIC_CONTROL_REMINDER}'
-                    )
-                    await self.set_agent_state_to(AgentState.PAUSED)
+                    if self.headless_mode:
+                        # set to ERROR state if running in headless mode
+                        # there is no way to resume
+                        await self.report_error(
+                            f'Task budget exceeded. Current cost: {current_cost:.2f}, max budget: {self.max_budget_per_task:.2f}, task stopped.'
+                        )
+                        await self.set_agent_state_to(AgentState.ERROR)
+                    else:
+                        await self.report_error(
+                            f'Task budget exceeded. Current cost: {current_cost:.2f}, Max budget: {self.max_budget_per_task:.2f}, task paused. {TRAFFIC_CONTROL_REMINDER}'
+                        )
+                        await self.set_agent_state_to(AgentState.PAUSED)
                     return
 
         self.update_state_before_step()
         action: Action = NullAction()
         try:
             action = self.agent.step(self.state)
+            if isinstance(action, AgentSummarizeAction):
+                self.state.history.add_summary(action)
+                return
             if action is None:
                 raise LLMNoActionError('No action was returned')
         except (LLMMalformedActionError, LLMNoActionError, LLMResponseError) as e:
diff --git a/opendevin/controller/state/state.py b/opendevin/controller/state/state.py
index a8dd2ed33372..372a66767930 100644
--- a/opendevin/controller/state/state.py
+++ b/opendevin/controller/state/state.py
@@ -2,6 +2,7 @@
 import pickle
 from dataclasses import dataclass, field
 from enum import Enum
+from typing import Any
 
 from opendevin.controller.state.task import RootTask
 from opendevin.core.logger import opendevin_logger as logger
@@ -106,6 +107,9 @@ class State:
     start_id: int = -1
     end_id: int = -1
     almost_stuck: int = 0
+    # NOTE: This will never be used by the controller, but it can be used by different
+    # evaluation tasks to store extra data needed to track the progress/state of the task.
+    extra_data: dict[str, Any] = field(default_factory=dict)
 
     def save_to_session(self, sid: str, file_store: FileStore):
         pickled = pickle.dumps(self)
@@ -126,10 +130,17 @@ def restore_from_session(sid: str, file_store: FileStore) -> 'State':
         except Exception as e:
             logger.error(f'Failed to restore state from session: {e}')
             raise e
+
+        # update state
         if state.agent_state in RESUMABLE_STATES:
             state.resume_state = state.agent_state
         else:
             state.resume_state = None
+
+        # don't carry last_error anymore after restore
+        state.last_error = None
+
+        # first state after restore
         state.agent_state = AgentState.LOADING
         return state
 
@@ -160,13 +171,15 @@ def __setstate__(self, state):
         # remove the restored data from the state if any
 
     def get_current_user_intent(self):
-        """Returns the latest user message that appears after a FinishAction, or the first (the task) if nothing was finished yet."""
+        """Returns the latest user message and image(if provided) that appears after a FinishAction, or the first (the task) if nothing was finished yet."""
         last_user_message = None
+        last_user_message_image_urls: list[str] | None = []
         for event in self.history.get_events(reverse=True):
             if isinstance(event, MessageAction) and event.source == 'user':
                 last_user_message = event.content
+                last_user_message_image_urls = event.images_urls
             elif isinstance(event, AgentFinishAction):
                 if last_user_message is not None:
                     return last_user_message
 
-        return last_user_message
+        return last_user_message, last_user_message_image_urls
diff --git a/opendevin/core/config.py b/opendevin/core/config.py
index 4fff99216a99..70cd269528b1 100644
--- a/opendevin/core/config.py
+++ b/opendevin/core/config.py
@@ -78,6 +78,8 @@ class LLMConfig:
     input_cost_per_token: float | None = None
     output_cost_per_token: float | None = None
     ollama_base_url: str | None = None
+    message_summary_trunc_tokens_frac: float = 0.75
+    attempts_to_condense: int = 2
     drop_params: bool | None = None
 
     def defaults_to_dict(self) -> dict:
@@ -111,6 +113,12 @@ def to_safe_dict(self):
                 ret[k] = '******' if v else None
         return ret
 
+    def set_missing_attributes(self):
+        """Set any missing attributes to their default values."""
+        for field_name, field_obj in self.__dataclass_fields__.items():
+            if not hasattr(self, field_name):
+                setattr(self, field_name, field_obj.default)
+
 
 @dataclass
 class AgentConfig:
@@ -139,23 +147,31 @@ class SandboxConfig(metaclass=Singleton):
     """Configuration for the sandbox.
 
     Attributes:
-        box_type: The type of sandbox to use. Options are: ssh, e2b, local.
+        api_hostname: The hostname for the EventStream Runtime API.
         container_image: The container image to use for the sandbox.
         user_id: The user ID for the sandbox.
         timeout: The timeout for the sandbox.
         enable_auto_lint: Whether to enable auto-lint.
         use_host_network: Whether to use the host network.
         initialize_plugins: Whether to initialize plugins.
-        update_source_code: Whether to update the source code in the EventStreamRuntime.
-            Used for development of EventStreamRuntime.
+        od_runtime_extra_deps: The extra dependencies to install in the runtime image (typically used for evaluation).
+            This will be rendered into the end of the Dockerfile that builds the runtime image.
+            It can contain any valid shell commands (e.g., pip install numpy).
+            The path to the interpreter is available as $OD_INTERPRETER_PATH,
+            which can be used to install dependencies for the OD-specific Python interpreter.
+        od_runtime_startup_env_vars: The environment variables to set at the launch of the runtime.
+            This is a dictionary of key-value pairs.
+            This is useful for setting environment variables that are needed by the runtime.
+            For example, for specifying the base url of website for browsergym evaluation.
+        browsergym_eval_env: The BrowserGym environment to use for evaluation.
+            Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
+        persist_sandbox: Whether to persist the sandbox after the task is done.
+        fast_boot: Whether to use a fast boot mode for the sandbox.
+        port: The port to use for the sandbox.
     """
 
-    box_type: str = 'ssh'
-    container_image: str = 'ghcr.io/opendevin/sandbox' + (
-        f':{os.getenv("OPEN_DEVIN_BUILD_VERSION")}'
-        if os.getenv('OPEN_DEVIN_BUILD_VERSION')
-        else ':main'
-    )
+    api_hostname: str = 'localhost'
+    container_image: str = 'nikolaik/python-nodejs:python3.11-nodejs22'  # default to nikolaik/python-nodejs:python3.11-nodejs22 for eventstream runtime
     user_id: int = os.getuid() if hasattr(os, 'getuid') else 1000
     timeout: int = 120
     enable_auto_lint: bool = (
@@ -163,7 +179,12 @@ class SandboxConfig(metaclass=Singleton):
     )
     use_host_network: bool = False
     initialize_plugins: bool = True
-    update_source_code: bool = False
+    od_runtime_extra_deps: str | None = None
+    od_runtime_startup_env_vars: dict[str, str] = field(default_factory=dict)
+    browsergym_eval_env: str | None = None
+    persist_sandbox: bool = False
+    fast_boot: bool = False
+    port: int = 63710
 
     def defaults_to_dict(self) -> dict:
         """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
@@ -211,7 +232,6 @@ class AppConfig(metaclass=Singleton):
         max_iterations: The maximum number of iterations.
         max_budget_per_task: The maximum budget allowed per task, beyond which the agent will stop.
         e2b_api_key: The E2B API key.
-        ssh_hostname: The SSH hostname.
         disable_color: Whether to disable color. For terminals that don't support color.
         debug: Whether to enable debugging.
         enable_cli_session: Whether to enable saving and restoring the session when run from CLI.
@@ -224,13 +244,14 @@ class AppConfig(metaclass=Singleton):
     agents: dict = field(default_factory=dict)
     default_agent: str = _DEFAULT_AGENT
     sandbox: SandboxConfig = field(default_factory=SandboxConfig)
-    runtime: str = 'server'
+    runtime: str = 'eventstream'
     file_store: str = 'memory'
     file_store_path: str = '/tmp/file_store'
+    # TODO: clean up workspace path after the removal of ServerRuntime
     workspace_base: str = os.path.join(os.getcwd(), 'workspace')
-    workspace_mount_path: str = (
+    workspace_mount_path: str | None = (
         UndefinedString.UNDEFINED  # this path should always be set when config is fully loaded
-    )
+    )  # when set to None, do not mount the workspace
     workspace_mount_path_in_sandbox: str = '/workspace'
     workspace_mount_rewrite: str | None = None
     cache_dir: str = '/tmp/cache'
@@ -239,11 +260,7 @@ class AppConfig(metaclass=Singleton):
     max_iterations: int = _MAX_ITERATIONS
     max_budget_per_task: float | None = None
     e2b_api_key: str = ''
-    ssh_hostname: str = 'localhost'
     disable_color: bool = False
-    persist_sandbox: bool = False
-    ssh_port: int = 63710
-    ssh_password: str | None = None
     jwt_secret: str = uuid.uuid4().hex
     debug: bool = False
     enable_cli_session: bool = False
@@ -315,7 +332,6 @@ def __str__(self):
                 'e2b_api_key',
                 'github_token',
                 'jwt_secret',
-                'ssh_password',
             ]:
                 attr_value = '******' if attr_value else None
 
@@ -390,6 +406,11 @@ def set_attr_from_env(sub_config: Any, prefix=''):
             elif env_var_name in env_or_toml_dict:
                 # convert the env var to the correct type and set it
                 value = env_or_toml_dict[env_var_name]
+
+                # skip empty config values (fall back to default)
+                if not value:
+                    continue
+
                 try:
                     # if it's an optional type, get the non-None type
                     if get_origin(field_type) is UnionType:
@@ -406,11 +427,6 @@ def set_attr_from_env(sub_config: Any, prefix=''):
                         f'Error setting env var {env_var_name}={value}: check that the value is of the right type'
                     )
 
-    if 'SANDBOX_TYPE' in env_or_toml_dict:
-        logger.opendevin_logger.error(
-            'SANDBOX_TYPE is deprecated. Please use SANDBOX_BOX_TYPE instead.'
-        )
-        env_or_toml_dict['SANDBOX_BOX_TYPE'] = env_or_toml_dict.pop('SANDBOX_TYPE')
     # Start processing from the root of the config object
     set_attr_from_env(cfg)
 
@@ -433,8 +449,7 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
     try:
         with open(toml_file, 'r', encoding='utf-8') as toml_contents:
             toml_config = toml.load(toml_contents)
-    except FileNotFoundError as e:
-        logger.opendevin_logger.info(f'Config file not found: {e}')
+    except FileNotFoundError:
         return
     except toml.TomlDecodeError as e:
         logger.opendevin_logger.warning(
@@ -501,8 +516,6 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
         keys_to_migrate = [key for key in core_config if key.startswith('sandbox_')]
         for key in keys_to_migrate:
             new_key = key.replace('sandbox_', '')
-            if new_key == 'type':
-                new_key = 'box_type'
             if new_key in sandbox_config.__annotations__:
                 # read the key in sandbox and remove it from core
                 setattr(sandbox_config, new_key, core_config.pop(key))
@@ -529,10 +542,6 @@ def finalize_config(cfg: AppConfig):
         cfg.workspace_mount_path = os.path.abspath(cfg.workspace_base)
     cfg.workspace_base = os.path.abspath(cfg.workspace_base)
 
-    # In local there is no sandbox, the workspace will have the same pwd as the host
-    if cfg.sandbox.box_type == 'local':
-        cfg.workspace_mount_path_in_sandbox = cfg.workspace_mount_path
-
     if cfg.workspace_mount_rewrite:  # and not config.workspace_mount_path:
         # TODO why do we need to check if workspace_mount_path is None?
         base = cfg.workspace_base or os.getcwd()
diff --git a/opendevin/core/exceptions.py b/opendevin/core/exceptions.py
index fe973083962c..ca130dbb9d13 100644
--- a/opendevin/core/exceptions.py
+++ b/opendevin/core/exceptions.py
@@ -67,3 +67,34 @@ def __init__(self, message='Agent must return an action'):
 class LLMResponseError(Exception):
     def __init__(self, message='Failed to retrieve action from LLM response'):
         super().__init__(message)
+
+
+class TokenLimitExceededError(Exception):
+    """Exception raised when the user-defined max_input_tokens limit is exceeded."""
+
+    def __init__(self, message='User-defined token limit exceeded. Condensing memory.'):
+        super().__init__(message)
+
+
+class ContextWindowLimitExceededError(Exception):
+    def __init__(
+        self, message='Context window limit exceeded. Unable to condense memory.'
+    ):
+        super().__init__(message)
+
+
+class SummarizeError(Exception):
+    """Exception raised when message can't be Summarized."""
+
+    def __init__(self, message='Error Summarizing The Memory'):
+        super().__init__(message)
+
+
+class InvalidSummaryResponseError(Exception):
+    def __init__(self, message='Invalid summary response'):
+        super().__init__(message)
+
+
+class UserCancelledError(Exception):
+    def __init__(self, message='User cancelled the request'):
+        super().__init__(message)
diff --git a/opendevin/core/logger.py b/opendevin/core/logger.py
index 5bde01847504..f765cdff9958 100644
--- a/opendevin/core/logger.py
+++ b/opendevin/core/logger.py
@@ -9,7 +9,7 @@
 from termcolor import colored
 
 DISABLE_COLOR_PRINTING = False
-DEBUG = False
+DEBUG = os.getenv('DEBUG', 'False').lower() in ['true', '1', 'yes']
 
 ColorType = Literal[
     'red',
@@ -87,7 +87,6 @@ def filter(self, record):
             'e2b_api_key',
             'github_token',
             'jwt_secret',
-            'ssh_password',
         ]
 
         # add env var names
@@ -123,9 +122,8 @@ def get_console_handler():
     return console_handler
 
 
-def get_file_handler(log_dir=None):
+def get_file_handler(log_dir):
     """Returns a file handler for logging."""
-    log_dir = os.path.join(os.getcwd(), 'logs') if log_dir is None else log_dir
     os.makedirs(log_dir, exist_ok=True)
     timestamp = datetime.now().strftime('%Y-%m-%d')
     file_name = f'opendevin_{timestamp}.log'
@@ -159,16 +157,21 @@ def log_uncaught_exceptions(ex_cls, ex, tb):
 
 opendevin_logger = logging.getLogger('opendevin')
 opendevin_logger.setLevel(logging.INFO)
+LOG_DIR = os.path.join(
+    # parent dir of opendevin/core (i.e., root of the repo)
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+    'logs',
+)
 if DEBUG:
     opendevin_logger.setLevel(logging.DEBUG)
-opendevin_logger.addHandler(get_file_handler())
+    # default log to project root
+    opendevin_logger.info('DEBUG logging is enabled. Logging to %s', LOG_DIR)
+opendevin_logger.addHandler(get_file_handler(LOG_DIR))
 opendevin_logger.addHandler(get_console_handler())
 opendevin_logger.addFilter(SensitiveDataFilter(opendevin_logger.name))
 opendevin_logger.propagate = False
 opendevin_logger.debug('Logging initialized')
-opendevin_logger.debug(
-    'Logging to %s', os.path.join(os.getcwd(), 'logs', 'opendevin.log')
-)
+
 
 # Exclude LiteLLM from logging output
 logging.getLogger('LiteLLM').disabled = True
@@ -194,7 +197,7 @@ def __init__(self, filename, mode='a', encoding='utf-8', delay=False):
             self.session = datetime.now().strftime('%y-%m-%d_%H-%M')
         else:
             self.session = 'default'
-        self.log_directory = os.path.join(os.getcwd(), 'logs', 'llm', self.session)
+        self.log_directory = os.path.join(LOG_DIR, 'llm', self.session)
         os.makedirs(self.log_directory, exist_ok=True)
         if not DEBUG:
             # Clear the log directory if not in debug mode
diff --git a/opendevin/core/main.py b/opendevin/core/main.py
index 34f58082979c..9e89d5385bad 100644
--- a/opendevin/core/main.py
+++ b/opendevin/core/main.py
@@ -1,13 +1,18 @@
 import asyncio
-import os
 import sys
+import uuid
 from typing import Callable, Type
 
 import agenthub  # noqa F401 (we import this to get the agents registered)
 from opendevin.controller import AgentController
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
+from opendevin.core.config import (
+    AppConfig,
+    get_llm_config_arg,
+    load_app_config,
+    parse_arguments,
+)
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.schema import AgentState
 from opendevin.events import EventSource, EventStream, EventStreamSubscriber
@@ -16,11 +21,9 @@
 from opendevin.events.observation import AgentStateChangedObservation
 from opendevin.llm.llm import LLM
 from opendevin.runtime import get_runtime_cls
-from opendevin.runtime.sandbox import Sandbox
+from opendevin.runtime.runtime import Runtime
 from opendevin.storage import get_file_store
 
-config = load_app_config()
-
 
 def read_task_from_file(file_path: str) -> str:
     """Read task from the specified file."""
@@ -33,79 +36,99 @@ def read_task_from_stdin() -> str:
     return sys.stdin.read()
 
 
-async def run_agent_controller(
-    agent: Agent,
+async def create_runtime(
+    config: AppConfig,
+    sid: str | None = None,
+    runtime_tools_config: dict | None = None,
+) -> Runtime:
+    """Create a runtime for the agent to run on.
+
+    config: The app config.
+    sid: The session id.
+    runtime_tools_config: (will be deprecated) The runtime tools config.
+    """
+    # set up the event stream
+    file_store = get_file_store(config.file_store, config.file_store_path)
+    session_id = 'main' + ('_' + sid if sid else str(uuid.uuid4()))
+    event_stream = EventStream(session_id, file_store)
+
+    # agent class
+    agent_cls = agenthub.Agent.get_cls(config.default_agent)
+
+    # runtime and tools
+    runtime_cls = get_runtime_cls(config.runtime)
+    logger.info(f'Initializing runtime: {runtime_cls}')
+    runtime: Runtime = runtime_cls(
+        config=config,
+        event_stream=event_stream,
+        sid=session_id,
+        plugins=agent_cls.sandbox_plugins,
+    )
+    await runtime.ainit()
+
+    return runtime
+
+
+async def run_controller(
+    config: AppConfig,
     task_str: str,
-    max_iterations: int,
-    max_budget_per_task: float | None = None,
+    runtime: Runtime | None = None,
+    agent: Agent | None = None,
     exit_on_message: bool = False,
     fake_user_response_fn: Callable[[State | None], str] | None = None,
-    sandbox: Sandbox | None = None,
-    runtime_tools_config: dict | None = None,
-    sid: str | None = None,
     headless_mode: bool = True,
 ) -> State | None:
     """Main coroutine to run the agent controller with task input flexibility.
     It's only used when you launch opendevin backend directly via cmdline.
 
     Args:
-        task_str: The task to run.
+        config: The app config.
+        task_str: The task to run. It can be a string.
+        runtime: (optional) A runtime for the agent to run on.
+        agent: (optional) A agent to run.
         exit_on_message: quit if agent asks for a message from user (optional)
         fake_user_response_fn: An optional function that receives the current state (could be None) and returns a fake user response.
-        sandbox: An optional sandbox to run the agent in.
         headless_mode: Whether the agent is run in headless mode.
     """
-    # Logging
-    logger.info(
-        f'Running agent {agent.name}, model {agent.llm.config.model}, with task: "{task_str}"'
-    )
+    # Create the agent
+    if agent is None:
+        agent_cls: Type[Agent] = Agent.get_cls(config.default_agent)
+        agent = agent_cls(
+            llm=LLM(config=config.get_llm_config_from_agent(config.default_agent))
+        )
 
-    # set up the event stream
-    file_store = get_file_store(config.file_store, config.file_store_path)
-    cli_session = 'main' + ('_' + sid if sid else '')
-    event_stream = EventStream(cli_session, file_store)
+    if runtime is None:
+        runtime = await create_runtime(config)
 
+    event_stream = runtime.event_stream
     # restore cli session if enabled
     initial_state = None
     if config.enable_cli_session:
         try:
             logger.info('Restoring agent state from cli session')
-            initial_state = State.restore_from_session(cli_session, file_store)
+            initial_state = State.restore_from_session(
+                event_stream.sid, event_stream.file_store
+            )
         except Exception as e:
-            print('Error restoring state', e)
+            logger.info('Error restoring state', e)
 
     # init controller with this initial state
     controller = AgentController(
         agent=agent,
-        max_iterations=max_iterations,
-        max_budget_per_task=max_budget_per_task,
+        max_iterations=config.max_iterations,
+        max_budget_per_task=config.max_budget_per_task,
         agent_to_llm_config=config.get_agent_to_llm_config_map(),
         event_stream=event_stream,
         initial_state=initial_state,
         headless_mode=headless_mode,
     )
 
-    # runtime and tools
-    runtime_cls = get_runtime_cls(config.runtime)
-    runtime = runtime_cls(config=config, event_stream=event_stream, sandbox=sandbox)
-    await runtime.ainit()
-    runtime.init_sandbox_plugins(controller.agent.sandbox_plugins)
-    runtime.init_runtime_tools(
-        controller.agent.runtime_tools,
-        is_async=False,
-        runtime_tools_config=runtime_tools_config,
+    assert isinstance(task_str, str), f'task_str must be a string, got {type(task_str)}'
+    # Logging
+    logger.info(
+        f'Agent Controller Initialized: Running agent {agent.name}, model {agent.llm.config.model}, with task: "{task_str}"'
     )
 
-    # browser eval specific
-    # TODO: move to a better place
-    if runtime.browser and runtime.browser.eval_dir:
-        logger.info(f'Evaluation directory: {runtime.browser.eval_dir}')
-        with open(
-            os.path.join(runtime.browser.eval_dir, 'goal.txt'), 'r', encoding='utf-8'
-        ) as f:
-            task_str = f.read()
-            logger.info(f'Dynamic Eval task: {task_str}')
-
     # start event is a MessageAction with the task, either resumed or new
     if config.enable_cli_session and initial_state is not None:
         # we're resuming the previous session
@@ -144,12 +167,13 @@ async def on_event(event: Event):
     # save session when we're about to close
     if config.enable_cli_session:
         end_state = controller.get_state()
-        end_state.save_to_session(cli_session, file_store)
+        end_state.save_to_session(event_stream.sid, event_stream.file_store)
 
     # close when done
     await controller.close()
-    await runtime.close()
-    return controller.get_state()
+    state = controller.get_state()
+
+    return state
 
 
 if __name__ == '__main__':
@@ -165,23 +189,30 @@ async def on_event(event: Event):
     else:
         raise ValueError('No task provided. Please specify a task through -t, -f.')
 
+    # Load the app config
+    # this will load config from config.toml in the current directory
+    # as well as from the environment variables
+    config = load_app_config()
+
     # Override default LLM configs ([llm] section in config.toml)
     if args.llm_config:
         llm_config = get_llm_config_arg(args.llm_config)
         if llm_config is None:
             raise ValueError(f'Invalid toml file, cannot read {args.llm_config}')
         config.set_llm_config(llm_config)
-    llm = LLM(config=config.get_llm_config_from_agent(args.agent_cls))
 
-    # Create the agent
-    AgentCls: Type[Agent] = Agent.get_cls(args.agent_cls)
-    agent = AgentCls(llm=llm)
+    # Set default agent
+    config.default_agent = args.agent_cls
+
+    # if max budget per task is not sent on the command line, use the config value
+    if args.max_budget_per_task is not None:
+        config.max_budget_per_task = args.max_budget_per_task
+    if args.max_iterations is not None:
+        config.max_iterations = args.max_iterations
 
     asyncio.run(
-        run_agent_controller(
-            agent=agent,
+        run_controller(
+            config=config,
             task_str=task_str,
-            max_iterations=args.max_iterations,
-            max_budget_per_task=args.max_budget_per_task,
         )
     )
diff --git a/opendevin/core/message.py b/opendevin/core/message.py
new file mode 100644
index 000000000000..6c8a81653a43
--- /dev/null
+++ b/opendevin/core/message.py
@@ -0,0 +1,69 @@
+from enum import Enum
+
+from pydantic import BaseModel, Field, model_serializer
+from typing_extensions import Literal
+
+from opendevin.core.config import load_app_config
+
+config = load_app_config()
+
+
+class ContentType(Enum):
+    TEXT = 'text'
+    IMAGE_URL = 'image_url'
+
+
+class Content(BaseModel):
+    type: ContentType
+
+    @model_serializer
+    def serialize_model(self):
+        raise NotImplementedError('Subclasses should implement this method.')
+
+
+class TextContent(Content):
+    text: str
+    type: ContentType = ContentType.TEXT
+
+    @model_serializer
+    def serialize_model(self):
+        return {'type': self.type.value, 'text': self.text}
+
+
+class ImageContent(Content):
+    image_urls: list[str]
+    type: ContentType = ContentType.IMAGE_URL
+
+    @model_serializer
+    def serialize_model(self):
+        images: list[dict[str, str | dict[str, str]]] = []
+        for url in self.image_urls:
+            images.append({'type': self.type.value, 'image_url': {'url': url}})
+        return images
+
+
+class Message(BaseModel):
+    role: Literal['user', 'system', 'assistant']
+    content: list[TextContent | ImageContent] = Field(default=list)
+    condensable: bool = True
+    event_id: int = -1
+
+    @property
+    def contains_image(self) -> bool:
+        return any(isinstance(content, ImageContent) for content in self.content)
+
+    @model_serializer
+    def serialize_model(self) -> dict:
+        content: list[dict[str, str | dict[str, str]]] = []
+        # check model provider is groq
+        if 'groq/' in config.get_llm_config().model:
+            if self.role in ['system', 'assistant']:
+                return {'role': self.role, 'content': self.content[0].text}
+
+        for item in self.content:
+            if isinstance(item, TextContent):
+                content.append(item.model_dump())
+            elif isinstance(item, ImageContent):
+                content.extend(item.model_dump())
+
+        return {'role': self.role, 'content': content}
diff --git a/opendevin/core/schema/config.py b/opendevin/core/schema/config.py
index b10ebe7ad069..f7c4f25b55d3 100644
--- a/opendevin/core/schema/config.py
+++ b/opendevin/core/schema/config.py
@@ -36,11 +36,9 @@ class ConfigType(str, Enum):
     MAX_ITERATIONS = 'MAX_ITERATIONS'
     AGENT = 'AGENT'
     E2B_API_KEY = 'E2B_API_KEY'
-    SANDBOX_BOX_TYPE = 'SANDBOX_BOX_TYPE'
     SANDBOX_USER_ID = 'SANDBOX_USER_ID'
     SANDBOX_TIMEOUT = 'SANDBOX_TIMEOUT'
     USE_HOST_NETWORK = 'USE_HOST_NETWORK'
-    SSH_HOSTNAME = 'SSH_HOSTNAME'
     DISABLE_COLOR = 'DISABLE_COLOR'
     DEBUG = 'DEBUG'
     FILE_UPLOADS_MAX_FILE_SIZE_MB = 'FILE_UPLOADS_MAX_FILE_SIZE_MB'
diff --git a/opendevin/events/action/agent.py b/opendevin/events/action/agent.py
index c9d4ec6a3edd..fd5462bb7a88 100644
--- a/opendevin/events/action/agent.py
+++ b/opendevin/events/action/agent.py
@@ -20,16 +20,27 @@ def message(self) -> str:
 
 @dataclass
 class AgentSummarizeAction(Action):
-    summary: str
+    """
+    Action to summarize a list of events.
+
+    Attributes:
+    - summarized_actions: A sentence summarizing all the actions.
+    - summarized_observations: A few sentences summarizing all the observations.
+    """
+
+    summarized_actions: str = ''
+    summarized_observations: str = ''
     action: str = ActionType.SUMMARIZE
+    last_summarized_event_id = -1
 
     @property
     def message(self) -> str:
-        return self.summary
+        return self.summarized_observations
 
     def __str__(self) -> str:
         ret = '**AgentSummarizeAction**\n'
-        ret += f'SUMMARY: {self.summary}'
+        ret += f'SUMMARIZED ACTIONS: {self.summarized_actions}\n'
+        ret += f'SUMMARIZED OBSERVATIONS: {self.summarized_observations}\n'
         return ret
 
 
diff --git a/opendevin/events/action/commands.py b/opendevin/events/action/commands.py
index 0edccf82982e..e1381c7f0944 100644
--- a/opendevin/events/action/commands.py
+++ b/opendevin/events/action/commands.py
@@ -10,6 +10,14 @@
 class CmdRunAction(Action):
     command: str
     thought: str = ''
+    keep_prompt: bool = True
+    # if True, the command prompt will be kept in the command output observation
+    # Example of command output:
+    # root@sandbox:~# ls
+    # file1.txt
+    # file2.txt
+    # root@sandbox:~# <-- this is the command prompt
+
     action: str = ActionType.RUN
     runnable: ClassVar[bool] = True
     is_confirmed: ActionConfirmationStatus = ActionConfirmationStatus.CONFIRMED
@@ -33,7 +41,6 @@ class IPythonRunCellAction(Action):
     action: str = ActionType.RUN_IPYTHON
     runnable: ClassVar[bool] = True
     is_confirmed: ActionConfirmationStatus = ActionConfirmationStatus.CONFIRMED
-    kernel_init_code: str = ''  # code to run in the kernel (if the kernel is restarted)
 
     def __str__(self) -> str:
         ret = '**IPythonRunCellAction**\n'
diff --git a/opendevin/events/action/message.py b/opendevin/events/action/message.py
index 724821c24337..b235dd8687b6 100644
--- a/opendevin/events/action/message.py
+++ b/opendevin/events/action/message.py
@@ -8,6 +8,7 @@
 @dataclass
 class MessageAction(Action):
     content: str
+    images_urls: list | None = None
     wait_for_response: bool = False
     action: str = ActionType.MESSAGE
 
@@ -18,4 +19,7 @@ def message(self) -> str:
     def __str__(self) -> str:
         ret = f'**MessageAction** (source={self.source})\n'
         ret += f'CONTENT: {self.content}'
+        if self.images_urls:
+            for url in self.images_urls:
+                ret += f'\nIMAGE_URL: {url}'
         return ret
diff --git a/opendevin/events/event.py b/opendevin/events/event.py
index 7cf6d4accd3a..d4a7f915e02f 100644
--- a/opendevin/events/event.py
+++ b/opendevin/events/event.py
@@ -39,3 +39,13 @@ def cause(self) -> int | None:
         if hasattr(self, '_cause'):
             return self._cause  # type: ignore[attr-defined]
         return None
+
+    @property
+    def timeout(self) -> int | None:
+        if hasattr(self, '_timeout'):
+            return self._timeout  # type: ignore[attr-defined]
+        return None
+
+    @timeout.setter
+    def timeout(self, value: int | None) -> None:
+        self._timeout = value
diff --git a/opendevin/events/observation/__init__.py b/opendevin/events/observation/__init__.py
index a6c7405ec3c0..fb498fcc3825 100644
--- a/opendevin/events/observation/__init__.py
+++ b/opendevin/events/observation/__init__.py
@@ -6,7 +6,7 @@
 from .error import ErrorObservation
 from .files import FileReadObservation, FileWriteObservation
 from .observation import Observation
-from .reject import RejectObservation
+from .reject import UserRejectObservation
 from .success import SuccessObservation
 
 __all__ = [
@@ -21,5 +21,5 @@
     'AgentStateChangedObservation',
     'AgentDelegateObservation',
     'SuccessObservation',
-    'RejectObservation',
+    'UserRejectObservation',
 ]
diff --git a/opendevin/events/observation/browse.py b/opendevin/events/observation/browse.py
index eaf44ac101ed..3ac00c8539b7 100644
--- a/opendevin/events/observation/browse.py
+++ b/opendevin/events/observation/browse.py
@@ -11,17 +11,12 @@ class BrowserOutputObservation(Observation):
 
     url: str
     screenshot: str = field(repr=False)  # don't show in repr
-    status_code: int = 200
     error: bool = False
     observation: str = ObservationType.BROWSE
     # do not include in the memory
     open_pages_urls: list = field(default_factory=list)
     active_page_index: int = -1
-    dom_object: dict = field(default_factory=dict, repr=False)  # don't show in repr
-    axtree_object: dict = field(default_factory=dict, repr=False)  # don't show in repr
-    extra_element_properties: dict = field(
-        default_factory=dict, repr=False
-    )  # don't show in repr
+    axtree_txt: str = ''
     last_browser_action: str = ''
     last_browser_action_error: str = ''
     focused_element_bid: str = ''
@@ -34,7 +29,6 @@ def __str__(self) -> str:
         return (
             '**BrowserOutputObservation**\n'
             f'URL: {self.url}\n'
-            f'Status code: {self.status_code}\n'
             f'Error: {self.error}\n'
             f'Open pages: {self.open_pages_urls}\n'
             f'Active page index: {self.active_page_index}\n'
diff --git a/opendevin/events/observation/reject.py b/opendevin/events/observation/reject.py
index d337889f7de6..2bb34ba48fa3 100644
--- a/opendevin/events/observation/reject.py
+++ b/opendevin/events/observation/reject.py
@@ -6,7 +6,7 @@
 
 
 @dataclass
-class RejectObservation(Observation):
+class UserRejectObservation(Observation):
     """This data class represents the result of a successful action."""
 
     observation: str = ObservationType.USER_REJECTED
diff --git a/opendevin/events/serialization/action.py b/opendevin/events/serialization/action.py
index f8051842eeb2..3f7a8265af3c 100644
--- a/opendevin/events/serialization/action.py
+++ b/opendevin/events/serialization/action.py
@@ -4,6 +4,7 @@
     AgentDelegateAction,
     AgentFinishAction,
     AgentRejectAction,
+    AgentSummarizeAction,
     ChangeAgentStateAction,
 )
 from opendevin.events.action.browse import BrowseInteractiveAction, BrowseURLAction
@@ -31,6 +32,7 @@
     ModifyTaskAction,
     ChangeAgentStateAction,
     MessageAction,
+    AgentSummarizeAction,
 )
 
 ACTION_TYPE_TO_CLASS = {action_class.action: action_class for action_class in actions}  # type: ignore[attr-defined]
@@ -54,6 +56,10 @@ def action_from_dict(action: dict) -> Action:
     args = action.get('args', {})
     try:
         decoded_action = action_class(**args)
-    except TypeError:
-        raise LLMMalformedActionError(f'action={action} has the wrong arguments')
+        if 'timeout' in action:
+            decoded_action.timeout = action['timeout']
+    except TypeError as e:
+        raise LLMMalformedActionError(
+            f'Error creating {action_class} from {action=}: {e}'
+        )
     return decoded_action
diff --git a/opendevin/events/serialization/event.py b/opendevin/events/serialization/event.py
index d50c1f06eb4c..aaa9f4b23572 100644
--- a/opendevin/events/serialization/event.py
+++ b/opendevin/events/serialization/event.py
@@ -14,14 +14,12 @@
 
 DELETE_FROM_MEMORY_EXTRAS = {
     'screenshot',
-    'dom_object',
-    'axtree_object',
+    'axtree_txt',
     'open_pages_urls',
     'active_page_index',
     'last_browser_action',
     'last_browser_action_error',
     'focused_element_bid',
-    'extra_element_properties',
 }
 
 
@@ -61,6 +59,8 @@ def event_to_dict(event: 'Event') -> dict:
         props.pop(key, None)
     if 'action' in d:
         d['args'] = props
+        if event.timeout is not None:
+            d['timeout'] = event.timeout
     elif 'observation' in d:
         d['content'] = props.pop('content', '')
         d['extras'] = props
@@ -75,6 +75,7 @@ def event_to_memory(event: 'Event', max_message_chars: int) -> dict:
     d.pop('cause', None)
     d.pop('timestamp', None)
     d.pop('message', None)
+    d.pop('images_urls', None)
     if 'extras' in d:
         remove_fields(d['extras'], DELETE_FROM_MEMORY_EXTRAS)
     if isinstance(event, Observation) and 'content' in d:
diff --git a/opendevin/events/serialization/observation.py b/opendevin/events/serialization/observation.py
index 1ff31f37fdc8..37183e9a21e3 100644
--- a/opendevin/events/serialization/observation.py
+++ b/opendevin/events/serialization/observation.py
@@ -9,7 +9,7 @@
 from opendevin.events.observation.error import ErrorObservation
 from opendevin.events.observation.files import FileReadObservation, FileWriteObservation
 from opendevin.events.observation.observation import Observation
-from opendevin.events.observation.reject import RejectObservation
+from opendevin.events.observation.reject import UserRejectObservation
 from opendevin.events.observation.success import SuccessObservation
 
 observations = (
@@ -23,7 +23,7 @@
     SuccessObservation,
     ErrorObservation,
     AgentStateChangedObservation,
-    RejectObservation,
+    UserRejectObservation,
 )
 
 OBSERVATION_TYPE_TO_CLASS = {
diff --git a/opendevin/events/stream.py b/opendevin/events/stream.py
index 5a6f90998b98..054ca40af288 100644
--- a/opendevin/events/stream.py
+++ b/opendevin/events/stream.py
@@ -22,16 +22,16 @@ class EventStreamSubscriber(str, Enum):
 
 class EventStream:
     sid: str
+    file_store: FileStore
     # For each subscriber ID, there is a stack of callback functions - useful
     # when there are agent delegates
     _subscribers: dict[str, list[Callable]]
     _cur_id: int
     _lock: threading.Lock
-    _file_store: FileStore
 
     def __init__(self, sid: str, file_store: FileStore):
         self.sid = sid
-        self._file_store = file_store
+        self.file_store = file_store
         self._subscribers = {}
         self._cur_id = 0
         self._lock = threading.Lock()
@@ -39,7 +39,7 @@ def __init__(self, sid: str, file_store: FileStore):
 
     def _reinitialize_from_file_store(self) -> None:
         try:
-            events = self._file_store.list(f'sessions/{self.sid}/events')
+            events = self.file_store.list(f'sessions/{self.sid}/events')
         except FileNotFoundError:
             logger.debug(f'No events found for session {self.sid}')
             self._cur_id = 0
@@ -100,7 +100,7 @@ def get_events(
 
     def get_event(self, id: int) -> Event:
         filename = self._get_filename_for_id(id)
-        content = self._file_store.read(filename)
+        content = self.file_store.read(filename)
         data = json.loads(content)
         return event_from_dict(data)
 
@@ -136,9 +136,7 @@ def add_event(self, event: Event, source: EventSource):
         event._source = source  # type: ignore [attr-defined]
         data = event_to_dict(event)
         if event.id is not None:
-            self._file_store.write(
-                self._get_filename_for_id(event.id), json.dumps(data)
-            )
+            self.file_store.write(self._get_filename_for_id(event.id), json.dumps(data))
         for stack in self._subscribers.values():
             callback = stack[-1]
             asyncio.create_task(callback(event))
@@ -149,7 +147,7 @@ def filtered_events_by_source(self, source: EventSource):
                 yield event
 
     def clear(self):
-        self._file_store.delete(f'sessions/{self.sid}')
+        self.file_store.delete(f'sessions/{self.sid}')
         self._cur_id = 0
         # self._subscribers = {}
         self._reinitialize_from_file_store()
diff --git a/opendevin/llm/llm.py b/opendevin/llm/llm.py
index 5d4e8b55ab0d..d54244538b67 100644
--- a/opendevin/llm/llm.py
+++ b/opendevin/llm/llm.py
@@ -1,8 +1,11 @@
+import asyncio
 import copy
 import warnings
 from functools import partial
+from typing import Optional
 
 from opendevin.core.config import LLMConfig
+from opendevin.core.message import Message
 
 with warnings.catch_warnings():
     warnings.simplefilter('ignore')
@@ -13,6 +16,7 @@
     APIConnectionError,
     ContentPolicyViolationError,
     InternalServerError,
+    OpenAIError,
     RateLimitError,
     ServiceUnavailableError,
 )
@@ -24,16 +28,20 @@
     wait_random_exponential,
 )
 
+from opendevin.condenser.condenser import CondenserMixin
+from opendevin.core.exceptions import (
+    ContextWindowLimitExceededError,
+    TokenLimitExceededError,
+    UserCancelledError,
+)
 from opendevin.core.logger import llm_prompt_logger, llm_response_logger
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.metrics import Metrics
 
-__all__ = ['LLM']
-
 message_separator = '\n\n----------\n\n'
 
 
-class LLM:
+class LLM(CondenserMixin):
     """The LLM class represents a Language Model instance.
 
     Attributes:
@@ -56,6 +64,9 @@ def __init__(
         self.metrics = metrics if metrics is not None else Metrics()
         self.cost_metric_supported = True
 
+        # Set up config attributes with default values to prevent AttributeError
+        LLMConfig.set_missing_attributes(self.config)
+
         # litellm actually uses base Exception here for unknown model
         self.model_info = None
         try:
@@ -80,7 +91,6 @@ def __init__(
             else:
                 # Max input tokens for gpt3.5, so this is a safe fallback for any potentially viable model
                 self.config.max_input_tokens = 4096
-
         if config.max_output_tokens is None:
             if (
                 self.model_info is not None
@@ -95,6 +105,12 @@ def __init__(
         if self.config.drop_params:
             litellm.drop_params = self.config.drop_params
 
+        if self.config.model.startswith('ollama'):
+            max_input_tokens = self.config.max_input_tokens
+            max_output_tokens = self.config.max_output_tokens
+            if max_input_tokens and max_output_tokens:
+                litellm.OllamaConfig.num_ctx = max_input_tokens + max_output_tokens
+
         self._completion = partial(
             litellm_completion,
             model=self.config.model,
@@ -108,7 +124,7 @@ def __init__(
             top_p=self.config.top_p,
         )
 
-        completion_unwrapped = self._completion
+        self.completion_unwrapped = self._completion
 
         def attempt_on_error(retry_state):
             logger.error(
@@ -144,14 +160,56 @@ def wrapper(*args, **kwargs):
             else:
                 messages = args[1]
 
+            try:
+                if self.is_over_token_limit(messages):
+                    raise TokenLimitExceededError()
+            except TokenLimitExceededError:
+                # If we got a context alert, try trimming the messages length, then try again
+                if kwargs['condense'] and self.is_over_token_limit(messages):
+                    # A separate call to run a summarizer
+                    summary_action = self.condense(messages=messages)
+                    return summary_action
+                else:
+                    print('step() failed with an unrecognized exception:')
+                    raise ContextWindowLimitExceededError()
+
+            kwargs.pop('condense', None)
+            if isinstance(messages[0], Message):
+                messages = [message.model_dump() for message in messages]
+                kwargs['messages'] = messages
+
             # log the prompt
             debug_message = ''
             for message in messages:
-                debug_message += message_separator + message['content']
+                content = message['content']
+
+                if isinstance(content, list):
+                    for element in content:
+                        if isinstance(element, dict):
+                            if 'text' in element:
+                                content_str = element['text'].strip()
+                            elif (
+                                'image_url' in element and 'url' in element['image_url']
+                            ):
+                                content_str = element['image_url']['url']
+                            else:
+                                content_str = str(element)
+                        else:
+                            content_str = str(element)
+
+                        debug_message += message_separator + content_str
+                else:
+                    content_str = str(content)
+
+                debug_message += message_separator + content_str
+
             llm_prompt_logger.debug(debug_message)
 
-            # call the completion function
-            resp = completion_unwrapped(*args, **kwargs)
+            # skip if messages is empty (thus debug_message is empty)
+            if debug_message:
+                resp = self.completion_unwrapped(*args, **kwargs)
+            else:
+                resp = {'choices': [{'message': {'content': ''}}]}
 
             # log the response
             message_back = resp['choices'][0]['message']['content']
@@ -163,6 +221,207 @@ def wrapper(*args, **kwargs):
 
         self._completion = wrapper  # type: ignore
 
+        # Async version
+        self._async_completion = partial(
+            self._call_acompletion,
+            model=self.config.model,
+            api_key=self.config.api_key,
+            base_url=self.config.base_url,
+            api_version=self.config.api_version,
+            custom_llm_provider=self.config.custom_llm_provider,
+            max_tokens=self.config.max_output_tokens,
+            timeout=self.config.timeout,
+            temperature=self.config.temperature,
+            top_p=self.config.top_p,
+            drop_params=True,
+        )
+
+        async_completion_unwrapped = self._async_completion
+
+        @retry(
+            reraise=True,
+            stop=stop_after_attempt(self.config.num_retries),
+            wait=wait_random_exponential(
+                multiplier=self.config.retry_multiplier,
+                min=self.config.retry_min_wait,
+                max=self.config.retry_max_wait,
+            ),
+            retry=retry_if_exception_type(
+                (
+                    RateLimitError,
+                    APIConnectionError,
+                    ServiceUnavailableError,
+                    InternalServerError,
+                    ContentPolicyViolationError,
+                )
+            ),
+            after=attempt_on_error,
+        )
+        async def async_completion_wrapper(*args, **kwargs):
+            """Async wrapper for the litellm acompletion function."""
+            # some callers might just send the messages directly
+            if 'messages' in kwargs:
+                messages = kwargs['messages']
+            else:
+                messages = args[1]
+
+            # log the prompt
+            debug_message = ''
+            for message in messages:
+                content = message['content']
+
+                if isinstance(content, list):
+                    for element in content:
+                        if isinstance(element, dict):
+                            if 'text' in element:
+                                content_str = element['text']
+                            elif (
+                                'image_url' in element and 'url' in element['image_url']
+                            ):
+                                content_str = element['image_url']['url']
+                            else:
+                                content_str = str(element)
+                        else:
+                            content_str = str(element)
+
+                        debug_message += message_separator + content_str
+                else:
+                    content_str = str(content)
+
+                debug_message += message_separator + content_str
+
+            llm_prompt_logger.debug(debug_message)
+
+            async def check_stopped():
+                while True:
+                    if (
+                        hasattr(self.config, 'on_cancel_requested_fn')
+                        and self.config.on_cancel_requested_fn is not None
+                        and await self.config.on_cancel_requested_fn()
+                    ):
+                        raise UserCancelledError('LLM request cancelled by user')
+                    await asyncio.sleep(0.1)
+
+            stop_check_task = asyncio.create_task(check_stopped())
+
+            try:
+                # Directly call and await litellm_acompletion
+                resp = await async_completion_unwrapped(*args, **kwargs)
+
+                # skip if messages is empty (thus debug_message is empty)
+                if debug_message:
+                    message_back = resp['choices'][0]['message']['content']
+                    llm_response_logger.debug(message_back)
+                else:
+                    resp = {'choices': [{'message': {'content': ''}}]}
+                self._post_completion(resp)
+
+                # We do not support streaming in this method, thus return resp
+                return resp
+
+            except UserCancelledError:
+                logger.info('LLM request cancelled by user.')
+                raise
+            except OpenAIError as e:
+                logger.error(f'OpenAIError occurred:\n{e}')
+                raise
+            except (
+                RateLimitError,
+                APIConnectionError,
+                ServiceUnavailableError,
+                InternalServerError,
+            ) as e:
+                logger.error(f'Completion Error occurred:\n{e}')
+                raise
+
+            finally:
+                await asyncio.sleep(0.1)
+                stop_check_task.cancel()
+                try:
+                    await stop_check_task
+                except asyncio.CancelledError:
+                    pass
+
+        @retry(
+            reraise=True,
+            stop=stop_after_attempt(self.config.num_retries),
+            wait=wait_random_exponential(
+                multiplier=self.config.retry_multiplier,
+                min=self.config.retry_min_wait,
+                max=self.config.retry_max_wait,
+            ),
+            retry=retry_if_exception_type(
+                (
+                    RateLimitError,
+                    APIConnectionError,
+                    ServiceUnavailableError,
+                    InternalServerError,
+                    ContentPolicyViolationError,
+                )
+            ),
+            after=attempt_on_error,
+        )
+        async def async_acompletion_stream_wrapper(*args, **kwargs):
+            """Async wrapper for the litellm acompletion with streaming function."""
+            # some callers might just send the messages directly
+            if 'messages' in kwargs:
+                messages = kwargs['messages']
+            else:
+                messages = args[1]
+
+            # log the prompt
+            debug_message = ''
+            for message in messages:
+                debug_message += message_separator + message['content']
+            llm_prompt_logger.debug(debug_message)
+
+            try:
+                # Directly call and await litellm_acompletion
+                resp = await async_completion_unwrapped(*args, **kwargs)
+
+                # For streaming we iterate over the chunks
+                async for chunk in resp:
+                    # Check for cancellation before yielding the chunk
+                    if (
+                        hasattr(self.config, 'on_cancel_requested_fn')
+                        and self.config.on_cancel_requested_fn is not None
+                        and await self.config.on_cancel_requested_fn()
+                    ):
+                        raise UserCancelledError(
+                            'LLM request cancelled due to CANCELLED state'
+                        )
+                    # with streaming, it is "delta", not "message"!
+                    message_back = chunk['choices'][0]['delta']['content']
+                    llm_response_logger.debug(message_back)
+                    self._post_completion(chunk)
+
+                    yield chunk
+
+            except UserCancelledError:
+                logger.info('LLM request cancelled by user.')
+                raise
+            except OpenAIError as e:
+                logger.error(f'OpenAIError occurred:\n{e}')
+                raise
+            except (
+                RateLimitError,
+                APIConnectionError,
+                ServiceUnavailableError,
+                InternalServerError,
+            ) as e:
+                logger.error(f'Completion Error occurred:\n{e}')
+                raise
+
+            finally:
+                if kwargs.get('stream', False):
+                    await asyncio.sleep(0.1)
+
+        self._async_completion = async_completion_wrapper  # type: ignore
+        self._async_streaming_completion = async_acompletion_stream_wrapper  # type: ignore
+
+    async def _call_acompletion(self, *args, **kwargs):
+        return await litellm.acompletion(*args, **kwargs)
+
     @property
     def completion(self):
         """Decorator for the litellm completion function.
@@ -171,6 +430,25 @@ def completion(self):
         """
         return self._completion
 
+    @property
+    def async_completion(self):
+        """Decorator for the async litellm acompletion function.
+
+        Check the complete documentation at https://litellm.vercel.app/docs/providers/ollama#example-usage---streaming--acompletion
+        """
+        return self._async_completion
+
+    @property
+    def async_streaming_completion(self):
+        """Decorator for the async litellm acompletion function with streaming.
+
+        Check the complete documentation at https://litellm.vercel.app/docs/providers/ollama#example-usage---streaming--acompletion
+        """
+        return self._async_streaming_completion
+
+    def supports_vision(self):
+        return litellm.supports_vision(self.config.model)
+
     def _post_completion(self, response: str) -> None:
         """Post-process the completion response."""
         try:
@@ -184,7 +462,9 @@ def _post_completion(self, response: str) -> None:
                 self.metrics.accumulated_cost,
             )
 
-    def get_token_count(self, messages):
+    def get_token_count(
+        self, messages: Optional[list[Message]] = None, text: Optional[str] = None
+    ) -> int:
         """Get the number of tokens in a list of messages.
 
         Args:
@@ -193,7 +473,11 @@ def get_token_count(self, messages):
         Returns:
             int: The number of tokens.
         """
-        return litellm.token_counter(model=self.config.model, messages=messages)
+        if messages and isinstance(messages[0], Message):
+            messages = [m.model_dump() for m in messages]
+        return litellm.token_counter(
+            model=self.config.model, messages=messages, text=text
+        )
 
     def is_local(self):
         """Determines if the system is using a locally running LLM.
@@ -259,3 +543,28 @@ def __repr__(self):
 
     def reset(self):
         self.metrics = Metrics()
+
+    def is_over_token_limit(self, messages: list[Message]) -> bool:
+        """
+        Estimates the token count of the given events using litellm tokenizer and returns True if over the max_input_tokens value.
+
+        Parameters:
+        - messages: List of messages to estimate the token count for.
+
+        Returns:
+        - Estimated token count.
+        """
+        # max_input_tokens will always be set in init to some sensible default
+        # 0 in config.llm disables the check
+        MAX_TOKEN_COUNT_PADDING = 512
+        if not self.config.max_input_tokens:
+            return False
+        token_count = self.get_token_count(messages=messages) + MAX_TOKEN_COUNT_PADDING
+        logger.debug(f'Token count: {token_count}')
+        return token_count >= self.config.max_input_tokens
+
+    def get_text_messages(self, messages: list[Message]) -> list[dict]:
+        text_messages = []
+        for message in messages:
+            text_messages.append(message.model_dump())
+        return text_messages
diff --git a/opendevin/memory/__init__.py b/opendevin/memory/__init__.py
index 9a705a24ac77..4c1a84f59f0c 100644
--- a/opendevin/memory/__init__.py
+++ b/opendevin/memory/__init__.py
@@ -1,5 +1,4 @@
-from .condenser import MemoryCondenser
 from .history import ShortTermHistory
 from .memory import LongTermMemory
 
-__all__ = ['LongTermMemory', 'ShortTermHistory', 'MemoryCondenser']
+__all__ = ['LongTermMemory', 'ShortTermHistory']
diff --git a/opendevin/memory/condenser.py b/opendevin/memory/condenser.py
deleted file mode 100644
index e0cef4ac3815..000000000000
--- a/opendevin/memory/condenser.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from opendevin.core.logger import opendevin_logger as logger
-from opendevin.llm.llm import LLM
-
-
-class MemoryCondenser:
-    def condense(self, summarize_prompt: str, llm: LLM):
-        """Attempts to condense the memory by using the llm
-
-        Parameters:
-        - llm (LLM): llm to be used for summarization
-
-        Raises:
-        - Exception: the same exception as it got from the llm or processing the response
-        """
-        try:
-            messages = [{'content': summarize_prompt, 'role': 'user'}]
-            resp = llm.completion(messages=messages)
-            summary_response = resp['choices'][0]['message']['content']
-            return summary_response
-        except Exception as e:
-            logger.error('Error condensing thoughts: %s', str(e), exc_info=False)
-
-            # TODO If the llm fails with ContextWindowExceededError, we can try to condense the memory chunk by chunk
-            raise
diff --git a/opendevin/memory/history.py b/opendevin/memory/history.py
index cbe186f387b6..beb0fb38bcf5 100644
--- a/opendevin/memory/history.py
+++ b/opendevin/memory/history.py
@@ -4,6 +4,7 @@
 from opendevin.events.action.action import Action
 from opendevin.events.action.agent import (
     AgentDelegateAction,
+    AgentSummarizeAction,
     ChangeAgentStateAction,
 )
 from opendevin.events.action.empty import NullAction
@@ -40,6 +41,12 @@ def __init__(self):
         self.start_id = -1
         self.end_id = -1
         self.delegates = {}
+        self.summary = None
+        self.last_summarized_event_id = -1
+
+    def add_summary(self, summary_action: AgentSummarizeAction):
+        self.summary = summary_action
+        self.last_summarized_event_id = summary_action.last_summarized_event_id
 
     def set_event_stream(self, event_stream: EventStream):
         self._event_stream = event_stream
@@ -60,24 +67,29 @@ def get_events(self, reverse: bool = False) -> Iterable[Event]:
             if self.end_id != -1
             else self._event_stream.get_latest_event_id()
         )
-
+        summary_yielded = False
         for event in self._event_stream.get_events(
             start_id=start_id,
             end_id=end_id,
             reverse=reverse,
             filter_out_type=self.filter_out,
         ):
-            # TODO add summaries
-            # and filter out events that were included in a summary
-
             # filter out the events from a delegate of the current agent
-            if not any(
+            if (
+                self.summary is not None
+                and event.id <= self.last_summarized_event_id
+                and not summary_yielded
+            ):
+                summary_action = self.summary
+                summary_yielded = True
+                yield summary_action
+            elif not any(
                 # except for the delegate action and observation themselves, currently
                 # AgentDelegateAction has id = delegate_start
                 # AgentDelegateObservation has id = delegate_end
                 delegate_start < event.id < delegate_end
                 for delegate_start, delegate_end in self.delegates.keys()
-            ):
+            ) and (event.id > self.last_summarized_event_id):
                 yield event
 
     def get_last_action(self, end_id: int = -1) -> Action | None:
diff --git a/opendevin/memory/memory.py b/opendevin/memory/memory.py
index a175c36d3f95..f000060cc088 100644
--- a/opendevin/memory/memory.py
+++ b/opendevin/memory/memory.py
@@ -1,10 +1,5 @@
 import threading
 
-import chromadb
-import llama_index.embeddings.openai.base as llama_openai
-from llama_index.core import Document, VectorStoreIndex
-from llama_index.core.retrievers import VectorIndexRetriever
-from llama_index.vector_stores.chroma import ChromaVectorStore
 from openai._exceptions import APIConnectionError, InternalServerError, RateLimitError
 from tenacity import (
     retry,
@@ -17,94 +12,102 @@
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.utils import json
 
-# TODO: this could be made configurable
-num_retries: int = 10
-retry_min_wait: int = 3
-retry_max_wait: int = 300
-
-# llama-index includes a retry decorator around openai.get_embeddings() function
-# it is initialized with hard-coded values and errors
-# this non-customizable behavior is creating issues when it's retrying faster than providers' rate limits
-# this block attempts to banish it and replace it with our decorator, to allow users to set their own limits
-
-if hasattr(llama_openai.get_embeddings, '__wrapped__'):
-    original_get_embeddings = llama_openai.get_embeddings.__wrapped__
-else:
-    logger.warning('Cannot set custom retry limits.')
-    num_retries = 1
-    original_get_embeddings = llama_openai.get_embeddings
-
-
-def attempt_on_error(retry_state):
-    logger.error(
-        f'{retry_state.outcome.exception()}. Attempt #{retry_state.attempt_number} | You can customize these settings in the configuration.',
-        exc_info=False,
+try:
+    import chromadb
+    import llama_index.embeddings.openai.base as llama_openai
+    from llama_index.core import Document, VectorStoreIndex
+    from llama_index.core.retrievers import VectorIndexRetriever
+    from llama_index.vector_stores.chroma import ChromaVectorStore
+
+    LLAMA_INDEX_AVAILABLE = True
+except ImportError:
+    LLAMA_INDEX_AVAILABLE = False
+
+if LLAMA_INDEX_AVAILABLE:
+    # TODO: this could be made configurable
+    num_retries: int = 10
+    retry_min_wait: int = 3
+    retry_max_wait: int = 300
+
+    # llama-index includes a retry decorator around openai.get_embeddings() function
+    # it is initialized with hard-coded values and errors
+    # this non-customizable behavior is creating issues when it's retrying faster than providers' rate limits
+    # this block attempts to banish it and replace it with our decorator, to allow users to set their own limits
+
+    if hasattr(llama_openai.get_embeddings, '__wrapped__'):
+        original_get_embeddings = llama_openai.get_embeddings.__wrapped__
+    else:
+        logger.warning('Cannot set custom retry limits.')
+        num_retries = 1
+        original_get_embeddings = llama_openai.get_embeddings
+
+    def attempt_on_error(retry_state):
+        logger.error(
+            f'{retry_state.outcome.exception()}. Attempt #{retry_state.attempt_number} | You can customize these settings in the configuration.',
+            exc_info=False,
+        )
+        return None
+
+    @retry(
+        reraise=True,
+        stop=stop_after_attempt(num_retries),
+        wait=wait_random_exponential(min=retry_min_wait, max=retry_max_wait),
+        retry=retry_if_exception_type(
+            (RateLimitError, APIConnectionError, InternalServerError)
+        ),
+        after=attempt_on_error,
     )
-    return None
-
-
-@retry(
-    reraise=True,
-    stop=stop_after_attempt(num_retries),
-    wait=wait_random_exponential(min=retry_min_wait, max=retry_max_wait),
-    retry=retry_if_exception_type(
-        (RateLimitError, APIConnectionError, InternalServerError)
-    ),
-    after=attempt_on_error,
-)
-def wrapper_get_embeddings(*args, **kwargs):
-    return original_get_embeddings(*args, **kwargs)
-
-
-llama_openai.get_embeddings = wrapper_get_embeddings
-
-
-class EmbeddingsLoader:
-    """Loader for embedding model initialization."""
-
-    @staticmethod
-    def get_embedding_model(strategy: str, llm_config: LLMConfig):
-        supported_ollama_embed_models = [
-            'llama2',
-            'mxbai-embed-large',
-            'nomic-embed-text',
-            'all-minilm',
-            'stable-code',
-        ]
-        if strategy in supported_ollama_embed_models:
-            from llama_index.embeddings.ollama import OllamaEmbedding
-
-            return OllamaEmbedding(
-                model_name=strategy,
-                base_url=llm_config.embedding_base_url,
-                ollama_additional_kwargs={'mirostat': 0},
-            )
-        elif strategy == 'openai':
-            from llama_index.embeddings.openai import OpenAIEmbedding
-
-            return OpenAIEmbedding(
-                model='text-embedding-ada-002',
-                api_key=llm_config.api_key,
-            )
-        elif strategy == 'azureopenai':
-            from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
-
-            return AzureOpenAIEmbedding(
-                model='text-embedding-ada-002',
-                deployment_name=llm_config.embedding_deployment_name,
-                api_key=llm_config.api_key,
-                azure_endpoint=llm_config.base_url,
-                api_version=llm_config.api_version,
-            )
-        elif (strategy is not None) and (strategy.lower() == 'none'):
-            # TODO: this works but is not elegant enough. The incentive is when
-            # an agent using embeddings is not used, there is no reason we need to
-            # initialize an embedding model
-            return None
-        else:
-            from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-
-            return HuggingFaceEmbedding(model_name='BAAI/bge-small-en-v1.5')
+    def wrapper_get_embeddings(*args, **kwargs):
+        return original_get_embeddings(*args, **kwargs)
+
+    llama_openai.get_embeddings = wrapper_get_embeddings
+
+    class EmbeddingsLoader:
+        """Loader for embedding model initialization."""
+
+        @staticmethod
+        def get_embedding_model(strategy: str, llm_config: LLMConfig):
+            supported_ollama_embed_models = [
+                'llama2',
+                'mxbai-embed-large',
+                'nomic-embed-text',
+                'all-minilm',
+                'stable-code',
+            ]
+            if strategy in supported_ollama_embed_models:
+                from llama_index.embeddings.ollama import OllamaEmbedding
+
+                return OllamaEmbedding(
+                    model_name=strategy,
+                    base_url=llm_config.embedding_base_url,
+                    ollama_additional_kwargs={'mirostat': 0},
+                )
+            elif strategy == 'openai':
+                from llama_index.embeddings.openai import OpenAIEmbedding
+
+                return OpenAIEmbedding(
+                    model='text-embedding-ada-002',
+                    api_key=llm_config.api_key,
+                )
+            elif strategy == 'azureopenai':
+                from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
+
+                return AzureOpenAIEmbedding(
+                    model='text-embedding-ada-002',
+                    deployment_name=llm_config.embedding_deployment_name,
+                    api_key=llm_config.api_key,
+                    azure_endpoint=llm_config.base_url,
+                    api_version=llm_config.api_version,
+                )
+            elif (strategy is not None) and (strategy.lower() == 'none'):
+                # TODO: this works but is not elegant enough. The incentive is when
+                # an agent using embeddings is not used, there is no reason we need to
+                # initialize an embedding model
+                return None
+            else:
+                from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+
+                return HuggingFaceEmbedding(model_name='BAAI/bge-small-en-v1.5')
 
 
 class LongTermMemory:
@@ -112,6 +115,12 @@ class LongTermMemory:
 
     def __init__(self, llm_config: LLMConfig, memory_max_threads: int = 1):
         """Initialize the chromadb and set up ChromaVectorStore for later use."""
+        if not LLAMA_INDEX_AVAILABLE:
+            raise ImportError(
+                'llama_index and its dependencies are not installed. '
+                'To use LongTermMemory, please run: poetry install --with llama-index'
+            )
+
         db = chromadb.Client(chromadb.Settings(anonymized_telemetry=False))
         self.collection = db.get_or_create_collection(name='memories')
         vector_store = ChromaVectorStore(chroma_collection=self.collection)
diff --git a/opendevin/runtime/__init__.py b/opendevin/runtime/__init__.py
index f11157e8f0b3..26d690826f2b 100644
--- a/opendevin/runtime/__init__.py
+++ b/opendevin/runtime/__init__.py
@@ -1,16 +1,10 @@
-from .docker.local_box import LocalBox
-from .docker.ssh_box import DockerSSHBox
 from .e2b.sandbox import E2BBox
 from .sandbox import Sandbox
 
 
 def get_runtime_cls(name: str):
     # Local imports to avoid circular imports
-    if name == 'server':
-        from .server.runtime import ServerRuntime
-
-        return ServerRuntime
-    elif name == 'client':
+    if name == 'eventstream':
         from .client.runtime import EventStreamRuntime
 
         return EventStreamRuntime
diff --git a/opendevin/runtime/browser/browser_env.py b/opendevin/runtime/browser/browser_env.py
index ed0611f6b21a..7d060580bd9e 100644
--- a/opendevin/runtime/browser/browser_env.py
+++ b/opendevin/runtime/browser/browser_env.py
@@ -3,8 +3,6 @@
 import io
 import json
 import multiprocessing
-import os
-import threading
 import time
 import uuid
 
@@ -12,55 +10,32 @@
 import gymnasium as gym
 import html2text
 import numpy as np
+import tenacity
 from browsergym.utils.obs import flatten_dom_to_str
 from PIL import Image
 
 from opendevin.core.exceptions import BrowserInitException
 from opendevin.core.logger import opendevin_logger as logger
 
+BROWSER_EVAL_GET_GOAL_ACTION = 'GET_EVAL_GOAL'
+BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS'
+
 
 class BrowserEnv:
-    def __init__(
-        self,
-        is_async: bool = True,
-        browsergym_eval: str = '',
-        browsergym_eval_save_dir: str = '',
-    ):
+    def __init__(self, browsergym_eval_env: str | None = None):
         self.html_text_converter = self.get_html_text_converter()
         self.eval_mode = False
         self.eval_dir = ''
-        # EVAL only: browsergym_eval and browsergym_eval_save_dir must be provided for evaluation
-        self.browsergym_eval = browsergym_eval
-        self.browsergym_eval_save_dir = browsergym_eval_save_dir
-        if self.browsergym_eval:
-            assert (
-                self.browsergym_eval_save_dir
-            ), 'browsergym_eval_save_dir must be provided for evaluation.'
-            self.eval_mode = True
-            self.eval_dir = os.path.join(
-                self.browsergym_eval_save_dir, self.browsergym_eval.split('/')[1]
-            )
-            os.makedirs(self.eval_dir, exist_ok=True)
+
+        # EVAL only: browsergym_eval_env must be provided for evaluation
+        self.browsergym_eval_env = browsergym_eval_env
+        self.eval_mode = bool(browsergym_eval_env)
+
         # Initialize browser environment process
         multiprocessing.set_start_method('spawn', force=True)
         self.browser_side, self.agent_side = multiprocessing.Pipe()
-        self.process = multiprocessing.Process(
-            target=self.browser_process,
-        )
 
-        try:
-            self.original_cwd = os.getcwd()
-        except FileNotFoundError:
-            logger.warning(
-                'Current working directory does not exist. Using /tmp as fallback.'
-            )
-            self.original_cwd = '/tmp'
-            os.chdir('/tmp')
-
-        if is_async:
-            threading.Thread(target=self.init_browser).start()
-        else:
-            self.init_browser()
+        self.init_browser()
         atexit.register(self.close)
 
     def get_html_text_converter(self):
@@ -74,20 +49,15 @@ def get_html_text_converter(self):
         html_text_converter.body_width = 0
         return html_text_converter
 
+    @tenacity.retry(
+        wait=tenacity.wait_fixed(1),
+        stop=tenacity.stop_after_attempt(5),
+        retry=tenacity.retry_if_exception_type(BrowserInitException),
+    )
     def init_browser(self):
         logger.info('Starting browser env...')
-
-        # Ensure we're in a valid directory before starting the process
-        try:
-            os.chdir(self.original_cwd)
-            logger.debug(f'Changed back to original directory: {self.original_cwd}')
-        except Exception as e:
-            logger.error(f'Failed to change to original directory: {e}')
-            # If we can't change to the original directory, try to use a known valid directory
-            os.chdir('/tmp')
-            logger.debug('Changed to /tmp directory as fallback')
-
         try:
+            self.process = multiprocessing.Process(target=self.browser_process)
             self.process.start()
         except Exception as e:
             logger.error(f'Failed to start browser process: {e}')
@@ -99,8 +69,17 @@ def init_browser(self):
 
     def browser_process(self):
         if self.eval_mode:
-            logger.info('Creating browser env for evaluation purpose.')
-            env = gym.make(self.browsergym_eval)
+            assert self.browsergym_eval_env is not None
+            logger.info('Initializing browser env for web browsing evaluation.')
+            if 'webarena' in self.browsergym_eval_env:
+                import browsergym.webarena  # noqa F401 register webarena tasks as gym environments
+            elif 'miniwob' in self.browsergym_eval_env:
+                import browsergym.miniwob  # noqa F401 register miniwob tasks as gym environments
+            else:
+                raise ValueError(
+                    f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
+                )
+            env = gym.make(self.browsergym_eval_env)
         else:
             env = gym.make(
                 'browsergym/openended',
@@ -109,20 +88,22 @@ def browser_process(self):
                 headless=True,
                 disable_env_checker=True,
             )
+
         obs, info = env.reset()
-        # EVAL only: save the goal into file for evaluation
+
+        # EVAL ONLY: save the goal into file for evaluation
+        self.eval_goal = None
+        self.eval_rewards: list[float] = []
         if self.eval_mode:
-            rewards = []  # store rewards if in eval mode
-            logger.info(obs['goal'])
-            with open(
-                os.path.join(self.eval_dir, 'goal.txt'), 'w', encoding='utf-8'
-            ) as f:
-                f.write(obs['goal'])
+            logger.info(f"Browsing goal: {obs['goal']}")
+            self.eval_goal = obs['goal']
+
         logger.info('Browser env started.')
         while True:
             try:
                 if self.browser_side.poll(timeout=0.01):
                     unique_request_id, action_data = self.browser_side.recv()
+
                     # shutdown the browser environment
                     if unique_request_id == 'SHUTDOWN':
                         logger.info('SHUTDOWN recv, shutting down browser env...')
@@ -131,17 +112,29 @@ def browser_process(self):
                     elif unique_request_id == 'IS_ALIVE':
                         self.browser_side.send(('ALIVE', None))
                         continue
+
+                    # EVAL ONLY: Get evaluation info
+                    if action_data['action'] == BROWSER_EVAL_GET_GOAL_ACTION:
+                        self.browser_side.send(
+                            (unique_request_id, {'text_content': self.eval_goal})
+                        )
+                        continue
+                    elif action_data['action'] == BROWSER_EVAL_GET_REWARDS_ACTION:
+                        self.browser_side.send(
+                            (
+                                unique_request_id,
+                                {'text_content': json.dumps(self.eval_rewards)},
+                            )
+                        )
+                        continue
+
                     action = action_data['action']
                     obs, reward, terminated, truncated, info = env.step(action)
-                    # EVAL only: save the rewards into file for evaluation
+
+                    # EVAL ONLY: Save the rewards into file for evaluation
                     if self.eval_mode:
-                        rewards.append(reward)
-                        with open(
-                            os.path.join(self.eval_dir, 'rewards.json'),
-                            'w',
-                            encoding='utf-8',
-                        ) as f:
-                            f.write(json.dumps(rewards))
+                        self.eval_rewards.append(reward)
+
                     # add text content of the page
                     html_str = flatten_dom_to_str(obs['dom_object'])
                     obs['text_content'] = self.html_text_converter.handle(html_str)
@@ -159,6 +152,7 @@ def browser_process(self):
                 return
 
     def step(self, action_str: str, timeout: float = 30) -> dict:
+        """Execute an action in the browser environment and return the observation."""
         unique_request_id = str(uuid.uuid4())
         self.agent_side.send((unique_request_id, {'action': action_str}))
         start_time = time.time()
diff --git a/opendevin/runtime/browser/utils.py b/opendevin/runtime/browser/utils.py
index 8ca73eab0cfe..f7a629520d26 100644
--- a/opendevin/runtime/browser/utils.py
+++ b/opendevin/runtime/browser/utils.py
@@ -1,6 +1,9 @@
 import os
 
+from browsergym.utils.obs import flatten_axtree_to_str
+
 from opendevin.core.exceptions import BrowserUnavailableException
+from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.schema import ActionType
 from opendevin.events.action import BrowseInteractiveAction, BrowseURLAction
 from opendevin.events.observation import BrowserOutputObservation
@@ -28,25 +31,41 @@ async def browse(
         raise ValueError(f'Invalid action type: {action.action}')
 
     try:
-        # obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396
+        # obs provided by BrowserGym:
+        # https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/core/src/browsergym/core/env.py#L521
+        # https://github.com/ServiceNow/BrowserGym/blob/418421abdc5da4d77dc71d3b82a9e5e931be0c4f/browsergym/core/src/browsergym/core/env.py#L521
         obs = browser.step(action_str)
+        try:
+            axtree_txt = flatten_axtree_to_str(
+                obs['axtree_object'],  # accessibility tree object
+                extra_properties=obs[
+                    'extra_element_properties'
+                ],  # extra element properties
+                with_clickable=True,
+                filter_visible_only=True,
+            )
+        except Exception as e:
+            logger.error(
+                f'Error when trying to process the accessibility tree: {e}, obs: {obs}'
+            )
+            axtree_txt = f'AX Error: {e}'
         return BrowserOutputObservation(
             content=obs['text_content'],  # text content of the page
-            open_pages_urls=obs['open_pages_urls'],  # list of open pages
-            active_page_index=obs['active_page_index'],  # index of the active page
-            dom_object=obs['dom_object'],  # DOM object
-            axtree_object=obs['axtree_object'],  # accessibility tree object
-            extra_element_properties=obs[
-                'extra_element_properties'
-            ],  # extra element properties
-            last_browser_action=obs['last_action'],  # last browser env action performed
-            focused_element_bid=obs['focused_element_bid'],  # focused element bid
-            screenshot=obs['screenshot'],  # base64-encoded screenshot, png
-            url=obs['url'],  # URL of the page
-            error=True if obs['last_action_error'] else False,  # error flag
-            last_browser_action_error=obs[
-                'last_action_error'
-            ],  # last browser env action error
+            url=obs.get('url', ''),  # URL of the page
+            screenshot=obs.get('screenshot', ''),  # base64-encoded screenshot, png
+            open_pages_urls=obs.get('open_pages_urls', []),  # list of open pages
+            active_page_index=obs.get(
+                'active_page_index', -1
+            ),  # index of the active page
+            axtree_txt=axtree_txt,  # accessibility tree text
+            focused_element_bid=obs.get(
+                'focused_element_bid', ''
+            ),  # focused element bid
+            last_browser_action=obs.get(
+                'last_action', ''
+            ),  # last browser env action performed
+            last_browser_action_error=obs.get('last_action_error', ''),
+            error=True if obs.get('last_action_error') else False,  # error flag
         )
     except Exception as e:
         return BrowserOutputObservation(
diff --git a/opendevin/runtime/client/README.md b/opendevin/runtime/client/README.md
deleted file mode 100644
index 2040ab66599a..000000000000
--- a/opendevin/runtime/client/README.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# OpenDevin Runtime
-
-This README provides an overview of the OpenDevin Runtime, a crucial component of the OpenDevin system. It covers two main aspects:
-
-1. How the Runtime Image is Built: Explains the layered approach to creating Docker images for both production and development environments.
-2. How the Runtime Client Works: Details the functionality and architecture of the Runtime Client, which executes actions within the Docker sandbox.
-
-The following sections dive deeper into these topics, providing a comprehensive understanding of the OpenDevin Runtime system.
-
-## How the Runtime Image is Built
-
-The OpenDevin runtime uses a layered approach for building Docker images:
-
-1. **Original Image**: `ubuntu:22.04`
-   - This is the base image used for all subsequent layers.
-
-2. **Runtime Image**:  `od_runtime:ubuntu__22.04`
-   - Built from the stable release of OpenDevin.
-   - This is the primary runtime image that users will interact with.
-   - Created by copying all OpenDevin code into the original image and installing dependencies using Poetry.
-
-3. **Dev Runtime Image**:  `od_runtime_dev:ubuntu__22.04`
-   - Built from local source code for development purposes.
-
-### Build Process
-
-#### Production Build (DEBUG=false)
-By default, when DEBUG is set to false, the build process only needs to run once:
-- The Runtime Image (`od_runtime:ubuntu__22.04`) is created by copying OpenDevin code into the original Ubuntu image and installing all dependencies.
-- This pre-built image is then used for running the OpenDevin environment.
-
-#### Development Build (DEBUG=true)
-When developing or modifying code that runs inside the container, you can set DEBUG=true to enable a more dynamic build process:
-- Every time you run the code, the existing image will be updated with the latest changes.
-- The Dev Runtime Image (`od_runtime_dev:ubuntu__22.04`) is rebuilt from the Runtime Image (`od_runtime:ubuntu__22.04`).
-- Most dependencies are already installed in the Runtime Image, so this process mainly updates the code and any new dependencies.
-- The rebuild process typically takes around 10 seconds, allowing for quick iterations during development.
-
-This approach allows developers to easily test changes to the OpenDevin codebase, including modifications to files like client.py, without needing to rebuild the entire image from scratch each time.
-
-## How the Runtime Client Works
-
-The Runtime Client is a crucial component of the OpenDevin system, responsible for executing actions within the Docker sandbox environment and producing observations. Here's an overview of its functionality:
-
-1. **Initialization**:
-   - The `EventStreamRuntime` class in `runtime.py` initializes the Docker container and sets up the runtime environment.
-
-2. **Communication**:
-   - The Runtime Client uses FastAPI to create a web server inside the Docker container.
-   - It listens for incoming action requests from the OpenDevin backend.
-
-3. **Action Execution**:
-   - When an action is received, the Runtime Client processes it based on its type:
-     - `CmdRunAction`: Executes shell commands using a pexpect-spawned bash shell.
-     - `FileReadAction` and `FileWriteAction`: Perform file operations within the sandbox.
-     - `IPythonRunCellAction`: Executes Python code in an IPython environment.
-     - `BrowseURLAction` and `BrowseInteractiveAction`: Handle web browsing tasks using a browser environment.
-
-4. **Plugin System**:
-   - The Runtime Client supports a plugin system for extending functionality.
-   - Plugins like JupyterPlugin can be loaded to provide additional features.
-
-5. **Observation Generation**:
-   - After executing an action, the Runtime Client generates an appropriate observation.
-   - Observations include command outputs, file contents, error messages, etc.
-
-6. **Asynchronous Operation**:
-   - The Runtime Client uses asyncio for avoid concurrent requests.
-   - It ensures that only one action is executed at a time using a semaphore.
-
-7. **Security**:
-   - All actions are executed within the confined Docker environment, providing a sandbox for safe execution.
-
-8. **Flexibility**:
-   - The system supports both production (DEBUG=false) and development (DEBUG=true) modes.
-   - In development mode, the runtime image can be updated with the latest code changes for testing and debugging.
-
-
-
-## Architecture Diagram
-
-```
-+-------------------+     +-------------------+
-|   OpenDevin       |     |   Docker Host     |
-|   Backend         |     |                   |
-|                   |     |  +-------------+  |
-|  +-------------+  |     |  |  Runtime    |  |
-|  | EventStream |  |     |  |  Container  |  |
-|  | Runtime     |<-|-----|->|             |  |
-|  +-------------+  |     |  |  +-------+  |  |
-|                   |     |  |  |Runtime|  |  |
-|                   |     |  |  |Client |  |  |
-|                   |     |  |  +-------+  |  |
-|                   |     |  |     |       |  |
-|                   |     |  |  +-------+  |  |
-|                   |     |  |  |Plugins|  |  |
-|                   |     |  |  +-------+  |  |
-|                   |     |  +-------------+  |
-+-------------------+     +-------------------+
-```
-
-This diagram illustrates the high-level architecture of the OpenDevin Runtime system:
-
-1. The OpenDevin Backend communicates with the Docker Host through the EventStreamRuntime.
-2. The Docker Host runs a Runtime Container, which includes:
-   - The Runtime Client: Handles incoming actions and generates observations.
-   - Plugins: Extend the functionality of the Runtime Client.
-3. The Runtime Client executes actions within the sandboxed environment of the Docker container.
-
-This architecture ensures a secure and flexible environment for executing AI-driven development tasks, allowing OpenDevin to execute a wide range of actions safely and efficiently.
diff --git a/opendevin/runtime/client/client.py b/opendevin/runtime/client/client.py
index c875ad494eab..4919bcc8a4db 100644
--- a/opendevin/runtime/client/client.py
+++ b/opendevin/runtime/client/client.py
@@ -3,20 +3,23 @@
 It is responsible for executing actions received from OpenDevin backend and producing observations.
 
 NOTE: this will be executed inside the docker sandbox.
-
-If you already have pre-build docker image yet you changed the code in this file OR dependencies, you need to rebuild the docker image to update the source code.
-
-You should add SANDBOX_UPDATE_SOURCE_CODE=True to any `python XXX.py` command you run to update the source code.
 """
 
 import argparse
 import asyncio
 import os
 import re
+import shutil
+import subprocess
+import sys
+from contextlib import asynccontextmanager
 from pathlib import Path
 
 import pexpect
-from fastapi import FastAPI, HTTPException, Request
+from fastapi import FastAPI, HTTPException, Request, UploadFile
+from fastapi.responses import JSONResponse
+from pathspec import PathSpec
+from pathspec.patterns import GitWildMatchPattern
 from pexpect import EOF, TIMEOUT, ExceptionPexpect
 from pydantic import BaseModel
 from uvicorn import run
@@ -36,6 +39,7 @@
     ErrorObservation,
     FileReadObservation,
     FileWriteObservation,
+    IPythonRunCellObservation,
     Observation,
 )
 from opendevin.events.serialization import event_from_dict, event_to_dict
@@ -46,33 +50,128 @@
     JupyterPlugin,
     Plugin,
 )
-from opendevin.runtime.server.files import insert_lines, read_lines
-
-app = FastAPI()
+from opendevin.runtime.utils import split_bash_commands
+from opendevin.runtime.utils.files import insert_lines, read_lines
 
 
 class ActionRequest(BaseModel):
     action: dict
 
 
+ROOT_GID = 0
+INIT_COMMANDS = [
+    'git config --global user.name "opendevin"',
+    'git config --global user.email "opendevin@all-hands.dev"',
+    "alias git='git --no-pager'",
+    'export TERM=xterm-256color',
+    "export PATH=/opendevin/poetry/$(ls /opendevin/poetry | sed -n '2p')/bin:$PATH",
+]
+
+
 class RuntimeClient:
     """RuntimeClient is running inside docker sandbox.
     It is responsible for executing actions received from OpenDevin backend and producing observations.
     """
 
-    def __init__(self, plugins_to_load: list[Plugin], work_dir: str) -> None:
-        self._init_bash_shell(work_dir)
+    def __init__(
+        self,
+        plugins_to_load: list[Plugin],
+        work_dir: str,
+        username: str,
+        user_id: int,
+        browsergym_eval_env: str | None,
+    ) -> None:
+        self.plugins_to_load = plugins_to_load
+        self.username = username
+        self.user_id = user_id
+        self.pwd = work_dir  # current PWD
+        self._initial_pwd = work_dir
+        self._init_user(self.username, self.user_id)
+        self._init_bash_shell(self.pwd, self.username)
         self.lock = asyncio.Lock()
         self.plugins: dict[str, Plugin] = {}
-        self.browser = BrowserEnv()
+        self.browser = BrowserEnv(browsergym_eval_env)
+        self._initial_pwd = work_dir
 
-        for plugin in plugins_to_load:
-            plugin.initialize()
+    @property
+    def initial_pwd(self):
+        return self._initial_pwd
+
+    async def ainit(self):
+        for plugin in self.plugins_to_load:
+            await plugin.initialize(self.username)
             self.plugins[plugin.name] = plugin
             logger.info(f'Initializing plugin: {plugin.name}')
 
-    def _init_bash_shell(self, work_dir: str) -> None:
-        self.shell = pexpect.spawn('/bin/bash', encoding='utf-8', echo=False)
+            if isinstance(plugin, JupyterPlugin):
+                await self.run_ipython(
+                    IPythonRunCellAction(code=f'import os; os.chdir("{self.pwd}")')
+                )
+
+        # This is a temporary workaround
+        # TODO: refactor AgentSkills to be part of JupyterPlugin
+        # AFTER ServerRuntime is deprecated
+        if 'agent_skills' in self.plugins and 'jupyter' in self.plugins:
+            obs = await self.run_ipython(
+                IPythonRunCellAction(code='from agentskills import *')
+            )
+            logger.info(f'AgentSkills initialized: {obs}')
+
+        await self._init_bash_commands()
+
+    def _init_user(self, username: str, user_id: int) -> None:
+        """Create user if not exists."""
+        # Skip root since it is already created
+        if username == 'root':
+            return
+
+        try:
+            subprocess.run(['id', username], stdout=subprocess.DEVNULL, check=True)
+            return
+        except subprocess.CalledProcessError:
+            pass
+        # Add sudoer
+        sudoer_line = r"echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers"
+        output = subprocess.run(sudoer_line, shell=True, capture_output=True)
+        if output.returncode != 0:
+            raise RuntimeError(f'Failed to add sudoer: {output.stderr.decode()}')
+        logger.debug(f'Added sudoer successfully. Output: [{output.stdout.decode()}]')
+
+        # Add user and change ownership of the initial working directory if it doesn't exist
+        command = (
+            f'useradd -rm -d /home/{username} -s /bin/bash '
+            f'-g root -G sudo -o -u {user_id} {username}'
+        )
+
+        if not os.path.exists(self.initial_pwd):
+            command += f' && mkdir -p {self.initial_pwd}'
+            command += f' && chown -R {username}:root {self.initial_pwd}'
+            command += f' && chmod g+s {self.initial_pwd}'
+
+        output = subprocess.run(
+            command,
+            shell=True,
+            capture_output=True,
+        )
+        if output.returncode != 0:
+            raise RuntimeError(
+                f'Failed to create user {username}: {output.stderr.decode()}'
+            )
+
+        logger.debug(
+            f'Added user {username} successfully. Output: [{output.stdout.decode()}]'
+        )
+
+        command = 'deluser pn'
+        output = subprocess.run(command, shell=True, capture_output=True)
+        logger.debug(f'Output: {output.stdout.decode()}')
+
+    def _init_bash_shell(self, work_dir: str, username: str) -> None:
+        self.shell = pexpect.spawn(
+            f'su - {username}',
+            encoding='utf-8',
+            echo=False,
+        )
         self.__bash_PS1 = r'[PEXPECT_BEGIN] \u@\h:\w [PEXPECT_END]'
 
         # This should NOT match "PS1=\u@\h:\w [PEXPECT]$" when `env` is executed
@@ -80,20 +179,47 @@ def _init_bash_shell(self, work_dir: str) -> None:
             r'\[PEXPECT_BEGIN\] ([a-z0-9_-]*)@([a-zA-Z0-9.-]*):(.+) \[PEXPECT_END\]'
         )
 
-        self.shell.sendline(f'export PS1="{self.__bash_PS1}"')
+        self.shell.sendline(f'export PS1="{self.__bash_PS1}"; export PS2=""')
         self.shell.expect(self.__bash_expect_regex)
 
         self.shell.sendline(f'cd {work_dir}')
         self.shell.expect(self.__bash_expect_regex)
+        logger.debug(
+            f'Bash initialized. Working directory: {work_dir}. Output: {self.shell.before}'
+        )
+
+    async def _init_bash_commands(self):
+        logger.info(f'Initializing by running {len(INIT_COMMANDS)} bash commands...')
+        for command in INIT_COMMANDS:
+            action = CmdRunAction(command=command)
+            action.timeout = 300
+            logger.debug(f'Executing init command: {command}')
+            obs: CmdOutputObservation = await self.run(action)
+            logger.debug(
+                f'Init command outputs (exit code: {obs.exit_code}): {obs.content}'
+            )
+            assert obs.exit_code == 0
 
-    def _get_bash_prompt(self):
+        logger.info('Bash init commands completed')
+
+    def _get_bash_prompt_and_update_pwd(self):
         ps1 = self.shell.after
+
+        # begin at the last occurence of '[PEXPECT_BEGIN]'.
+        # In multi-line bash commands, the prompt will be repeated
+        # and the matched regex captures all of them
+        # - we only want the last one (newest prompt)
+        _begin_pos = ps1.rfind('[PEXPECT_BEGIN]')
+        if _begin_pos != -1:
+            ps1 = ps1[_begin_pos:]
+
         # parse the ps1 to get username, hostname, and working directory
         matched = re.match(self.__bash_expect_regex, ps1)
         assert (
             matched is not None
         ), f'Failed to parse bash prompt: {ps1}. This should not happen.'
         username, hostname, working_dir = matched.groups()
+        self.pwd = os.path.expanduser(working_dir)
 
         # re-assemble the prompt
         prompt = f'{username}@{hostname}:{working_dir} '
@@ -106,24 +232,23 @@ def _get_bash_prompt(self):
     def _send_interrupt(
         self,
         command: str,
-        prev_output: str = '',
-        ignore_last_output: bool = False,
+        timeout: int | None,
     ) -> tuple[str, int]:
         logger.exception(
             f'Command "{command}" timed out, killing process...', exc_info=False
         )
         # send a SIGINT to the process
         self.shell.sendintr()
-        self.shell.prompt()
-        command_output = prev_output
-        if not ignore_last_output:
-            command_output += '\n' + self.shell.before
+        self.shell.expect(self.__bash_expect_regex, timeout=timeout)
+        command_output = self.shell.before
         return (
             f'Command: "{command}" timed out. Sent SIGINT to the process: {command_output}',
-            -1,
+            130,
         )
 
-    def _execute_bash(self, command, keep_prompt: bool = True) -> tuple[str, int]:
+    def _execute_bash(
+        self, command: str, timeout: int | None, keep_prompt: bool = True
+    ) -> tuple[str, int]:
         logger.debug(f'Executing command: {command}')
         self.shell.sendline(command)
 
@@ -156,54 +281,147 @@ def _execute_bash(self, command, keep_prompt: bool = True) -> tuple[str, int]:
                         logger.exception(
                             'Command timed out, killing process...', exc_info=False
                         )
-                        return self._send_interrupt(command)
+                        return self._send_interrupt(command, timeout=timeout)
             except ExceptionPexpect as e:
                 logger.exception(f'Unexpected exception: {e}')
                 break
 
         output = command_output.strip()
         if keep_prompt:
-            output += '\r\n' + self._get_bash_prompt()
-        logger.debug(f'Command output: {output}')
+            output += '\r\n' + self._get_bash_prompt_and_update_pwd()
 
         # Get exit code
         self.shell.sendline('echo $?')
         logger.debug(f'Executing command for exit code: {command}')
-        self.shell.expect(self.__bash_expect_regex)
+        self.shell.expect(self.__bash_expect_regex, timeout=timeout)
         _exit_code_output = self.shell.before
         logger.debug(f'Exit code Output: {_exit_code_output}')
         exit_code = int(_exit_code_output.strip())
+        logger.debug(f'Command output: {output}')
+
         return output, exit_code
 
     async def run_action(self, action) -> Observation:
         action_type = action.action
         observation = await getattr(self, action_type)(action)
-        observation._parent = action.id
         return observation
 
     async def run(self, action: CmdRunAction) -> CmdOutputObservation:
         try:
-            output, exit_code = self._execute_bash(action.command)
+            assert (
+                action.timeout is not None
+            ), f'Timeout argument is required for CmdRunAction: {action}'
+            commands = split_bash_commands(action.command)
+            all_output = ''
+            for command in commands:
+                output, exit_code = self._execute_bash(
+                    command,
+                    timeout=action.timeout,
+                    keep_prompt=action.keep_prompt,
+                )
+                if command.startswith('pip install'):
+                    output = await self.parse_pip_output(command, output)
+                if all_output:
+                    # previous output already exists with prompt "user@hostname:working_dir #""
+                    # we need to add the command to the previous output,
+                    # so model knows the following is the output of another action)
+                    all_output = all_output.rstrip() + ' ' + command + '\r\n'
+
+                all_output += str(output) + '\r\n'
+                if exit_code != 0:
+                    break
+
             return CmdOutputObservation(
                 command_id=-1,
-                content=str(output),
+                content=all_output.rstrip('\r\n'),
                 command=action.command,
                 exit_code=exit_code,
             )
         except UnicodeDecodeError:
             raise RuntimeError('Command output could not be decoded as utf-8')
 
+    async def restart_kernel(self) -> str:
+        if 'agent_skills' in self.plugins:
+            kernel_init_code = 'from agentskills import *'
+        else:
+            return ''
+
+        jupyter_plugin: JupyterPlugin = self.plugins['jupyter']  # type: ignore
+        restart_kernel_code = (
+            'import IPython\nIPython.Application.instance().kernel.do_shutdown(True)'
+        )
+        act = IPythonRunCellAction(code=restart_kernel_code)
+        obs = await jupyter_plugin.run(act)
+        output = obs.content
+        if "{'status': 'ok', 'restart': True}" != output.strip():
+            print(output)
+            output = '\n[Failed to restart the kernel]'
+        else:
+            output = '\n[Kernel restarted successfully]'
+
+        # re-init the kernel after restart
+        act = IPythonRunCellAction(code=kernel_init_code)
+        await jupyter_plugin.run(act)
+        return output
+
+    async def parse_pip_output(self, code, output) -> str:
+        print(output)
+        package_names = code.split(' ', 2)[-1]
+        is_single_package = ' ' not in package_names
+        parsed_output = output
+        if 'Successfully installed' in output:
+            parsed_output = '[Package installed successfully]'
+            if (
+                'Note: you may need to restart the kernel to use updated packages.'
+                in output
+            ):
+                parsed_output += await self.restart_kernel()
+            else:
+                # restart kernel if installed via bash too
+                await self.restart_kernel()
+        elif (
+            is_single_package
+            and f'Requirement already satisfied: {package_names}' in output
+        ):
+            parsed_output = '[Package already installed]'
+
+        prompt_output = self._get_bash_prompt_and_update_pwd()
+        return parsed_output + '\r\n' + prompt_output
+
     async def run_ipython(self, action: IPythonRunCellAction) -> Observation:
         if 'jupyter' in self.plugins:
             _jupyter_plugin: JupyterPlugin = self.plugins['jupyter']  # type: ignore
-            return await _jupyter_plugin.run(action)
+            # This is used to make AgentSkills in Jupyter aware of the
+            # current working directory in Bash
+            if self.pwd != getattr(self, '_jupyter_pwd', None):
+                logger.debug(
+                    f"{self.pwd} != {getattr(self, '_jupyter_pwd', None)} -> reset Jupyter PWD"
+                )
+                reset_jupyter_pwd_code = f'import os; os.environ["JUPYTER_PWD"] = os.path.abspath("{self.pwd}")'
+                _aux_action = IPythonRunCellAction(code=reset_jupyter_pwd_code)
+                _reset_obs = await _jupyter_plugin.run(_aux_action)
+                logger.debug(
+                    f'Changed working directory in IPython to: {self.pwd}. Output: {_reset_obs}'
+                )
+                self._jupyter_pwd = self.pwd
+
+            if 'app.run' in action.code.strip().split('\n')[-1]:
+                return ErrorObservation(
+                    "Don't run Flask app in Jupyter notebook. Save the code to a file and run it in the terminal."
+                )
+            action.code = action.code.replace('!pip', '%pip')
+            obs: IPythonRunCellObservation = await _jupyter_plugin.run(action)
+            if 'pip install' in action.code:
+                obs.content = await self.parse_pip_output(action.code, obs.content)
+            return obs
         else:
             raise RuntimeError(
                 'JupyterRequirement not found. Unable to run IPython action.'
             )
 
-    def get_working_directory(self):
-        result, exit_code = self._execute_bash('pwd', keep_prompt=False)
+    def _get_working_directory(self):
+        # NOTE: this is part of initialization, so we hard code the timeout
+        result, exit_code = self._execute_bash('pwd', timeout=60, keep_prompt=False)
         if exit_code != 0:
             raise RuntimeError('Failed to get working directory')
         return result.strip()
@@ -217,7 +435,7 @@ def _resolve_path(self, path: str, working_dir: str) -> str:
     async def read(self, action: FileReadAction) -> Observation:
         # NOTE: the client code is running inside the sandbox,
         # so there's no need to check permission
-        working_dir = self.get_working_directory()
+        working_dir = self._get_working_directory()
         filepath = self._resolve_path(action.path, working_dir)
         try:
             with open(filepath, 'r', encoding='utf-8') as file:
@@ -237,14 +455,21 @@ async def read(self, action: FileReadAction) -> Observation:
         return FileReadObservation(path=filepath, content=code_view)
 
     async def write(self, action: FileWriteAction) -> Observation:
-        working_dir = self.get_working_directory()
+        working_dir = self._get_working_directory()
         filepath = self._resolve_path(action.path, working_dir)
 
         insert = action.content.split('\n')
         try:
             if not os.path.exists(os.path.dirname(filepath)):
                 os.makedirs(os.path.dirname(filepath))
-            mode = 'w' if not os.path.exists(filepath) else 'r+'
+
+            file_exists = os.path.exists(filepath)
+            if file_exists:
+                file_stat = os.stat(filepath)
+            else:
+                file_stat = None
+
+            mode = 'w' if not file_exists else 'r+'
             try:
                 with open(filepath, mode, encoding='utf-8') as file:
                     if mode != 'w':
@@ -258,6 +483,19 @@ async def write(self, action: FileWriteAction) -> Observation:
                     file.seek(0)
                     file.writelines(new_file)
                     file.truncate()
+
+                # Handle file permissions
+                if sys.platform != 'win32':
+                    if file_exists:
+                        assert file_stat is not None
+                        # restore the original file permissions if the file already exists
+                        os.chmod(filepath, file_stat.st_mode)
+                        os.chown(filepath, file_stat.st_uid, file_stat.st_gid)
+                    else:
+                        # set the new file permissions if the file is new
+                        os.chmod(filepath, 0o644)
+                        os.chown(filepath, self.user_id, self.user_id)
+
             except FileNotFoundError:
                 return ErrorObservation(f'File not found: {filepath}')
             except IsADirectoryError:
@@ -283,27 +521,21 @@ def close(self):
         self.browser.close()
 
 
-# def test_run_commond():
-#     client = RuntimeClient()
-#     command = CmdRunAction(command='ls -l')
-#     obs = client.run_action(command)
-#     print(obs)
-
-# def test_shell(message):
-#     shell = pexpect.spawn('/bin/bash', encoding='utf-8')
-#     shell.expect(r'[$#] ')
-#     print(f'Received command: {message}')
-#     shell.sendline(message)
-#     shell.expect(r'[$#] ')
-#     output = shell.before.strip().split('\r\n', 1)[1].strip()
-#     print(f'Output: {output}')
-#     shell.close()
-
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('port', type=int, help='Port to listen on')
     parser.add_argument('--working-dir', type=str, help='Working directory')
     parser.add_argument('--plugins', type=str, help='Plugins to initialize', nargs='+')
+    parser.add_argument(
+        '--username', type=str, help='User to run as', default='opendevin'
+    )
+    parser.add_argument('--user-id', type=int, help='User ID to run as', default=1000)
+    parser.add_argument(
+        '--browsergym-eval-env',
+        type=str,
+        help='BrowserGym environment used for browser evaluation',
+        default=None,
+    )
     # example: python client.py 8000 --working-dir /workspace --plugins JupyterRequirement
     args = parser.parse_args()
 
@@ -314,16 +546,35 @@ def close(self):
                 raise ValueError(f'Plugin {plugin} not found')
             plugins_to_load.append(ALL_PLUGINS[plugin]())  # type: ignore
 
-    client = RuntimeClient(plugins_to_load, work_dir=args.working_dir)
+    client: RuntimeClient | None = None
+
+    @asynccontextmanager
+    async def lifespan(app: FastAPI):
+        global client
+        client = RuntimeClient(
+            plugins_to_load,
+            work_dir=args.working_dir,
+            username=args.username,
+            user_id=args.user_id,
+            browsergym_eval_env=args.browsergym_eval_env,
+        )
+        await client.ainit()
+        yield
+        # Clean up & release the resources
+        client.close()
+
+    app = FastAPI(lifespan=lifespan)
 
     @app.middleware('http')
     async def one_request_at_a_time(request: Request, call_next):
+        assert client is not None
         async with client.lock:
             response = await call_next(request)
         return response
 
     @app.post('/execute_action')
     async def execute_action(action_request: ActionRequest):
+        assert client is not None
         try:
             action = event_from_dict(action_request.action)
             if not isinstance(action, Action):
@@ -334,10 +585,188 @@ async def execute_action(action_request: ActionRequest):
             logger.error(f'Error processing command: {str(e)}')
             raise HTTPException(status_code=500, detail=str(e))
 
+    @app.post('/upload_file')
+    async def upload_file(
+        file: UploadFile, destination: str = '/', recursive: bool = False
+    ):
+        assert client is not None
+
+        try:
+            # Ensure the destination directory exists
+            if not os.path.isabs(destination):
+                raise HTTPException(
+                    status_code=400, detail='Destination must be an absolute path'
+                )
+
+            full_dest_path = destination
+            if not os.path.exists(full_dest_path):
+                os.makedirs(full_dest_path, exist_ok=True)
+
+            if recursive:
+                # For recursive uploads, we expect a zip file
+                if not file.filename.endswith('.zip'):
+                    raise HTTPException(
+                        status_code=400, detail='Recursive uploads must be zip files'
+                    )
+
+                zip_path = os.path.join(full_dest_path, file.filename)
+                with open(zip_path, 'wb') as buffer:
+                    shutil.copyfileobj(file.file, buffer)
+
+                # Extract the zip file
+                shutil.unpack_archive(zip_path, full_dest_path)
+                os.remove(zip_path)  # Remove the zip file after extraction
+
+                logger.info(
+                    f'Uploaded file {file.filename} and extracted to {destination}'
+                )
+            else:
+                # For single file uploads
+                file_path = os.path.join(full_dest_path, file.filename)
+                with open(file_path, 'wb') as buffer:
+                    shutil.copyfileobj(file.file, buffer)
+                logger.info(f'Uploaded file {file.filename} to {destination}')
+
+            return JSONResponse(
+                content={
+                    'filename': file.filename,
+                    'destination': destination,
+                    'recursive': recursive,
+                },
+                status_code=200,
+            )
+
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
     @app.get('/alive')
     async def alive():
         return {'status': 'ok'}
 
+    # ================================
+    # File-specific operations for UI
+    # ================================
+
+    @app.post('/list_files')
+    async def list_files(request: Request):
+        """List files in the specified path.
+
+        This function retrieves a list of files from the agent's runtime file store,
+        excluding certain system and hidden files/directories.
+
+        To list files:
+        ```sh
+        curl http://localhost:3000/api/list-files
+        ```
+
+        Args:
+            request (Request): The incoming request object.
+            path (str, optional): The path to list files from. Defaults to '/'.
+
+        Returns:
+            list: A list of file names in the specified path.
+
+        Raises:
+            HTTPException: If there's an error listing the files.
+        """
+        assert client is not None
+
+        # get request as dict
+        request_dict = await request.json()
+        path = request_dict.get('path', None)
+
+        # Get the full path of the requested directory
+        if path is None:
+            full_path = client.initial_pwd
+        elif os.path.isabs(path):
+            full_path = path
+        else:
+            full_path = os.path.join(client.initial_pwd, path)
+
+        if not os.path.exists(full_path):
+            return JSONResponse(
+                content={'error': f'Directory {full_path} does not exist'},
+                status_code=400,
+            )
+
+        try:
+            # Check if the directory exists
+            if not os.path.exists(full_path) or not os.path.isdir(full_path):
+                return []
+
+            # Check if .gitignore exists
+            gitignore_path = os.path.join(full_path, '.gitignore')
+            if os.path.exists(gitignore_path):
+                # Use PathSpec to parse .gitignore
+                with open(gitignore_path, 'r') as f:
+                    spec = PathSpec.from_lines(GitWildMatchPattern, f.readlines())
+            else:
+                # Fallback to default exclude list if .gitignore doesn't exist
+                default_exclude = [
+                    '.git',
+                    '.DS_Store',
+                    '.svn',
+                    '.hg',
+                    '.idea',
+                    '.vscode',
+                    '.settings',
+                    '.pytest_cache',
+                    '__pycache__',
+                    'node_modules',
+                    'vendor',
+                    'build',
+                    'dist',
+                    'bin',
+                    'logs',
+                    'log',
+                    'tmp',
+                    'temp',
+                    'coverage',
+                    'venv',
+                    'env',
+                ]
+                spec = PathSpec.from_lines(GitWildMatchPattern, default_exclude)
+
+            entries = os.listdir(full_path)
+
+            # Filter entries using PathSpec
+            filtered_entries = [
+                os.path.join(full_path, entry)
+                for entry in entries
+                if not spec.match_file(os.path.relpath(entry, str(full_path)))
+            ]
+
+            # Separate directories and files
+            directories = []
+            files = []
+            for entry in filtered_entries:
+                # Remove leading slash and any parent directory components
+                entry_relative = entry.lstrip('/').split('/')[-1]
+
+                # Construct the full path by joining the base path with the relative entry path
+                full_entry_path = os.path.join(full_path, entry_relative)
+                if os.path.exists(full_entry_path):
+                    is_dir = os.path.isdir(full_entry_path)
+                    if is_dir:
+                        # add trailing slash to directories
+                        # required by FE to differentiate directories and files
+                        entry = entry.rstrip('/') + '/'
+                        directories.append(entry)
+                    else:
+                        files.append(entry)
+
+            # Sort directories and files separately
+            directories.sort(key=lambda s: s.lower())
+            files.sort(key=lambda s: s.lower())
+
+            # Combine sorted directories and files
+            sorted_entries = directories + files
+            return sorted_entries
+
+        except Exception as e:
+            logger.error(f'Error listing files: {e}', exc_info=True)
+            return []
+
     logger.info(f'Starting action execution API on port {args.port}')
     print(f'Starting action execution API on port {args.port}')
     run(app, host='0.0.0.0', port=args.port)
diff --git a/opendevin/runtime/client/runtime.py b/opendevin/runtime/client/runtime.py
index 100d18df9036..c90dfcc4b92b 100644
--- a/opendevin/runtime/client/runtime.py
+++ b/opendevin/runtime/client/runtime.py
@@ -1,6 +1,9 @@
 import asyncio
+import os
+import tempfile
 import uuid
 from typing import Optional
+from zipfile import ZipFile
 
 import aiohttp
 import docker
@@ -8,7 +11,7 @@
 
 from opendevin.core.config import AppConfig
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.events import EventSource, EventStream
+from opendevin.events import EventStream
 from opendevin.events.action import (
     BrowseInteractiveAction,
     BrowseURLAction,
@@ -18,7 +21,6 @@
     IPythonRunCellAction,
 )
 from opendevin.events.action.action import Action
-from opendevin.events.event import Event
 from opendevin.events.observation import (
     ErrorObservation,
     NullObservation,
@@ -44,47 +46,76 @@ def __init__(
         config: AppConfig,
         event_stream: EventStream,
         sid: str = 'default',
-        container_image: str | None = None,
         plugins: list[PluginRequirement] | None = None,
+        container_image: str | None = None,
     ):
-        super().__init__(config, event_stream, sid)  # will initialize the event stream
-        self._port = find_available_tcp_port()
+        super().__init__(
+            config, event_stream, sid, plugins
+        )  # will initialize the event stream
+        self.persist_sandbox = self.config.sandbox.persist_sandbox
+        self.fast_boot = self.config.sandbox.fast_boot
+        if self.persist_sandbox:
+            user = 'od' if self.config.run_as_devin else 'root'
+            path = config.workspace_mount_path
+            path = ''.join(c if c.isalnum() else '_' for c in path)  # type: ignore
+            self.instance_id = f'persisted-{user}-{path}'
+            self._port = self.config.sandbox.port
+        else:
+            self.instance_id = (sid or '') + str(uuid.uuid4())
+            self._port = find_available_tcp_port()
         self.api_url = f'http://localhost:{self._port}'
+        self.api_url = f'http://{self.config.sandbox.api_hostname}:{self._port}'
         self.session: Optional[aiohttp.ClientSession] = None
 
-        self.instance_id = (
-            sid + str(uuid.uuid4()) if sid is not None else str(uuid.uuid4())
-        )
         # TODO: We can switch to aiodocker when `get_od_sandbox_image` is updated to use aiodocker
         self.docker_client: docker.DockerClient = self._init_docker_client()
         self.container_image = (
-            config.sandbox.container_image
+            self.config.sandbox.container_image
             if container_image is None
             else container_image
         )
         self.container_name = self.container_name_prefix + self.instance_id
 
-        self.plugins = plugins if plugins is not None else []
         self.container = None
         self.action_semaphore = asyncio.Semaphore(1)  # Ensure one action at a time
+        logger.debug(f'EventStreamRuntime `{sid}` config:\n{self.config}')
 
     async def ainit(self, env_vars: dict[str, str] | None = None):
-        self.container_image = build_runtime_image(
-            self.container_image,
-            self.docker_client,
-            # NOTE: You can need set DEBUG=true to update the source code
-            # inside the container. This is useful when you want to test/debug the
-            # latest code in the runtime docker container.
-            update_source_code=self.config.sandbox.update_source_code,
-        )
-        self.container = await self._init_container(
-            self.sandbox_workspace_dir,
-            mount_dir=self.config.workspace_mount_path,
-            plugins=self.plugins,
-        )
-        # MUST call super().ainit() to initialize both default env vars
-        # AND the ones in env vars!
-        await super().ainit(env_vars)
+        if self.config.sandbox.od_runtime_extra_deps:
+            logger.info(
+                f'Installing extra user-provided dependencies in the runtime image: {self.config.sandbox.od_runtime_extra_deps}'
+            )
+        try:
+            docker.DockerClient().containers.get(self.container_name)
+            self.is_initial_session = False
+        except docker.errors.NotFound:
+            self.is_initial_session = True
+
+        if self.is_initial_session:
+            logger.info('Creating new Docker container')
+            self.container_image = build_runtime_image(
+                self.container_image,
+                self.docker_client,
+                extra_deps=self.config.sandbox.od_runtime_extra_deps,
+            )
+            self.container = await self._init_container(
+                self.sandbox_workspace_dir,
+                mount_dir=self.config.workspace_mount_path,
+                plugins=self.plugins,
+            )
+            # MUST call super().ainit() to initialize both default env vars
+            # AND the ones in env vars!
+            await super().ainit(env_vars)
+
+            logger.info(
+                f'Container initialized with plugins: {[plugin.name for plugin in self.plugins]}'
+            )
+            logger.info(f'Container initialized with env vars: {env_vars}')
+
+        else:
+            logger.info('Using existing Docker container')
+            self.container = self.docker_client.containers.get(self.container_name)
+            await self.start_docker_container()
 
     @staticmethod
     def _init_docker_client() -> docker.DockerClient:
@@ -103,16 +134,18 @@ def _init_docker_client() -> docker.DockerClient:
     async def _init_container(
         self,
         sandbox_workspace_dir: str,
-        mount_dir: str,
+        mount_dir: str | None = None,
         plugins: list[PluginRequirement] | None = None,
     ):
         try:
             logger.info(
-                f'Starting container with image: {self.container_image} and name: {self.container_name}'
+                f'Starting container with image: {self.container_image} and name: {self.container_name} with port: {self._port}'
             )
-            if plugins is None:
-                plugins = []
-            plugin_names = ' '.join([plugin.name for plugin in plugins])
+            plugin_arg = ''
+            if plugins is not None and len(plugins) > 0:
+                plugin_arg = (
+                    f'--plugins {" ".join([plugin.name for plugin in plugins])} '
+                )
 
             network_mode: str | None = None
             port_mapping: dict[str, int] | None = None
@@ -124,6 +157,21 @@ async def _init_container(
             else:
                 port_mapping = {f'{self._port}/tcp': self._port}
 
+            if mount_dir is not None:
+                volumes = {mount_dir: {'bind': sandbox_workspace_dir, 'mode': 'rw'}}
+                logger.info(f'Mount dir: {sandbox_workspace_dir}')
+            else:
+                logger.warn(
+                    'Mount dir is not set, will not mount the workspace directory to the container.'
+                )
+                volumes = None
+
+            if self.config.sandbox.browsergym_eval_env is not None:
+                browsergym_arg = (
+                    f'--browsergym-eval-env {self.config.sandbox.browsergym_eval_env}'
+                )
+            else:
+                browsergym_arg = ''
             container = self.docker_client.containers.run(
                 self.container_image,
                 command=(
@@ -131,7 +179,10 @@ async def _init_container(
                     'PYTHONUNBUFFERED=1 poetry run '
                     f'python -u -m opendevin.runtime.client.client {self._port} '
                     f'--working-dir {sandbox_workspace_dir} '
-                    f'--plugins {plugin_names}'
+                    f'{plugin_arg}'
+                    f'--username {"opendevin" if self.config.run_as_devin else "root"} '
+                    f'--user-id {self.config.sandbox.user_id} '
+                    f'{browsergym_arg}'
                 ),
                 network_mode=network_mode,
                 ports=port_mapping,
@@ -139,7 +190,7 @@ async def _init_container(
                 name=self.container_name,
                 detach=True,
                 environment={'DEBUG': 'true'} if self.config.debug else None,
-                volumes={mount_dir: {'bind': sandbox_workspace_dir, 'mode': 'rw'}},
+                volumes=volumes,
             )
             logger.info(f'Container started. Server url: {self.api_url}')
             return container
@@ -150,31 +201,61 @@ async def _init_container(
             raise e
 
     async def _ensure_session(self):
+        await asyncio.sleep(1)
         if self.session is None or self.session.closed:
             self.session = aiohttp.ClientSession()
         return self.session
 
     @tenacity.retry(
         stop=tenacity.stop_after_attempt(10),
-        wait=tenacity.wait_exponential(multiplier=2, min=4, max=600),
+        wait=tenacity.wait_exponential(multiplier=2, min=4, max=60),
     )
     async def _wait_until_alive(self):
+        logger.info('Reconnecting session')
+        container = self.docker_client.containers.get(self.container_name)
+        # print logs
+        _logs = container.logs(tail=10).decode('utf-8').split('\n')
+        # add indent
+        _logs = '\n'.join([f'    |{log}' for log in _logs])
+        logger.info(
+            '\n'
+            + '-' * 30
+            + 'Container logs (last 10 lines):'
+            + '-' * 30
+            + f'\n{_logs}'
+            + '\n'
+            + '-' * 90
+        )
         async with aiohttp.ClientSession() as session:
             async with session.get(f'{self.api_url}/alive') as response:
                 if response.status == 200:
                     return
                 else:
-                    logger.error(
-                        f'Action execution API is not alive. Response: {response}'
-                    )
-                    raise RuntimeError(
-                        f'Action execution API is not alive. Response: {response}'
-                    )
+                    msg = f'Action execution API is not alive. Response: {response}'
+                    logger.error(msg)
+                    raise RuntimeError(msg)
 
     @property
     def sandbox_workspace_dir(self):
         return self.config.workspace_mount_path_in_sandbox
 
+    async def start_docker_container(self):
+        try:
+            container = self.docker_client.containers.get(self.container_name)
+            logger.info('Container status: %s', container.status)
+            if container.status != 'running':
+                container.start()
+                logger.info('Container started')
+            elapsed = 0
+            while container.status != 'running':
+                await asyncio.sleep(1)
+                elapsed += 1
+                if elapsed > self.config.sandbox.timeout:
+                    break
+                container = self.docker_client.containers.get(self.container_name)
+        except Exception:
+            logger.exception('Failed to start container')
+
     async def close(self, close_client: bool = True):
         if self.session is not None and not self.session.closed:
             await self.session.close()
@@ -182,28 +263,29 @@ async def close(self, close_client: bool = True):
         containers = self.docker_client.containers.list(all=True)
         for container in containers:
             try:
-                if container.name.startswith(self.container_name_prefix):
+                # only remove the container it created
+                # otherwise all other containers with the same prefix will be removed
+                # which will mess up with parallel evaluation
+                if container.name.startswith(self.container_name):
                     logs = container.logs(tail=1000).decode('utf-8')
                     logger.debug(
                         f'==== Container logs ====\n{logs}\n==== End of container logs ===='
                     )
-                    container.remove(force=True)
+                    if self.persist_sandbox:
+                        if not self.fast_boot:
+                            container.stop()
+                    else:
+                        container.remove(force=True)
             except docker.errors.NotFound:
                 pass
         if close_client:
             self.docker_client.close()
 
-    async def on_event(self, event: Event) -> None:
-        logger.info(f'EventStreamRuntime: on_event triggered: {event}')
-        if isinstance(event, Action):
-            logger.info(event, extra={'msg_type': 'ACTION'})
-            observation = await self.run_action(event)
-            # observation._cause = event.id  # type: ignore[attr-defined]
-            logger.info(observation, extra={'msg_type': 'OBSERVATION'})
-            source = event.source if event.source else EventSource.AGENT
-            await self.event_stream.add_event(observation, source)
-
-    async def run_action(self, action: Action, timeout: int = 600) -> Observation:
+    async def run_action(self, action: Action) -> Observation:
+        # set timeout to default if not set
+        if action.timeout is None:
+            action.timeout = self.config.sandbox.timeout
+
         async with self.action_semaphore:
             if not action.runnable:
                 return NullObservation('')
@@ -215,13 +297,18 @@ async def run_action(self, action: Action, timeout: int = 600) -> Observation:
                     f'Action {action_type} is not supported in the current runtime.'
                 )
 
+            logger.info('Awaiting session')
             session = await self._ensure_session()
             await self._wait_until_alive()
+
+            assert action.timeout is not None
+
             try:
+                logger.info(f'Executing action {action}')
                 async with session.post(
                     f'{self.api_url}/execute_action',
                     json={'action': event_to_dict(action)},
-                    timeout=timeout,
+                    timeout=action.timeout,
                 ) as response:
                     if response.status == 200:
                         output = await response.json()
@@ -240,7 +327,6 @@ async def run_action(self, action: Action, timeout: int = 600) -> Observation:
             except Exception as e:
                 logger.error(f'Error during command execution: {e}')
                 obs = ErrorObservation(f'Command execution failed: {str(e)}')
-            obs._parent = action.id  # type: ignore[attr-defined]
             return obs
 
     async def run(self, action: CmdRunAction) -> Observation:
@@ -261,22 +347,83 @@ async def browse(self, action: BrowseURLAction) -> Observation:
     async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation:
         return await self.run_action(action)
 
-    ############################################################################
-    # Keep the same with other runtimes
-    ############################################################################
+    # ====================================================================
+    # Implement these methods (for file operations) in the subclass
+    # ====================================================================
 
-    def get_working_directory(self):
-        raise NotImplementedError(
-            'This method is not implemented in the runtime client.'
-        )
+    async def copy_to(
+        self, host_src: str, sandbox_dest: str, recursive: bool = False
+    ) -> None:
+        if not os.path.exists(host_src):
+            raise FileNotFoundError(f'Source file {host_src} does not exist')
+
+        session = await self._ensure_session()
+        await self._wait_until_alive()
+        try:
+            if recursive:
+                # For recursive copy, create a zip file
+                with tempfile.NamedTemporaryFile(
+                    suffix='.zip', delete=False
+                ) as temp_zip:
+                    temp_zip_path = temp_zip.name
+
+                with ZipFile(temp_zip_path, 'w') as zipf:
+                    for root, _, files in os.walk(host_src):
+                        for file in files:
+                            file_path = os.path.join(root, file)
+                            arcname = os.path.relpath(
+                                file_path, os.path.dirname(host_src)
+                            )
+                            zipf.write(file_path, arcname)
+
+                upload_data = {'file': open(temp_zip_path, 'rb')}
+            else:
+                # For single file copy
+                upload_data = {'file': open(host_src, 'rb')}
 
-    ############################################################################
-    # Initialization work inside sandbox image
-    ############################################################################
+            params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()}
 
-    # init_runtime_tools direcctly do as what Runtime do
+            async with session.post(
+                f'{self.api_url}/upload_file', data=upload_data, params=params
+            ) as response:
+                if response.status == 200:
+                    return
+                else:
+                    error_message = await response.text()
+                    raise Exception(f'Copy operation failed: {error_message}')
 
-    # Do in the od_runtime_client
-    # Overwrite the init_sandbox_plugins
-    def init_sandbox_plugins(self, plugins: list[PluginRequirement]) -> None:
-        pass
+        except asyncio.TimeoutError:
+            raise TimeoutError('Copy operation timed out')
+        except Exception as e:
+            raise RuntimeError(f'Copy operation failed: {str(e)}')
+        finally:
+            if recursive:
+                os.unlink(temp_zip_path)
+            logger.info(f'Copy completed: host:{host_src} -> runtime:{sandbox_dest}')
+
+    async def list_files(self, path: str | None = None) -> list[str]:
+        """List files in the sandbox.
+
+        If path is None, list files in the sandbox's initial working directory (e.g., /workspace).
+        """
+        session = await self._ensure_session()
+        await self._wait_until_alive()
+        try:
+            data = {}
+            if path is not None:
+                data['path'] = path
+
+            async with session.post(
+                f'{self.api_url}/list_files', json=data
+            ) as response:
+                if response.status == 200:
+                    response_json = await response.json()
+                    assert isinstance(response_json, list)
+                    return response_json
+                else:
+                    error_message = await response.text()
+                    raise Exception(f'List files operation failed: {error_message}')
+        except asyncio.TimeoutError:
+            raise TimeoutError('List files operation timed out')
+        except Exception as e:
+            raise RuntimeError(f'List files operation failed: {str(e)}')
diff --git a/opendevin/runtime/docker/__init__.py b/opendevin/runtime/docker/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/opendevin/runtime/docker/local_box.py b/opendevin/runtime/docker/local_box.py
deleted file mode 100644
index 5da1707939f8..000000000000
--- a/opendevin/runtime/docker/local_box.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import atexit
-import os
-import subprocess
-import sys
-
-from opendevin.core.config import SandboxConfig
-from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.schema import CancellableStream
-from opendevin.runtime.sandbox import Sandbox
-
-# ===============================================================================
-#  ** WARNING **
-#
-#  This sandbox should only be used when OpenDevin is running inside a container
-#
-#  Sandboxes are generally isolated so that they cannot affect the host machine.
-#  This Sandbox implementation does not provide isolation, and can inadvertently
-#  run dangerous commands on the host machine, potentially rendering the host
-#  machine unusable.
-#
-#  This sandbox is meant for use with OpenDevin Quickstart
-#
-#  DO NOT USE THIS SANDBOX IN A PRODUCTION ENVIRONMENT
-# ===============================================================================
-
-
-class LocalBox(Sandbox):
-    def __init__(
-        self,
-        config: SandboxConfig,
-        workspace_base: str,
-    ):
-        self.config = config
-        os.makedirs(workspace_base, exist_ok=True)
-        self.workspace_base = workspace_base
-        atexit.register(self.cleanup)
-        super().__init__(config)
-
-    def execute(
-        self, cmd: str, stream: bool = False, timeout: int | None = None
-    ) -> tuple[int, str | CancellableStream]:
-        try:
-            completed_process = subprocess.run(
-                cmd,
-                shell=True,
-                text=True,
-                capture_output=True,
-                timeout=self.config.timeout,
-                cwd=self.workspace_base,
-                env=self._env,
-            )
-            return completed_process.returncode, completed_process.stdout.strip()
-        except subprocess.TimeoutExpired:
-            return -1, 'Command timed out'
-
-    def copy_to(self, host_src: str, sandbox_dest: str, recursive: bool = False):
-        # mkdir -p sandbox_dest if it doesn't exist
-        res = subprocess.run(
-            f'mkdir -p {sandbox_dest}',
-            shell=True,
-            text=True,
-            cwd=self.workspace_base,
-            env=self._env,
-        )
-        if res.returncode != 0:
-            raise RuntimeError(f'Failed to create directory {sandbox_dest} in sandbox')
-
-        if recursive:
-            res = subprocess.run(
-                f'cp -r {host_src} {sandbox_dest}',
-                shell=True,
-                text=True,
-                cwd=self.workspace_base,
-                env=self._env,
-            )
-            if res.returncode != 0:
-                raise RuntimeError(
-                    f'Failed to copy {host_src} to {sandbox_dest} in sandbox'
-                )
-        else:
-            res = subprocess.run(
-                f'cp {host_src} {sandbox_dest}',
-                shell=True,
-                text=True,
-                cwd=self.workspace_base,
-                env=self._env,
-            )
-            if res.returncode != 0:
-                raise RuntimeError(
-                    f'Failed to copy {host_src} to {sandbox_dest} in sandbox'
-                )
-
-    def close(self):
-        pass
-
-    def cleanup(self):
-        self.close()
-
-    def get_working_directory(self):
-        return self.workspace_base
-
-
-if __name__ == '__main__':
-    local_box = LocalBox(SandboxConfig(), '/tmp/opendevin')
-    sys.stdout.flush()
-    try:
-        while True:
-            try:
-                user_input = input('>>> ')
-            except EOFError:
-                logger.info('Exiting...')
-                break
-            if user_input.lower() == 'exit':
-                logger.info('Exiting...')
-                break
-            exit_code, output = local_box.execute(user_input)
-            logger.info('exit code: %d', exit_code)
-            logger.info(output)
-            sys.stdout.flush()
-    except KeyboardInterrupt:
-        logger.info('Exiting...')
-    local_box.close()
diff --git a/opendevin/runtime/docker/ssh_box.py b/opendevin/runtime/docker/ssh_box.py
deleted file mode 100644
index d7650cc76fe5..000000000000
--- a/opendevin/runtime/docker/ssh_box.py
+++ /dev/null
@@ -1,683 +0,0 @@
-import atexit
-import os
-import re
-import sys
-import tarfile
-import tempfile
-import time
-import uuid
-from glob import glob
-
-import docker
-from pexpect import exceptions, pxssh
-from tenacity import retry, stop_after_attempt, wait_fixed
-
-from opendevin.core.config import SandboxConfig
-from opendevin.core.const.guide_url import TROUBLESHOOTING_URL
-from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.schema import CancellableStream
-from opendevin.runtime.plugins import AgentSkillsRequirement, JupyterRequirement
-from opendevin.runtime.plugins.requirement import PluginRequirement
-from opendevin.runtime.sandbox import Sandbox
-from opendevin.runtime.utils import find_available_tcp_port, split_bash_commands
-from opendevin.runtime.utils.image_agnostic import get_od_sandbox_image
-
-
-class SSHExecCancellableStream(CancellableStream):
-    def __init__(self, ssh, cmd, timeout):
-        super().__init__(self.read_output())
-        self.ssh = ssh
-        self.cmd = cmd
-        self.timeout = timeout
-
-    def close(self):
-        self.closed = True
-
-    def exit_code(self):
-        marker = f'EXIT_CODE_MARKER_{uuid.uuid4().hex}'
-        self.ssh.sendline(f'echo "{marker}$?{marker}"')
-
-        if not self.ssh.prompt(timeout=self.timeout):
-            return None  # Timeout occurred
-
-        output = self.ssh.before
-        match = re.search(f'{marker}(\\d+){marker}', output)
-
-        if match:
-            try:
-                return int(match.group(1))
-            except ValueError:
-                # Log the unexpected format
-                logger.error(f'Unexpected exit code format: {match.group(1)}')
-                return None
-        else:
-            # If we can't find our marked exit code, log the output and return None
-            logger.error(f'Could not find exit code in output: {output}')
-            return None
-
-    def read_output(self):
-        st = time.time()
-        buf = ''
-        crlf = '\r\n'
-        lf = '\n'
-        prompt_len = len(self.ssh.PROMPT)
-        while True:
-            try:
-                if self.closed:
-                    break
-                _output = self.ssh.read_nonblocking(timeout=1)
-                if not _output:
-                    continue
-
-                buf += _output
-
-                if len(buf) < prompt_len:
-                    continue
-
-                match = re.search(self.ssh.PROMPT, buf)
-                if match:
-                    idx, _ = match.span()
-                    yield buf[:idx].replace(crlf, lf)
-                    buf = ''
-                    break
-
-                res = buf[:-prompt_len]
-                if len(res) == 0 or res.find(crlf) == -1:
-                    continue
-                buf = buf[-prompt_len:]
-                yield res.replace(crlf, lf)
-            except exceptions.TIMEOUT:
-                if time.time() - st < self.timeout:
-                    match = re.search(self.ssh.PROMPT, buf)
-                    if match:
-                        idx, _ = match.span()
-                        yield buf[:idx].replace(crlf, lf)
-                        break
-                    continue
-                else:
-                    yield buf.replace(crlf, lf)
-                break
-            except exceptions.EOF:
-                break
-
-
-class DockerSSHBox(Sandbox):
-    instance_id: str
-    container_image: str
-    container_name_prefix = 'opendevin-sandbox-'
-    container_name: str
-    container: docker.models.containers.Container
-    docker_client: docker.DockerClient
-
-    _ssh_password: str
-    _ssh_port: int
-    ssh: pxssh.pxssh | None = None
-
-    def __init__(
-        self,
-        config: SandboxConfig,
-        persist_sandbox: bool,
-        workspace_mount_path: str,
-        sandbox_workspace_dir: str,
-        cache_dir: str,
-        run_as_devin: bool,
-        ssh_hostname: str = 'host.docker.internal',
-        ssh_password: str | None = None,
-        ssh_port: int = 22,
-        sid: str | None = None,
-    ):
-        self.config = config
-        self.workspace_mount_path = workspace_mount_path
-        self.sandbox_workspace_dir = sandbox_workspace_dir
-        self.cache_dir = cache_dir
-        self.use_host_network = config.use_host_network
-        self.run_as_devin = run_as_devin
-        logger.info(
-            f'SSHBox is running as {"opendevin" if self.run_as_devin else "root"} user with USER_ID={config.user_id} in the sandbox'
-        )
-        # Initialize docker client. Throws an exception if Docker is not reachable.
-        try:
-            self.docker_client = docker.from_env()
-        except Exception as ex:
-            logger.exception(
-                f'Error creating controller. Please check Docker is running and visit `{TROUBLESHOOTING_URL}` for more debugging information.',
-                exc_info=False,
-            )
-            raise ex
-
-        if persist_sandbox:
-            if not self.run_as_devin:
-                raise Exception(
-                    'Persistent sandbox is currently designed for opendevin user only. Please set run_as_devin=True in your config.toml'
-                )
-            self.instance_id = 'persisted'
-        else:
-            self.instance_id = (sid or '') + str(uuid.uuid4())
-
-        self.container_image = get_od_sandbox_image(
-            config.container_image, self.docker_client
-        )
-        self.container_name = self.container_name_prefix + self.instance_id
-
-        # set up random user password
-        self.persist_sandbox = persist_sandbox
-        self.ssh_hostname = ssh_hostname
-        if persist_sandbox:
-            if not ssh_password:
-                raise ValueError('ssh_password is required for persistent sandbox')
-            self._ssh_password = ssh_password
-            self._ssh_port = ssh_port
-        else:
-            self._ssh_password = str(uuid.uuid4())
-            self._ssh_port = find_available_tcp_port()
-        try:
-            docker.DockerClient().containers.get(self.container_name)
-            self.is_initial_session = False
-        except docker.errors.NotFound:
-            self.is_initial_session = True
-            logger.info('Detected initial session.')
-        if not persist_sandbox or self.is_initial_session:
-            logger.info('Creating new Docker container')
-            n_tries = 5
-            while n_tries > 0:
-                try:
-                    self.restart_docker_container()
-                    break
-                except Exception as e:
-                    logger.exception(
-                        'Failed to start Docker container, retrying...', exc_info=False
-                    )
-                    n_tries -= 1
-                    if n_tries == 0:
-                        raise e
-                    time.sleep(5)
-            self.setup_user()
-        else:
-            self.container = self.docker_client.containers.get(self.container_name)
-            logger.info('Using existing Docker container')
-            self.start_docker_container()
-        try:
-            self.start_ssh_session()
-        except Exception as e:
-            self.close()
-            raise e
-        time.sleep(1)
-
-        # make sure /tmp always exists
-        self.execute('mkdir -p /tmp')
-        # set git config
-        self.execute('git config --global user.name "OpenDevin"')
-        self.execute('git config --global user.email "opendevin@all-hands.dev"')
-        atexit.register(self.close)
-        super().__init__(config)
-
-    def setup_user(self):
-        # Make users sudoers passwordless
-        # TODO(sandbox): add this line in the Dockerfile for next minor version of docker image
-        exit_code, logs = self.container.exec_run(
-            ['/bin/bash', '-c', r"echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers"],
-            workdir=self.sandbox_workspace_dir,
-            environment=self._env,
-        )
-        if exit_code != 0:
-            raise Exception(
-                f'Failed to make all users passwordless sudoers in sandbox: {logs}'
-            )
-
-        # Check if the opendevin user exists
-        exit_code, logs = self.container.exec_run(
-            ['/bin/bash', '-c', 'id -u opendevin'],
-            workdir=self.sandbox_workspace_dir,
-            environment=self._env,
-        )
-        if exit_code == 0:
-            # User exists, delete it
-            exit_code, logs = self.container.exec_run(
-                ['/bin/bash', '-c', 'userdel -r opendevin'],
-                workdir=self.sandbox_workspace_dir,
-                environment=self._env,
-            )
-            if exit_code != 0:
-                raise Exception(f'Failed to remove opendevin user in sandbox: {logs}')
-
-        if self.run_as_devin:
-            # Create the opendevin user
-            exit_code, logs = self.container.exec_run(
-                [
-                    '/bin/bash',
-                    '-c',
-                    f'useradd -rm -d /home/opendevin -s /bin/bash -g root -G sudo -u {self.config.user_id} opendevin',
-                ],
-                workdir=self.sandbox_workspace_dir,
-                environment=self._env,
-            )
-            if exit_code != 0:
-                raise Exception(f'Failed to create opendevin user in sandbox: {logs}')
-            exit_code, logs = self.container.exec_run(
-                [
-                    '/bin/bash',
-                    '-c',
-                    f"echo 'opendevin:{self._ssh_password}' | chpasswd",
-                ],
-                workdir=self.sandbox_workspace_dir,
-                environment=self._env,
-            )
-            if exit_code != 0:
-                raise Exception(f'Failed to set password in sandbox: {logs}')
-
-            # chown the home directory
-            exit_code, logs = self.container.exec_run(
-                ['/bin/bash', '-c', 'chown opendevin:root /home/opendevin'],
-                workdir=self.sandbox_workspace_dir,
-                environment=self._env,
-            )
-            if exit_code != 0:
-                raise Exception(
-                    f'Failed to chown home directory for opendevin in sandbox: {logs}'
-                )
-            # check the miniforge3 directory exist
-            exit_code, logs = self.container.exec_run(
-                [
-                    '/bin/bash',
-                    '-c',
-                    '[ -d "/opendevin/miniforge3" ] && exit 0 || exit 1',
-                ],
-                workdir=self.sandbox_workspace_dir,
-                environment=self._env,
-            )
-            if exit_code != 0:
-                if exit_code == 1:
-                    raise Exception(
-                        'OPENDEVIN_PYTHON_INTERPRETER is not usable. Please pull the latest Docker image: docker pull ghcr.io/opendevin/sandbox:main'
-                    )
-                else:
-                    raise Exception(
-                        f'An error occurred while checking if miniforge3 directory exists: {logs}'
-                    )
-            exit_code, logs = self.container.exec_run(
-                [
-                    '/bin/bash',
-                    '-c',
-                    f'chown opendevin:root {self.sandbox_workspace_dir}',
-                ],
-                workdir=self.sandbox_workspace_dir,
-                environment=self._env,
-            )
-            if exit_code != 0:
-                # This is not a fatal error, just a warning
-                logger.warning(
-                    f'Failed to chown workspace directory for opendevin in sandbox: {logs}. But this should be fine if the {self.sandbox_workspace_dir=} is mounted by the app docker container.'
-                )
-        else:
-            exit_code, logs = self.container.exec_run(
-                # change password for root
-                ['/bin/bash', '-c', f"echo 'root:{self._ssh_password}' | chpasswd"],
-                workdir=self.sandbox_workspace_dir,
-                environment=self._env,
-            )
-            if exit_code != 0:
-                raise Exception(f'Failed to set password for root in sandbox: {logs}')
-        exit_code, logs = self.container.exec_run(
-            ['/bin/bash', '-c', "echo 'opendevin-sandbox' > /etc/hostname"],
-            workdir=self.sandbox_workspace_dir,
-            environment=self._env,
-        )
-
-    # Use the retry decorator, with a maximum of 5 attempts and a fixed wait time of 5 seconds between attempts
-    @retry(stop=stop_after_attempt(5), wait=wait_fixed(5))
-    def __ssh_login(self):
-        try:
-            self.ssh = pxssh.pxssh(
-                echo=False,
-                timeout=self.config.timeout,
-                encoding='utf-8',
-                codec_errors='replace',
-            )
-            hostname = self.ssh_hostname
-            username = 'opendevin' if self.run_as_devin else 'root'
-            if self.persist_sandbox:
-                password_msg = 'using your SSH password'
-            else:
-                password_msg = f"using the password '{self._ssh_password}'"
-            logger.info('Connecting to SSH session...')
-            hostname_to_log = hostname.replace('host.docker.internal', 'localhost')
-            ssh_cmd = f'`ssh -v -p {self._ssh_port} {username}@{hostname_to_log}`'
-            logger.info(
-                f'You can debug the SSH connection by running: {ssh_cmd} {password_msg}'
-            )
-            self.ssh.login(hostname, username, self._ssh_password, port=self._ssh_port)
-            logger.info('Connected to SSH session')
-        except pxssh.ExceptionPxssh as e:
-            logger.exception(
-                'Failed to login to SSH session, retrying...', exc_info=False
-            )
-            raise e
-
-    def start_ssh_session(self):
-        time.sleep(1)
-        self.__ssh_login()
-        assert self.ssh is not None
-
-        # Fix: https://github.com/pexpect/pexpect/issues/669
-        self.ssh.sendline("bind 'set enable-bracketed-paste off'")
-        self.ssh.prompt()
-        time.sleep(1)
-
-        # cd to workspace
-        self.ssh.sendline(f'cd {self.sandbox_workspace_dir}')
-        self.ssh.prompt()
-
-    def get_exec_cmd(self, cmd: str) -> list[str]:
-        if self.run_as_devin:
-            return ['su', 'opendevin', '-c', cmd]
-        else:
-            return ['/bin/bash', '-c', cmd]
-
-    def _send_interrupt(
-        self,
-        cmd: str,
-        prev_output: str = '',
-        ignore_last_output: bool = False,
-    ) -> tuple[int, str]:
-        assert self.ssh is not None
-        logger.exception(
-            f'Command "{cmd}" timed out, killing process...', exc_info=False
-        )
-        # send a SIGINT to the process
-        self.ssh.sendintr()
-        self.ssh.prompt()
-        command_output = prev_output
-        if not ignore_last_output:
-            command_output += '\n' + self.ssh.before
-        return (
-            -1,
-            f'Command: "{cmd}" timed out. Sent SIGINT to the process: {command_output}',
-        )
-
-    def execute(
-        self, cmd: str, stream: bool = False, timeout: int | None = None
-    ) -> tuple[int, str | CancellableStream]:
-        assert self.ssh is not None
-        timeout = timeout or self.config.timeout
-        commands = split_bash_commands(cmd)
-        if len(commands) > 1:
-            all_output = ''
-            for command in commands:
-                exit_code, output = self.execute(command)
-                if all_output:
-                    all_output += '\r\n'
-                all_output += str(output)
-                if exit_code != 0:
-                    return exit_code, all_output
-            return 0, all_output
-
-        self.ssh.sendline(cmd)
-        if stream:
-            return 0, SSHExecCancellableStream(self.ssh, cmd, self.config.timeout)
-        success = self.ssh.prompt(timeout=timeout)
-        if not success:
-            return self._send_interrupt(cmd)
-        command_output = self.ssh.before
-
-        # once out, make sure that we have *every* output, we while loop until we get an empty output
-        while True:
-            self.ssh.sendline('\n')
-            timeout_not_reached = self.ssh.prompt(timeout=1)
-            if not timeout_not_reached:
-                logger.debug('TIMEOUT REACHED')
-                break
-            output = self.ssh.before
-            if isinstance(output, str) and output.strip() == '':
-                break
-            command_output += output
-        command_output = command_output.removesuffix('\r\n')
-
-        # get the exit code
-        self.ssh.sendline('echo $?')
-        self.ssh.prompt()
-        exit_code_str = self.ssh.before.strip()
-        _start_time = time.time()
-        while not exit_code_str:
-            self.ssh.prompt(timeout=1)
-            exit_code_str = self.ssh.before.strip()
-            if time.time() - _start_time > timeout:
-                return self._send_interrupt(
-                    cmd, command_output, ignore_last_output=True
-                )
-        cleaned_exit_code_str = exit_code_str.replace('echo $?', '').strip()
-
-        try:
-            exit_code = int(cleaned_exit_code_str)
-        except ValueError:
-            logger.error(f'Invalid exit code: {cleaned_exit_code_str}')
-            # Handle the invalid exit code appropriately (e.g., raise an exception or set a default value)
-            exit_code = -1  # or some other appropriate default value
-
-        return exit_code, command_output
-
-    def copy_to(self, host_src: str, sandbox_dest: str, recursive: bool = False):
-        # mkdir -p sandbox_dest if it doesn't exist
-        exit_code, logs = self.container.exec_run(
-            ['/bin/bash', '-c', f'mkdir -p {sandbox_dest}'],
-            workdir=self.sandbox_workspace_dir,
-            environment=self._env,
-        )
-        if exit_code != 0:
-            raise Exception(
-                f'Failed to create directory {sandbox_dest} in sandbox: {logs}'
-            )
-
-        # use temp directory to store the tar file to avoid
-        # conflict of filename when running multi-processes
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            if recursive:
-                assert os.path.isdir(
-                    host_src
-                ), 'Source must be a directory when recursive is True'
-                files = glob(host_src + '/**/*', recursive=True)
-                srcname = os.path.basename(host_src)
-                tar_filename = os.path.join(tmp_dir, srcname + '.tar')
-                with tarfile.open(tar_filename, mode='w') as tar:
-                    for file in files:
-                        tar.add(
-                            file,
-                            arcname=os.path.relpath(file, os.path.dirname(host_src)),
-                        )
-            else:
-                assert os.path.isfile(
-                    host_src
-                ), 'Source must be a file when recursive is False'
-                srcname = os.path.basename(host_src)
-                tar_filename = os.path.join(tmp_dir, srcname + '.tar')
-                with tarfile.open(tar_filename, mode='w') as tar:
-                    tar.add(host_src, arcname=srcname)
-
-            with open(tar_filename, 'rb') as f:
-                data = f.read()
-            self.container.put_archive(os.path.dirname(sandbox_dest), data)
-
-    def start_docker_container(self):
-        try:
-            container = self.docker_client.containers.get(self.container_name)
-            logger.info('Container status: %s', container.status)
-            if container.status != 'running':
-                container.start()
-                logger.info('Container started')
-            elapsed = 0
-            while container.status != 'running':
-                time.sleep(1)
-                elapsed += 1
-                if elapsed > self.config.timeout:
-                    break
-                container = self.docker_client.containers.get(self.container_name)
-        except Exception:
-            logger.exception('Failed to start container')
-
-    def remove_docker_container(self):
-        try:
-            container = self.docker_client.containers.get(self.container_name)
-            container.stop()
-            logger.info('Container stopped')
-            container.remove()
-            logger.info('Container removed')
-            elapsed = 0
-            while container.status != 'exited':
-                time.sleep(1)
-                elapsed += 1
-                if elapsed > self.config.timeout:
-                    break
-                container = self.docker_client.containers.get(self.container_name)
-        except docker.errors.NotFound:
-            pass
-
-    def get_working_directory(self):
-        exit_code, result = self.execute('pwd')
-        if exit_code != 0:
-            raise Exception('Failed to get working directory')
-        return str(result).strip()
-
-    def is_container_running(self):
-        try:
-            container = self.docker_client.containers.get(self.container_name)
-            if container.status == 'running':
-                self.container = container
-                return True
-            return False
-        except docker.errors.NotFound:
-            return False
-
-    @property
-    def volumes(self):
-        mount_dir = self.workspace_mount_path
-        return {
-            mount_dir: {'bind': self.sandbox_workspace_dir, 'mode': 'rw'},
-            # mount cache directory to /home/opendevin/.cache for pip cache reuse
-            self.cache_dir: {
-                'bind': (
-                    '/home/opendevin/.cache' if self.run_as_devin else '/root/.cache'
-                ),
-                'mode': 'rw',
-            },
-        }
-
-    def restart_docker_container(self):
-        try:
-            self.remove_docker_container()
-        except docker.errors.DockerException as ex:
-            logger.exception('Failed to remove container', exc_info=False)
-            raise ex
-
-        try:
-            network_kwargs: dict[str, str | dict[str, int]] = {}
-            if self.use_host_network:
-                network_kwargs['network_mode'] = 'host'
-            else:
-                # FIXME: This is a temporary workaround for Windows where host network mode has bugs.
-                # FIXME: Docker Desktop for Mac OS has experimental support for host network mode
-                network_kwargs['ports'] = {f'{self._ssh_port}/tcp': self._ssh_port}
-                logger.warning(
-                    (
-                        'Using port forwarding till the enable host network mode of Docker is out of experimental mode.'
-                        'Check the 897th issue on https://github.com/OpenDevin/OpenDevin/issues/ for more information.'
-                    )
-                )
-
-            # start the container
-            logger.info(f'Mounting volumes: {self.volumes}')
-            self.container = self.docker_client.containers.run(
-                self.container_image,
-                # allow root login
-                command=f"/usr/sbin/sshd -D -p {self._ssh_port} -o 'PermitRootLogin=yes'",
-                **network_kwargs,
-                working_dir=self.sandbox_workspace_dir,
-                name=self.container_name,
-                detach=True,
-                volumes=self.volumes,
-            )
-            logger.info('Container started')
-        except Exception as ex:
-            logger.exception('Failed to start container: ' + str(ex), exc_info=False)
-            raise ex
-
-        # wait for container to be ready
-        elapsed = 0
-        while self.container.status != 'running':
-            if self.container.status == 'exited':
-                logger.info('container exited')
-                logger.info('container logs:')
-                logger.info(self.container.logs())
-                break
-            time.sleep(1)
-            elapsed += 1
-            self.container = self.docker_client.containers.get(self.container_name)
-            logger.info(
-                f'waiting for container to start: {elapsed}, container status: {self.container.status}'
-            )
-            if elapsed > self.config.timeout:
-                break
-        if self.container.status != 'running':
-            raise Exception('Failed to start container')
-
-    # clean up the container, cannot do it in __del__ because the python interpreter is already shutting down
-    def close(self):
-        containers = self.docker_client.containers.list(all=True)
-        for container in containers:
-            try:
-                if container.name.startswith(self.container_name):
-                    if self.persist_sandbox:
-                        container.stop()
-                    else:
-                        # only remove the container we created
-                        # otherwise all other containers with the same prefix will be removed
-                        # which will mess up with parallel evaluation
-                        container.remove(force=True)
-            except docker.errors.NotFound:
-                pass
-        self.docker_client.close()
-
-
-if __name__ == '__main__':
-    try:
-        ssh_box = DockerSSHBox(
-            config=SandboxConfig(),
-            run_as_devin=False,
-            workspace_mount_path='/path/to/workspace',
-            cache_dir='/path/to/cache',
-            sandbox_workspace_dir='/sandbox',
-            persist_sandbox=False,
-        )
-    except Exception as e:
-        logger.exception('Failed to start Docker container: %s', e)
-        sys.exit(1)
-
-    logger.info(
-        "Interactive Docker container started. Type 'exit' or use Ctrl+C to exit."
-    )
-
-    # Initialize required plugins
-    plugins: list[PluginRequirement] = [AgentSkillsRequirement(), JupyterRequirement()]
-    ssh_box.init_plugins(plugins)
-    logger.info(
-        '--- AgentSkills COMMAND DOCUMENTATION ---\n'
-        f'{AgentSkillsRequirement().documentation}\n'
-        '---'
-    )
-
-    sys.stdout.flush()
-    try:
-        while True:
-            try:
-                user_input = input('$ ')
-            except EOFError:
-                logger.info('Exiting...')
-                break
-            if user_input.lower() == 'exit':
-                logger.info('Exiting...')
-                break
-            exit_code, output = ssh_box.execute(user_input)
-            logger.info('exit code: %d', exit_code)
-            logger.info(output)
-            sys.stdout.flush()
-    except KeyboardInterrupt:
-        logger.info('Exiting...')
-    ssh_box.close()
diff --git a/opendevin/runtime/e2b/runtime.py b/opendevin/runtime/e2b/runtime.py
index 36d162b3d8d8..5d90408b5545 100644
--- a/opendevin/runtime/e2b/runtime.py
+++ b/opendevin/runtime/e2b/runtime.py
@@ -11,22 +11,26 @@
 )
 from opendevin.events.stream import EventStream
 from opendevin.runtime import Sandbox
-from opendevin.runtime.server.files import insert_lines, read_lines
-from opendevin.runtime.server.runtime import ServerRuntime
+from opendevin.runtime.plugins import PluginRequirement
+from opendevin.runtime.runtime import Runtime
 
+from ..utils.files import insert_lines, read_lines
 from .filestore import E2BFileStore
 from .sandbox import E2BSandbox
 
 
-class E2BRuntime(ServerRuntime):
+class E2BRuntime(Runtime):
     def __init__(
         self,
         config: AppConfig,
         event_stream: EventStream,
         sid: str = 'default',
+        plugins: list[PluginRequirement] | None = None,
         sandbox: Sandbox | None = None,
     ):
-        super().__init__(config, event_stream, sid, sandbox)
+        super().__init__(config, event_stream, sid, plugins)
+        if sandbox is None:
+            self.sandbox = E2BSandbox()
         if not isinstance(self.sandbox, E2BSandbox):
             raise ValueError('E2BRuntime requires an E2BSandbox')
         self.file_store = E2BFileStore(self.sandbox.filesystem)
diff --git a/opendevin/runtime/plugins/__init__.py b/opendevin/runtime/plugins/__init__.py
index 9c4e01cc8a33..fac44a362544 100644
--- a/opendevin/runtime/plugins/__init__.py
+++ b/opendevin/runtime/plugins/__init__.py
@@ -1,19 +1,15 @@
 # Requirements
 from .agent_skills import AgentSkillsPlugin, AgentSkillsRequirement
 from .jupyter import JupyterPlugin, JupyterRequirement
-from .mixin import PluginMixin
 from .requirement import Plugin, PluginRequirement
-from .swe_agent_commands import SWEAgentCommandsRequirement
 
 __all__ = [
     'Plugin',
-    'PluginMixin',
     'PluginRequirement',
     'AgentSkillsRequirement',
     'AgentSkillsPlugin',
     'JupyterRequirement',
     'JupyterPlugin',
-    'SWEAgentCommandsRequirement',
 ]
 
 ALL_PLUGINS = {
diff --git a/opendevin/runtime/plugins/agent_skills/__init__.py b/opendevin/runtime/plugins/agent_skills/__init__.py
index 8b1a3c7335d5..e331e16096ae 100644
--- a/opendevin/runtime/plugins/agent_skills/__init__.py
+++ b/opendevin/runtime/plugins/agent_skills/__init__.py
@@ -1,4 +1,3 @@
-import os
 from dataclasses import dataclass
 
 from opendevin.runtime.plugins.agent_skills.agentskills import DOCUMENTATION
@@ -8,11 +7,6 @@
 @dataclass
 class AgentSkillsRequirement(PluginRequirement):
     name: str = 'agent_skills'
-    host_src: str = os.path.dirname(
-        os.path.abspath(__file__)
-    )  # The directory of this file (opendevin/runtime/plugins/agent_skills)
-    sandbox_dest: str = '/opendevin/plugins/agent_skills'
-    bash_script_path: str = 'setup.sh'
     documentation: str = DOCUMENTATION
 
 
diff --git a/opendevin/runtime/plugins/agent_skills/agentskills.py b/opendevin/runtime/plugins/agent_skills/agentskills.py
index d6fdea9084eb..db45daef580c 100644
--- a/opendevin/runtime/plugins/agent_skills/agentskills.py
+++ b/opendevin/runtime/plugins/agent_skills/agentskills.py
@@ -21,6 +21,7 @@
 import os
 import re
 import shutil
+import sys
 import tempfile
 from inspect import signature
 from typing import Optional
@@ -41,30 +42,77 @@
 WINDOW = 100
 
 
-ENABLE_AUTO_LINT = os.getenv('ENABLE_AUTO_LINT', 'false').lower() == 'true'
-
 # This is also used in unit tests!
 MSG_FILE_UPDATED = '[File updated (edited at line {line_number}). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]'
 
+
+# ==================================================================================================
 # OPENAI
-OPENAI_API_KEY = os.getenv(
-    'OPENAI_API_KEY', os.getenv('SANDBOX_ENV_OPENAI_API_KEY', '')
-)
-OPENAI_BASE_URL = os.getenv('OPENAI_BASE_URL', 'https://api.openai.com/v1')
-OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-4o-2024-05-13')
-MAX_TOKEN = os.getenv('MAX_TOKEN', 500)
+# TODO: Move this to EventStream Actions when EventStreamRuntime is fully implemented
+# NOTE: we need to get env vars inside functions because they will be set in IPython
+# AFTER the agentskills is imported (the case for EventStreamRuntime)
+# ==================================================================================================
+def _get_openai_api_key():
+    return os.getenv('OPENAI_API_KEY', os.getenv('SANDBOX_ENV_OPENAI_API_KEY', ''))
+
+
+def _get_openai_base_url():
+    return os.getenv('OPENAI_BASE_URL', 'https://api.openai.com/v1')
+
+
+def _get_openai_model():
+    return os.getenv('OPENAI_MODEL', 'gpt-4o-2024-05-13')
 
-OPENAI_PROXY = f'{OPENAI_BASE_URL}/chat/completions'
 
-client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
+def _get_max_token():
+    return os.getenv('MAX_TOKEN', 500)
+
+
+def _get_openai_client():
+    client = OpenAI(api_key=_get_openai_api_key(), base_url=_get_openai_base_url())
+    return client
+
+
+# ==================================================================================================
 
 
 # Define the decorator using the functionality of UpdatePwd
 def update_pwd_decorator(func):
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
-        old_pwd = os.getcwd()
         jupyter_pwd = os.environ.get('JUPYTER_PWD', None)
+        try:
+            old_pwd = os.getcwd()
+        except FileNotFoundError:
+            import json
+            import subprocess
+
+            print(
+                f'DEBUGGING Environment variables: {json.dumps(dict(os.environ), indent=2)}'
+            )
+            if sys.platform != 'win32':
+                print(f'DEBUGGING User ID: {os.getuid()}, Group ID: {os.getgid()}')
+
+            out = subprocess.run(['pwd'], capture_output=True)
+            old_pwd = out.stdout.decode('utf-8').strip()
+            os.chdir(old_pwd)
+            print(f'DEBUGGING Change to working directory: {old_pwd}')
+
+            import tempfile
+
+            try:
+                tempfile.TemporaryFile(dir=old_pwd)
+                print(f'DEBUGGING Directory {old_pwd} is writable')
+            except Exception as e:
+                print(f'DEBUGGING Directory {old_pwd} is not writable: {str(e)}')
+
+            # ls -alh
+            out = subprocess.run(['ls', '-alh', old_pwd], capture_output=True)
+            print(
+                f'DEBUGGING OLD working directory contents: {out.stdout.decode("utf-8")}'
+            )
+            print(f'DEBUGGING Target JUPYTER pwd: {jupyter_pwd}')
+
         if jupyter_pwd:
             os.chdir(jupyter_pwd)
         try:
@@ -506,7 +554,10 @@ def _edit_file_impl(
         shutil.move(temp_file_path, src_abs_path)
 
         # Handle linting
-        if ENABLE_AUTO_LINT:
+        # NOTE: we need to get env var inside this function
+        # because the env var will be set AFTER the agentskills is imported
+        enable_auto_lint = os.getenv('ENABLE_AUTO_LINT', 'false').lower() == 'true'
+        if enable_auto_lint:
             # BACKUP the original file
             original_file_backup_path = os.path.join(
                 os.path.dirname(file_name),
@@ -954,7 +1005,9 @@ def parse_audio(file_path: str, model: str = 'whisper-1') -> None:
     try:
         # TODO: record the COST of the API call
         with open(file_path, 'rb') as audio_file:
-            transcript = client.audio.translations.create(model=model, file=audio_file)
+            transcript = _get_openai_client().audio.translations.create(
+                model=model, file=audio_file
+            )
         print(transcript.text)
 
     except Exception as e:
@@ -975,10 +1028,10 @@ def parse_image(
     # TODO: record the COST of the API call
     try:
         base64_image = _base64_img(file_path)
-        response = client.chat.completions.create(
-            model=OPENAI_MODEL,
+        response = _get_openai_client().chat.completions.create(
+            model=_get_openai_model(),
             messages=_prepare_image_messages(task, base64_image),
-            max_tokens=MAX_TOKEN,
+            max_tokens=_get_max_token(),
         )
         content = response.choices[0].message.content
         print(content)
@@ -1021,10 +1074,10 @@ def parse_video(
         print(f'Process the {file_path}, current No. {idx * frame_interval} frame...')
         # TODO: record the COST of the API call
         try:
-            response = client.chat.completions.create(
-                model=OPENAI_MODEL,
+            response = _get_openai_client().chat.completions.create(
+                model=_get_openai_model(),
                 messages=_prepare_image_messages(task, base64_frame),
-                max_tokens=MAX_TOKEN,
+                max_tokens=_get_max_token(),
             )
 
             content = response.choices[0].message.content
@@ -1077,7 +1130,9 @@ def parse_pptx(file_path: str) -> None:
     'parse_pptx',
 ]
 
-if OPENAI_API_KEY and OPENAI_BASE_URL:
+# This is called from OpenDevin's side
+# If SANDBOX_ENV_OPENAI_API_KEY is set, we will be able to use these tools in the sandbox environment
+if _get_openai_api_key() and _get_openai_base_url():
     __all__ += ['parse_audio', 'parse_video', 'parse_image']
 
 DOCUMENTATION = ''
diff --git a/opendevin/runtime/plugins/agent_skills/setup.sh b/opendevin/runtime/plugins/agent_skills/setup.sh
deleted file mode 100755
index 53e5cfe7df65..000000000000
--- a/opendevin/runtime/plugins/agent_skills/setup.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-set -e
-
-OPENDEVIN_PYTHON_INTERPRETER=/opendevin/miniforge3/bin/python
-# check if OPENDEVIN_PYTHON_INTERPRETER exists and it is usable
-if [ -z "$OPENDEVIN_PYTHON_INTERPRETER" ] ||  [ ! -x "$OPENDEVIN_PYTHON_INTERPRETER" ]; then
-    echo "OPENDEVIN_PYTHON_INTERPRETER is not usable. Please pull the latest Docker image!"
-    exit 1
-fi
-
-# add agent_skills to PATH
-echo 'export PATH=/opendevin/plugins/agent_skills:$PATH' >> ~/.bashrc
-
-# add agent_skills to PYTHONPATH
-echo 'export PYTHONPATH=/opendevin/plugins/agent_skills:$PYTHONPATH' >> ~/.bashrc
-
-source ~/.bashrc
-
-$OPENDEVIN_PYTHON_INTERPRETER -m pip install flake8 python-docx PyPDF2 python-pptx pylatexenc openai opencv-python
-$OPENDEVIN_PYTHON_INTERPRETER -m pip install diskcache==5.6.3 grep-ast==0.3.2 tree-sitter==0.21.3 tree-sitter-languages==1.10.2
diff --git a/opendevin/runtime/plugins/jupyter/__init__.py b/opendevin/runtime/plugins/jupyter/__init__.py
index 1178a95fd0da..f9c33bfa19cc 100644
--- a/opendevin/runtime/plugins/jupyter/__init__.py
+++ b/opendevin/runtime/plugins/jupyter/__init__.py
@@ -1,10 +1,10 @@
-import os
 import subprocess
 import time
 from dataclasses import dataclass
 
+from opendevin.core.logger import opendevin_logger as logger
 from opendevin.events.action import Action, IPythonRunCellAction
-from opendevin.events.observation import IPythonRunCellObservation, Observation
+from opendevin.events.observation import IPythonRunCellObservation
 from opendevin.runtime.plugins.requirement import Plugin, PluginRequirement
 from opendevin.runtime.utils import find_available_tcp_port
 
@@ -14,48 +14,45 @@
 @dataclass
 class JupyterRequirement(PluginRequirement):
     name: str = 'jupyter'
-    host_src: str = os.path.dirname(
-        os.path.abspath(__file__)
-    )  # The directory of this file (opendevin/runtime/plugins/jupyter)
-    sandbox_dest: str = '/opendevin/plugins/jupyter'
-    bash_script_path: str = 'setup.sh'
-
-    # ================================================================
-    # Plugin methods, which will ONLY be used in the runtime client
-    # running inside docker
-    # ================================================================
 
 
 class JupyterPlugin(Plugin):
     name: str = 'jupyter'
 
-    def initialize(self, kernel_id: str = 'opendevin-default'):
+    async def initialize(self, username: str, kernel_id: str = 'opendevin-default'):
         self.kernel_gateway_port = find_available_tcp_port()
         self.kernel_id = kernel_id
         self.gateway_process = subprocess.Popen(
-            [
-                '/opendevin/miniforge3/bin/mamba',
-                'run',
-                '-n',
-                'base',
-                'poetry',
-                'run',
-                'jupyter',
-                'kernelgateway',
-                '--KernelGatewayApp.ip=0.0.0.0',
-                f'--KernelGatewayApp.port={self.kernel_gateway_port}',
-            ],
+            (
+                f"su - {username} -s /bin/bash << 'EOF'\n"
+                'cd /opendevin/code\n'
+                'export POETRY_VIRTUALENVS_PATH=/opendevin/poetry;\n'
+                'export PYTHONPATH=/opendevin/code/opendevin/runtime/plugins/agent_skills:$PYTHONPATH;\n'
+                '/opendevin/miniforge3/bin/mamba run -n base '
+                'poetry run jupyter kernelgateway '
+                '--KernelGatewayApp.ip=0.0.0.0 '
+                f'--KernelGatewayApp.port={self.kernel_gateway_port}\n'
+                'EOF'
+            ),
             stderr=subprocess.STDOUT,
+            shell=True,
         )
         # read stdout until the kernel gateway is ready
+        output = ''
         while True and self.gateway_process.stdout is not None:
             line = self.gateway_process.stdout.readline().decode('utf-8')
+            output += line
             if 'at' in line:
                 break
             time.sleep(1)
-            print('Waiting for jupyter kernel gateway to start...')
+            logger.debug('Waiting for jupyter kernel gateway to start...')
+
+        logger.info(
+            f'Jupyter kernel gateway started at port {self.kernel_gateway_port}. Output: {output}'
+        )
 
-    async def run(self, action: Action) -> Observation:
+    async def _run(self, action: Action) -> IPythonRunCellObservation:
+        """Internal method to run a code cell in the jupyter kernel."""
         if not isinstance(action, IPythonRunCellAction):
             raise ValueError(
                 f'Jupyter plugin only supports IPythonRunCellAction, but got {action}'
@@ -68,8 +65,12 @@ async def run(self, action: Action) -> Observation:
 
         if not self.kernel.initialized:
             await self.kernel.initialize()
-        output = await self.kernel.execute(action.code)
+        output = await self.kernel.execute(action.code, timeout=action.timeout)
         return IPythonRunCellObservation(
             content=output,
             code=action.code,
         )
+
+    async def run(self, action: Action) -> IPythonRunCellObservation:
+        obs = await self._run(action)
+        return obs
diff --git a/opendevin/runtime/plugins/jupyter/execute_cli b/opendevin/runtime/plugins/jupyter/execute_cli
deleted file mode 100755
index 9637290e3887..000000000000
--- a/opendevin/runtime/plugins/jupyter/execute_cli
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-# Run the Python script with the specified interpreter
-export JUPYTER_PWD=$(pwd)
-$OPENDEVIN_PYTHON_INTERPRETER /opendevin/plugins/jupyter/execute_cli.py
diff --git a/opendevin/runtime/plugins/jupyter/execute_cli.py b/opendevin/runtime/plugins/jupyter/execute_cli.py
deleted file mode 100755
index 0b77653e12b5..000000000000
--- a/opendevin/runtime/plugins/jupyter/execute_cli.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import os
-import sys
-import time
-import traceback
-
-import requests
-
-# Read the Python code from STDIN
-code = sys.stdin.read()
-
-
-def execute_code(code, print_output=True):
-    PORT = os.environ.get('JUPYTER_EXEC_SERVER_PORT')
-    POST_URL = f'http://localhost:{PORT}/execute'
-
-    # Set the default kernel ID
-    kernel_id = 'default'
-    output = ''
-    for i in range(3):
-        try:
-            response = requests.post(
-                POST_URL, json={'kernel_id': kernel_id, 'code': code}
-            )
-            output = response.text
-            if '500: Internal Server Error' not in output:
-                if print_output:
-                    print(output)
-                break
-        except requests.exceptions.ConnectionError:
-            if i == 2:
-                traceback.print_exc()
-        time.sleep(2)
-    else:
-        if not output:
-            with open('/opendevin/logs/jupyter_execute_server.log', 'r') as f:
-                output = f.read()
-        print('Failed to connect to the Jupyter server', output)
-
-
-if jupyter_pwd := os.environ.get('JUPYTER_PWD'):
-    execute_code(
-        f'import os\nos.environ["JUPYTER_PWD"] = "{jupyter_pwd}"\n', print_output=False
-    )
-
-execute_code(code)
diff --git a/opendevin/runtime/plugins/jupyter/execute_server.py b/opendevin/runtime/plugins/jupyter/execute_server.py
index 1388875c9b5a..35824257479f 100755
--- a/opendevin/runtime/plugins/jupyter/execute_server.py
+++ b/opendevin/runtime/plugins/jupyter/execute_server.py
@@ -7,6 +7,7 @@
 from uuid import uuid4
 
 import tornado
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
 from tornado.escape import json_decode, json_encode, url_escape
 from tornado.httpclient import AsyncHTTPClient, HTTPRequest
 from tornado.ioloop import PeriodicCallback
@@ -73,8 +74,8 @@ async def initialize(self):
         if os.path.exists('/opendevin/plugins/agent_skills/agentskills.py'):
             self.tools_to_run.append('from agentskills import *')
         for tool in self.tools_to_run:
-            # logging.info(f'Tool initialized:\n{tool}')
-            await self.execute(tool)
+            res = await self.execute(tool)
+            logging.info(f'Tool [{tool}] initialized:\n{res}')
         self.initialized = True
 
     async def _send_heartbeat(self):
@@ -134,13 +135,18 @@ async def _connect(self):
         )
         self.heartbeat_callback.start()
 
+    @retry(
+        retry=retry_if_exception_type(ConnectionRefusedError),
+        stop=stop_after_attempt(3),
+        wait=wait_fixed(2),
+    )
     async def execute(self, code, timeout=120):
         if not self.ws:
             await self._connect()
 
         msg_id = uuid4().hex
         assert self.ws is not None
-        self.ws.write_message(
+        res = await self.ws.write_message(
             json_encode(
                 {
                     'header': {
@@ -164,6 +170,7 @@ async def execute(self, code, timeout=120):
                 }
             )
         )
+        logging.info(f'Executed code in jupyter kernel:\n{res}')
 
         outputs = []
 
diff --git a/opendevin/runtime/plugins/jupyter/setup.sh b/opendevin/runtime/plugins/jupyter/setup.sh
deleted file mode 100755
index 765351880f11..000000000000
--- a/opendevin/runtime/plugins/jupyter/setup.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# Hardcoded to use the Python interpreter from the OpenDevin runtime client
-OPENDEVIN_PYTHON_INTERPRETER=/opendevin/miniforge3/bin/python
-# check if OPENDEVIN_PYTHON_INTERPRETER exists and it is usable
-if [ -z "$OPENDEVIN_PYTHON_INTERPRETER" ] ||  [ ! -x "$OPENDEVIN_PYTHON_INTERPRETER" ]; then
-    echo "OPENDEVIN_PYTHON_INTERPRETER is not usable. Please pull the latest Docker image!"
-    exit 1
-fi
-
-# Install dependencies
-$OPENDEVIN_PYTHON_INTERPRETER -m pip install jupyterlab notebook jupyter_kernel_gateway
-
-source ~/.bashrc
-# ADD /opendevin/plugins to PATH to make `jupyter_cli` available
-echo 'export PATH=$PATH:/opendevin/plugins/jupyter' >> ~/.bashrc
-export PATH=/opendevin/plugins/jupyter:$PATH
-
-# Temporary add /opendevin/miniforge3/bin to PATH
-# will not persist after the end of the script
-# This fixes https://github.com/OpenDevin/OpenDevin/pull/2489#issuecomment-2223088169
-export PATH=/opendevin/miniforge3/bin:$PATH
-
-# if user name is `opendevin`, add '/home/opendevin/.local/bin' to PATH
-if [ "$USER" = "opendevin" ]; then
-    echo 'export PATH=$PATH:/home/opendevin/.local/bin' >> ~/.bashrc
-    echo "export OPENDEVIN_PYTHON_INTERPRETER=$OPENDEVIN_PYTHON_INTERPRETER" >> ~/.bashrc
-    export PATH=$PATH:/home/opendevin/.local/bin
-    export PIP_CACHE_DIR=$HOME/.cache/pip
-fi
-# if user name is `root`, add '/root/.local/bin' to PATH
-if [ "$USER" = "root" ]; then
-    echo 'export PATH=$PATH:/root/.local/bin' >> ~/.bashrc
-    echo "export OPENDEVIN_PYTHON_INTERPRETER=$OPENDEVIN_PYTHON_INTERPRETER" >> ~/.bashrc
-    export PATH=$PATH:/root/.local/bin
-    export PIP_CACHE_DIR=$HOME/.cache/pip
-
-fi
-
-# Run background process to start jupyter kernel gateway
-# write a bash function that finds a free port
-find_free_port() {
-  local start_port="${1:-20000}"
-  local end_port="${2:-65535}"
-
-  for port in $(seq $start_port $end_port); do
-    if ! ss -tuln | awk '{print $5}' | grep -q ":$port$"; then
-      echo $port
-      return
-    fi
-  done
-
-  echo "No free ports found in the range $start_port to $end_port" >&2
-  return 1
-}
-
-export JUPYTER_GATEWAY_PORT=$(find_free_port 20000 30000)
-$OPENDEVIN_PYTHON_INTERPRETER -m \
-  jupyter kernelgateway --KernelGatewayApp.ip=0.0.0.0 --KernelGatewayApp.port=$JUPYTER_GATEWAY_PORT > /opendevin/logs/jupyter_kernel_gateway.log 2>&1 &
-
-export JUPYTER_GATEWAY_PID=$!
-echo "export JUPYTER_GATEWAY_PID=$JUPYTER_GATEWAY_PID" >> ~/.bashrc
-export JUPYTER_GATEWAY_KERNEL_ID="default"
-echo "export JUPYTER_GATEWAY_KERNEL_ID=$JUPYTER_GATEWAY_KERNEL_ID" >> ~/.bashrc
-echo "JupyterKernelGateway started with PID: $JUPYTER_GATEWAY_PID"
-
-# Start the jupyter_server
-export JUPYTER_EXEC_SERVER_PORT=$(find_free_port 30000 40000)
-echo "export JUPYTER_EXEC_SERVER_PORT=$JUPYTER_EXEC_SERVER_PORT" >> ~/.bashrc
-$OPENDEVIN_PYTHON_INTERPRETER /opendevin/plugins/jupyter/execute_server.py > /opendevin/logs/jupyter_execute_server.log 2>&1 &
-export JUPYTER_EXEC_SERVER_PID=$!
-echo "export JUPYTER_EXEC_SERVER_PID=$JUPYTER_EXEC_SERVER_PID" >> ~/.bashrc
-echo "Execution server started with PID: $JUPYTER_EXEC_SERVER_PID"
-
-# Wait until /opendevin/logs/jupyter_kernel_gateway.log contains "is available"
-while ! grep -q "at" /opendevin/logs/jupyter_kernel_gateway.log; do
-    echo "Waiting for Jupyter kernel gateway to be available..."
-    sleep 1
-done
-# Wait until /opendevin/logs/jupyter_execute_server.log contains "Jupyter kernel created for conversation"
-while ! grep -q "kernel created" /opendevin/logs/jupyter_execute_server.log; do
-    echo "Waiting for Jupyter kernel to be created..."
-    sleep 1
-done
-echo "Jupyter kernel ready."
diff --git a/opendevin/runtime/plugins/mixin.py b/opendevin/runtime/plugins/mixin.py
deleted file mode 100644
index 14088eb493fa..000000000000
--- a/opendevin/runtime/plugins/mixin.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import os
-from typing import Protocol
-
-from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.schema import CancellableStream
-from opendevin.runtime.plugins.requirement import PluginRequirement
-
-
-class SandboxProtocol(Protocol):
-    # https://stackoverflow.com/questions/51930339/how-do-i-correctly-add-type-hints-to-mixin-classes
-
-    @property
-    def initialize_plugins(self) -> bool: ...
-
-    def execute(
-        self, cmd: str, stream: bool = False
-    ) -> tuple[int, str | CancellableStream]: ...
-
-    def copy_to(self, host_src: str, sandbox_dest: str, recursive: bool = False): ...
-
-
-def _source_bashrc(sandbox: SandboxProtocol):
-    exit_code, output = sandbox.execute(
-        'source /opendevin/bash.bashrc && source ~/.bashrc'
-    )
-    if exit_code != 0:
-        raise RuntimeError(
-            f'Failed to source /opendevin/bash.bashrc and ~/.bashrc with exit code {exit_code} and output: {output}'
-        )
-    logger.info('Sourced /opendevin/bash.bashrc and ~/.bashrc successfully')
-
-
-class PluginMixin:
-    """Mixin for Sandbox to support plugins."""
-
-    def init_plugins(self: SandboxProtocol, requirements: list[PluginRequirement]):
-        """Load a plugin into the sandbox."""
-        if hasattr(self, 'plugin_initialized') and self.plugin_initialized:
-            return
-
-        if self.initialize_plugins:
-            logger.info('Initializing plugins in the sandbox')
-
-            # clean-up ~/.bashrc and touch ~/.bashrc
-            exit_code, output = self.execute('rm -f ~/.bashrc && touch ~/.bashrc')
-            if exit_code != 0:
-                logger.warning(
-                    f'Failed to clean-up ~/.bashrc with exit code {exit_code} and output: {output}'
-                )
-
-            for requirement in requirements:
-                # source bashrc file when plugin loads
-                _source_bashrc(self)
-
-                # copy over the files
-                self.copy_to(
-                    requirement.host_src, requirement.sandbox_dest, recursive=True
-                )
-                logger.info(
-                    f'Copied files from [{requirement.host_src}] to [{requirement.sandbox_dest}] inside sandbox.'
-                )
-
-                # Execute the bash script
-                abs_path_to_bash_script = os.path.join(
-                    requirement.sandbox_dest, requirement.bash_script_path
-                )
-                logger.info(
-                    f'Initializing plugin [{requirement.name}] by executing [{abs_path_to_bash_script}] in the sandbox.'
-                )
-                exit_code, output = self.execute(abs_path_to_bash_script, stream=True)
-                if isinstance(output, CancellableStream):
-                    total_output = ''
-                    for line in output:
-                        # Removes any trailing whitespace, including \n and \r\n
-                        line = line.rstrip()
-                        # logger.debug(line)
-                        # Avoid text from lines running into each other
-                        total_output += line + ' '
-                    _exit_code = output.exit_code()
-                    output.close()
-                    if _exit_code != 0:
-                        raise RuntimeError(
-                            f'Failed to initialize plugin {requirement.name} with exit code {_exit_code} and output: {total_output.strip()}'
-                        )
-                    logger.info(f'Plugin {requirement.name} initialized successfully')
-                else:
-                    if exit_code != 0:
-                        raise RuntimeError(
-                            f'Failed to initialize plugin {requirement.name} with exit code {exit_code} and output: {output}'
-                        )
-                    logger.info(f'Plugin {requirement.name} initialized successfully.')
-        else:
-            logger.info('Skipping plugin initialization in the sandbox')
-
-        if len(requirements) > 0:
-            _source_bashrc(self)
-
-        self.plugin_initialized = True
diff --git a/opendevin/runtime/plugins/requirement.py b/opendevin/runtime/plugins/requirement.py
index 3971dad8c184..b6713fe2a3a5 100644
--- a/opendevin/runtime/plugins/requirement.py
+++ b/opendevin/runtime/plugins/requirement.py
@@ -14,7 +14,7 @@ class Plugin:
     name: str
 
     @abstractmethod
-    def initialize(self):
+    async def initialize(self, username: str):
         """Initialize the plugin."""
         pass
 
@@ -29,8 +29,3 @@ class PluginRequirement:
     """Requirement for a plugin."""
 
     name: str
-    # FOLDER/FILES to be copied to the sandbox
-    host_src: str
-    sandbox_dest: str
-    # NOTE: bash_script_path should be relative to the `sandbox_dest` path
-    bash_script_path: str
diff --git a/opendevin/runtime/plugins/swe_agent_commands/__init__.py b/opendevin/runtime/plugins/swe_agent_commands/__init__.py
deleted file mode 100644
index 66316f36b20c..000000000000
--- a/opendevin/runtime/plugins/swe_agent_commands/__init__.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import os
-from dataclasses import dataclass, field
-
-from opendevin.runtime.plugins.requirement import PluginRequirement
-from opendevin.runtime.plugins.swe_agent_commands.parse_commands import (
-    parse_command_file,
-)
-
-
-def _resolve_to_cur_dir(filename):
-    return os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
-
-
-def check_and_parse_command_file(filepath) -> str:
-    if filepath is None:
-        raise FileNotFoundError(f'File not found: {filepath}')
-    return parse_command_file(filepath)
-
-
-DEFAULT_SCRIPT_FILEPATHS = [
-    _resolve_to_cur_dir('defaults.sh'),
-    _resolve_to_cur_dir('search.sh'),
-    _resolve_to_cur_dir('edit_linting.sh'),
-]
-DEFAULT_DOCUMENTATION = ''.join(
-    [
-        check_and_parse_command_file(filepath)
-        for filepath in DEFAULT_SCRIPT_FILEPATHS
-        if filepath is not None
-    ]
-)
-
-
-@dataclass
-class SWEAgentCommandsRequirement(PluginRequirement):
-    name: str = 'swe_agent_commands'
-    host_src: str = os.path.dirname(os.path.abspath(__file__))
-    sandbox_dest: str = '/opendevin/plugins/swe_agent_commands'
-    bash_script_path: str = 'setup_default.sh'
-
-    scripts_filepaths: list[str | None] = field(
-        default_factory=lambda: DEFAULT_SCRIPT_FILEPATHS
-    )
-    documentation: str = DEFAULT_DOCUMENTATION
-
-
-CURSOR_SCRIPT_FILEPATHS = [
-    _resolve_to_cur_dir('cursors_defaults.sh'),
-    _resolve_to_cur_dir('cursors_edit_linting.sh'),
-    _resolve_to_cur_dir('search.sh'),
-]
-CURSOR_DOCUMENTATION = ''.join(
-    [
-        check_and_parse_command_file(filepath)
-        for filepath in CURSOR_SCRIPT_FILEPATHS
-        if filepath is not None
-    ]
-)
-
-
-@dataclass
-class SWEAgentCursorCommandsRequirement(PluginRequirement):
-    name: str = 'swe_agent_commands'
-    host_src: str = os.path.dirname(os.path.abspath(__file__))
-    sandbox_dest: str = '/opendevin/plugins/swe_agent_commands'
-    bash_script_path: str = 'setup_cursor_mode.sh'
-
-    scripts_filepaths: list[str | None] = field(
-        default_factory=lambda: CURSOR_SCRIPT_FILEPATHS
-    )
-    documentation: str = CURSOR_DOCUMENTATION
diff --git a/opendevin/runtime/plugins/swe_agent_commands/_setup_cursor_mode_env.sh b/opendevin/runtime/plugins/swe_agent_commands/_setup_cursor_mode_env.sh
deleted file mode 100755
index 3f4e8bf9e020..000000000000
--- a/opendevin/runtime/plugins/swe_agent_commands/_setup_cursor_mode_env.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-# Cursor Mode from SWE-Bench
-# https://github.com/princeton-nlp/SWE-agent/blob/ca54d5556b9db4f4f2be21f09530ce69a72c0305/config/configs/default_sys-env_cursors_window100-detailed_cmd_format-last_5_history-1_demos.yaml
-export WINDOW=200;
-export OVERLAP=2;
-export CURRENT_LINE=0;
-export CURRENT_FILE='';
-export SEARCH_RESULTS=();
-export SEARCH_FILES=();
-export SEARCH_INDEX=0;
-export START_INDEX=0;
-export END_INDEX=0;
-export START_CURSOR=0;
-export END_CURSOR=0;
-export START_CURSOR_MARK='"<<<<< START CURSOR >>>>>"';  # these have to use double quotes
-export END_CURSOR_MARK='"<<<<< END CURSOR >>>>>"'; # these have to use double quotes
-
-state() {
-    local working_dir="$PWD";
-    if [ -z $CURRENT_FILE ]; then
-        echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}';
-    else
-        echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}';
-    fi
-};
diff --git a/opendevin/runtime/plugins/swe_agent_commands/_setup_default_env.sh b/opendevin/runtime/plugins/swe_agent_commands/_setup_default_env.sh
deleted file mode 100755
index fc0dbad7b870..000000000000
--- a/opendevin/runtime/plugins/swe_agent_commands/_setup_default_env.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-# Default Mode from SWE-Bench
-# https://github.com/princeton-nlp/SWE-agent/blob/ca54d5556b9db4f4f2be21f09530ce69a72c0305/config/configs/default_sys-env_window100-detailed_cmd_format-last_5_history-1_demos.yaml
-export WINDOW=100;
-export OVERLAP=2;
-export CURRENT_LINE=0;
-export CURRENT_FILE='';
-export SEARCH_RESULTS=();
-export SEARCH_FILES=();
-export SEARCH_INDEX=0;
-
-state() {
-    local working_dir="$PWD";
-    if [ -z $CURRENT_FILE ]; then
-        echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}';
-    else
-        echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}';
-    fi
-};
diff --git a/opendevin/runtime/plugins/swe_agent_commands/_split_string b/opendevin/runtime/plugins/swe_agent_commands/_split_string
deleted file mode 100755
index 7525fdd2419e..000000000000
--- a/opendevin/runtime/plugins/swe_agent_commands/_split_string
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env python3
-import sys
-
-
-def print_flake8_output(input_string, show_line_numbers=False):
-    for value in input_string.split('\n'):
-        parts = value.split()
-        if not show_line_numbers:
-            print(f"- {' '.join(parts[1:])}")
-        else:
-            line_nums = ':'.join(parts[0].split(':')[1:])
-            print(f"- {line_nums} {' '.join(parts[1:])}")
-
-
-if __name__ == '__main__':
-    lint_output = sys.argv[1]
-    print_flake8_output(lint_output)
diff --git a/opendevin/runtime/plugins/swe_agent_commands/cursors_defaults.sh b/opendevin/runtime/plugins/swe_agent_commands/cursors_defaults.sh
deleted file mode 100644
index 561900371b1e..000000000000
--- a/opendevin/runtime/plugins/swe_agent_commands/cursors_defaults.sh
+++ /dev/null
@@ -1,306 +0,0 @@
-_reset_cursors() {
-    export START_CURSOR=1
-    export END_CURSOR=1
-}
-
-_constrain_cursors() {
-    # constrain the cursors to be within the bounds of the file [0, total_lines+1]
-    local total_lines=$(awk 'END {print NR}' "$CURRENT_FILE")
-    total_lines=$((total_lines < 1 ? 1 : total_lines))  # if the file is empty, set total_lines to 1
-    local start_line=$((CURRENT_LINE - WINDOW / 2))
-    local end_line=$((CURRENT_LINE + WINDOW / 2))
-    start_line=$((start_line < 1 ? 1 : start_line))
-    end_line=$((end_line > total_lines ? total_lines : end_line))
-    local warning_string=""
-    if [ "$START_CURSOR" -lt "$start_line" ]; then
-        warning_string+="START_CURSOR moved to $start_line\n"
-        START_CURSOR=$start_line
-    elif [ "$START_CURSOR" -gt "$end_line" ]; then
-        START_CURSOR=$end_line
-        warning_string+="START_CURSOR moved to $end_line\n"
-    fi
-    if [ "$END_CURSOR" -lt "$start_line" ]; then
-        warning_string+="END_CURSOR moved to $start_line\n"
-        END_CURSOR=$start_line
-    elif [ "$END_CURSOR" -gt "$end_line" ]; then
-        warning_string+="END_CURSOR moved to $end_line\n"
-        END_CURSOR=$end_line
-    fi
-    export START_CURSOR END_CURSOR
-    echo "$warning_string"
-    echo $START_CURSOR $END_CURSOR
-}
-
-_print() {
-    local cursor_warning=$(_constrain_cursors)
-    local cursor_values=$(echo "$cursor_warning" | tail -n 1)
-    cursor_warning=$(echo "$cursor_warning" | head -n -1)
-    export START_CURSOR=$(echo "$cursor_values" | awk '{print $1}')
-    export END_CURSOR=$(echo "$cursor_values" | awk '{print $2}')
-    local total_lines=$(awk 'END {print NR}' $CURRENT_FILE)
-    echo "[File: $(realpath "$CURRENT_FILE") ($total_lines lines total)]"
-    local start_line=$((CURRENT_LINE - WINDOW / 2))
-    local end_line=$((CURRENT_LINE + WINDOW / 2))
-    start_line=$((start_line < 1 ? 1 : start_line))
-    end_line=$((end_line > total_lines ? total_lines : end_line))
-    local lines=()
-    local i=0
-    while IFS= read -r line; do
-        lines[i++]="$line"
-    done < <(awk -v start="$start_line" -v end="$end_line" 'NR>=start && NR<=end {print}' "$CURRENT_FILE")
-    local num_lines=${#lines[@]}
-    if [ $start_line -gt 1 ]; then
-        echo "($((start_line - 1)) more lines above)"
-    fi
-    for ((i=0; i<num_lines+1; i++)) do
-        local line_number=$((start_line + i))
-        if [ $line_number -eq $START_CURSOR ]
-        then
-            echo $START_CURSOR_MARK
-        fi
-        # if i in [0, num_lines-1] then print the line number and the line
-        if [ $i -ge 0 ] && [ $i -lt $num_lines ]
-        then
-            echo "$line_number:${lines[i]}"
-        fi
-        if [ $line_number -eq $END_CURSOR ]
-        then
-            echo $END_CURSOR_MARK
-        fi
-    done
-    lines_below=$(jq -n "$total_lines - $start_line - $num_lines" | jq '[0, .] | max')
-    if [ $lines_below -gt 0 ]; then
-        echo "($lines_below more lines below)"
-    fi
-    if [ -n "$cursor_warning" ]; then
-        echo -e "$cursor_warning"
-    fi
-}
-
-_constrain_line() {
-    if [ -z "$CURRENT_FILE" ]
-    then
-        echo "No file open. Use the open command first."
-        return
-    fi
-    local max_line=$(awk 'END {print NR}' $CURRENT_FILE)
-    local half_window=$(jq -n "$WINDOW/2" | jq 'floor')
-    export CURRENT_LINE=$(jq -n "[$CURRENT_LINE, $max_line - $half_window] | min")
-    export CURRENT_LINE=$(jq -n "[$CURRENT_LINE, $half_window] | max")
-}
-
-# @yaml
-# signature: set_cursors <start_line> <end_line>
-# docstring: sets the start and end cursors to the given line numbers
-# arguments:
-#   start_line:
-#     type: integer
-#     description: the line number to set the start cursor to
-#     required: true
-#   end_line:
-#     type: integer
-#     description: the line number to set the end cursor to
-#     required: true
-set_cursors() {
-    if [ -z "$CURRENT_FILE" ]
-    then
-        echo "No file open. Use the open command first."
-        return
-    fi
-    if [ $# -lt 2 ]
-    then
-        echo "Usage: set_cursors <start_line> <end_line>"
-        return
-    fi
-    local start_line=$1
-    local end_line=$2
-    local re='^[0-9]+$'
-    if ! [[ $start_line =~ $re ]]
-    then
-        echo "Usage: set_cursors <start_line> <end_line>"
-        echo "Error: start_line must be a number"
-        return
-    fi
-    if ! [[ $end_line =~ $re ]]
-    then
-        echo "Usage: set_cursors <start_line> <end_line>"
-        echo "Error: end_line must be a number"
-        return
-    fi
-    if [ $start_line -gt $end_line ]
-    then
-        echo "Usage: set_cursors <start_line> <end_line>"
-        echo "Error: start_line must be less than or equal to end_line"
-        return
-    fi
-    export START_CURSOR=$start_line
-    export END_CURSOR=$end_line
-    _print
-}
-
-# @yaml
-# signature: open <path> [<line_number>]
-# docstring: opens the file at the given path in the editor. If line_number is provided, the window will be centered on that line
-# arguments:
-#   path:
-#     type: string
-#     description: the path to the file to open
-#     required: true
-#   line_number:
-#     type: integer
-#     description: the line number to move the window to (if not provided, the window will start at the top of the file)
-#     required: false
-open() {
-    if [ -z "$1" ]
-    then
-        echo "Usage: open <file>"
-        return
-    fi
-    # Check if the second argument is provided
-    if [ -n "$2" ]; then
-        # Check if the provided argument is a valid number
-        if ! [[ $2 =~ ^[0-9]+$ ]]; then
-            echo "Usage: open <file> [<line_number>]"
-            echo "Error: <line_number> must be a number"
-            return  # Exit if the line number is not valid
-        fi
-        local max_line=$(awk 'END {print NR}' $1)
-        if [ $2 -gt $max_line ]; then
-            echo "Warning: <line_number> ($2) is greater than the number of lines in the file ($max_line)"
-            echo "Warning: Setting <line_number> to $max_line"
-            local line_number=$(jq -n "$max_line")  # Set line number to max if greater than max
-        elif [ $2 -lt 1 ]; then
-            echo "Warning: <line_number> ($2) is less than 1"
-            echo "Warning: Setting <line_number> to 1"
-            local line_number=$(jq -n "1")  # Set line number to 1 if less than 1
-        else
-            local line_number=$(jq -n "$2")  # Set line number if valid
-        fi
-    else
-        local line_number=$(jq -n "$WINDOW/2")  # Set default line number if not provided
-    fi
-
-    if [ -f "$1" ]; then
-        export CURRENT_FILE=$(realpath $1)
-        export CURRENT_LINE=$line_number
-        _constrain_line
-        _print
-    else
-        echo "File $1 not found"
-    fi
-}
-
-# @yaml
-# signature: scroll_down
-# docstring: moves the window down {WINDOW} lines
-scroll_down() {
-    if [ -z "$CURRENT_FILE" ]
-    then
-        echo "No file open. Use the open command first."
-        return
-    fi
-    export CURRENT_LINE=$(jq -n "$CURRENT_LINE + $WINDOW - $OVERLAP")
-    _constrain_line
-    _print
-}
-
-# @yaml
-# signature: scroll_up
-# docstring: moves the window up {WINDOW} lines
-scroll_up() {
-    if [ -z "$CURRENT_FILE" ]
-    then
-        echo "No file open. Use the open command first."
-        return
-    fi
-    export CURRENT_LINE=$(jq -n "$CURRENT_LINE - $WINDOW + $OVERLAP")
-    _constrain_line
-    _print
-}
-
-# @yaml
-# signature: goto <line_number>
-# docstring: moves the window to show <line_number>
-# arguments:
-#   line_number:
-#     type: integer
-#     description: the line number to move the window to
-#     required: true
-goto() {
-    if [ $# -gt 1 ]; then
-        echo "goto allows only one line number at a time."
-        return
-    fi
-    if [ -z "$CURRENT_FILE" ]
-    then
-        echo "No file open. Use the open command first."
-        return
-    fi
-    if [ -z "$1" ]
-    then
-        echo "Usage: goto <line>"
-        return
-    fi
-    if ! [[ $1 =~ ^[0-9]+$ ]]
-    then
-        echo "Usage: goto <line>"
-        echo "Error: <line> must be a number"
-        return
-    fi
-    local max_line=$(awk 'END {print NR}' $CURRENT_FILE)
-    if [ $1 -gt $max_line ]
-    then
-        echo "Error: <line> must be less than or equal to $max_line"
-        return
-    fi
-    local OFFSET=$(jq -n "$WINDOW/6" | jq 'floor')
-    export CURRENT_LINE=$(jq -n "[$1 + $WINDOW/2 - $OFFSET, 1] | max | floor")
-    _constrain_line
-    _print
-}
-
-# @yaml
-# signature: create <filename>
-# docstring: creates and opens a new file with the given name
-# arguments:
-#   filename:
-#     type: string
-#     description: the name of the file to create
-#     required: true
-create() {
-    if [ -z "$1" ]; then
-        echo "Usage: create <filename>"
-        return
-    fi
-
-    # Check if the file already exists
-    if [ -e "$1" ]; then
-        echo "Error: File '$1' already exists."
-		open "$1"
-        return
-    fi
-
-    # Create the file an empty new line
-    printf "\n" > "$1"
-    # Use the existing open command to open the created file
-    open "$1"
-}
-
-# @yaml
-# signature: submit
-# docstring: submits your current code and terminates the session
-submit() {
-    cd $ROOT
-
-    # Check if the patch file exists and is non-empty
-    if [ -s "$SWE_CMD_WORK_DIR/test.patch" ]; then
-        # Apply the patch in reverse
-        git apply -R < "$SWE_CMD_WORK_DIR/test.patch"
-    fi
-
-    git add -A
-    git diff --cached > model.patch
-    echo "<<SUBMISSION||"
-    cat model.patch
-    echo "||SUBMISSION>>"
-}
diff --git a/opendevin/runtime/plugins/swe_agent_commands/cursors_edit_linting.sh b/opendevin/runtime/plugins/swe_agent_commands/cursors_edit_linting.sh
deleted file mode 100644
index a11f82f5aadd..000000000000
--- a/opendevin/runtime/plugins/swe_agent_commands/cursors_edit_linting.sh
+++ /dev/null
@@ -1,100 +0,0 @@
-# @yaml
-# signature: |-
-#   edit <<EOF
-#   <replacement_text>
-#   EOF
-# docstring: replaces *all* of the text between the START CURSOR and the END CURSOR with the replacement_text. The replacement text is delineated using heredoc syntax. All of the <replacement_text> will be entered, so make sure your indentation is formatted properly. To enter text at the beginning of the file, set START CURSOR and END CURSOR to 0. Use set_cursors to move the cursors around. Python files will be checked for syntax errors after the edit.
-# arguments:
-#   replacement_text:
-#     type: string
-#     description: the text to replace the current selection with
-#     required: true
-edit() {
-    if [ -z "$CURRENT_FILE" ]
-    then
-        echo 'No file is opened. Use the `open` command first.'
-        return
-    fi
-    local start_line=$((START_CURSOR - 1))
-    start_line=$((start_line < 0 ? 0 : start_line))
-    local end_line=$((END_CURSOR))
-    end_line=$((end_line < 0 ? 0 : end_line))
-
-    local replacement=()
-    while IFS= read -r line
-    do
-        replacement+=("$line")
-    done
-
-    local num_lines=${#replacement[@]}
-    # Create a backup of the current file
-    cp "$CURRENT_FILE" "$SWE_CMD_WORK_DIR/$(basename "$CURRENT_FILE")_backup"
-    # Read the file line by line into an array
-    mapfile -t lines < "$CURRENT_FILE"
-    local new_lines=("${lines[@]:0:$start_line}" "${replacement[@]}" "${lines[@]:$((end_line))}")
-    # Write the new stuff directly back into the original file
-    printf "%s\n" "${new_lines[@]}" >| "$CURRENT_FILE"
-    # Run linter if enabled
-    if [[ $CURRENT_FILE == *.py && -n "$ENABLE_AUTO_LINT" ]]; then
-        lint_output=$(flake8 --isolated --select=F821,F822,F831,E111,E112,E113,E999,E902 "$CURRENT_FILE" 2>&1)
-    else
-        # do nothing
-        lint_output=""
-    fi
-    # if there is no output, then the file is good
-    if [ -z "$lint_output" ]; then
-        _constrain_line
-        # set to START + num_lines - 1, unless num_lines is 0, then set to START
-        export END_CURSOR=$((num_lines == 0 ? START_CURSOR : START_CURSOR + num_lines - 1))
-        export START_CURSOR=$START_CURSOR
-        _print
-        echo "File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary."
-    else
-        echo "Your proposed edit has introduced new syntax error(s). Please understand the fixes and retry your edit command."
-        echo ""
-        echo "ERRORS:"
-        _split_string "$lint_output"
-        echo ""
-
-        # Save original values
-        original_current_line=$CURRENT_LINE
-        original_window=$WINDOW
-        original_end_cursor=$END_CURSOR
-
-        # Update values
-        export CURRENT_LINE=$(( (num_lines / 2) + start_line )) # Set to "center" of edit
-        export WINDOW=$((num_lines + 10)) # Show +/- 5 lines around edit
-        export END_CURSOR=$((num_lines == 0 ? START_CURSOR : START_CURSOR + num_lines - 1))
-
-        echo "This is how your edit would have looked if applied"
-        echo "-------------------------------------------------"
-        _constrain_line
-        _print
-        echo "-------------------------------------------------"
-        echo ""
-
-        # Restoring CURRENT_FILE to original contents.
-        cp "$SWE_CMD_WORK_DIR/$(basename "$CURRENT_FILE")_backup" "$CURRENT_FILE"
-
-        export CURRENT_LINE=$(( ((end_line - start_line) / 2) + start_line )) # Set to "center" of edit
-        export WINDOW=$((end_line - start_line + 10))
-        export END_CURSOR=$original_end_cursor
-
-        echo "This is the original code before your edit"
-        echo "-------------------------------------------------"
-        _constrain_line
-        _print
-        echo "-------------------------------------------------"
-
-        # Restore original values
-        export CURRENT_LINE=$original_current_line
-        export WINDOW=$original_window
-        export END_CURSOR=$original_end_cursor
-
-        echo "Your changes have NOT been applied. Please fix your edit command and try again."
-        echo "You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code."
-        echo "DO NOT re-run the same failed edit command. Running it again will lead to the same error."
-    fi
-    # Remove backup file
-    rm -f "$SWE_CMD_WORK_DIR/$(basename "$CURRENT_FILE")_backup"
-}
diff --git a/opendevin/runtime/plugins/swe_agent_commands/defaults.sh b/opendevin/runtime/plugins/swe_agent_commands/defaults.sh
deleted file mode 100644
index 706368d72c5d..000000000000
--- a/opendevin/runtime/plugins/swe_agent_commands/defaults.sh
+++ /dev/null
@@ -1,195 +0,0 @@
-_print() {
-    local total_lines=$(awk 'END {print NR}' $CURRENT_FILE)
-    echo "[File: $(realpath $CURRENT_FILE) ($total_lines lines total)]"
-    lines_above=$(jq -n "$CURRENT_LINE - $WINDOW/2" | jq '[0, .] | max | floor')
-    lines_below=$(jq -n "$total_lines - $CURRENT_LINE - $WINDOW/2" | jq '[0, .] | max | round')
-    if [ $lines_above -gt 0 ]; then
-        echo "($lines_above more lines above)"
-    fi
-    cat $CURRENT_FILE | grep -n $ | head -n $(jq -n "[$CURRENT_LINE + $WINDOW/2, $WINDOW/2] | max | floor") | tail -n $(jq -n "$WINDOW")
-    if [ $lines_below -gt 0 ]; then
-        echo "($lines_below more lines below)"
-    fi
-}
-
-_constrain_line() {
-    if [ -z "$CURRENT_FILE" ]
-    then
-        echo "No file open. Use the open command first."
-        return
-    fi
-    local max_line=$(awk 'END {print NR}' $CURRENT_FILE)
-    local half_window=$(jq -n "$WINDOW/2" | jq 'floor')
-    export CURRENT_LINE=$(jq -n "[$CURRENT_LINE, $max_line - $half_window] | min")
-    export CURRENT_LINE=$(jq -n "[$CURRENT_LINE, $half_window] | max")
-}
-
-# @yaml
-# signature: open <path> [<line_number>]
-# docstring: opens the file at the given path in the editor. If line_number is provided, the window will be move to include that line
-# arguments:
-#   path:
-#     type: string
-#     description: the path to the file to open
-#     required: true
-#   line_number:
-#     type: integer
-#     description: the line number to move the window to (if not provided, the window will start at the top of the file)
-#     required: false
-open() {
-    if [ -z "$1" ]
-    then
-        echo "Usage: open <file>"
-        return
-    fi
-    # Check if the second argument is provided
-    if [ -n "$2" ]; then
-        # Check if the provided argument is a valid number
-        if ! [[ $2 =~ ^[0-9]+$ ]]; then
-            echo "Usage: open <file> [<line_number>]"
-            echo "Error: <line_number> must be a number"
-            return  # Exit if the line number is not valid
-        fi
-        local max_line=$(awk 'END {print NR}' $1)
-        if [ $2 -gt $max_line ]; then
-            echo "Warning: <line_number> ($2) is greater than the number of lines in the file ($max_line)"
-            echo "Warning: Setting <line_number> to $max_line"
-            local line_number=$(jq -n "$max_line")  # Set line number to max if greater than max
-        elif [ $2 -lt 1 ]; then
-            echo "Warning: <line_number> ($2) is less than 1"
-            echo "Warning: Setting <line_number> to 1"
-            local line_number=$(jq -n "1")  # Set line number to 1 if less than 1
-        else
-            local OFFSET=$(jq -n "$WINDOW/6" | jq 'floor')
-            local line_number=$(jq -n "[$2 + $WINDOW/2 - $OFFSET, 1] | max | floor")
-        fi
-    else
-        local line_number=$(jq -n "$WINDOW/2")  # Set default line number if not provided
-    fi
-
-    if [ -f "$1" ]; then
-        export CURRENT_FILE=$(realpath $1)
-        export CURRENT_LINE=$line_number
-        _constrain_line
-        _print
-    elif [ -d "$1" ]; then
-        echo "Error: $1 is a directory. You can only open files. Use cd or ls to navigate directories."
-    else
-        echo "File $1 not found"
-    fi
-}
-
-# @yaml
-# signature: goto <line_number>
-# docstring: moves the window to show <line_number>
-# arguments:
-#   line_number:
-#     type: integer
-#     description: the line number to move the window to
-#     required: true
-goto() {
-    if [ $# -gt 1 ]; then
-        echo "goto allows only one line number at a time."
-        return
-    fi
-    if [ -z "$CURRENT_FILE" ]
-    then
-        echo "No file open. Use the open command first."
-        return
-    fi
-    if [ -z "$1" ]
-    then
-        echo "Usage: goto <line>"
-        return
-    fi
-    if ! [[ $1 =~ ^[0-9]+$ ]]
-    then
-        echo "Usage: goto <line>"
-        echo "Error: <line> must be a number"
-        return
-    fi
-    local max_line=$(awk 'END {print NR}' $CURRENT_FILE)
-    if [ $1 -gt $max_line ]
-    then
-        echo "Error: <line> must be less than or equal to $max_line"
-        return
-    fi
-    local OFFSET=$(jq -n "$WINDOW/6" | jq 'floor')
-    export CURRENT_LINE=$(jq -n "[$1 + $WINDOW/2 - $OFFSET, 1] | max | floor")
-    _constrain_line
-    _print
-}
-
-# @yaml
-# signature: scroll_down
-# docstring: moves the window down {WINDOW} lines
-scroll_down() {
-    if [ -z "$CURRENT_FILE" ]
-    then
-        echo "No file open. Use the open command first."
-        return
-    fi
-    export CURRENT_LINE=$(jq -n "$CURRENT_LINE + $WINDOW - $OVERLAP")
-    _constrain_line
-    _print
-}
-
-# @yaml
-# signature: scroll_up
-# docstring: moves the window down {WINDOW} lines
-scroll_up() {
-    if [ -z "$CURRENT_FILE" ]
-    then
-        echo "No file open. Use the open command first."
-        return
-    fi
-    export CURRENT_LINE=$(jq -n "$CURRENT_LINE - $WINDOW + $OVERLAP")
-    _constrain_line
-    _print
-}
-
-# @yaml
-# signature: create <filename>
-# docstring: creates and opens a new file with the given name
-# arguments:
-#   filename:
-#     type: string
-#     description: the name of the file to create
-#     required: true
-create() {
-    if [ -z "$1" ]; then
-        echo "Usage: create <filename>"
-        return
-    fi
-
-    # Check if the file already exists
-    if [ -e "$1" ]; then
-        echo "Error: File '$1' already exists."
-		open "$1"
-        return
-    fi
-
-    # Create the file an empty new line
-    printf "\n" > "$1"
-    # Use the existing open command to open the created file
-    open "$1"
-}
-
-# @yaml
-# signature: submit
-# docstring: submits your current code and terminates the session
-submit() {
-    cd $ROOT
-
-    # Check if the patch file exists and is non-empty
-    if [ -s "$SWE_CMD_WORK_DIR/test.patch" ]; then
-        # Apply the patch in reverse
-        git apply -R < "$SWE_CMD_WORK_DIR/test.patch"
-    fi
-
-    git add -A
-    git diff --cached > model.patch
-    echo "<<SUBMISSION||"
-    cat model.patch
-    echo "||SUBMISSION>>"
-}
diff --git a/opendevin/runtime/plugins/swe_agent_commands/edit_linting.sh b/opendevin/runtime/plugins/swe_agent_commands/edit_linting.sh
deleted file mode 100644
index 8341c3cd8c8e..000000000000
--- a/opendevin/runtime/plugins/swe_agent_commands/edit_linting.sh
+++ /dev/null
@@ -1,129 +0,0 @@
-# @yaml
-# signature: |-
-#   edit <start_line>:<end_line> <<EOF
-#   <replacement_text>
-#   EOF
-# docstring: replaces lines <start_line> through <end_line> (inclusive) with the given text in the open file. The replacement text is delineated using heredoc syntax. All of the <replacement text> will be entered, so make sure your indentation is formatted properly. Python files will be checked for syntax errors after the edit. If the system detects a syntax error, the edit will not be executed. Simply try to edit the file again, but make sure to read the error message and modify the edit command you issue accordingly. Issuing the same command a second time will just lead to the same error message again. Remember, the file must be open before editing.
-# arguments:
-#   start_line:
-#     type: integer
-#     description: the line number to start the edit at
-#     required: true
-#   end_line:
-#     type: integer
-#     description: the line number to end the edit at (inclusive)
-#     required: true
-#   replacement_text:
-#     type: string
-#     description: the text to replace the current selection with
-#     required: true
-edit() {
-    if [ -z "$CURRENT_FILE" ]
-    then
-        echo 'No file open. Use the `open` command first.'
-        return
-    fi
-
-    local start_line="$(echo $1: | cut -d: -f1)"
-    local end_line="$(echo $1: | cut -d: -f2)"
-
-    if [ -z "$start_line" ] || [ -z "$end_line" ]
-    then
-        echo "Usage: edit <start_line>:<end_line>"
-        return
-    fi
-
-    local re='^[0-9]+$'
-    if ! [[ $start_line =~ $re ]]; then
-        echo "Usage: edit <start_line>:<end_line>"
-        echo "Error: start_line must be a number"
-        return
-    fi
-    if ! [[ $end_line =~ $re ]]; then
-        echo "Usage: edit <start_line>:<end_line>"
-        echo "Error: end_line must be a number"
-        return
-    fi
-
-    # Bash array starts at 0, so let's adjust
-    local start_line=$((start_line - 1))
-    local end_line=$((end_line))
-
-    local line_count=0
-    local replacement=()
-    while IFS= read -r line
-    do
-        replacement+=("$line")
-        ((line_count++))
-    done
-
-    # Create a backup of the current file
-    cp "$CURRENT_FILE" "$SWE_CMD_WORK_DIR/$(basename "$CURRENT_FILE")_backup"
-
-    # Read the file line by line into an array
-    mapfile -t lines < "$CURRENT_FILE"
-    local new_lines=("${lines[@]:0:$start_line}" "${replacement[@]}" "${lines[@]:$((end_line))}")
-    # Write the new stuff directly back into the original file
-    printf "%s\n" "${new_lines[@]}" >| "$CURRENT_FILE"
-
-    # Run linter if enabled
-    if [[ $CURRENT_FILE == *.py && -n "$ENABLE_AUTO_LINT" ]]; then
-        lint_output=$(flake8 --isolated --select=F821,F822,F831,E111,E112,E113,E999,E902 "$CURRENT_FILE" 2>&1)
-    else
-        # do nothing
-        lint_output=""
-    fi
-
-    # if there is no output, then the file is good
-    if [ -z "$lint_output" ]; then
-        export CURRENT_LINE=$start_line
-        _constrain_line
-        _print
-
-        echo "File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary."
-    else
-        echo "Your proposed edit has introduced new syntax error(s). Please understand the fixes and retry your edit command."
-        echo ""
-        echo "ERRORS:"
-        _split_string "$lint_output"
-        echo ""
-
-        # Save original values
-        original_current_line=$CURRENT_LINE
-        original_window=$WINDOW
-
-        # Update values
-        export CURRENT_LINE=$(( (line_count / 2) + start_line )) # Set to "center" of edit
-        export WINDOW=$((line_count + 10)) # Show +/- 5 lines around edit
-
-        echo "This is how your edit would have looked if applied"
-        echo "-------------------------------------------------"
-        _constrain_line
-        _print
-        echo "-------------------------------------------------"
-        echo ""
-
-        # Restoring CURRENT_FILE to original contents.
-        cp "$SWE_CMD_WORK_DIR/$(basename "$CURRENT_FILE")_backup" "$CURRENT_FILE"
-
-        export CURRENT_LINE=$(( ((end_line - start_line + 1) / 2) + start_line ))
-        export WINDOW=$((end_line - start_line + 10))
-
-        echo "This is the original code before your edit"
-        echo "-------------------------------------------------"
-        _constrain_line
-        _print
-        echo "-------------------------------------------------"
-
-        # Restore original values
-        export CURRENT_LINE=$original_current_line
-        export WINDOW=$original_window
-
-        echo "Your changes have NOT been applied. Please fix your edit command and try again."
-        echo "You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code."
-        echo "DO NOT re-run the same failed edit command. Running it again will lead to the same error."
-    fi
-
-    # Remove backup file
-    rm -f "$SWE_CMD_WORK_DIR/$(basename "$CURRENT_FILE")_backup"
-}
diff --git a/opendevin/runtime/plugins/swe_agent_commands/parse_commands.py b/opendevin/runtime/plugins/swe_agent_commands/parse_commands.py
deleted file mode 100644
index 49aa9489646c..000000000000
--- a/opendevin/runtime/plugins/swe_agent_commands/parse_commands.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from dataclasses import dataclass
-
-import yaml
-
-
-@dataclass()
-class Command:
-    name: str
-    docstring: str | None = None
-    signature: str | None = None
-
-
-def parse_command_file(filepath: str) -> str:
-    content = open(filepath, 'r').read()
-    lines = content.split('\n')
-    commands: list[Command] = []
-    idx = 0
-    docs: list[str] = []
-    while idx < len(lines):
-        line = lines[idx]
-        idx += 1
-        if line.startswith('# '):
-            docs.append(line[2:])
-        elif line.strip().endswith('() {'):
-            name = line.split()[0][:-2]
-            while lines[idx].strip() != '}':
-                idx += 1
-            docstring, signature = None, name
-            docs_dict = yaml.safe_load('\n'.join(docs).replace('@yaml', ''))
-            if docs_dict is not None:
-                docstring = docs_dict.get('docstring')
-                arguments = docs_dict.get('arguments', None)
-                if 'signature' in docs_dict:
-                    signature = docs_dict['signature']
-                else:
-                    if arguments is not None:
-                        for param, settings in arguments.items():
-                            if 'required' in settings:
-                                signature += f' <{param}>'
-                            else:
-                                signature += f' [<{param}>]'
-            command = Command(name, docstring, signature)
-            commands.append(command)
-            docs = []
-    function_docs = ''
-    for cmd in commands:
-        if cmd.docstring is not None:
-            function_docs += f'{cmd.signature or cmd.name} - {cmd.docstring}\n'
-    return function_docs
-
-
-if __name__ == '__main__':
-    import sys
-
-    if len(sys.argv) < 2:
-        print('Usage: python parse_commands.py <file>')
-        sys.exit(1)
-    filepath = sys.argv[1]
-    filepaths = filepath.split(',')
-    for filepath in filepaths:
-        docs = parse_command_file(filepath)
-        print(docs)
diff --git a/opendevin/runtime/plugins/swe_agent_commands/search.sh b/opendevin/runtime/plugins/swe_agent_commands/search.sh
deleted file mode 100644
index c09566bdcb1c..000000000000
--- a/opendevin/runtime/plugins/swe_agent_commands/search.sh
+++ /dev/null
@@ -1,155 +0,0 @@
-# @yaml
-# signature: search_dir <search_term> [<dir>]
-# docstring: searches for search_term in all files in dir. If dir is not provided, searches in the current directory
-# arguments:
-#   search_term:
-#     type: string
-#     description: the term to search for
-#     required: true
-#   dir:
-#     type: string
-#     description: the directory to search in (if not provided, searches in the current directory)
-#     required: false
-search_dir() {
-    if [ $# -eq 1 ]; then
-        local search_term="$1"
-        local dir="./"
-    elif [ $# -eq 2 ]; then
-        local search_term="$1"
-        if [ -d "$2" ]; then
-            local dir="$2"
-        else
-            echo "Directory $2 not found"
-            return
-        fi
-    else
-        echo "Usage: search_dir <search_term> [<dir>]"
-        return
-    fi
-    dir=$(realpath "$dir")
-    local matches=$(find "$dir" -type f ! -path '*/.*' -exec grep -nIH -- "$search_term" {} + | cut -d: -f1 | sort | uniq -c)
-    # if no matches, return
-    if [ -z "$matches" ]; then
-        echo "No matches found for \"$search_term\" in $dir"
-        return
-    fi
-    # Calculate total number of matches
-    local num_matches=$(echo "$matches" | awk '{sum+=$1} END {print sum}')
-    # calculate total number of files matched
-    local num_files=$(echo "$matches" | wc -l | awk '{$1=$1; print $0}')
-    # if num_files is > 100, print an error
-    if [ $num_files -gt 100 ]; then
-        echo "More than $num_files files matched for \"$search_term\" in $dir. Please narrow your search."
-        return
-    fi
-
-    echo "Found $num_matches matches for \"$search_term\" in $dir:"
-    echo "$matches" | awk '{$2=$2; gsub(/^\.+\/+/, "./", $2); print $2 " ("$1" matches)"}'
-    echo "End of matches for \"$search_term\" in $dir"
-}
-
-# @yaml
-# signature: search_file <search_term> [<file>]
-# docstring: searches for search_term in file. If file is not provided, searches in the current open file
-# arguments:
-#   search_term:
-#     type: string
-#     description: the term to search for
-#     required: true
-#   file:
-#     type: string
-#     description: the file to search in (if not provided, searches in the current open file)
-#     required: false
-search_file() {
-    # Check if the first argument is provided
-    if [ -z "$1" ]; then
-        echo "Usage: search_file <search_term> [<file>]"
-        return
-    fi
-    # Check if the second argument is provided
-    if [ -n "$2" ]; then
-        # Check if the provided argument is a valid file
-        if [ -f "$2" ]; then
-            local file="$2"  # Set file if valid
-        else
-            echo "Usage: search_file <search_term> [<file>]"
-            echo "Error: File name $2 not found. Please provide a valid file name."
-            return  # Exit if the file is not valid
-        fi
-    else
-        # Check if a file is open
-        if [ -z "$CURRENT_FILE" ]; then
-            echo "No file open. Use the open command first."
-            return  # Exit if no file is open
-        fi
-        local file="$CURRENT_FILE"  # Set file to the current open file
-    fi
-    local search_term="$1"
-    file=$(realpath "$file")
-    # Use grep to directly get the desired formatted output
-    local matches=$(grep -nH -- "$search_term" "$file")
-    # Check if no matches were found
-    if [ -z "$matches" ]; then
-        echo "No matches found for \"$search_term\" in $file"
-        return
-    fi
-    # Calculate total number of matches
-    local num_matches=$(echo "$matches" | wc -l | awk '{$1=$1; print $0}')
-
-    # calculate total number of lines matched
-    local num_lines=$(echo "$matches" | cut -d: -f1 | sort | uniq | wc -l | awk '{$1=$1; print $0}')
-    # if num_lines is > 100, print an error
-    if [ $num_lines -gt 100 ]; then
-        echo "More than $num_lines lines matched for \"$search_term\" in $file. Please narrow your search."
-        return
-    fi
-
-    # Print the total number of matches and the matches themselves
-    echo "Found $num_matches matches for \"$search_term\" in $file:"
-    echo "$matches" | cut -d: -f1-2 | sort -u -t: -k2,2n | while IFS=: read -r filename line_number; do
-        echo "Line $line_number:$(sed -n "${line_number}p" "$file")"
-    done
-    echo "End of matches for \"$search_term\" in $file"
-}
-
-# @yaml
-# signature: find_file <file_name> [<dir>]
-# docstring: finds all files with the given name in dir. If dir is not provided, searches in the current directory
-# arguments:
-#   file_name:
-#     type: string
-#     description: the name of the file to search for
-#     required: true
-#   dir:
-#     type: string
-#     description: the directory to search in (if not provided, searches in the current directory)
-#     required: false
-find_file() {
-    if [ $# -eq 1 ]; then
-        local file_name="$1"
-        local dir="./"
-    elif [ $# -eq 2 ]; then
-        local file_name="$1"
-        if [ -d "$2" ]; then
-            local dir="$2"
-        else
-            echo "Directory $2 not found"
-            return
-        fi
-    else
-        echo "Usage: find_file <file_name> [<dir>]"
-        return
-    fi
-
-    dir=$(realpath "$dir")
-    local matches=$(find "$dir" -type f -name "$file_name")
-    # if no matches, return
-    if [ -z "$matches" ]; then
-        echo "No matches found for \"$file_name\" in $dir"
-        return
-    fi
-    # Calculate total number of matches
-    local num_matches=$(echo "$matches" | wc -l | awk '{$1=$1; print $0}')
-    echo "Found $num_matches matches for \"$file_name\" in $dir:"
-    echo "$matches" | awk '{print $0}'
-}
diff --git a/opendevin/runtime/plugins/swe_agent_commands/setup_cursor_mode.sh b/opendevin/runtime/plugins/swe_agent_commands/setup_cursor_mode.sh
deleted file mode 100755
index 8788e3947d73..000000000000
--- a/opendevin/runtime/plugins/swe_agent_commands/setup_cursor_mode.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-export PIP_CACHE_DIR=$HOME/.cache/pip
-pip install flake8
-
-# Cursor Mode from SWE-Bench
-# https://github.com/princeton-nlp/SWE-agent/blob/ca54d5556b9db4f4f2be21f09530ce69a72c0305/config/configs/default_sys-env_cursors_window100-detailed_cmd_format-last_5_history-1_demos.yaml#L108-L111
-echo 'source /opendevin/plugins/swe_agent_commands/_setup_cursor_mode_env.sh' >> ~/.bashrc
-
-# make _split_string (py) available
-echo 'export PATH=$PATH:/opendevin/plugins/swe_agent_commands' >> ~/.bashrc
-
-echo 'source /opendevin/plugins/swe_agent_commands/cursors_defaults.sh' >> ~/.bashrc
-echo 'source /opendevin/plugins/swe_agent_commands/cursors_edit_linting.sh' >> ~/.bashrc
-echo 'source /opendevin/plugins/swe_agent_commands/search.sh' >> ~/.bashrc
-
-echo 'export SWE_CMD_WORK_DIR="/opendevin/plugins/swe_agent_commands/workdir"' >> ~/.bashrc
-sudo mkdir -p /opendevin/plugins/swe_agent_commands/workdir
-sudo chmod 777 /opendevin/plugins/swe_agent_commands/workdir
diff --git a/opendevin/runtime/plugins/swe_agent_commands/setup_default.sh b/opendevin/runtime/plugins/swe_agent_commands/setup_default.sh
deleted file mode 100755
index 465a83e12f9a..000000000000
--- a/opendevin/runtime/plugins/swe_agent_commands/setup_default.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-export PIP_CACHE_DIR=$HOME/.cache/pip
-pip install flake8
-
-# Default Mode from SWE-Bench
-# https://github.com/princeton-nlp/SWE-agent/blob/ca54d5556b9db4f4f2be21f09530ce69a72c0305/config/configs/default_sys-env_window100-detailed_cmd_format-last_5_history-1_demos.yaml#L103-L106
-echo 'source /opendevin/plugins/swe_agent_commands/_setup_default_env.sh' >> ~/.bashrc
-
-# make _split_string (py) available
-echo 'export PATH=$PATH:/opendevin/plugins/swe_agent_commands' >> ~/.bashrc
-
-echo 'source /opendevin/plugins/swe_agent_commands/defaults.sh' >> ~/.bashrc
-echo 'source /opendevin/plugins/swe_agent_commands/search.sh' >> ~/.bashrc
-echo 'source /opendevin/plugins/swe_agent_commands/edit_linting.sh' >> ~/.bashrc
-
-echo 'export SWE_CMD_WORK_DIR="/opendevin/plugins/swe_agent_commands/workdir"' >> ~/.bashrc
-sudo mkdir -p /opendevin/plugins/swe_agent_commands/workdir
-sudo chmod 777 /opendevin/plugins/swe_agent_commands/workdir
diff --git a/opendevin/runtime/runtime.py b/opendevin/runtime/runtime.py
index 0be6d97d0af8..688227cb1bab 100644
--- a/opendevin/runtime/runtime.py
+++ b/opendevin/runtime/runtime.py
@@ -4,11 +4,10 @@
 import json
 import os
 from abc import abstractmethod
-from typing import Any, Optional
 
 from opendevin.core.config import AppConfig, SandboxConfig
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.events import EventStream, EventStreamSubscriber
+from opendevin.events import EventSource, EventStream, EventStreamSubscriber
 from opendevin.events.action import (
     Action,
     ActionConfirmationStatus,
@@ -25,21 +24,19 @@
     ErrorObservation,
     NullObservation,
     Observation,
-    RejectObservation,
+    UserRejectObservation,
 )
 from opendevin.events.serialization.action import ACTION_TYPE_TO_CLASS
-from opendevin.runtime.plugins import PluginRequirement
-from opendevin.runtime.tools import RuntimeTool
-from opendevin.storage import FileStore
+from opendevin.runtime.plugins import JupyterRequirement, PluginRequirement
 
 
-def _default_env_vars(config: SandboxConfig) -> dict[str, str]:
+def _default_env_vars(sandbox_config: SandboxConfig) -> dict[str, str]:
     ret = {}
     for key in os.environ:
         if key.startswith('SANDBOX_ENV_'):
             sandbox_key = key.removeprefix('SANDBOX_ENV_')
             ret[sandbox_key] = os.environ[key]
-    if config.enable_auto_lint:
+    if sandbox_config.enable_auto_lint:
         ret['ENABLE_AUTO_LINT'] = 'true'
     return ret
 
@@ -52,7 +49,7 @@ class Runtime:
     """
 
     sid: str
-    file_store: FileStore
+    config: AppConfig
     DEFAULT_ENV_VARS: dict[str, str]
 
     def __init__(
@@ -60,13 +57,17 @@ def __init__(
         config: AppConfig,
         event_stream: EventStream,
         sid: str = 'default',
+        plugins: list[PluginRequirement] | None = None,
     ):
         self.sid = sid
         self.event_stream = event_stream
         self.event_stream.subscribe(EventStreamSubscriber.RUNTIME, self.on_event)
+        self.plugins = plugins if plugins is not None and len(plugins) > 0 else []
+
         self.config = copy.deepcopy(config)
         self.DEFAULT_ENV_VARS = _default_env_vars(config.sandbox)
         atexit.register(self.close_sync)
+        logger.debug(f'Runtime `{sid}` config:\n{self.config}')
 
     async def ainit(self, env_vars: dict[str, str] | None = None) -> None:
         """
@@ -97,26 +98,20 @@ def close_sync(self) -> None:
             else:
                 loop.run_until_complete(self.close())
 
-    # ====================================================================
-    # Methods we plan to deprecate when we move to new EventStreamRuntime
-    # ====================================================================
-
-    def init_sandbox_plugins(self, plugins: list[PluginRequirement]) -> None:
-        # TODO: deprecate this method when we move to the new EventStreamRuntime
-        raise NotImplementedError('This method is not implemented in the base class.')
-
-    def init_runtime_tools(
-        self,
-        runtime_tools: list[RuntimeTool],
-        runtime_tools_config: Optional[dict[RuntimeTool, Any]] = None,
-        is_async: bool = True,
-    ) -> None:
-        # TODO: deprecate this method when we move to the new EventStreamRuntime
-        raise NotImplementedError('This method is not implemented in the base class.')
-
     # ====================================================================
 
     async def add_env_vars(self, env_vars: dict[str, str]) -> None:
+        # Add env vars to the IPython shell (if Jupyter is used)
+        if any(isinstance(plugin, JupyterRequirement) for plugin in self.plugins):
+            code = 'import os\n'
+            for key, value in env_vars.items():
+                # Note: json.dumps gives us nice escaping for free
+                code += f'os.environ["{key}"] = {json.dumps(value)}\n'
+            code += '\n'
+            obs = await self.run_ipython(IPythonRunCellAction(code))
+            logger.info(f'Added env vars to IPython: code={code}, obs={obs}')
+
+        # Add env vars to the Bash shell
         cmd = ''
         for key, value in env_vars.items():
             # Note: json.dumps gives us nice escaping for free
@@ -125,7 +120,7 @@ async def add_env_vars(self, env_vars: dict[str, str]) -> None:
             return
         cmd = cmd.strip()
         logger.debug(f'Adding env var: {cmd}')
-        obs: Observation = await self.run(CmdRunAction(cmd))
+        obs = await self.run(CmdRunAction(cmd))
         if not isinstance(obs, CmdOutputObservation) or obs.exit_code != 0:
             raise RuntimeError(
                 f'Failed to add env vars [{env_vars}] to environment: {obs.content}'
@@ -133,9 +128,14 @@ async def add_env_vars(self, env_vars: dict[str, str]) -> None:
 
     async def on_event(self, event: Event) -> None:
         if isinstance(event, Action):
+            # set timeout to default if not set
+            if event.timeout is None:
+                event.timeout = self.config.sandbox.timeout
+            assert event.timeout is not None
             observation = await self.run_action(event)
             observation._cause = event.id  # type: ignore[attr-defined]
-            self.event_stream.add_event(observation, event.source)  # type: ignore[arg-type]
+            source = event.source if event.source else EventSource.AGENT
+            self.event_stream.add_event(observation, source)  # type: ignore[arg-type]
 
     async def run_action(self, action: Action) -> Observation:
         """Run an action and return the resulting observation.
@@ -160,15 +160,14 @@ async def run_action(self, action: Action) -> Observation:
             hasattr(action, 'is_confirmed')
             and action.is_confirmed == ActionConfirmationStatus.REJECTED
         ):
-            return RejectObservation(
+            return UserRejectObservation(
                 'Action has been rejected by the user! Waiting for further user input.'
             )
         observation = await getattr(self, action_type)(action)
-        observation._parent = action.id  # type: ignore[attr-defined]
         return observation
 
     # ====================================================================
-    # Implement these methods in the subclass
+    # Action execution
     # ====================================================================
 
     @abstractmethod
@@ -194,3 +193,19 @@ async def browse(self, action: BrowseURLAction) -> Observation:
     @abstractmethod
     async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation:
         pass
+
+    # ====================================================================
+    # File operations
+    # ====================================================================
+
+    @abstractmethod
+    async def copy_to(self, host_src: str, sandbox_dest: str, recursive: bool = False):
+        raise NotImplementedError('This method is not implemented in the base class.')
+
+    @abstractmethod
+    async def list_files(self, path: str | None = None) -> list[str]:
+        """List files in the sandbox.
+
+        If path is None, list files in the sandbox's initial working directory (e.g., /workspace).
+        """
+        raise NotImplementedError('This method is not implemented in the base class.')
diff --git a/opendevin/runtime/sandbox.py b/opendevin/runtime/sandbox.py
index 5009308a9276..c8ebd87d3783 100644
--- a/opendevin/runtime/sandbox.py
+++ b/opendevin/runtime/sandbox.py
@@ -3,10 +3,9 @@
 
 from opendevin.core.config import SandboxConfig
 from opendevin.core.schema import CancellableStream
-from opendevin.runtime.plugins.mixin import PluginMixin
 
 
-class Sandbox(ABC, PluginMixin):
+class Sandbox(ABC):
     _env: dict[str, str] = {}
     is_initial_session: bool = True
 
diff --git a/opendevin/runtime/server/runtime.py b/opendevin/runtime/server/runtime.py
deleted file mode 100644
index 04b6ee57fe05..000000000000
--- a/opendevin/runtime/server/runtime.py
+++ /dev/null
@@ -1,224 +0,0 @@
-from typing import Any, Optional
-
-from opendevin.core.config import AppConfig
-from opendevin.core.exceptions import BrowserInitException
-from opendevin.core.logger import opendevin_logger as logger
-from opendevin.events.action import (
-    BrowseInteractiveAction,
-    BrowseURLAction,
-    CmdRunAction,
-    FileReadAction,
-    FileWriteAction,
-    IPythonRunCellAction,
-)
-from opendevin.events.observation import (
-    CmdOutputObservation,
-    ErrorObservation,
-    IPythonRunCellObservation,
-    Observation,
-)
-from opendevin.events.stream import EventStream
-from opendevin.runtime import (
-    DockerSSHBox,
-    E2BBox,
-    LocalBox,
-    Sandbox,
-)
-from opendevin.runtime.browser.browser_env import BrowserEnv
-from opendevin.runtime.plugins import PluginRequirement
-from opendevin.runtime.runtime import Runtime
-from opendevin.runtime.tools import RuntimeTool
-from opendevin.storage.local import LocalFileStore
-
-from ..browser import browse
-from .files import read_file, write_file
-
-
-class ServerRuntime(Runtime):
-    def __init__(
-        self,
-        config: AppConfig,
-        event_stream: EventStream,
-        sid: str = 'default',
-        sandbox: Sandbox | None = None,
-    ):
-        super().__init__(config, event_stream, sid)
-        self.file_store = LocalFileStore(config.workspace_base)
-        if sandbox is None:
-            self.sandbox = self.create_sandbox(sid, config.sandbox.box_type)
-            self._is_external_sandbox = False
-        else:
-            self.sandbox = sandbox
-            self._is_external_sandbox = True
-        self.browser: BrowserEnv | None = None
-
-    def create_sandbox(self, sid: str = 'default', box_type: str = 'ssh') -> Sandbox:
-        if box_type == 'local':
-            return LocalBox(
-                config=self.config.sandbox, workspace_base=self.config.workspace_base
-            )
-        elif box_type == 'ssh':
-            return DockerSSHBox(
-                config=self.config.sandbox,
-                persist_sandbox=self.config.persist_sandbox,
-                workspace_mount_path=self.config.workspace_mount_path,
-                sandbox_workspace_dir=self.config.workspace_mount_path_in_sandbox,
-                cache_dir=self.config.cache_dir,
-                run_as_devin=self.config.run_as_devin,
-                ssh_hostname=self.config.ssh_hostname,
-                ssh_password=self.config.ssh_password,
-                ssh_port=self.config.ssh_port,
-                sid=sid,
-            )
-        elif box_type == 'e2b':
-            return E2BBox(
-                config=self.config.sandbox,
-                e2b_api_key=self.config.e2b_api_key,
-            )
-        else:
-            raise ValueError(f'Invalid sandbox type: {box_type}')
-
-    async def ainit(self, env_vars: dict[str, str] | None = None):
-        # MUST call super().ainit() to initialize both default env vars
-        # AND the ones in env vars!
-        await super().ainit(env_vars)
-
-    async def close(self):
-        if hasattr(self, '_is_external_sandbox') and not self._is_external_sandbox:
-            self.sandbox.close()
-        if hasattr(self, 'browser') and self.browser is not None:
-            self.browser.close()
-
-    def init_sandbox_plugins(self, plugins: list[PluginRequirement]) -> None:
-        self.sandbox.init_plugins(plugins)
-
-    def init_runtime_tools(
-        self,
-        runtime_tools: list[RuntimeTool],
-        runtime_tools_config: Optional[dict[RuntimeTool, Any]] = None,
-        is_async: bool = True,
-    ) -> None:
-        # if browser in runtime_tools, init it
-        if RuntimeTool.BROWSER in runtime_tools:
-            if runtime_tools_config is None:
-                runtime_tools_config = {}
-            browser_env_config = runtime_tools_config.get(RuntimeTool.BROWSER, {})
-            try:
-                self.browser = BrowserEnv(is_async=is_async, **browser_env_config)
-            except BrowserInitException:
-                logger.warn(
-                    'Failed to start browser environment, web browsing functionality will not work'
-                )
-
-    async def run(self, action: CmdRunAction) -> Observation:
-        return self._run_command(action.command)
-
-    async def run_ipython(self, action: IPythonRunCellAction) -> Observation:
-        self._run_command(
-            ("cat > /tmp/opendevin_jupyter_temp.py <<'EOL'\n" f'{action.code}\n' 'EOL'),
-        )
-
-        # run the code
-        obs = self._run_command('cat /tmp/opendevin_jupyter_temp.py | execute_cli')
-        output = obs.content
-        if 'pip install' in action.code:
-            print(output)
-            package_names = action.code.split(' ', 2)[-1]
-            is_single_package = ' ' not in package_names
-
-            if 'Successfully installed' in output:
-                restart_kernel = 'import IPython\nIPython.Application.instance().kernel.do_shutdown(True)'
-                if (
-                    'Note: you may need to restart the kernel to use updated packages.'
-                    in output
-                ):
-                    self._run_command(
-                        (
-                            "cat > /tmp/opendevin_jupyter_temp.py <<'EOL'\n"
-                            f'{restart_kernel}\n'
-                            'EOL'
-                        )
-                    )
-                    obs = self._run_command(
-                        'cat /tmp/opendevin_jupyter_temp.py | execute_cli'
-                    )
-                    output = '[Package installed successfully]'
-                    if "{'status': 'ok', 'restart': True}" != obs.content.strip():
-                        print(obs.content)
-                        output += (
-                            '\n[But failed to restart the kernel to load the package]'
-                        )
-                    else:
-                        output += (
-                            '\n[Kernel restarted successfully to load the package]'
-                        )
-
-                    # re-init the kernel after restart
-                    if action.kernel_init_code:
-                        self._run_command(
-                            (
-                                f"cat > /tmp/opendevin_jupyter_init.py <<'EOL'\n"
-                                f'{action.kernel_init_code}\n'
-                                'EOL'
-                            ),
-                        )
-                        obs = self._run_command(
-                            'cat /tmp/opendevin_jupyter_init.py | execute_cli',
-                        )
-            elif (
-                is_single_package
-                and f'Requirement already satisfied: {package_names}' in output
-            ):
-                output = '[Package already installed]'
-        return IPythonRunCellObservation(content=output, code=action.code)
-
-    async def read(self, action: FileReadAction) -> Observation:
-        # TODO: use self.file_store
-        working_dir = self.sandbox.get_working_directory()
-        return await read_file(
-            action.path,
-            working_dir,
-            self.config.workspace_base,
-            self.config.workspace_mount_path_in_sandbox,
-            action.start,
-            action.end,
-        )
-
-    async def write(self, action: FileWriteAction) -> Observation:
-        # TODO: use self.file_store
-        working_dir = self.sandbox.get_working_directory()
-        return await write_file(
-            action.path,
-            working_dir,
-            self.config.workspace_base,
-            self.config.workspace_mount_path_in_sandbox,
-            action.content,
-            action.start,
-            action.end,
-        )
-
-    async def browse(self, action: BrowseURLAction) -> Observation:
-        return await browse(action, self.browser)
-
-    async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation:
-        return await browse(action, self.browser)
-
-    def _run_command(self, command: str) -> Observation:
-        try:
-            exit_code, output = self.sandbox.execute(command)
-            if 'pip install' in command:
-                package_names = command.split(' ', 2)[-1]
-                is_single_package = ' ' not in package_names
-                print(output)
-                if 'Successfully installed' in output:
-                    output = '[Package installed successfully]'
-                elif (
-                    is_single_package
-                    and f'Requirement already satisfied: {package_names}' in output
-                ):
-                    output = '[Package already installed]'
-            return CmdOutputObservation(
-                command_id=-1, content=str(output), command=command, exit_code=exit_code
-            )
-        except UnicodeDecodeError:
-            return ErrorObservation('Command output could not be decoded as utf-8')
diff --git a/opendevin/runtime/utils/bash.py b/opendevin/runtime/utils/bash.py
index 9dd7a60422c5..6de80c38840a 100644
--- a/opendevin/runtime/utils/bash.py
+++ b/opendevin/runtime/utils/bash.py
@@ -1,87 +1,52 @@
-def split_bash_commands(commands):
-    # States
-    NORMAL = 0
-    IN_SINGLE_QUOTE = 1
-    IN_DOUBLE_QUOTE = 2
-    IN_HEREDOC = 3
-
-    state = NORMAL
-    heredoc_trigger = None
-    result = []
-    current_command: list[str] = []
-
-    i = 0
-    while i < len(commands):
-        char = commands[i]
-
-        if state == NORMAL:
-            if char == "'":
-                state = IN_SINGLE_QUOTE
-            elif char == '"':
-                state = IN_DOUBLE_QUOTE
-            elif char == '\\':
-                # Check if this is escaping a newline
-                if i + 1 < len(commands) and commands[i + 1] == '\n':
-                    i += 1  # Skip the newline
-                    # Continue with the next line as part of the same command
-                    i += 1  # Move to the first character of the next line
-                    continue
-            elif char == '\n':
-                if not heredoc_trigger and current_command:
-                    result.append(''.join(current_command).strip())
-                    current_command = []
-            elif char == '<' and commands[i : i + 2] == '<<':
-                # Detect heredoc
-                state = IN_HEREDOC
-                i += 2  # Skip '<<'
-                while commands[i] == ' ':
-                    i += 1
-                start = i
-                while commands[i] not in [' ', '\n']:
-                    i += 1
-                heredoc_trigger = commands[start:i]
-                current_command.append(commands[start - 2 : i])  # Include '<<'
-                continue  # Skip incrementing i at the end of the loop
-            current_command.append(char)
-
-        elif state == IN_SINGLE_QUOTE:
-            current_command.append(char)
-            if char == "'" and commands[i - 1] != '\\':
-                state = NORMAL
+import bashlex
 
-        elif state == IN_DOUBLE_QUOTE:
-            current_command.append(char)
-            if char == '"' and commands[i - 1] != '\\':
-                state = NORMAL
+from opendevin.core.logger import opendevin_logger as logger
 
-        elif state == IN_HEREDOC:
-            current_command.append(char)
-            if (
-                char == '\n'
-                and heredoc_trigger
-                and commands[i + 1 : i + 1 + len(heredoc_trigger) + 1]
-                == heredoc_trigger + '\n'
-            ):
-                # Check if the next line starts with the heredoc trigger followed by a newline
-                i += (
-                    len(heredoc_trigger) + 1
-                )  # Move past the heredoc trigger and newline
-                current_command.append(
-                    heredoc_trigger + '\n'
-                )  # Include the heredoc trigger and newline
-                result.append(''.join(current_command).strip())
-                current_command = []
-                heredoc_trigger = None
-                state = NORMAL
-                continue
-
-        i += 1
-
-    # Add the last command if any
-    if current_command:
-        result.append(''.join(current_command).strip())
-
-    # Remove any empty strings from the result
-    result = [cmd for cmd in result if cmd]
 
+def split_bash_commands(commands):
+    try:
+        parsed = bashlex.parse(commands)
+    except bashlex.errors.ParsingError as e:
+        logger.debug(
+            f'Failed to parse bash commands\n'
+            f'[input]: {commands}\n'
+            f'[warning]: {e}\n'
+            f'The original command will be returned as is.'
+        )
+        # If parsing fails, return the original commands
+        return [commands]
+
+    result: list[str] = []
+    last_end = 0
+
+    for node in parsed:
+        start, end = node.pos
+
+        # Include any text between the last command and this one
+        if start > last_end:
+            between = commands[last_end:start]
+            logger.debug(f'BASH PARSING between: {between}')
+            if result:
+                result[-1] += between.rstrip()
+            elif between.strip():
+                # THIS SHOULD NOT HAPPEN
+                result.append(between.rstrip())
+
+        # Extract the command, preserving original formatting
+        command = commands[start:end].rstrip()
+        logger.debug(f'BASH PARSING command: {command}')
+        result.append(command)
+
+        last_end = end
+
+    # Add any remaining text after the last command to the last command
+    remaining = commands[last_end:].rstrip()
+    logger.debug(f'BASH PARSING remaining: {remaining}')
+    if last_end < len(commands) and result:
+        result[-1] += remaining
+        logger.debug(f'BASH PARSING result[-1] += remaining: {result[-1]}')
+    elif last_end < len(commands):
+        if remaining:
+            result.append(remaining)
+            logger.debug(f'BASH PARSING result.append(remaining): {result[-1]}')
     return result
diff --git a/opendevin/runtime/server/files.py b/opendevin/runtime/utils/files.py
similarity index 100%
rename from opendevin/runtime/server/files.py
rename to opendevin/runtime/utils/files.py
diff --git a/opendevin/runtime/utils/image_agnostic.py b/opendevin/runtime/utils/image_agnostic.py
deleted file mode 100644
index d44c9c4ebb39..000000000000
--- a/opendevin/runtime/utils/image_agnostic.py
+++ /dev/null
@@ -1,113 +0,0 @@
-"""This module contains functions for building and managing the agnostic sandbox image.
-
-This WILL BE DEPRECATED when EventStreamRuntime is fully implemented and adopted.
-"""
-
-import tempfile
-
-import docker
-
-from opendevin.core.logger import opendevin_logger as logger
-
-
-def generate_dockerfile(base_image: str) -> str:
-    """Generate the Dockerfile content for the agnostic sandbox image based on user-provided base image.
-
-    NOTE: This is only tested on debian yet.
-    """
-    # FIXME: Remove the requirement of ssh in future version
-    dockerfile_content = (
-        f'FROM {base_image}\n'
-        'RUN apt update && apt install -y openssh-server wget sudo\n'
-        'RUN mkdir -p -m0755 /var/run/sshd\n'
-        'RUN mkdir -p /opendevin && mkdir -p /opendevin/logs && chmod 777 /opendevin/logs\n'
-        'RUN echo "" > /opendevin/bash.bashrc\n'
-        'RUN if [ ! -d /opendevin/miniforge3 ]; then \\\n'
-        '        wget --progress=bar:force -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" && \\\n'
-        '        bash Miniforge3.sh -b -p /opendevin/miniforge3 && \\\n'
-        '        rm Miniforge3.sh && \\\n'
-        '        chmod -R g+w /opendevin/miniforge3 && \\\n'
-        '        bash -c ". /opendevin/miniforge3/etc/profile.d/conda.sh && conda config --set changeps1 False && conda config --append channels conda-forge"; \\\n'
-        '    fi\n'
-        'RUN /opendevin/miniforge3/bin/pip install --upgrade pip\n'
-        'RUN /opendevin/miniforge3/bin/pip install jupyterlab notebook jupyter_kernel_gateway flake8\n'
-        'RUN /opendevin/miniforge3/bin/pip install python-docx PyPDF2 python-pptx pylatexenc openai\n'
-    ).strip()
-    return dockerfile_content
-
-
-def _build_sandbox_image(
-    base_image: str, target_image_name: str, docker_client: docker.DockerClient
-):
-    try:
-        with tempfile.TemporaryDirectory() as temp_dir:
-            dockerfile_content = generate_dockerfile(base_image)
-
-            logger.info(f'Building agnostic sandbox image: {target_image_name}')
-            logger.info(
-                (
-                    f'===== Dockerfile content =====\n'
-                    f'{dockerfile_content}\n'
-                    f'==============================='
-                )
-            )
-            with open(f'{temp_dir}/Dockerfile', 'w') as file:
-                file.write(dockerfile_content)
-
-            api_client = docker_client.api
-            build_logs = api_client.build(
-                path=temp_dir, tag=target_image_name, rm=True, decode=True
-            )
-
-            for log in build_logs:
-                if 'stream' in log:
-                    print(log['stream'].strip())
-                elif 'error' in log:
-                    logger.error(log['error'].strip())
-                else:
-                    logger.info(str(log))
-
-        logger.info(f'Image {target_image_name} built successfully')
-    except docker.errors.BuildError as e:
-        logger.error(f'Sandbox image build failed: {e}')
-        raise e
-    except Exception as e:
-        logger.error(f'An error occurred during sandbox image build: {e}')
-        raise e
-
-
-def _get_new_image_name(base_image: str) -> str:
-    prefix = 'od_sandbox'
-    if ':' not in base_image:
-        base_image = base_image + ':latest'
-
-    [repo, tag] = base_image.split(':')
-    repo = repo.replace('/', '___')
-    return f'{prefix}:{repo}__{tag}'
-
-
-def get_od_sandbox_image(base_image: str, docker_client: docker.DockerClient) -> str:
-    """Return the sandbox image name based on user-provided base image.
-
-    The returned sandbox image is assumed to contains all the required dependencies for OpenDevin.
-    If the sandbox image is not found, it will be built.
-    """
-    # OpenDevin's offcial sandbox already contains the required dependencies for OpenDevin.
-    if 'ghcr.io/opendevin/sandbox' in base_image:
-        return base_image
-
-    new_image_name = _get_new_image_name(base_image)
-
-    # Detect if the sandbox image is built
-    images = docker_client.images.list()
-    for image in images:
-        if new_image_name in image.tags:
-            logger.info('Found existing od_sandbox image, reuse:' + new_image_name)
-            return new_image_name
-
-    # If the sandbox image is not found, build it
-    logger.info(
-        f'od_sandbox image is not found for {base_image}, will build: {new_image_name}'
-    )
-    _build_sandbox_image(base_image, new_image_name, docker_client)
-    return new_image_name
diff --git a/opendevin/runtime/utils/runtime_build.py b/opendevin/runtime/utils/runtime_build.py
index acb4ac6d02fa..9d3596eb669e 100644
--- a/opendevin/runtime/utils/runtime_build.py
+++ b/opendevin/runtime/utils/runtime_build.py
@@ -6,11 +6,16 @@
 
 import docker
 import toml
+from dirhash import dirhash
 from jinja2 import Environment, FileSystemLoader
 
 import opendevin
 from opendevin.core.logger import opendevin_logger as logger
 
+RUNTIME_IMAGE_REPO = os.getenv(
+    'OD_RUNTIME_RUNTIME_IMAGE_REPO', 'ghcr.io/opendevin/od_runtime'
+)
+
 
 def _get_package_version():
     """Read the version from pyproject.toml as the other one may be outdated."""
@@ -29,6 +34,8 @@ def _create_project_source_dist():
     logger.info(f'Using project root: {project_root}')
 
     # run "python -m build -s" on project_root
+    # install build package if not installed
+    subprocess.run(['python', '-m', 'pip', 'install', 'build'])
     result = subprocess.run(['python', '-m', 'build', '-s', project_root])
     if result.returncode != 0:
         logger.error(f'Build failed: {result}')
@@ -47,7 +54,8 @@ def _create_project_source_dist():
     return tarball_path
 
 
-def _put_source_code_to_dir(temp_dir: str) -> str:
+def _put_source_code_to_dir(temp_dir: str):
+    """Put the source code of OpenDevin to the temp_dir/code."""
     tarball_path = _create_project_source_dist()
     filename = os.path.basename(tarball_path)
     filename = filename.removesuffix('.tar.gz')
@@ -59,11 +67,20 @@ def _put_source_code_to_dir(temp_dir: str) -> str:
     logger.info(
         f'Source distribution moved to {os.path.join(temp_dir, "project.tar.gz")}'
     )
-    return filename
+
+    # unzip the tarball
+    shutil.unpack_archive(os.path.join(temp_dir, 'project.tar.gz'), temp_dir)
+    # remove the tarball
+    os.remove(os.path.join(temp_dir, 'project.tar.gz'))
+    # rename the directory to the 'code'
+    os.rename(os.path.join(temp_dir, filename), os.path.join(temp_dir, 'code'))
+    logger.info(f'Unpacked source code directory: {os.path.join(temp_dir, "code")}')
 
 
 def _generate_dockerfile(
-    base_image: str, source_code_dirname: str, skip_init: bool = False
+    base_image: str,
+    skip_init: bool = False,
+    extra_deps: str | None = None,
 ) -> str:
     """Generate the Dockerfile content for the eventstream runtime image based on user-provided base image."""
     env = Environment(
@@ -74,8 +91,8 @@ def _generate_dockerfile(
     template = env.get_template('Dockerfile.j2')
     dockerfile_content = template.render(
         base_image=base_image,
-        source_code_dirname=source_code_dirname,
         skip_init=skip_init,
+        extra_deps=extra_deps if extra_deps is not None else '',
     )
     return dockerfile_content
 
@@ -84,11 +101,17 @@ def prep_docker_build_folder(
     dir_path: str,
     base_image: str,
     skip_init: bool = False,
-):
-    """Prepares the docker build folder by copying the source code and generating the Dockerfile."""
-    source_code_dirname = _put_source_code_to_dir(dir_path)
+    extra_deps: str | None = None,
+) -> str:
+    """Prepares the docker build folder by copying the source code and generating the Dockerfile.
+
+    Return the MD5 hash of the directory.
+    """
+    _put_source_code_to_dir(dir_path)
     dockerfile_content = _generate_dockerfile(
-        base_image, source_code_dirname, skip_init=skip_init
+        base_image,
+        skip_init=skip_init,
+        extra_deps=extra_deps,
     )
     logger.info(
         (
@@ -100,78 +123,111 @@ def prep_docker_build_folder(
     with open(os.path.join(dir_path, 'Dockerfile'), 'w') as file:
         file.write(dockerfile_content)
 
+    hash = dirhash(dir_path, 'md5')
+    logger.info(
+        f'Input base image: {base_image}\n'
+        f'Skip init: {skip_init}\n'
+        f'Extra deps: {extra_deps}\n'
+        f'Hash for docker build directory [{dir_path}] (contents: {os.listdir(dir_path)}): {hash}\n'
+    )
+    return hash
+
 
 def _build_sandbox_image(
-    base_image: str,
-    target_image_name: str,
+    docker_folder: str,
     docker_client: docker.DockerClient,
-    skip_init: bool = False,
-):
+    target_image_repo: str,
+    target_image_hash_tag: str,
+    target_image_tag: str,
+) -> str:
+    """Build the sandbox image.
+
+    The image will be tagged as both:
+        - target_image_repo:target_image_hash_tag
+        - target_image_repo:target_image_tag
+
+    Args:
+        docker_folder: str: the path to the docker build folder
+        docker_client: docker.DockerClient: the docker client
+        target_image_repo: str: the repository name for the target image
+        target_image_hash_tag: str: the *hash* tag for the target image that is calculated based
+            on the contents of the docker build folder (source code and Dockerfile)
+            e.g., ubuntu:latest -> od_runtime:1234567890abcdef
+        target_image_tag: str: the tag for the target image that's generic and based on the base image name
+            e.g., ubuntu:latest -> od_runtime:ubuntu_tag_latest
+    """
+    # 1. Always directly build and tag using the dir_hash
+    target_image_hash_name = f'{target_image_repo}:{target_image_hash_tag}'
     try:
-        with tempfile.TemporaryDirectory() as temp_dir:
-            if skip_init:
-                logger.info(
-                    f'Reusing existing od_sandbox image [{target_image_name}] but will update the source code in it.'
-                )
-            else:
-                logger.info(f'Building agnostic sandbox image: {target_image_name}')
-            prep_docker_build_folder(temp_dir, base_image, skip_init=skip_init)
-            api_client = docker_client.api
-            build_logs = api_client.build(
-                path=temp_dir,
-                tag=target_image_name,
-                rm=True,
-                decode=True,
-                # do not use cache when skip_init is True (i.e., when we want to update the source code in the existing image)
-                nocache=skip_init,
-            )
-
-            if skip_init:
-                logger.info(
-                    f'Rebuilding existing od_sandbox image [{target_image_name}] to update the source code.'
-                )
-            for log in build_logs:
-                if 'stream' in log:
-                    print(log['stream'].strip())
-                elif 'error' in log:
-                    logger.error(log['error'].strip())
-                else:
-                    logger.info(str(log))
-
-        # check if the image is built successfully
-        image = docker_client.images.get(target_image_name)
-        if image is None:
-            raise RuntimeError(f'Build failed: Image {target_image_name} not found')
-        logger.info(f'Image {target_image_name} built successfully')
+        build_logs = docker_client.api.build(
+            path=docker_folder,
+            tag=target_image_hash_name,
+            rm=True,
+            decode=True,
+        )
     except docker.errors.BuildError as e:
         logger.error(f'Sandbox image build failed: {e}')
         raise e
 
+    for log in build_logs:
+        if 'stream' in log:
+            print(log['stream'].strip())
+        elif 'error' in log:
+            logger.error(log['error'].strip())
+        else:
+            logger.info(str(log))
+
+    # 2. Re-tag the image with a more generic tag (as somewhat of "latest" tag)
+    logger.info(f'Image [{target_image_hash_name}] build finished.')
+    image = docker_client.images.get(target_image_hash_name)
+    image.tag(target_image_repo, target_image_tag)
+    logger.info(
+        f'Re-tagged image [{target_image_hash_name}] with more generic tag [{target_image_tag}]'
+    )
 
-def get_new_image_name(base_image: str, dev_mode: bool = False) -> str:
-    if dev_mode:
-        if 'od_runtime' not in base_image:
-            raise ValueError(
-                f'Base image {base_image} must be a valid od_runtime image to be used for dev mode.'
-            )
-        # remove the 'od_runtime' prefix from the base_image
-        return base_image.replace('od_runtime', 'od_runtime_dev')
-    elif 'od_runtime' in base_image:
-        # if the base image is a valid od_runtime image, we will use it as is
-        logger.info(f'Using existing od_runtime image [{base_image}]')
-        return base_image
+    # check if the image is built successfully
+    image = docker_client.images.get(target_image_hash_name)
+    if image is None:
+        raise RuntimeError(
+            f'Build failed: Image [{target_image_repo}:{target_image_hash_tag}] not found'
+        )
+    logger.info(
+        f'Image [{target_image_repo}:{target_image_hash_tag}] (hash: [{target_image_tag}]) built successfully'
+    )
+    return target_image_hash_name
+
+
+def get_runtime_image_repo_and_tag(base_image: str) -> tuple[str, str]:
+    if RUNTIME_IMAGE_REPO in base_image:
+        logger.info(
+            f'The provided image [{base_image}] is a already a valid od_runtime image.\n'
+            f'Will try to reuse it as is.'
+        )
+        if ':' not in base_image:
+            base_image = base_image + ':latest'
+        repo, tag = base_image.split(':')
+        return repo, tag
     else:
-        prefix = 'od_runtime'
         if ':' not in base_image:
             base_image = base_image + ':latest'
         [repo, tag] = base_image.split(':')
         repo = repo.replace('/', '___')
-
         od_version = _get_package_version()
-        return f'{prefix}:od_v{od_version}_image_{repo}_tag_{tag}'
+        return RUNTIME_IMAGE_REPO, f'od_v{od_version}_image_{repo}_tag_{tag}'
 
 
 def _check_image_exists(image_name: str, docker_client: docker.DockerClient) -> bool:
+    """Check if the image exists in the registry (try to pull it first) AND in the local store.
+
+    image_name is f'{repo}:{tag}'
+    """
+    # Try to pull the new image from the registry
+    try:
+        logger.info(f'Pulling image {image_name} directly...')
+        docker_client.images.pull(image_name)
+    except Exception:
+        logger.info(f'Cannot pull image {image_name} directly')
+
     images = docker_client.images.list()
     if images:
         for image in images:
@@ -183,92 +239,135 @@ def _check_image_exists(image_name: str, docker_client: docker.DockerClient) ->
 def build_runtime_image(
     base_image: str,
     docker_client: docker.DockerClient,
-    update_source_code: bool = False,
-    save_to_local_store: bool = False,  # New parameter to control saving to local store
+    extra_deps: str | None = None,
+    docker_build_folder: str | None = None,
+    dry_run: bool = False,
+    force_rebuild: bool = False,
 ) -> str:
     """Build the runtime image for the OpenDevin runtime.
 
-    This is only used for **eventstream runtime**.
+    See https://docs.all-hands.dev/modules/usage/runtime for more details.
     """
-    new_image_name = get_new_image_name(base_image)
-    if base_image == new_image_name:
-        logger.info(
-            f'Using existing od_runtime image [{base_image}]. Will NOT build a new image.'
+    runtime_image_repo, runtime_image_tag = get_runtime_image_repo_and_tag(base_image)
+
+    # Calculate the hash for the docker build folder (source code and Dockerfile)
+    with tempfile.TemporaryDirectory() as temp_dir:
+        from_scratch_hash = prep_docker_build_folder(
+            temp_dir,
+            base_image=base_image,
+            skip_init=False,
+            extra_deps=extra_deps,
         )
-    else:
-        logger.info(f'New image name: {new_image_name}')
 
-    # Ensure new_image_name contains a colon
-    if ':' not in new_image_name:
-        raise ValueError(
-            f'Invalid image name: {new_image_name}. Expected format "repository:tag".'
-        )
+    # hash image name, if the hash matches, it means the image is already
+    # built from scratch with the *exact SAME source code* on the exact Dockerfile
+    hash_runtime_image_name = f'{runtime_image_repo}:{from_scratch_hash}'
 
-    # Try to pull the new image from the registry
-    try:
-        docker_client.images.pull(new_image_name)
-    except Exception:
-        logger.info(f'Cannot pull image {new_image_name} directly')
-
-    # Detect if the sandbox image is built
-    image_exists = _check_image_exists(new_image_name, docker_client)
-    if image_exists:
-        logger.info(f'Image {new_image_name} exists')
-    else:
-        logger.info(f'Image {new_image_name} does not exist')
-
-    skip_init = False
-    if image_exists and not update_source_code:
-        # If (1) Image exists & we are not updating the source code, we can reuse the existing production image
-        logger.info('No image build done (not updating source code)')
-        return new_image_name
-    elif image_exists and update_source_code:
-        # If (2) Image exists & we plan to update the source code (in dev mode), we need to rebuild the image
-        # and give it a special name
-        # e.g., od_runtime:ubuntu_tag_latest -> od_runtime_dev:ubuntu_tag_latest
-        logger.info('Image exists, but updating source code requested')
-        base_image = new_image_name
-        new_image_name = get_new_image_name(base_image, dev_mode=True)
-
-        skip_init = True  # since we only need to update the source code
-    else:
-        # If (3) Image does not exist, we need to build it from scratch
-        # e.g., ubuntu:latest -> od_runtime:ubuntu_tag_latest
-        # This snippet would allow to load from archive:
-        # tar_path = f'{new_image_name.replace(":", "_")}.tar'
-        # if os.path.exists(tar_path):
-        #     logger.info(f'Loading image from {tar_path}')
-        #     load_command = ['docker', 'load', '-i', tar_path]
-        #     subprocess.run(load_command, check=True)
-        #     logger.info(f'Image {new_image_name} loaded from {tar_path}')
-        #     return new_image_name
-        skip_init = False
+    # non-hash generic image name, it could contains *similar* dependencies
+    # but *might* not exactly match the state of the source code.
+    # It resembles the "latest" tag in the docker image naming convention for
+    # a particular {repo}:{tag} pair (e.g., ubuntu:latest -> od_runtime:ubuntu_tag_latest)
+    # we will build from IT to save time if the `from_scratch_hash` is not found
+    generic_runtime_image_name = f'{runtime_image_repo}:{runtime_image_tag}'
 
-    if not skip_init:
-        logger.info(f'Building image [{new_image_name}] from scratch')
+    # 1. If the image exists with the same hash, we will reuse it as is
+    if _check_image_exists(hash_runtime_image_name, docker_client):
+        logger.info(
+            f'Image [{hash_runtime_image_name}] exists with matched hash for Docker build folder.\n'
+            'Will reuse it as is.'
+        )
+        return hash_runtime_image_name
+
+    # 2. If the exact hash is not found, we will FIRST try to re-build it
+    # by leveraging the non-hash `generic_runtime_image_name` to save some time
+    # from re-building the dependencies (e.g., poetry install, apt install)
+    elif (
+        _check_image_exists(generic_runtime_image_name, docker_client)
+        and not force_rebuild
+    ):
+        logger.info(
+            f'Cannot find matched hash for image [{hash_runtime_image_name}]\n'
+            f'Will try to re-build it from latest [{generic_runtime_image_name}] image to potentially save '
+            f'time for dependencies installation.\n'
+        )
 
-    _build_sandbox_image(base_image, new_image_name, docker_client, skip_init=skip_init)
+        cur_docker_build_folder = docker_build_folder or tempfile.mkdtemp()
+        _skip_init_hash = prep_docker_build_folder(
+            cur_docker_build_folder,
+            # we want to use the existing generic image as base
+            # so that we can leverage existing dependencies already installed in the image
+            base_image=generic_runtime_image_name,
+            skip_init=True,  # skip init since we are re-using the existing image
+            extra_deps=extra_deps,
+        )
+        assert (
+            _skip_init_hash != from_scratch_hash
+        ), f'The skip_init hash [{_skip_init_hash}] should not match the existing hash [{from_scratch_hash}]'
+
+        if not dry_run:
+            _build_sandbox_image(
+                docker_folder=cur_docker_build_folder,
+                docker_client=docker_client,
+                target_image_repo=runtime_image_repo,
+                # NOTE: WE ALWAYS use the "from_scratch_hash" tag for the target image
+                # otherwise, even if the source code is exactly the same, the image *might* be re-built
+                # because the same source code will generate different hash when skip_init=True/False
+                # since the Dockerfile is slightly different
+                target_image_hash_tag=from_scratch_hash,
+                target_image_tag=runtime_image_tag,
+            )
+        else:
+            logger.info(
+                f'Dry run: Skipping image build for [{generic_runtime_image_name}]'
+            )
+        if docker_build_folder is None:
+            shutil.rmtree(cur_docker_build_folder)
 
-    # Only for development: allow to save image as archive:
-    if not image_exists and save_to_local_store:
-        tar_path = f'{new_image_name.replace(":", "_")}.tar'
-        save_command = ['docker', 'save', '-o', tar_path, new_image_name]
-        subprocess.run(save_command, check=True)
-        logger.info(f'Image saved to {tar_path}')
+    # 3. If the image is not found AND we cannot re-use the non-hash latest relavant image,
+    # we will build it completely from scratch
+    else:
+        if force_rebuild:
+            logger.info(
+                f'Force re-build: Will try to re-build image [{generic_runtime_image_name}] from scratch.\n'
+            )
+        cur_docker_build_folder = docker_build_folder or tempfile.mkdtemp()
+        _new_from_scratch_hash = prep_docker_build_folder(
+            cur_docker_build_folder,
+            base_image,
+            skip_init=False,
+            extra_deps=extra_deps,
+        )
+        assert (
+            _new_from_scratch_hash == from_scratch_hash
+        ), f'The new from scratch hash [{_new_from_scratch_hash}] does not match the existing hash [{from_scratch_hash}]'
+
+        if not dry_run:
+            _build_sandbox_image(
+                docker_folder=cur_docker_build_folder,
+                docker_client=docker_client,
+                target_image_repo=runtime_image_repo,
+                # NOTE: WE ALWAYS use the "from_scratch_hash" tag for the target image
+                target_image_hash_tag=from_scratch_hash,
+                target_image_tag=runtime_image_tag,
+            )
+        else:
+            logger.info(
+                f'Dry run: Skipping image build for [{generic_runtime_image_name}]'
+            )
 
-        load_command = ['docker', 'load', '-i', tar_path]
-        subprocess.run(load_command, check=True)
-        logger.info(f'Image {new_image_name} loaded back into Docker from {tar_path}')
+        if docker_build_folder is None:
+            shutil.rmtree(cur_docker_build_folder)
 
-    return new_image_name
+    return f'{runtime_image_repo}:{from_scratch_hash}'
 
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--base_image', type=str, default='ubuntu:22.04')
-    parser.add_argument('--update_source_code', action='store_true')
-    parser.add_argument('--save_to_local_store', action='store_true')
+    parser.add_argument(
+        '--base_image', type=str, default='nikolaik/python-nodejs:python3.11-nodejs22'
+    )
     parser.add_argument('--build_folder', type=str, default=None)
+    parser.add_argument('--force_rebuild', action='store_true', default=False)
     args = parser.parse_args()
 
     if args.build_folder is not None:
@@ -279,29 +378,41 @@ def build_runtime_image(
         logger.info(
             f'Will prepare a build folder by copying the source code and generating the Dockerfile: {build_folder}'
         )
-        new_image_path = get_new_image_name(args.base_image)
-        prep_docker_build_folder(
-            build_folder, args.base_image, skip_init=args.update_source_code
+        runtime_image_repo, runtime_image_tag = get_runtime_image_repo_and_tag(
+            args.base_image
+        )
+        with tempfile.TemporaryDirectory() as temp_dir:
+            runtime_image_hash_name = build_runtime_image(
+                args.base_image,
+                docker_client=docker.from_env(),
+                docker_build_folder=temp_dir,
+                dry_run=True,
+                force_rebuild=args.force_rebuild,
+            )
+            _runtime_image_repo, runtime_image_hash_tag = runtime_image_hash_name.split(
+                ':'
+            )
+            # Move contents of temp_dir to build_folder
+            shutil.copytree(temp_dir, build_folder, dirs_exist_ok=True)
+        logger.info(
+            f'Build folder [{build_folder}] is ready: {os.listdir(build_folder)}'
         )
-        new_image_name, new_image_tag = new_image_path.split(':')
+
         with open(os.path.join(build_folder, 'config.sh'), 'a') as file:
             file.write(
                 (
-                    f'DOCKER_IMAGE={new_image_name}\n'
-                    f'DOCKER_IMAGE_TAG={new_image_tag}\n'
+                    f'\n'
+                    f'DOCKER_IMAGE={runtime_image_repo}\n'
+                    f'DOCKER_IMAGE_TAG={runtime_image_tag}\n'
+                    f'DOCKER_IMAGE_HASH_TAG={runtime_image_hash_tag}\n'
                 )
             )
         logger.info(
-            f'`config.sh` is updated with the new image name [{new_image_name}] and tag [{new_image_tag}]'
+            f'`config.sh` is updated with the new image name [{runtime_image_repo}] and tag [{runtime_image_tag}, {runtime_image_hash_tag}]'
         )
         logger.info(f'Dockerfile and source distribution are ready in {build_folder}')
     else:
         logger.info('Building image in a temporary folder')
         client = docker.from_env()
-        image_name = build_runtime_image(
-            args.base_image,
-            client,
-            update_source_code=args.update_source_code,
-            save_to_local_store=args.save_to_local_store,
-        )
+        image_name = build_runtime_image(args.base_image, client)
         print(f'\nBUILT Image: {image_name}\n')
diff --git a/opendevin/runtime/utils/runtime_templates/Dockerfile.j2 b/opendevin/runtime/utils/runtime_templates/Dockerfile.j2
index 2502518b88e8..b7d04d246529 100644
--- a/opendevin/runtime/utils/runtime_templates/Dockerfile.j2
+++ b/opendevin/runtime/utils/runtime_templates/Dockerfile.j2
@@ -5,6 +5,7 @@ FROM {{ base_image }}
 # START: Build Runtime Image from Scratch
 # ================================================================
 FROM {{ base_image }}
+
 {% if 'ubuntu' in base_image and (base_image.endswith(':latest') or base_image.endswith(':24.04')) %}
 {% set LIBGL_MESA = 'libgl1' %}
 {% else %}
@@ -13,15 +14,16 @@ FROM {{ base_image }}
 
 # Install necessary packages and clean up in one layer
 RUN apt-get update && \
-    apt-get install -y wget sudo apt-utils {{ LIBGL_MESA }} libasound2-plugins && \
-    apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
+    apt-get install -y wget sudo apt-utils {{ LIBGL_MESA }} libasound2-plugins git && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
 # Create necessary directories
 RUN mkdir -p /opendevin && \
     mkdir -p /opendevin/logs && \
-    chmod 777 /opendevin/logs && \
-    echo "" > /opendevin/bash.bashrc
+    mkdir -p /opendevin/poetry
+
+ENV POETRY_VIRTUALENVS_PATH=/opendevin/poetry
 
 RUN if [ ! -d /opendevin/miniforge3 ]; then \
     wget --progress=bar:force -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" && \
@@ -32,8 +34,7 @@ RUN if [ ! -d /opendevin/miniforge3 ]; then \
     fi
 
 # Install Python and Poetry
-RUN /opendevin/miniforge3/bin/mamba install python=3.11 -y
-RUN /opendevin/miniforge3/bin/mamba install conda-forge::poetry -y
+RUN /opendevin/miniforge3/bin/mamba install conda-forge::poetry python=3.11 -y
 # ================================================================
 # END: Build Runtime Image from Scratch
 # ================================================================
@@ -42,10 +43,8 @@ RUN /opendevin/miniforge3/bin/mamba install conda-forge::poetry -y
 # ================================================================
 # START: Copy Project and Install/Update Dependencies
 # ================================================================
-COPY project.tar.gz /opendevin
 RUN if [ -d /opendevin/code ]; then rm -rf /opendevin/code; fi
-RUN cd /opendevin && tar -xzvf project.tar.gz && rm project.tar.gz
-RUN mv /opendevin/{{ source_code_dirname }} /opendevin/code
+COPY ./code /opendevin/code
 
 # Install/Update Dependencies
 # 1. Install pyproject.toml via poetry
@@ -53,11 +52,14 @@ RUN mv /opendevin/{{ source_code_dirname }} /opendevin/code
 # 3. Clear poetry, apt, mamba caches
 RUN cd /opendevin/code && \
     /opendevin/miniforge3/bin/mamba run -n base poetry env use python3.11 && \
-    /opendevin/miniforge3/bin/mamba run -n base poetry install --no-interaction --no-root && \
+    /opendevin/miniforge3/bin/mamba run -n base poetry install --only main,runtime --no-interaction --no-root && \
     apt-get update && \
     /opendevin/miniforge3/bin/mamba run -n base poetry run pip install playwright && \
     /opendevin/miniforge3/bin/mamba run -n base poetry run playwright install --with-deps chromium && \
+    export OD_INTERPRETER_PATH=$(/opendevin/miniforge3/bin/mamba run -n base poetry run python -c "import sys; print(sys.executable)") && \
+    {{ extra_deps }} {% if extra_deps %} && {% endif %} \
     /opendevin/miniforge3/bin/mamba run -n base poetry cache clear --all . && \
+    {% if not skip_init %}chmod -R g+rws /opendevin/poetry && {% endif %} \
     apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     /opendevin/miniforge3/bin/mamba clean --all
 
diff --git a/opendevin/server/README.md b/opendevin/server/README.md
index fa4d6249e662..991f0fe7c03b 100644
--- a/opendevin/server/README.md
+++ b/opendevin/server/README.md
@@ -2,9 +2,23 @@
 
 This is a WebSocket server that executes tasks using an agent.
 
+## Recommended Prerequisites
+
+- [Initialize the frontend code](../../frontend/README.md)
+- Install Python 3.12 (`brew install python` for those using homebrew)
+- Install pipx: (`brew install pipx` followed by `pipx ensurepath`)
+- Install poetry: (`pipx install poetry`)
+
 ## Install
 
-Follow the instructions in the base README.md to install dependencies and set up.
+First build a distribution of the frontend code (From the project root directory):
+```
+cd frontend
+npm install
+npm run build
+cd ..
+```
+Next run `poetry shell` (So you don't have to repeat `poetry run`)
 
 ## Start the Server
 
diff --git a/opendevin/server/listen.py b/opendevin/server/listen.py
index 3bbba95cd1f9..4453047702ba 100644
--- a/opendevin/server/listen.py
+++ b/opendevin/server/listen.py
@@ -1,11 +1,10 @@
 import os
 import re
+import tempfile
 import uuid
 import warnings
 
 import requests
-from pathspec import PathSpec
-from pathspec.patterns import GitWildMatchPattern
 
 from opendevin.server.data_models.feedback import FeedbackDataModel, store_feedback
 from opendevin.storage import get_file_store
@@ -33,13 +32,22 @@
 from opendevin.core.config import LLMConfig, load_app_config
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.schema import AgentState  # Add this import
-from opendevin.events.action import ChangeAgentStateAction, NullAction
+from opendevin.events.action import (
+    ChangeAgentStateAction,
+    FileReadAction,
+    FileWriteAction,
+    NullAction,
+)
 from opendevin.events.observation import (
     AgentStateChangedObservation,
+    ErrorObservation,
+    FileReadObservation,
+    FileWriteObservation,
     NullObservation,
 )
 from opendevin.events.serialization import event_to_dict
 from opendevin.llm import bedrock
+from opendevin.runtime.runtime import Runtime
 from opendevin.server.auth import get_sid_from_token, sign_token
 from opendevin.server.session import SessionManager
 
@@ -207,7 +215,7 @@ async def websocket_endpoint(websocket: WebSocket):
         ```
     - Send a message:
         ```json
-        {"action": "message", "args": {"content": "Hello, how are you?"}}
+        {"action": "message", "args": {"content": "Hello, how are you?", "images_urls": ["base64_url1", "base64_url2"]}}
         ```
     - Write contents to a file:
         ```json
@@ -355,7 +363,7 @@ async def get_agents():
 
 
 @app.get('/api/list-files')
-def list_files(request: Request, path: str = '/'):
+async def list_files(request: Request, path: str | None = None):
     """List files in the specified path.
 
     This function retrieves a list of files from the agent's runtime file store,
@@ -368,7 +376,7 @@ def list_files(request: Request, path: str = '/'):
 
     Args:
         request (Request): The incoming request object.
-        path (str, optional): The path to list files from. Defaults to '/'.
+        path (str, optional): The path to list files from. Defaults to None.
 
     Returns:
         list: A list of file names in the specified path.
@@ -381,90 +389,13 @@ def list_files(request: Request, path: str = '/'):
             status_code=status.HTTP_404_NOT_FOUND,
             content={'error': 'Runtime not yet initialized'},
         )
-
-    try:
-        # Get the full path of the requested directory
-        full_path = (
-            request.state.session.agent_session.runtime.file_store.get_full_path(path)
-        )
-
-        # Check if the directory exists
-        if not os.path.exists(full_path) or not os.path.isdir(full_path):
-            return []
-
-        # Check if .gitignore exists
-        gitignore_path = os.path.join(full_path, '.gitignore')
-        if os.path.exists(gitignore_path):
-            # Use PathSpec to parse .gitignore
-            with open(gitignore_path, 'r') as f:
-                spec = PathSpec.from_lines(GitWildMatchPattern, f.readlines())
-        else:
-            # Fallback to default exclude list if .gitignore doesn't exist
-            default_exclude = [
-                '.git',
-                '.DS_Store',
-                '.svn',
-                '.hg',
-                '.idea',
-                '.vscode',
-                '.settings',
-                '.pytest_cache',
-                '__pycache__',
-                'node_modules',
-                'vendor',
-                'build',
-                'dist',
-                'bin',
-                'logs',
-                'log',
-                'tmp',
-                'temp',
-                'coverage',
-                'venv',
-                'env',
-            ]
-            spec = PathSpec.from_lines(GitWildMatchPattern, default_exclude)
-
-        entries = request.state.session.agent_session.runtime.file_store.list(path)
-
-        # Filter entries using PathSpec
-        filtered_entries = [
-            entry
-            for entry in entries
-            if not spec.match_file(os.path.relpath(entry, str(full_path)))
-        ]
-
-        # Separate directories and files
-        directories = []
-        files = []
-        for entry in filtered_entries:
-            # Remove leading slash and any parent directory components
-            entry_relative = entry.lstrip('/').split('/')[-1]
-
-            # Construct the full path by joining the base path with the relative entry path
-            full_entry_path = os.path.join(full_path, entry_relative)
-            if os.path.exists(full_entry_path):
-                is_dir = os.path.isdir(full_entry_path)
-                if is_dir:
-                    directories.append(entry)
-                else:
-                    files.append(entry)
-
-        # Sort directories and files separately
-        directories.sort(key=lambda s: s.lower())
-        files.sort(key=lambda s: s.lower())
-
-        # Combine sorted directories and files
-        sorted_entries = directories + files
-        return sorted_entries
-
-    except Exception as e:
-        logger.error(f'Error listing files: {e}', exc_info=True)
-        return []
+    runtime: Runtime = request.state.session.agent_session.runtime
+    file_list = await runtime.list_files(path)
+    return file_list
 
 
 @app.get('/api/select-file')
-def select_file(file: str, request: Request):
+async def select_file(file: str, request: Request):
     """Retrieve the content of a specified file.
 
     To select a file:
@@ -474,6 +405,7 @@ def select_file(file: str, request: Request):
 
     Args:
         file (str): The path of the file to be retrieved.
+            Expect path to be absolute inside the runtime.
         request (Request): The incoming request object.
 
     Returns:
@@ -482,16 +414,27 @@ def select_file(file: str, request: Request):
     Raises:
         HTTPException: If there's an error opening the file.
     """
-    try:
-        content = request.state.session.agent_session.runtime.file_store.read(file)
-    except Exception as e:
-        logger.error(f'Error opening file {file}: {e}', exc_info=False)
-        error_msg = f'Error opening file: {e}'
+    runtime: Runtime = request.state.session.agent_session.runtime
+
+    # convert file to an absolute path inside the runtime
+    if not os.path.isabs(file):
+        return JSONResponse(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            content={'error': 'File path must be absolute'},
+        )
+
+    read_action = FileReadAction(file)
+    observation = await runtime.run_action(read_action)
+
+    if isinstance(observation, FileReadObservation):
+        content = observation.content
+        return {'code': content}
+    elif isinstance(observation, ErrorObservation):
+        logger.error(f'Error opening file {file}: {observation}', exc_info=False)
         return JSONResponse(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            content={'error': error_msg},
+            content={'error': f'Error opening file: {observation}'},
         )
-    return {'code': content}
 
 
 def sanitize_filename(filename):
@@ -552,9 +495,17 @@ async def upload_file(request: Request, files: list[UploadFile]):
                 )
                 continue
 
-            request.state.session.agent_session.runtime.file_store.write(
-                safe_filename, file_contents
-            )
+            # copy the file to the runtime
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file_path = os.path.join(tmp_dir, safe_filename)
+                with open(tmp_file_path, 'wb') as tmp_file:
+                    tmp_file.write(file_contents)
+                    tmp_file.flush()
+
+                runtime: Runtime = request.state.session.agent_session.runtime
+                await runtime.copy_to(
+                    tmp_file_path, runtime.config.workspace_mount_path_in_sandbox
+                )
             uploaded_files.append(safe_filename)
 
         response_content = {
@@ -709,13 +660,32 @@ async def save_file(request: Request):
         if not file_path or content is None:
             raise HTTPException(status_code=400, detail='Missing filePath or content')
 
+        # Make sure file_path is abs
+        if not os.path.isabs(file_path):
+            return JSONResponse(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                content={'error': 'File path must be absolute'},
+            )
+
         # Save the file to the agent's runtime file store
-        request.state.session.agent_session.runtime.file_store.write(file_path, content)
+        runtime: Runtime = request.state.session.agent_session.runtime
+        write_action = FileWriteAction(file_path, content)
+        observation = await runtime.run_action(write_action)
 
-        # Return a success response
-        return JSONResponse(
-            status_code=200, content={'message': 'File saved successfully'}
-        )
+        if isinstance(observation, FileWriteObservation):
+            return JSONResponse(
+                status_code=200, content={'message': 'File saved successfully'}
+            )
+        elif isinstance(observation, ErrorObservation):
+            return JSONResponse(
+                status_code=500,
+                content={'error': f'Failed to save file: {observation}'},
+            )
+        else:
+            return JSONResponse(
+                status_code=500,
+                content={'error': f'Unexpected observation: {observation}'},
+            )
     except Exception as e:
         # Log the error and return a 500 response
         logger.error(f'Error saving file: {e}', exc_info=True)
diff --git a/opendevin/server/session/agent.py b/opendevin/server/session/agent.py
index 616a99501bff..bbe5d5e98a4f 100644
--- a/opendevin/server/session/agent.py
+++ b/opendevin/server/session/agent.py
@@ -1,15 +1,13 @@
 from typing import Optional
 
-from agenthub.codeact_agent.codeact_agent import CodeActAgent
 from opendevin.controller import AgentController
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.core.config import AppConfig, LLMConfig
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.events.stream import EventStream
-from opendevin.runtime import DockerSSHBox, get_runtime_cls
+from opendevin.runtime import get_runtime_cls
 from opendevin.runtime.runtime import Runtime
-from opendevin.runtime.server.runtime import ServerRuntime
 from opendevin.storage.files import FileStore
 
 
@@ -22,6 +20,7 @@ class AgentSession:
 
     sid: str
     event_stream: EventStream
+    file_store: FileStore
     controller: Optional[AgentController] = None
     runtime: Optional[Runtime] = None
     _closed: bool = False
@@ -51,7 +50,7 @@ async def start(
             raise Exception(
                 'Session already started. You need to close this session and start a new one.'
             )
-        await self._create_runtime(runtime_name, config)
+        await self._create_runtime(runtime_name, config, agent)
         await self._create_controller(
             agent,
             confirmation_mode,
@@ -71,7 +70,7 @@ async def close(self):
             await self.runtime.close()
         self._closed = True
 
-    async def _create_runtime(self, runtime_name: str, config: AppConfig):
+    async def _create_runtime(self, runtime_name: str, config: AppConfig, agent: Agent):
         """Creates a runtime instance."""
         if self.runtime is not None:
             raise Exception('Runtime already created')
@@ -79,7 +78,10 @@ async def _create_runtime(self, runtime_name: str, config: AppConfig):
         logger.info(f'Using runtime: {runtime_name}')
         runtime_cls = get_runtime_cls(runtime_name)
         self.runtime = runtime_cls(
-            config=config, event_stream=self.event_stream, sid=self.sid
+            config=config,
+            event_stream=self.event_stream,
+            sid=self.sid,
+            plugins=agent.sandbox_plugins,
         )
         await self.runtime.ainit()
 
@@ -98,17 +100,6 @@ async def _create_controller(
             raise Exception('Runtime must be initialized before the agent controller')
 
         logger.info(f'Creating agent {agent.name} using LLM {agent.llm.config.model}')
-        if isinstance(agent, CodeActAgent):
-            if not self.runtime or not (
-                isinstance(self.runtime, ServerRuntime)
-                and isinstance(self.runtime.sandbox, DockerSSHBox)
-            ):
-                logger.warning(
-                    'CodeActAgent requires DockerSSHBox as sandbox! Using other sandbox that are not stateful'
-                    ' LocalBox will not work properly.'
-                )
-        self.runtime.init_sandbox_plugins(agent.sandbox_plugins)
-        self.runtime.init_runtime_tools(agent.runtime_tools)
 
         self.controller = AgentController(
             sid=self.sid,
diff --git a/opendevin/server/session/session.py b/opendevin/server/session/session.py
index d8434bffc7c8..7d0b850d6f89 100644
--- a/opendevin/server/session/session.py
+++ b/opendevin/server/session/session.py
@@ -10,13 +10,14 @@
 from opendevin.core.schema import AgentState
 from opendevin.core.schema.action import ActionType
 from opendevin.core.schema.config import ConfigType
-from opendevin.events.action import Action, ChangeAgentStateAction, NullAction
+from opendevin.events.action import ChangeAgentStateAction, MessageAction, NullAction
 from opendevin.events.event import Event, EventSource
 from opendevin.events.observation import (
     AgentStateChangedObservation,
     CmdOutputObservation,
     NullObservation,
 )
+from opendevin.events.observation.browse import BrowserOutputObservation
 from opendevin.events.serialization import event_from_dict, event_to_dict
 from opendevin.events.stream import EventStreamSubscriber
 from opendevin.llm.llm import LLM
@@ -135,7 +136,7 @@ async def on_event(self, event: Event):
         if event.source == EventSource.AGENT:
             await self.send(event_to_dict(event))
         elif event.source == EventSource.USER and isinstance(
-            event, CmdOutputObservation
+            event, (CmdOutputObservation, BrowserOutputObservation)
         ):
             await self.send(event_to_dict(event))
 
@@ -145,11 +146,15 @@ async def dispatch(self, data: dict):
             await self._initialize_agent(data)
             return
         event = event_from_dict(data.copy())
+        # This checks if the model supports images
+        if isinstance(event, MessageAction) and event.images_urls:
+            controller = self.agent_session.controller
+            if controller and not controller.agent.llm.supports_vision():
+                await self.send_error(
+                    'Model does not support image upload, change to a different model or try without an image.'
+                )
+                return
         self.agent_session.event_stream.add_event(event, EventSource.USER)
-        if isinstance(event, Action):
-            logger.info(
-                event, extra={'msg_type': 'ACTION', 'event_source': EventSource.USER}
-            )
 
     async def send(self, data: dict[str, object]) -> bool:
         try:
diff --git a/poetry.lock b/poetry.lock
index 19b60549653b..65a38908108f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -398,6 +398,17 @@ files = [
     {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
 ]
 
+[[package]]
+name = "bashlex"
+version = "0.18"
+description = "Python parser for bash"
+optional = false
+python-versions = ">=2.7, !=3.0, !=3.1, !=3.2, !=3.3, !=3.4"
+files = [
+    {file = "bashlex-0.18-py2.py3-none-any.whl", hash = "sha256:91d73a23a3e51711919c1c899083890cdecffc91d8c088942725ac13e9dcfffa"},
+    {file = "bashlex-0.18.tar.gz", hash = "sha256:5bb03a01c6d5676338c36fd1028009c8ad07e7d61d8a1ce3f513b7fff52796ee"},
+]
+
 [[package]]
 name = "bcrypt"
 version = "4.1.3"
@@ -508,17 +519,17 @@ files = [
 
 [[package]]
 name = "boto3"
-version = "1.34.149"
+version = "1.34.157"
 description = "The AWS SDK for Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "boto3-1.34.149-py3-none-any.whl", hash = "sha256:11edeeacdd517bda3b7615b754d8440820cdc9ddd66794cc995a9693ddeaa3be"},
-    {file = "boto3-1.34.149.tar.gz", hash = "sha256:f4e6489ba9dc7fb37d53e0e82dbc97f2cb0a4969ef3970e2c88b8f94023ae81a"},
+    {file = "boto3-1.34.157-py3-none-any.whl", hash = "sha256:3cc357156df5482154a016f138d1953061a181b4c594f8b6302c9d6c024bd950"},
+    {file = "boto3-1.34.157.tar.gz", hash = "sha256:7ef19ed38cba9863b58430fb4a66a72a5c250304f234bd1c16b860f9bf25677b"},
 ]
 
 [package.dependencies]
-botocore = ">=1.34.149,<1.35.0"
+botocore = ">=1.34.157,<1.35.0"
 jmespath = ">=0.7.1,<2.0.0"
 s3transfer = ">=0.10.0,<0.11.0"
 
@@ -527,13 +538,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
 
 [[package]]
 name = "botocore"
-version = "1.34.149"
+version = "1.34.157"
 description = "Low-level, data-driven core of boto 3."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "botocore-1.34.149-py3-none-any.whl", hash = "sha256:ae6c4be52eeee96f68c116b27d252bab069cd046d61a17cfe8e9da411cf22906"},
-    {file = "botocore-1.34.149.tar.gz", hash = "sha256:2e1eb5ef40102a3d796bb3dd05f2ac5e8fb43fe1ff114b4f6d33153437f5a372"},
+    {file = "botocore-1.34.157-py3-none-any.whl", hash = "sha256:c6cba6de8eb86ca4d2f934e009b37adbe1e7fdcfa52fbab74783f4c30676e07d"},
+    {file = "botocore-1.34.157.tar.gz", hash = "sha256:5628a36cec123cdc8c1158d05a7b06aa5e53649ad73796c50ef3fb51199785fb"},
 ]
 
 [package.dependencies]
@@ -542,7 +553,7 @@ python-dateutil = ">=2.1,<3.0.0"
 urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""}
 
 [package.extras]
-crt = ["awscrt (==0.20.11)"]
+crt = ["awscrt (==0.21.2)"]
 
 [[package]]
 name = "browsergym"
@@ -1367,6 +1378,20 @@ files = [
 graph = ["objgraph (>=1.7.2)"]
 profile = ["gprof2dot (>=2022.7.29)"]
 
+[[package]]
+name = "dirhash"
+version = "0.5.0"
+description = "Python module and CLI for hashing of file system directories."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "dirhash-0.5.0-py3-none-any.whl", hash = "sha256:523dfd6b058c64f45b31604376926c6e2bd2ea301d0df23095d4055674e38b09"},
+    {file = "dirhash-0.5.0.tar.gz", hash = "sha256:e60760f0ab2e935d8cb088923ea2c6492398dca42cec785df778985fd4cd5386"},
+]
+
+[package.dependencies]
+scantree = ">=0.0.4"
+
 [[package]]
 name = "dirtyjson"
 version = "1.0.8"
@@ -1400,26 +1425,6 @@ files = [
     {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
 ]
 
-[[package]]
-name = "dnspython"
-version = "2.6.1"
-description = "DNS toolkit"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "dnspython-2.6.1-py3-none-any.whl", hash = "sha256:5ef3b9680161f6fa89daf8ad451b5f1a33b18ae8a1c6778cdf4b43f08c0a6e50"},
-    {file = "dnspython-2.6.1.tar.gz", hash = "sha256:e8f0f9c23a7b7cb99ded64e6c3a6f3e701d78f50c55e002b839dea7225cff7cc"},
-]
-
-[package.extras]
-dev = ["black (>=23.1.0)", "coverage (>=7.0)", "flake8 (>=7)", "mypy (>=1.8)", "pylint (>=3)", "pytest (>=7.4)", "pytest-cov (>=4.1.0)", "sphinx (>=7.2.0)", "twine (>=4.0.0)", "wheel (>=0.42.0)"]
-dnssec = ["cryptography (>=41)"]
-doh = ["h2 (>=4.1.0)", "httpcore (>=1.0.0)", "httpx (>=0.26.0)"]
-doq = ["aioquic (>=0.9.25)"]
-idna = ["idna (>=3.6)"]
-trio = ["trio (>=0.23)"]
-wmi = ["wmi (>=1.5.1)"]
-
 [[package]]
 name = "docker"
 version = "7.1.0"
@@ -1475,21 +1480,6 @@ typing-extensions = ">=4.8.0"
 urllib3 = ">=1.25.3"
 websockets = ">=11.0.3"
 
-[[package]]
-name = "email-validator"
-version = "2.2.0"
-description = "A robust email address syntax and deliverability validation library."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "email_validator-2.2.0-py3-none-any.whl", hash = "sha256:561977c2d73ce3611850a06fa56b414621e0c8faa9d66f2611407d87465da631"},
-    {file = "email_validator-2.2.0.tar.gz", hash = "sha256:cb690f344c617a714f22e66ae771445a1ceb46821152df8e165c5f9a364582b7"},
-]
-
-[package.dependencies]
-dnspython = ">=2.0.0"
-idna = ">=2.0.0"
-
 [[package]]
 name = "english-words"
 version = "2.0.1"
@@ -1576,45 +1566,23 @@ files = [
 
 [[package]]
 name = "fastapi"
-version = "0.111.1"
+version = "0.112.0"
 description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "fastapi-0.111.1-py3-none-any.whl", hash = "sha256:4f51cfa25d72f9fbc3280832e84b32494cf186f50158d364a8765aabf22587bf"},
-    {file = "fastapi-0.111.1.tar.gz", hash = "sha256:ddd1ac34cb1f76c2e2d7f8545a4bcb5463bce4834e81abf0b189e0c359ab2413"},
+    {file = "fastapi-0.112.0-py3-none-any.whl", hash = "sha256:3487ded9778006a45834b8c816ec4a48d522e2631ca9e75ec5a774f1b052f821"},
+    {file = "fastapi-0.112.0.tar.gz", hash = "sha256:d262bc56b7d101d1f4e8fc0ad2ac75bb9935fec504d2b7117686cec50710cf05"},
 ]
 
 [package.dependencies]
-email_validator = ">=2.0.0"
-fastapi-cli = ">=0.0.2"
-httpx = ">=0.23.0"
-jinja2 = ">=2.11.2"
 pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0"
-python-multipart = ">=0.0.7"
 starlette = ">=0.37.2,<0.38.0"
 typing-extensions = ">=4.8.0"
-uvicorn = {version = ">=0.12.0", extras = ["standard"]}
-
-[package.extras]
-all = ["email_validator (>=2.0.0)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.7)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
-
-[[package]]
-name = "fastapi-cli"
-version = "0.0.4"
-description = "Run and manage FastAPI apps from the command line with FastAPI CLI. 🚀"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "fastapi_cli-0.0.4-py3-none-any.whl", hash = "sha256:a2552f3a7ae64058cdbb530be6fa6dbfc975dc165e4fa66d224c3d396e25e809"},
-    {file = "fastapi_cli-0.0.4.tar.gz", hash = "sha256:e2e9ffaffc1f7767f488d6da34b6f5a377751c996f397902eb6abb99a67bde32"},
-]
-
-[package.dependencies]
-typer = ">=0.12.3"
 
 [package.extras]
-standard = ["fastapi", "uvicorn[standard] (>=0.15.0)"]
+all = ["email_validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.7)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
+standard = ["email_validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "jinja2 (>=2.11.2)", "python-multipart (>=0.0.7)", "uvicorn[standard] (>=0.12.0)"]
 
 [[package]]
 name = "fastcore"
@@ -1665,13 +1633,13 @@ typing = ["typing-extensions (>=4.8)"]
 
 [[package]]
 name = "flake8"
-version = "7.1.0"
+version = "7.1.1"
 description = "the modular source code checker: pep8 pyflakes and co"
 optional = false
 python-versions = ">=3.8.1"
 files = [
-    {file = "flake8-7.1.0-py2.py3-none-any.whl", hash = "sha256:2e416edcc62471a64cea09353f4e7bdba32aeb079b6e360554c659a122b1bc6a"},
-    {file = "flake8-7.1.0.tar.gz", hash = "sha256:48a07b626b55236e0fb4784ee69a465fbf59d79eec1f5b4785c3d3bc57d17aa5"},
+    {file = "flake8-7.1.1-py2.py3-none-any.whl", hash = "sha256:597477df7860daa5aa0fdd84bf5208a043ab96b8e96ab708770ae0364dd03213"},
+    {file = "flake8-7.1.1.tar.gz", hash = "sha256:049d058491e228e03e67b390f311bbf88fce2dbaa8fa673e7aea87b7198b8d38"},
 ]
 
 [package.dependencies]
@@ -1915,6 +1883,36 @@ test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,
 test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
 tqdm = ["tqdm"]
 
+[[package]]
+name = "func-timeout"
+version = "4.3.5"
+description = "Python module which allows you to specify timeouts when calling any existing function. Also provides support for stoppable-threads"
+optional = false
+python-versions = "*"
+files = [
+    {file = "func_timeout-4.3.5.tar.gz", hash = "sha256:74cd3c428ec94f4edfba81f9b2f14904846d5ffccc27c92433b8b5939b5575dd"},
+]
+
+[[package]]
+name = "gdown"
+version = "5.2.0"
+description = "Google Drive Public File/Folder Downloader"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "gdown-5.2.0-py3-none-any.whl", hash = "sha256:33083832d82b1101bdd0e9df3edd0fbc0e1c5f14c9d8c38d2a35bf1683b526d6"},
+    {file = "gdown-5.2.0.tar.gz", hash = "sha256:2145165062d85520a3cd98b356c9ed522c5e7984d408535409fd46f94defc787"},
+]
+
+[package.dependencies]
+beautifulsoup4 = "*"
+filelock = "*"
+requests = {version = "*", extras = ["socks"]}
+tqdm = "*"
+
+[package.extras]
+test = ["build", "mypy", "pytest", "pytest-xdist", "ruff", "twine", "types-requests", "types-setuptools"]
+
 [[package]]
 name = "gevent"
 version = "24.2.1"
@@ -2129,13 +2127,13 @@ httplib2 = ">=0.19.0"
 
 [[package]]
 name = "google-cloud-aiplatform"
-version = "1.60.0"
+version = "1.61.0"
 description = "Vertex AI API client library"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "google-cloud-aiplatform-1.60.0.tar.gz", hash = "sha256:782c7f1ec0e77a7c7daabef3b65bfd506ed2b4b1dc2186753c43cd6faf8dd04e"},
-    {file = "google_cloud_aiplatform-1.60.0-py2.py3-none-any.whl", hash = "sha256:5f14159c9575f4b46335027e3ceb8fa57bd5eaa76a07f858105b8c6c034ec0d6"},
+    {file = "google-cloud-aiplatform-1.61.0.tar.gz", hash = "sha256:648e3cd7bb75be706d3c31d852a3d4d8a2e616ad4db4cf520ef4430615cf8ad9"},
+    {file = "google_cloud_aiplatform-1.61.0-py2.py3-none-any.whl", hash = "sha256:57b36d5fa085e68197e9fc576c43263a7cad320483aa3b166bcd1fdc7e8f49e7"},
 ]
 
 [package.dependencies]
@@ -2147,7 +2145,7 @@ google-cloud-resource-manager = ">=1.3.3,<3.0.0dev"
 google-cloud-storage = ">=1.32.0,<3.0.0dev"
 packaging = ">=14.3"
 proto-plus = ">=1.22.3,<2.0.0dev"
-protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0dev"
+protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0dev"
 pydantic = "<3"
 shapely = "<3.0.0dev"
 
@@ -2480,13 +2478,13 @@ test = ["objgraph", "psutil"]
 
 [[package]]
 name = "grep-ast"
-version = "0.3.2"
+version = "0.3.3"
 description = "A tool to grep through the AST of a source file"
 optional = false
 python-versions = "*"
 files = [
-    {file = "grep_ast-0.3.2-py3-none-any.whl", hash = "sha256:b7ceb84743983c3f4f5bca82f3374534cd9dbd759792d0dedf5648fedbb6f3fc"},
-    {file = "grep_ast-0.3.2.tar.gz", hash = "sha256:d53bc7d25dfefafe77643fec189ab38e3cbd839d546c070a950ebedad82ee164"},
+    {file = "grep_ast-0.3.3-py3-none-any.whl", hash = "sha256:515cb889bffefefa26c4ab1377b9a75b3fc678aa5fa02bf9aa4f8f20999a83ad"},
+    {file = "grep_ast-0.3.3.tar.gz", hash = "sha256:42b8887d57301dc55634368f8d549e9c49c913dafb4d19c9b54c3ddb604fccf4"},
 ]
 
 [package.dependencies]
@@ -2992,6 +2990,76 @@ MarkupSafe = ">=2.0"
 [package.extras]
 i18n = ["Babel (>=2.7)"]
 
+[[package]]
+name = "jiter"
+version = "0.5.0"
+description = "Fast iterable JSON parser."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "jiter-0.5.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b599f4e89b3def9a94091e6ee52e1d7ad7bc33e238ebb9c4c63f211d74822c3f"},
+    {file = "jiter-0.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2a063f71c4b06225543dddadbe09d203dc0c95ba352d8b85f1221173480a71d5"},
+    {file = "jiter-0.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:acc0d5b8b3dd12e91dd184b87273f864b363dfabc90ef29a1092d269f18c7e28"},
+    {file = "jiter-0.5.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c22541f0b672f4d741382a97c65609332a783501551445ab2df137ada01e019e"},
+    {file = "jiter-0.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:63314832e302cc10d8dfbda0333a384bf4bcfce80d65fe99b0f3c0da8945a91a"},
+    {file = "jiter-0.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a25fbd8a5a58061e433d6fae6d5298777c0814a8bcefa1e5ecfff20c594bd749"},
+    {file = "jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:503b2c27d87dfff5ab717a8200fbbcf4714516c9d85558048b1fc14d2de7d8dc"},
+    {file = "jiter-0.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6d1f3d27cce923713933a844872d213d244e09b53ec99b7a7fdf73d543529d6d"},
+    {file = "jiter-0.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c95980207b3998f2c3b3098f357994d3fd7661121f30669ca7cb945f09510a87"},
+    {file = "jiter-0.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:afa66939d834b0ce063f57d9895e8036ffc41c4bd90e4a99631e5f261d9b518e"},
+    {file = "jiter-0.5.0-cp310-none-win32.whl", hash = "sha256:f16ca8f10e62f25fd81d5310e852df6649af17824146ca74647a018424ddeccf"},
+    {file = "jiter-0.5.0-cp310-none-win_amd64.whl", hash = "sha256:b2950e4798e82dd9176935ef6a55cf6a448b5c71515a556da3f6b811a7844f1e"},
+    {file = "jiter-0.5.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d4c8e1ed0ef31ad29cae5ea16b9e41529eb50a7fba70600008e9f8de6376d553"},
+    {file = "jiter-0.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c6f16e21276074a12d8421692515b3fd6d2ea9c94fd0734c39a12960a20e85f3"},
+    {file = "jiter-0.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5280e68e7740c8c128d3ae5ab63335ce6d1fb6603d3b809637b11713487af9e6"},
+    {file = "jiter-0.5.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:583c57fc30cc1fec360e66323aadd7fc3edeec01289bfafc35d3b9dcb29495e4"},
+    {file = "jiter-0.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:26351cc14507bdf466b5f99aba3df3143a59da75799bf64a53a3ad3155ecded9"},
+    {file = "jiter-0.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4829df14d656b3fb87e50ae8b48253a8851c707da9f30d45aacab2aa2ba2d614"},
+    {file = "jiter-0.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a42a4bdcf7307b86cb863b2fb9bb55029b422d8f86276a50487982d99eed7c6e"},
+    {file = "jiter-0.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04d461ad0aebf696f8da13c99bc1b3e06f66ecf6cfd56254cc402f6385231c06"},
+    {file = "jiter-0.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e6375923c5f19888c9226582a124b77b622f8fd0018b843c45eeb19d9701c403"},
+    {file = "jiter-0.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2cec323a853c24fd0472517113768c92ae0be8f8c384ef4441d3632da8baa646"},
+    {file = "jiter-0.5.0-cp311-none-win32.whl", hash = "sha256:aa1db0967130b5cab63dfe4d6ff547c88b2a394c3410db64744d491df7f069bb"},
+    {file = "jiter-0.5.0-cp311-none-win_amd64.whl", hash = "sha256:aa9d2b85b2ed7dc7697597dcfaac66e63c1b3028652f751c81c65a9f220899ae"},
+    {file = "jiter-0.5.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9f664e7351604f91dcdd557603c57fc0d551bc65cc0a732fdacbf73ad335049a"},
+    {file = "jiter-0.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:044f2f1148b5248ad2c8c3afb43430dccf676c5a5834d2f5089a4e6c5bbd64df"},
+    {file = "jiter-0.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:702e3520384c88b6e270c55c772d4bd6d7b150608dcc94dea87ceba1b6391248"},
+    {file = "jiter-0.5.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:528d742dcde73fad9d63e8242c036ab4a84389a56e04efd854062b660f559544"},
+    {file = "jiter-0.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8cf80e5fe6ab582c82f0c3331df27a7e1565e2dcf06265afd5173d809cdbf9ba"},
+    {file = "jiter-0.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:44dfc9ddfb9b51a5626568ef4e55ada462b7328996294fe4d36de02fce42721f"},
+    {file = "jiter-0.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c451f7922992751a936b96c5f5b9bb9312243d9b754c34b33d0cb72c84669f4e"},
+    {file = "jiter-0.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:308fce789a2f093dca1ff91ac391f11a9f99c35369117ad5a5c6c4903e1b3e3a"},
+    {file = "jiter-0.5.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7f5ad4a7c6b0d90776fdefa294f662e8a86871e601309643de30bf94bb93a64e"},
+    {file = "jiter-0.5.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ea189db75f8eca08807d02ae27929e890c7d47599ce3d0a6a5d41f2419ecf338"},
+    {file = "jiter-0.5.0-cp312-none-win32.whl", hash = "sha256:e3bbe3910c724b877846186c25fe3c802e105a2c1fc2b57d6688b9f8772026e4"},
+    {file = "jiter-0.5.0-cp312-none-win_amd64.whl", hash = "sha256:a586832f70c3f1481732919215f36d41c59ca080fa27a65cf23d9490e75b2ef5"},
+    {file = "jiter-0.5.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:f04bc2fc50dc77be9d10f73fcc4e39346402ffe21726ff41028f36e179b587e6"},
+    {file = "jiter-0.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f433a4169ad22fcb550b11179bb2b4fd405de9b982601914ef448390b2954f3"},
+    {file = "jiter-0.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad4a6398c85d3a20067e6c69890ca01f68659da94d74c800298581724e426c7e"},
+    {file = "jiter-0.5.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6baa88334e7af3f4d7a5c66c3a63808e5efbc3698a1c57626541ddd22f8e4fbf"},
+    {file = "jiter-0.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ece0a115c05efca597c6d938f88c9357c843f8c245dbbb53361a1c01afd7148"},
+    {file = "jiter-0.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:335942557162ad372cc367ffaf93217117401bf930483b4b3ebdb1223dbddfa7"},
+    {file = "jiter-0.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:649b0ee97a6e6da174bffcb3c8c051a5935d7d4f2f52ea1583b5b3e7822fbf14"},
+    {file = "jiter-0.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f4be354c5de82157886ca7f5925dbda369b77344b4b4adf2723079715f823989"},
+    {file = "jiter-0.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5206144578831a6de278a38896864ded4ed96af66e1e63ec5dd7f4a1fce38a3a"},
+    {file = "jiter-0.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8120c60f8121ac3d6f072b97ef0e71770cc72b3c23084c72c4189428b1b1d3b6"},
+    {file = "jiter-0.5.0-cp38-none-win32.whl", hash = "sha256:6f1223f88b6d76b519cb033a4d3687ca157c272ec5d6015c322fc5b3074d8a5e"},
+    {file = "jiter-0.5.0-cp38-none-win_amd64.whl", hash = "sha256:c59614b225d9f434ea8fc0d0bec51ef5fa8c83679afedc0433905994fb36d631"},
+    {file = "jiter-0.5.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:0af3838cfb7e6afee3f00dc66fa24695199e20ba87df26e942820345b0afc566"},
+    {file = "jiter-0.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:550b11d669600dbc342364fd4adbe987f14d0bbedaf06feb1b983383dcc4b961"},
+    {file = "jiter-0.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:489875bf1a0ffb3cb38a727b01e6673f0f2e395b2aad3c9387f94187cb214bbf"},
+    {file = "jiter-0.5.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b250ca2594f5599ca82ba7e68785a669b352156260c5362ea1b4e04a0f3e2389"},
+    {file = "jiter-0.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8ea18e01f785c6667ca15407cd6dabbe029d77474d53595a189bdc813347218e"},
+    {file = "jiter-0.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:462a52be85b53cd9bffd94e2d788a09984274fe6cebb893d6287e1c296d50653"},
+    {file = "jiter-0.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:92cc68b48d50fa472c79c93965e19bd48f40f207cb557a8346daa020d6ba973b"},
+    {file = "jiter-0.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1c834133e59a8521bc87ebcad773608c6fa6ab5c7a022df24a45030826cf10bc"},
+    {file = "jiter-0.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ab3a71ff31cf2d45cb216dc37af522d335211f3a972d2fe14ea99073de6cb104"},
+    {file = "jiter-0.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cccd3af9c48ac500c95e1bcbc498020c87e1781ff0345dd371462d67b76643eb"},
+    {file = "jiter-0.5.0-cp39-none-win32.whl", hash = "sha256:368084d8d5c4fc40ff7c3cc513c4f73e02c85f6009217922d0823a48ee7adf61"},
+    {file = "jiter-0.5.0-cp39-none-win_amd64.whl", hash = "sha256:ce03f7b4129eb72f1687fa11300fbf677b02990618428934662406d2a76742a1"},
+    {file = "jiter-0.5.0.tar.gz", hash = "sha256:1d916ba875bcab5c5f7d927df998c4cb694d27dceddf3392e58beaf10563368a"},
+]
+
 [[package]]
 name = "jmespath"
 version = "1.0.1"
@@ -3016,13 +3084,13 @@ files = [
 
 [[package]]
 name = "json-repair"
-version = "0.25.3"
+version = "0.27.0"
 description = "A package to repair broken json strings"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "json_repair-0.25.3-py3-none-any.whl", hash = "sha256:f00b510dd21b31ebe72581bdb07e66381df2883d6f640c89605e482882c12b17"},
-    {file = "json_repair-0.25.3.tar.gz", hash = "sha256:4ee970581a05b0b258b749eb8bcac21de380edda97c3717a4edfafc519ec21a4"},
+    {file = "json_repair-0.27.0-py3-none-any.whl", hash = "sha256:20763c8cf1c3096e33ce7c09c2b8e6c471a4acdce468688c96052fc7cccbad7f"},
+    {file = "json_repair-0.27.0.tar.gz", hash = "sha256:f4e14c5ad2b3f17290a361c3c90915536b462c36f69989e915867e81663dd467"},
 ]
 
 [[package]]
@@ -3498,13 +3566,13 @@ types-tqdm = "*"
 
 [[package]]
 name = "litellm"
-version = "1.42.3"
+version = "1.43.4"
 description = "Library to easily interface with LLM API providers"
 optional = false
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8"
 files = [
-    {file = "litellm-1.42.3-py3-none-any.whl", hash = "sha256:08060c35dd2261a7955a9da30aca294f921adad4074157876be9a1e901bd107c"},
-    {file = "litellm-1.42.3.tar.gz", hash = "sha256:69b4a4c748d9dfd21016f7738b8ad31768271c8c37491d7a1a3c3ffae0e47ac2"},
+    {file = "litellm-1.43.4-py3-none-any.whl", hash = "sha256:619cfaab189f921f66ff50c2b7a0e965e562c2a95b17c2ee24649826ba35da11"},
+    {file = "litellm-1.43.4.tar.gz", hash = "sha256:949c51ad494b935d80da1cd18c3567e4ed181f8eb531ef4706e3be72afb0c43c"},
 ]
 
 [package.dependencies]
@@ -3513,7 +3581,7 @@ click = "*"
 importlib-metadata = ">=6.8.0"
 jinja2 = ">=3.1.2,<4.0.0"
 jsonschema = ">=4.22.0,<5.0.0"
-openai = ">=1.27.0"
+openai = ">=1.40.0"
 pydantic = ">=2.0.0,<3.0.0"
 python-dotenv = ">=0.2.0"
 requests = ">=2.31.0,<3.0.0"
@@ -3666,13 +3734,13 @@ sentence-transformers = ">=2.6.1"
 
 [[package]]
 name = "llama-index-embeddings-ollama"
-version = "0.1.2"
+version = "0.1.3"
 description = "llama-index embeddings ollama integration"
 optional = false
-python-versions = ">=3.8.1,<4.0"
+python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_ollama-0.1.2-py3-none-any.whl", hash = "sha256:ac7afabfa1134059af351b021e05e256bf86dd15e5176ffa5ab0305bcf03b33f"},
-    {file = "llama_index_embeddings_ollama-0.1.2.tar.gz", hash = "sha256:a9e0809bddd2e4ad888f249519edc7e3d339c74e4e03fc5a40c3060dc41d47a9"},
+    {file = "llama_index_embeddings_ollama-0.1.3-py3-none-any.whl", hash = "sha256:b960a8c744e2e56ce1fd75a34753614fed3ad81558570ae9958b90b9062afb6a"},
+    {file = "llama_index_embeddings_ollama-0.1.3.tar.gz", hash = "sha256:4bd1dd3230c9be04cfa45b28c3a8066e46c1654d4360fcbecdc1718ac9013eca"},
 ]
 
 [package.dependencies]
@@ -4571,38 +4639,38 @@ dill = ">=0.3.8"
 
 [[package]]
 name = "mypy"
-version = "1.11.0"
+version = "1.11.1"
 description = "Optional static typing for Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "mypy-1.11.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3824187c99b893f90c845bab405a585d1ced4ff55421fdf5c84cb7710995229"},
-    {file = "mypy-1.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:96f8dbc2c85046c81bcddc246232d500ad729cb720da4e20fce3b542cab91287"},
-    {file = "mypy-1.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1a5d8d8dd8613a3e2be3eae829ee891b6b2de6302f24766ff06cb2875f5be9c6"},
-    {file = "mypy-1.11.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:72596a79bbfb195fd41405cffa18210af3811beb91ff946dbcb7368240eed6be"},
-    {file = "mypy-1.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:35ce88b8ed3a759634cb4eb646d002c4cef0a38f20565ee82b5023558eb90c00"},
-    {file = "mypy-1.11.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:98790025861cb2c3db8c2f5ad10fc8c336ed2a55f4daf1b8b3f877826b6ff2eb"},
-    {file = "mypy-1.11.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:25bcfa75b9b5a5f8d67147a54ea97ed63a653995a82798221cca2a315c0238c1"},
-    {file = "mypy-1.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bea2a0e71c2a375c9fa0ede3d98324214d67b3cbbfcbd55ac8f750f85a414e3"},
-    {file = "mypy-1.11.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d2b3d36baac48e40e3064d2901f2fbd2a2d6880ec6ce6358825c85031d7c0d4d"},
-    {file = "mypy-1.11.0-cp311-cp311-win_amd64.whl", hash = "sha256:d8e2e43977f0e09f149ea69fd0556623919f816764e26d74da0c8a7b48f3e18a"},
-    {file = "mypy-1.11.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1d44c1e44a8be986b54b09f15f2c1a66368eb43861b4e82573026e04c48a9e20"},
-    {file = "mypy-1.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cea3d0fb69637944dd321f41bc896e11d0fb0b0aa531d887a6da70f6e7473aba"},
-    {file = "mypy-1.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a83ec98ae12d51c252be61521aa5731f5512231d0b738b4cb2498344f0b840cd"},
-    {file = "mypy-1.11.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c7b73a856522417beb78e0fb6d33ef89474e7a622db2653bc1285af36e2e3e3d"},
-    {file = "mypy-1.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:f2268d9fcd9686b61ab64f077be7ffbc6fbcdfb4103e5dd0cc5eaab53a8886c2"},
-    {file = "mypy-1.11.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:940bfff7283c267ae6522ef926a7887305945f716a7704d3344d6d07f02df850"},
-    {file = "mypy-1.11.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:14f9294528b5f5cf96c721f231c9f5b2733164e02c1c018ed1a0eff8a18005ac"},
-    {file = "mypy-1.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d7b54c27783991399046837df5c7c9d325d921394757d09dbcbf96aee4649fe9"},
-    {file = "mypy-1.11.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:65f190a6349dec29c8d1a1cd4aa71284177aee5949e0502e6379b42873eddbe7"},
-    {file = "mypy-1.11.0-cp38-cp38-win_amd64.whl", hash = "sha256:dbe286303241fea8c2ea5466f6e0e6a046a135a7e7609167b07fd4e7baf151bf"},
-    {file = "mypy-1.11.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:104e9c1620c2675420abd1f6c44bab7dd33cc85aea751c985006e83dcd001095"},
-    {file = "mypy-1.11.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f006e955718ecd8d159cee9932b64fba8f86ee6f7728ca3ac66c3a54b0062abe"},
-    {file = "mypy-1.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:becc9111ca572b04e7e77131bc708480cc88a911adf3d0239f974c034b78085c"},
-    {file = "mypy-1.11.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6801319fe76c3f3a3833f2b5af7bd2c17bb93c00026a2a1b924e6762f5b19e13"},
-    {file = "mypy-1.11.0-cp39-cp39-win_amd64.whl", hash = "sha256:c1a184c64521dc549324ec6ef7cbaa6b351912be9cb5edb803c2808a0d7e85ac"},
-    {file = "mypy-1.11.0-py3-none-any.whl", hash = "sha256:56913ec8c7638b0091ef4da6fcc9136896914a9d60d54670a75880c3e5b99ace"},
-    {file = "mypy-1.11.0.tar.gz", hash = "sha256:93743608c7348772fdc717af4aeee1997293a1ad04bc0ea6efa15bf65385c538"},
+    {file = "mypy-1.11.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a32fc80b63de4b5b3e65f4be82b4cfa362a46702672aa6a0f443b4689af7008c"},
+    {file = "mypy-1.11.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c1952f5ea8a5a959b05ed5f16452fddadbaae48b5d39235ab4c3fc444d5fd411"},
+    {file = "mypy-1.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e1e30dc3bfa4e157e53c1d17a0dad20f89dc433393e7702b813c10e200843b03"},
+    {file = "mypy-1.11.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2c63350af88f43a66d3dfeeeb8d77af34a4f07d760b9eb3a8697f0386c7590b4"},
+    {file = "mypy-1.11.1-cp310-cp310-win_amd64.whl", hash = "sha256:a831671bad47186603872a3abc19634f3011d7f83b083762c942442d51c58d58"},
+    {file = "mypy-1.11.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7b6343d338390bb946d449677726edf60102a1c96079b4f002dedff375953fc5"},
+    {file = "mypy-1.11.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e4fe9f4e5e521b458d8feb52547f4bade7ef8c93238dfb5bbc790d9ff2d770ca"},
+    {file = "mypy-1.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:886c9dbecc87b9516eff294541bf7f3655722bf22bb898ee06985cd7269898de"},
+    {file = "mypy-1.11.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fca4a60e1dd9fd0193ae0067eaeeb962f2d79e0d9f0f66223a0682f26ffcc809"},
+    {file = "mypy-1.11.1-cp311-cp311-win_amd64.whl", hash = "sha256:0bd53faf56de9643336aeea1c925012837432b5faf1701ccca7fde70166ccf72"},
+    {file = "mypy-1.11.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f39918a50f74dc5969807dcfaecafa804fa7f90c9d60506835036cc1bc891dc8"},
+    {file = "mypy-1.11.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0bc71d1fb27a428139dd78621953effe0d208aed9857cb08d002280b0422003a"},
+    {file = "mypy-1.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b868d3bcff720dd7217c383474008ddabaf048fad8d78ed948bb4b624870a417"},
+    {file = "mypy-1.11.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a707ec1527ffcdd1c784d0924bf5cb15cd7f22683b919668a04d2b9c34549d2e"},
+    {file = "mypy-1.11.1-cp312-cp312-win_amd64.whl", hash = "sha256:64f4a90e3ea07f590c5bcf9029035cf0efeae5ba8be511a8caada1a4893f5525"},
+    {file = "mypy-1.11.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:749fd3213916f1751fff995fccf20c6195cae941dc968f3aaadf9bb4e430e5a2"},
+    {file = "mypy-1.11.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b639dce63a0b19085213ec5fdd8cffd1d81988f47a2dec7100e93564f3e8fb3b"},
+    {file = "mypy-1.11.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c956b49c5d865394d62941b109728c5c596a415e9c5b2be663dd26a1ff07bc0"},
+    {file = "mypy-1.11.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:45df906e8b6804ef4b666af29a87ad9f5921aad091c79cc38e12198e220beabd"},
+    {file = "mypy-1.11.1-cp38-cp38-win_amd64.whl", hash = "sha256:d44be7551689d9d47b7abc27c71257adfdb53f03880841a5db15ddb22dc63edb"},
+    {file = "mypy-1.11.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2684d3f693073ab89d76da8e3921883019ea8a3ec20fa5d8ecca6a2db4c54bbe"},
+    {file = "mypy-1.11.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:79c07eb282cb457473add5052b63925e5cc97dfab9812ee65a7c7ab5e3cb551c"},
+    {file = "mypy-1.11.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11965c2f571ded6239977b14deebd3f4c3abd9a92398712d6da3a772974fad69"},
+    {file = "mypy-1.11.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a2b43895a0f8154df6519706d9bca8280cda52d3d9d1514b2d9c3e26792a0b74"},
+    {file = "mypy-1.11.1-cp39-cp39-win_amd64.whl", hash = "sha256:1a81cf05975fd61aec5ae16501a091cfb9f605dc3e3c878c0da32f250b74760b"},
+    {file = "mypy-1.11.1-py3-none-any.whl", hash = "sha256:0624bdb940255d2dd24e829d99a13cfeb72e4e9031f9492148f410ed30bcab54"},
+    {file = "mypy-1.11.1.tar.gz", hash = "sha256:f404a0b069709f18bbdb702eb3dcfe51910602995de00bd39cea3050b5772d08"},
 ]
 
 [package.dependencies]
@@ -5058,23 +5126,24 @@ sympy = "*"
 
 [[package]]
 name = "openai"
-version = "1.37.1"
+version = "1.40.2"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.7.1"
 files = [
-    {file = "openai-1.37.1-py3-none-any.whl", hash = "sha256:9a6adda0d6ae8fce02d235c5671c399cfa40d6a281b3628914c7ebf244888ee3"},
-    {file = "openai-1.37.1.tar.gz", hash = "sha256:faf87206785a6b5d9e34555d6a3242482a6852bc802e453e2a891f68ee04ce55"},
+    {file = "openai-1.40.2-py3-none-any.whl", hash = "sha256:38068f858f310b4fd4b0ea8734c3efcfde3c15a2978311e1453bd84817231b96"},
+    {file = "openai-1.40.2.tar.gz", hash = "sha256:2180e9070bd36084328248b3ce668964e8ddd2e9019e1d426e31dc54cc117bb5"},
 ]
 
 [package.dependencies]
 anyio = ">=3.5.0,<5"
 distro = ">=1.7.0,<2"
 httpx = ">=0.23.0,<1"
+jiter = ">=0.4.0,<1"
 pydantic = ">=1.9.0,<3"
 sniffio = "*"
 tqdm = ">4"
-typing-extensions = ">=4.7,<5"
+typing-extensions = ">=4.11,<5"
 
 [package.extras]
 datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
@@ -5551,13 +5620,13 @@ xmp = ["defusedxml"]
 
 [[package]]
 name = "pip"
-version = "24.1.1"
+version = "24.2"
 description = "The PyPA recommended tool for installing Python packages."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pip-24.1.1-py3-none-any.whl", hash = "sha256:efca15145a95e95c00608afeab66311d40bfb73bb2266a855befd705e6bb15a0"},
-    {file = "pip-24.1.1.tar.gz", hash = "sha256:5aa64f65e1952733ee0a9a9b1f52496ebdb3f3077cc46f80a16d983b58d1180a"},
+    {file = "pip-24.2-py3-none-any.whl", hash = "sha256:2cd581cf58ab7fcfca4ce8efa6dcacd0de5bf8d0a3eb9ec927e07405f4d9e2a2"},
+    {file = "pip-24.2.tar.gz", hash = "sha256:5b5e490b5e9cb275c879595064adce9ebd31b854e3e803740b72f9ccf34a45b8"},
 ]
 
 [[package]]
@@ -5655,13 +5724,13 @@ test = ["coverage", "flake8", "freezegun (==0.3.15)", "mock (>=2.0.0)", "pylint"
 
 [[package]]
 name = "pre-commit"
-version = "3.7.1"
+version = "3.8.0"
 description = "A framework for managing and maintaining multi-language pre-commit hooks."
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "pre_commit-3.7.1-py2.py3-none-any.whl", hash = "sha256:fae36fd1d7ad7d6a5a1c0b0d5adb2ed1a3bda5a21bf6c3e5372073d7a11cd4c5"},
-    {file = "pre_commit-3.7.1.tar.gz", hash = "sha256:8ca3ad567bc78a4972a3f1a477e94a79d4597e8140a6e0b651c5e33899c3654a"},
+    {file = "pre_commit-3.8.0-py2.py3-none-any.whl", hash = "sha256:9a90a53bf82fdd8778d58085faf8d83df56e40dfe18f45b19446e26bf1b3a63f"},
+    {file = "pre_commit-3.8.0.tar.gz", hash = "sha256:8bb6494d4a20423842e198980c9ecf9f96607a07ea29549e180eef9ae80fe7af"},
 ]
 
 [package.dependencies]
@@ -6124,13 +6193,13 @@ windows-terminal = ["colorama (>=0.4.6)"]
 
 [[package]]
 name = "pyjwt"
-version = "2.8.0"
+version = "2.9.0"
 description = "JSON Web Token implementation in Python"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "PyJWT-2.8.0-py3-none-any.whl", hash = "sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320"},
-    {file = "PyJWT-2.8.0.tar.gz", hash = "sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de"},
+    {file = "PyJWT-2.9.0-py3-none-any.whl", hash = "sha256:3b02fb0f44517787776cf48f2ae25d8e14f300e6d7545a4315cee571a415e850"},
+    {file = "pyjwt-2.9.0.tar.gz", hash = "sha256:7e1e5b56cc735432a7369cbfa0efe50fa113ebecdc04ae6922deba8b84582d0c"},
 ]
 
 [package.dependencies]
@@ -6138,8 +6207,8 @@ cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"cryp
 
 [package.extras]
 crypto = ["cryptography (>=3.4.0)"]
-dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"]
-docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"]
+dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx", "sphinx-rtd-theme", "zope.interface"]
+docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"]
 tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
 
 [[package]]
@@ -6234,6 +6303,18 @@ files = [
     {file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"},
 ]
 
+[[package]]
+name = "pysocks"
+version = "1.7.1"
+description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"},
+    {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"},
+    {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"},
+]
+
 [[package]]
 name = "pytest"
 version = "8.3.2"
@@ -6375,18 +6456,19 @@ dev = ["atomicwrites (==1.4.1)", "attrs (==23.2.0)", "coverage (==7.4.1)", "hatc
 
 [[package]]
 name = "python-pptx"
-version = "0.6.23"
-description = "Generate and manipulate Open XML PowerPoint (.pptx) files"
+version = "1.0.2"
+description = "Create, read, and update PowerPoint 2007+ (.pptx) files."
 optional = false
-python-versions = "*"
+python-versions = ">=3.8"
 files = [
-    {file = "python-pptx-0.6.23.tar.gz", hash = "sha256:587497ff28e779ab18dbb074f6d4052893c85dedc95ed75df319364f331fedee"},
-    {file = "python_pptx-0.6.23-py3-none-any.whl", hash = "sha256:dd0527194627a2b7cc05f3ba23ecaa2d9a0d5ac9b6193a28ed1b7a716f4217d4"},
+    {file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"},
+    {file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"},
 ]
 
 [package.dependencies]
 lxml = ">=3.1.0"
 Pillow = ">=3.3.2"
+typing-extensions = ">=4.9.0"
 XlsxWriter = ">=0.5.7"
 
 [[package]]
@@ -6736,6 +6818,7 @@ files = [
 certifi = ">=2017.4.17"
 charset-normalizer = ">=2,<4"
 idna = ">=2.5,<4"
+PySocks = {version = ">=1.5.6,<1.5.7 || >1.5.7", optional = true, markers = "extra == \"socks\""}
 urllib3 = ">=1.21.1,<3"
 
 [package.extras]
@@ -6942,29 +7025,29 @@ pyasn1 = ">=0.1.3"
 
 [[package]]
 name = "ruff"
-version = "0.5.5"
+version = "0.5.7"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.5.5-py3-none-linux_armv6l.whl", hash = "sha256:605d589ec35d1da9213a9d4d7e7a9c761d90bba78fc8790d1c5e65026c1b9eaf"},
-    {file = "ruff-0.5.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:00817603822a3e42b80f7c3298c8269e09f889ee94640cd1fc7f9329788d7bf8"},
-    {file = "ruff-0.5.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:187a60f555e9f865a2ff2c6984b9afeffa7158ba6e1eab56cb830404c942b0f3"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe26fc46fa8c6e0ae3f47ddccfbb136253c831c3289bba044befe68f467bfb16"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4ad25dd9c5faac95c8e9efb13e15803cd8bbf7f4600645a60ffe17c73f60779b"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f70737c157d7edf749bcb952d13854e8f745cec695a01bdc6e29c29c288fc36e"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:cfd7de17cef6ab559e9f5ab859f0d3296393bc78f69030967ca4d87a541b97a0"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a09b43e02f76ac0145f86a08e045e2ea452066f7ba064fd6b0cdccb486f7c3e7"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d0b856cb19c60cd40198be5d8d4b556228e3dcd545b4f423d1ad812bfdca5884"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3687d002f911e8a5faf977e619a034d159a8373514a587249cc00f211c67a091"},
-    {file = "ruff-0.5.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ac9dc814e510436e30d0ba535f435a7f3dc97f895f844f5b3f347ec8c228a523"},
-    {file = "ruff-0.5.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:af9bdf6c389b5add40d89b201425b531e0a5cceb3cfdcc69f04d3d531c6be74f"},
-    {file = "ruff-0.5.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:d40a8533ed545390ef8315b8e25c4bb85739b90bd0f3fe1280a29ae364cc55d8"},
-    {file = "ruff-0.5.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:cab904683bf9e2ecbbe9ff235bfe056f0eba754d0168ad5407832928d579e7ab"},
-    {file = "ruff-0.5.5-py3-none-win32.whl", hash = "sha256:696f18463b47a94575db635ebb4c178188645636f05e934fdf361b74edf1bb2d"},
-    {file = "ruff-0.5.5-py3-none-win_amd64.whl", hash = "sha256:50f36d77f52d4c9c2f1361ccbfbd09099a1b2ea5d2b2222c586ab08885cf3445"},
-    {file = "ruff-0.5.5-py3-none-win_arm64.whl", hash = "sha256:3191317d967af701f1b73a31ed5788795936e423b7acce82a2b63e26eb3e89d6"},
-    {file = "ruff-0.5.5.tar.gz", hash = "sha256:cc5516bdb4858d972fbc31d246bdb390eab8df1a26e2353be2dbc0c2d7f5421a"},
+    {file = "ruff-0.5.7-py3-none-linux_armv6l.whl", hash = "sha256:548992d342fc404ee2e15a242cdbea4f8e39a52f2e7752d0e4cbe88d2d2f416a"},
+    {file = "ruff-0.5.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:00cc8872331055ee017c4f1071a8a31ca0809ccc0657da1d154a1d2abac5c0be"},
+    {file = "ruff-0.5.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:eaf3d86a1fdac1aec8a3417a63587d93f906c678bb9ed0b796da7b59c1114a1e"},
+    {file = "ruff-0.5.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a01c34400097b06cf8a6e61b35d6d456d5bd1ae6961542de18ec81eaf33b4cb8"},
+    {file = "ruff-0.5.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fcc8054f1a717e2213500edaddcf1dbb0abad40d98e1bd9d0ad364f75c763eea"},
+    {file = "ruff-0.5.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f70284e73f36558ef51602254451e50dd6cc479f8b6f8413a95fcb5db4a55fc"},
+    {file = "ruff-0.5.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:a78ad870ae3c460394fc95437d43deb5c04b5c29297815a2a1de028903f19692"},
+    {file = "ruff-0.5.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ccd078c66a8e419475174bfe60a69adb36ce04f8d4e91b006f1329d5cd44bcf"},
+    {file = "ruff-0.5.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7e31c9bad4ebf8fdb77b59cae75814440731060a09a0e0077d559a556453acbb"},
+    {file = "ruff-0.5.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d796327eed8e168164346b769dd9a27a70e0298d667b4ecee6877ce8095ec8e"},
+    {file = "ruff-0.5.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:4a09ea2c3f7778cc635e7f6edf57d566a8ee8f485f3c4454db7771efb692c499"},
+    {file = "ruff-0.5.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:a36d8dcf55b3a3bc353270d544fb170d75d2dff41eba5df57b4e0b67a95bb64e"},
+    {file = "ruff-0.5.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9369c218f789eefbd1b8d82a8cf25017b523ac47d96b2f531eba73770971c9e5"},
+    {file = "ruff-0.5.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b88ca3db7eb377eb24fb7c82840546fb7acef75af4a74bd36e9ceb37a890257e"},
+    {file = "ruff-0.5.7-py3-none-win32.whl", hash = "sha256:33d61fc0e902198a3e55719f4be6b375b28f860b09c281e4bdbf783c0566576a"},
+    {file = "ruff-0.5.7-py3-none-win_amd64.whl", hash = "sha256:083bbcbe6fadb93cd86709037acc510f86eed5a314203079df174c40bbbca6b3"},
+    {file = "ruff-0.5.7-py3-none-win_arm64.whl", hash = "sha256:2dca26154ff9571995107221d0aeaad0e75a77b5a682d6236cf89a58c70b76f4"},
+    {file = "ruff-0.5.7.tar.gz", hash = "sha256:8dfc0a458797f5d9fb622dd0efc52d796f23f0a1493a9527f4e49a550ae9a7e5"},
 ]
 
 [[package]]
@@ -7106,6 +7189,21 @@ tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"]
 testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools-rust (>=1.5.2)"]
 torch = ["safetensors[numpy]", "torch (>=1.10)"]
 
+[[package]]
+name = "scantree"
+version = "0.0.4"
+description = "Flexible recursive directory iterator: scandir meets glob(\"**\", recursive=True)"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "scantree-0.0.4-py3-none-any.whl", hash = "sha256:7616ab65aa6b7f16fcf8e6fa1d9afaa99a27ab72bba05c61b691853b96763174"},
+    {file = "scantree-0.0.4.tar.gz", hash = "sha256:15bd5cb24483b04db2c70653604e8ea3522e98087db7e38ab8482f053984c0ac"},
+]
+
+[package.dependencies]
+attrs = ">=18.0.0"
+pathspec = ">=0.10.1"
+
 [[package]]
 name = "scikit-learn"
 version = "1.5.0"
@@ -7507,13 +7605,13 @@ full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7
 
 [[package]]
 name = "streamlit"
-version = "1.37.0"
+version = "1.37.1"
 description = "A faster way to build and share data apps"
 optional = false
 python-versions = "!=3.9.7,>=3.8"
 files = [
-    {file = "streamlit-1.37.0-py2.py3-none-any.whl", hash = "sha256:d17e2d32b075a270a97f134ab5d22bbb98b4e474fa261ff49dc4a2b380386c84"},
-    {file = "streamlit-1.37.0.tar.gz", hash = "sha256:463ef728ba21e74e05122e3704e8af644a7bdbb5822e281b8daf4a0a48761879"},
+    {file = "streamlit-1.37.1-py2.py3-none-any.whl", hash = "sha256:0651240fccc569900cc9450390b0a67473fda55be65f317e46285f99e2bddf04"},
+    {file = "streamlit-1.37.1.tar.gz", hash = "sha256:bc7e3813d94a39dda56f15678437eb37830973c601e8e574f2225a7bf188ea5a"},
 ]
 
 [package.dependencies]
@@ -7553,7 +7651,7 @@ files = [
 
 [[package]]
 name = "swebench"
-version = "2.0.2"
+version = "2.0.12"
 description = "The official SWE-bench package - a benchmark for evaluating LMs on software engineering"
 optional = false
 python-versions = ">=3.8"
@@ -7572,26 +7670,33 @@ python-dotenv = "*"
 requests = "*"
 rich = "*"
 tqdm = "*"
+unidiff = "*"
+
+[package.extras]
+inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tenacity", "tiktoken", "torch", "transformers", "triton"]
 
 [package.source]
 type = "git"
 url = "https://github.com/OpenDevin/SWE-bench.git"
 reference = "HEAD"
-resolved_reference = "4498af933679c518df6d1f8485864f72afc21500"
+resolved_reference = "c2b3cefd4a5af0b248966a773650a39046072975"
 
 [[package]]
 name = "sympy"
-version = "1.12.1"
+version = "1.13.1"
 description = "Computer algebra system (CAS) in Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "sympy-1.12.1-py3-none-any.whl", hash = "sha256:9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515"},
-    {file = "sympy-1.12.1.tar.gz", hash = "sha256:2877b03f998cd8c08f07cd0de5b767119cd3ef40d09f41c30d722f6686b0fb88"},
+    {file = "sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8"},
+    {file = "sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f"},
 ]
 
 [package.dependencies]
-mpmath = ">=1.1.0,<1.4.0"
+mpmath = ">=1.1.0,<1.4"
+
+[package.extras]
+dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
 
 [[package]]
 name = "tenacity"
@@ -7881,36 +7986,36 @@ files = [
 
 [[package]]
 name = "torch"
-version = "2.2.0"
+version = "2.2.2"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:d366158d6503a3447e67f8c0ad1328d54e6c181d88572d688a625fac61b13a97"},
-    {file = "torch-2.2.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:707f2f80402981e9f90d0038d7d481678586251e6642a7a6ef67fc93511cb446"},
-    {file = "torch-2.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:15c8f0a105c66b28496092fca1520346082e734095f8eaf47b5786bac24b8a31"},
-    {file = "torch-2.2.0-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:0ca4df4b728515ad009b79f5107b00bcb2c63dc202d991412b9eb3b6a4f24349"},
-    {file = "torch-2.2.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:3d3eea2d5969b9a1c9401429ca79efc668120314d443d3463edc3289d7f003c7"},
-    {file = "torch-2.2.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:0d1c580e379c0d48f0f0a08ea28d8e373295aa254de4f9ad0631f9ed8bc04c24"},
-    {file = "torch-2.2.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:9328e3c1ce628a281d2707526b4d1080eae7c4afab4f81cea75bde1f9441dc78"},
-    {file = "torch-2.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:03c8e660907ac1b8ee07f6d929c4e15cd95be2fb764368799cca02c725a212b8"},
-    {file = "torch-2.2.0-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:da0cefe7f84ece3e3b56c11c773b59d1cb2c0fd83ddf6b5f7f1fd1a987b15c3e"},
-    {file = "torch-2.2.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:f81d23227034221a4a4ff8ef24cc6cec7901edd98d9e64e32822778ff01be85e"},
-    {file = "torch-2.2.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:dcbfb2192ac41ca93c756ebe9e2af29df0a4c14ee0e7a0dd78f82c67a63d91d4"},
-    {file = "torch-2.2.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:9eeb42971619e24392c9088b5b6d387d896e267889d41d267b1fec334f5227c5"},
-    {file = "torch-2.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:c718b2ca69a6cac28baa36d86d8c0ec708b102cebd1ceb1b6488e404cd9be1d1"},
-    {file = "torch-2.2.0-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:f11d18fceb4f9ecb1ac680dde7c463c120ed29056225d75469c19637e9f98d12"},
-    {file = "torch-2.2.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:ee1da852bfd4a7e674135a446d6074c2da7194c1b08549e31eae0b3138c6b4d2"},
-    {file = "torch-2.2.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0d819399819d0862268ac531cf12a501c253007df4f9e6709ede8a0148f1a7b8"},
-    {file = "torch-2.2.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:08f53ccc38c49d839bc703ea1b20769cc8a429e0c4b20b56921a9f64949bf325"},
-    {file = "torch-2.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:93bffe3779965a71dab25fc29787538c37c5d54298fd2f2369e372b6fb137d41"},
-    {file = "torch-2.2.0-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:c17ec323da778efe8dad49d8fb534381479ca37af1bfc58efdbb8607a9d263a3"},
-    {file = "torch-2.2.0-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:c02685118008834e878f676f81eab3a952b7936fa31f474ef8a5ff4b5c78b36d"},
-    {file = "torch-2.2.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:d9f39d6f53cec240a0e3baa82cb697593340f9d4554cee6d3d6ca07925c2fac0"},
-    {file = "torch-2.2.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:51770c065206250dc1222ea7c0eff3f88ab317d3e931cca2aee461b85fbc2472"},
-    {file = "torch-2.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:008e4c6ad703de55af760c73bf937ecdd61a109f9b08f2bbb9c17e7c7017f194"},
-    {file = "torch-2.2.0-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:de8680472dd14e316f42ceef2a18a301461a9058cd6e99a1f1b20f78f11412f1"},
-    {file = "torch-2.2.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:99e1dcecb488e3fd25bcaac56e48cdb3539842904bdc8588b0b255fde03a254c"},
+    {file = "torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:bc889d311a855dd2dfd164daf8cc903a6b7273a747189cebafdd89106e4ad585"},
+    {file = "torch-2.2.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:15dffa4cc3261fa73d02f0ed25f5fa49ecc9e12bf1ae0a4c1e7a88bbfaad9030"},
+    {file = "torch-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:11e8fe261233aeabd67696d6b993eeb0896faa175c6b41b9a6c9f0334bdad1c5"},
+    {file = "torch-2.2.2-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:b2e2200b245bd9f263a0d41b6a2dab69c4aca635a01b30cca78064b0ef5b109e"},
+    {file = "torch-2.2.2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:877b3e6593b5e00b35bbe111b7057464e76a7dd186a287280d941b564b0563c2"},
+    {file = "torch-2.2.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:ad4c03b786e074f46606f4151c0a1e3740268bcf29fbd2fdf6666d66341c1dcb"},
+    {file = "torch-2.2.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:32827fa1fbe5da8851686256b4cd94cc7b11be962862c2293811c94eea9457bf"},
+    {file = "torch-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:f9ef0a648310435511e76905f9b89612e45ef2c8b023bee294f5e6f7e73a3e7c"},
+    {file = "torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:95b9b44f3bcebd8b6cd8d37ec802048c872d9c567ba52c894bba90863a439059"},
+    {file = "torch-2.2.2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:49aa4126ede714c5aeef7ae92969b4b0bbe67f19665106463c39f22e0a1860d1"},
+    {file = "torch-2.2.2-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:cf12cdb66c9c940227ad647bc9cf5dba7e8640772ae10dfe7569a0c1e2a28aca"},
+    {file = "torch-2.2.2-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:89ddac2a8c1fb6569b90890955de0c34e1724f87431cacff4c1979b5f769203c"},
+    {file = "torch-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:451331406b760f4b1ab298ddd536486ab3cfb1312614cfe0532133535be60bea"},
+    {file = "torch-2.2.2-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:eb4d6e9d3663e26cd27dc3ad266b34445a16b54908e74725adb241aa56987533"},
+    {file = "torch-2.2.2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:bf9558da7d2bf7463390b3b2a61a6a3dbb0b45b161ee1dd5ec640bf579d479fc"},
+    {file = "torch-2.2.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:cd2bf7697c9e95fb5d97cc1d525486d8cf11a084c6af1345c2c2c22a6b0029d0"},
+    {file = "torch-2.2.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:b421448d194496e1114d87a8b8d6506bce949544e513742b097e2ab8f7efef32"},
+    {file = "torch-2.2.2-cp38-cp38-win_amd64.whl", hash = "sha256:3dbcd563a9b792161640c0cffe17e3270d85e8f4243b1f1ed19cca43d28d235b"},
+    {file = "torch-2.2.2-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:31f4310210e7dda49f1fb52b0ec9e59382cfcb938693f6d5378f25b43d7c1d29"},
+    {file = "torch-2.2.2-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:c795feb7e8ce2e0ef63f75f8e1ab52e7fd5e1a4d7d0c31367ade1e3de35c9e95"},
+    {file = "torch-2.2.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:a6e5770d68158d07456bfcb5318b173886f579fdfbf747543901ce718ea94782"},
+    {file = "torch-2.2.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:67dcd726edff108e2cd6c51ff0e416fd260c869904de95750e80051358680d24"},
+    {file = "torch-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:539d5ef6c4ce15bd3bd47a7b4a6e7c10d49d4d21c0baaa87c7d2ef8698632dfb"},
+    {file = "torch-2.2.2-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:dff696de90d6f6d1e8200e9892861fd4677306d0ef604cb18f2134186f719f82"},
+    {file = "torch-2.2.2-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:3a4dd910663fd7a124c056c878a52c2b0be4a5a424188058fe97109d4436ee42"},
 ]
 
 [package.dependencies]
@@ -7930,7 +8035,7 @@ nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"
 nvidia-nccl-cu12 = {version = "2.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 sympy = "*"
-triton = {version = "2.2.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+triton = {version = "2.2.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.12\""}
 typing-extensions = ">=4.8.0"
 
 [package.extras]
@@ -8286,6 +8391,17 @@ files = [
     {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
 ]
 
+[[package]]
+name = "unidiff"
+version = "0.7.5"
+description = "Unified diff parsing/metadata extraction library."
+optional = false
+python-versions = "*"
+files = [
+    {file = "unidiff-0.7.5-py2.py3-none-any.whl", hash = "sha256:c93bf2265cc1ba2a520e415ab05da587370bc2a3ae9e0414329f54f0c2fc09e8"},
+    {file = "unidiff-0.7.5.tar.gz", hash = "sha256:2e5f0162052248946b9f0970a40e9e124236bf86c82b70821143a6fc1dea2574"},
+]
+
 [[package]]
 name = "uri-template"
 version = "1.3.0"
@@ -8330,13 +8446,13 @@ zstd = ["zstandard (>=0.18.0)"]
 
 [[package]]
 name = "uvicorn"
-version = "0.30.3"
+version = "0.30.5"
 description = "The lightning-fast ASGI server."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "uvicorn-0.30.3-py3-none-any.whl", hash = "sha256:94a3608da0e530cea8f69683aa4126364ac18e3826b6630d1a65f4638aade503"},
-    {file = "uvicorn-0.30.3.tar.gz", hash = "sha256:0d114d0831ff1adbf231d358cbf42f17333413042552a624ea6a9b4c33dcfd81"},
+    {file = "uvicorn-0.30.5-py3-none-any.whl", hash = "sha256:b2d86de274726e9878188fa07576c9ceeff90a839e2b6e25c917fe05f5a6c835"},
+    {file = "uvicorn-0.30.5.tar.gz", hash = "sha256:ac6fdbd4425c5fd17a9fe39daf4d4d075da6fdc80f653e5894cdc2fd98752bee"},
 ]
 
 [package.dependencies]
@@ -9055,47 +9171,45 @@ test = ["zope.testrunner"]
 
 [[package]]
 name = "zope-interface"
-version = "6.4.post2"
+version = "7.0.1"
 description = "Interfaces for Python"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "zope.interface-6.4.post2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2eccd5bef45883802848f821d940367c1d0ad588de71e5cabe3813175444202c"},
-    {file = "zope.interface-6.4.post2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:762e616199f6319bb98e7f4f27d254c84c5fb1c25c908c2a9d0f92b92fb27530"},
-    {file = "zope.interface-6.4.post2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ef8356f16b1a83609f7a992a6e33d792bb5eff2370712c9eaae0d02e1924341"},
-    {file = "zope.interface-6.4.post2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e4fa5d34d7973e6b0efa46fe4405090f3b406f64b6290facbb19dcbf642ad6b"},
-    {file = "zope.interface-6.4.post2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d22fce0b0f5715cdac082e35a9e735a1752dc8585f005d045abb1a7c20e197f9"},
-    {file = "zope.interface-6.4.post2-cp310-cp310-win_amd64.whl", hash = "sha256:97e615eab34bd8477c3f34197a17ce08c648d38467489359cb9eb7394f1083f7"},
-    {file = "zope.interface-6.4.post2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:599f3b07bde2627e163ce484d5497a54a0a8437779362395c6b25e68c6590ede"},
-    {file = "zope.interface-6.4.post2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:136cacdde1a2c5e5bc3d0b2a1beed733f97e2dad8c2ad3c2e17116f6590a3827"},
-    {file = "zope.interface-6.4.post2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47937cf2e7ed4e0e37f7851c76edeb8543ec9b0eae149b36ecd26176ff1ca874"},
-    {file = "zope.interface-6.4.post2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f0a6be264afb094975b5ef55c911379d6989caa87c4e558814ec4f5125cfa2e"},
-    {file = "zope.interface-6.4.post2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47654177e675bafdf4e4738ce58cdc5c6d6ee2157ac0a78a3fa460942b9d64a8"},
-    {file = "zope.interface-6.4.post2-cp311-cp311-win_amd64.whl", hash = "sha256:e2fb8e8158306567a3a9a41670c1ff99d0567d7fc96fa93b7abf8b519a46b250"},
-    {file = "zope.interface-6.4.post2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b912750b13d76af8aac45ddf4679535def304b2a48a07989ec736508d0bbfbde"},
-    {file = "zope.interface-6.4.post2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4ac46298e0143d91e4644a27a769d1388d5d89e82ee0cf37bf2b0b001b9712a4"},
-    {file = "zope.interface-6.4.post2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86a94af4a88110ed4bb8961f5ac72edf782958e665d5bfceaab6bf388420a78b"},
-    {file = "zope.interface-6.4.post2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:73f9752cf3596771c7726f7eea5b9e634ad47c6d863043589a1c3bb31325c7eb"},
-    {file = "zope.interface-6.4.post2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00b5c3e9744dcdc9e84c24ed6646d5cf0cf66551347b310b3ffd70f056535854"},
-    {file = "zope.interface-6.4.post2-cp312-cp312-win_amd64.whl", hash = "sha256:551db2fe892fcbefb38f6f81ffa62de11090c8119fd4e66a60f3adff70751ec7"},
-    {file = "zope.interface-6.4.post2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96ac6b3169940a8cd57b4f2b8edcad8f5213b60efcd197d59fbe52f0accd66e"},
-    {file = "zope.interface-6.4.post2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cebff2fe5dc82cb22122e4e1225e00a4a506b1a16fafa911142ee124febf2c9e"},
-    {file = "zope.interface-6.4.post2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33ee982237cffaf946db365c3a6ebaa37855d8e3ca5800f6f48890209c1cfefc"},
-    {file = "zope.interface-6.4.post2-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:fbf649bc77510ef2521cf797700b96167bb77838c40780da7ea3edd8b78044d1"},
-    {file = "zope.interface-6.4.post2-cp37-cp37m-win_amd64.whl", hash = "sha256:4c0b208a5d6c81434bdfa0f06d9b667e5de15af84d8cae5723c3a33ba6611b82"},
-    {file = "zope.interface-6.4.post2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d3fe667935e9562407c2511570dca14604a654988a13d8725667e95161d92e9b"},
-    {file = "zope.interface-6.4.post2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a96e6d4074db29b152222c34d7eec2e2db2f92638d2b2b2c704f9e8db3ae0edc"},
-    {file = "zope.interface-6.4.post2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:866a0f583be79f0def667a5d2c60b7b4cc68f0c0a470f227e1122691b443c934"},
-    {file = "zope.interface-6.4.post2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5fe919027f29b12f7a2562ba0daf3e045cb388f844e022552a5674fcdf5d21f1"},
-    {file = "zope.interface-6.4.post2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e0343a6e06d94f6b6ac52fbc75269b41dd3c57066541a6c76517f69fe67cb43"},
-    {file = "zope.interface-6.4.post2-cp38-cp38-win_amd64.whl", hash = "sha256:dabb70a6e3d9c22df50e08dc55b14ca2a99da95a2d941954255ac76fd6982bc5"},
-    {file = "zope.interface-6.4.post2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:706efc19f9679a1b425d6fa2b4bc770d976d0984335eaea0869bd32f627591d2"},
-    {file = "zope.interface-6.4.post2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d136e5b8821073e1a09dde3eb076ea9988e7010c54ffe4d39701adf0c303438"},
-    {file = "zope.interface-6.4.post2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1730c93a38b5a18d24549bc81613223962a19d457cfda9bdc66e542f475a36f4"},
-    {file = "zope.interface-6.4.post2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bc2676312cc3468a25aac001ec727168994ea3b69b48914944a44c6a0b251e79"},
-    {file = "zope.interface-6.4.post2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a62fd6cd518693568e23e02f41816adedfca637f26716837681c90b36af3671"},
-    {file = "zope.interface-6.4.post2-cp39-cp39-win_amd64.whl", hash = "sha256:d3f7e001328bd6466b3414215f66dde3c7c13d8025a9c160a75d7b2687090d15"},
-    {file = "zope.interface-6.4.post2.tar.gz", hash = "sha256:1c207e6f6dfd5749a26f5a5fd966602d6b824ec00d2df84a7e9a924e8933654e"},
+    {file = "zope.interface-7.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ec4e87e6fdc511a535254daa122c20e11959ce043b4e3425494b237692a34f1c"},
+    {file = "zope.interface-7.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:51d5713e8e38f2d3ec26e0dfdca398ed0c20abda2eb49ffc15a15a23eb8e5f6d"},
+    {file = "zope.interface-7.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea8d51e5eb29e57d34744369cd08267637aa5a0fefc9b5d33775ab7ff2ebf2e3"},
+    {file = "zope.interface-7.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:55bbcc74dc0c7ab489c315c28b61d7a1d03cf938cc99cc58092eb065f120c3a5"},
+    {file = "zope.interface-7.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10ebac566dd0cec66f942dc759d46a994a2b3ba7179420f0e2130f88f8a5f400"},
+    {file = "zope.interface-7.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:7039e624bcb820f77cc2ff3d1adcce531932990eee16121077eb51d9c76b6c14"},
+    {file = "zope.interface-7.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:03bd5c0db82237bbc47833a8b25f1cc090646e212f86b601903d79d7e6b37031"},
+    {file = "zope.interface-7.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3f52050c6a10d4a039ec6f2c58e5b3ade5cc570d16cf9d102711e6b8413c90e6"},
+    {file = "zope.interface-7.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af0b33f04677b57843d529b9257a475d2865403300b48c67654c40abac2f9f24"},
+    {file = "zope.interface-7.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:696c2a381fc7876b3056711717dba5eddd07c2c9e5ccd50da54029a1293b6e43"},
+    {file = "zope.interface-7.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f89a420cf5a6f2aa7849dd59e1ff0e477f562d97cf8d6a1ee03461e1eec39887"},
+    {file = "zope.interface-7.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:b59deb0ddc7b431e41d720c00f99d68b52cb9bd1d5605a085dc18f502fe9c47f"},
+    {file = "zope.interface-7.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:52f5253cca1b35eaeefa51abd366b87f48f8714097c99b131ba61f3fdbbb58e7"},
+    {file = "zope.interface-7.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:88d108d004e0df25224de77ce349a7e73494ea2cb194031f7c9687e68a88ec9b"},
+    {file = "zope.interface-7.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c203d82069ba31e1f3bc7ba530b2461ec86366cd4bfc9b95ec6ce58b1b559c34"},
+    {file = "zope.interface-7.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f3495462bc0438b76536a0e10d765b168ae636092082531b88340dc40dcd118"},
+    {file = "zope.interface-7.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:192b7a792e3145ed880ff6b1a206fdb783697cfdb4915083bfca7065ec845e60"},
+    {file = "zope.interface-7.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:400d06c9ec8dbcc96f56e79376297e7be07a315605c9a2208720da263d44d76f"},
+    {file = "zope.interface-7.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c1dff87b30fd150c61367d0e2cdc49bb55f8b9fd2a303560bbc24b951573ae1"},
+    {file = "zope.interface-7.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f749ca804648d00eda62fe1098f229b082dfca930d8bad8386e572a6eafa7525"},
+    {file = "zope.interface-7.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ec212037becf6d2f705b7ed4538d56980b1e7bba237df0d8995cbbed29961dc"},
+    {file = "zope.interface-7.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d33cb526efdc235a2531433fc1287fcb80d807d5b401f9b801b78bf22df560dd"},
+    {file = "zope.interface-7.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b419f2144e1762ab845f20316f1df36b15431f2622ebae8a6d5f7e8e712b413c"},
+    {file = "zope.interface-7.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03f1452d5d1f279184d5bdb663a3dc39902d9320eceb63276240791e849054b6"},
+    {file = "zope.interface-7.0.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ba4b3638d014918b918aa90a9c8370bd74a03abf8fcf9deb353b3a461a59a84"},
+    {file = "zope.interface-7.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc0615351221926a36a0fbcb2520fb52e0b23e8c22a43754d9cb8f21358c33c0"},
+    {file = "zope.interface-7.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:ce6cbb852fb8f2f9bb7b9cdca44e2e37bce783b5f4c167ff82cb5f5128163c8f"},
+    {file = "zope.interface-7.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5566fd9271c89ad03d81b0831c37d46ae5e2ed211122c998637130159a120cf1"},
+    {file = "zope.interface-7.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:da0cef4d7e3f19c3bd1d71658d6900321af0492fee36ec01b550a10924cffb9c"},
+    {file = "zope.interface-7.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f32ca483e6ade23c7caaee9d5ee5d550cf4146e9b68d2fb6c68bac183aa41c37"},
+    {file = "zope.interface-7.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:da21e7eec49252df34d426c2ee9cf0361c923026d37c24728b0fa4cc0599fd03"},
+    {file = "zope.interface-7.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a8195b99e650e6f329ce4e5eb22d448bdfef0406404080812bc96e2a05674cb"},
+    {file = "zope.interface-7.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:19c829d52e921b9fe0b2c0c6a8f9a2508c49678ee1be598f87d143335b6a35dc"},
+    {file = "zope.interface-7.0.1.tar.gz", hash = "sha256:f0f5fda7cbf890371a59ab1d06512da4f2c89a6ea194e595808123c863c38eff"},
 ]
 
 [package.dependencies]
@@ -9109,4 +9223,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "e1520f1342ab527bc3bb2619f8909cbdddeb227c14614eb3d82e133961f1f4d2"
+content-hash = "613a56e7dc5551be660388fb8603f6139dbc5d440ea39f3ba931870dc3234bb4"
diff --git a/pyproject.toml b/pyproject.toml
index 5496eddec6de..7e7d7276b6d1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "opendevin"
-version = "0.8.1"
+version = "0.8.3"
 description = "OpenDevin: Code Less, Make More"
 authors = ["OpenDevin"]
 license = "MIT"
@@ -34,25 +34,29 @@ minio = "^7.2.7"
 gevent = "^24.2.1"
 pyarrow = "17.0.0" # transitive dependency, pinned here to avoid conflicts
 tenacity = "^8.5.0"
-zope-interface = "6.4.post2"
+zope-interface = "7.0.1"
 pathspec = "^0.12.1"
 google-cloud-aiplatform = "*"
-grep-ast = "0.3.2"
+grep-ast = "0.3.3"
 tree-sitter = "0.21.3"
+bashlex = "^0.18"
+pyjwt = "^2.9.0"
+dirhash = "*"
 
 [tool.poetry.group.llama-index.dependencies]
 llama-index = "*"
 llama-index-vector-stores-chroma = "*"
 chromadb = "*"
 llama-index-embeddings-huggingface = "*"
-torch = "2.2.0"
+torch = "2.2.2"
 llama-index-embeddings-azure-openai = "*"
 llama-index-embeddings-ollama = "*"
 
 [tool.poetry.group.dev.dependencies]
-ruff = "0.5.5"
-mypy = "1.11.0"
-pre-commit = "3.7.1"
+ruff = "0.5.7"
+mypy = "1.11.1"
+pre-commit = "3.8.0"
+build = "*"
 
 [tool.poetry.group.test.dependencies]
 pytest = "*"
@@ -72,6 +76,7 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]
 
+
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 notebook = "*"
@@ -83,6 +88,7 @@ python-pptx = "*"
 pylatexenc = "*"
 opencv-python = "*"
 
+
 [build-system]
 build-backend = "poetry.core.masonry.api"
 requires = [
@@ -105,9 +111,13 @@ ignore = ["D1"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"
 
+
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
 retry = "*"
 evaluate = "*"
 swebench = { git = "https://github.com/OpenDevin/SWE-bench.git" }
+func_timeout = "*"
+sympy = "*"
+gdown = "*"
diff --git a/tests/integration/README.md b/tests/integration/README.md
index 292153611a88..390192e9738e 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -36,10 +36,11 @@ The folder is organised as follows:
 ├── README.md
 ├── conftest.py
 ├── mock
-│   ├── [AgentName]
-│   │   └── [TestName]
-│   │       ├── prompt_*.log
-│   │       ├── response_*.log
+    ├── [RuntimeType]
+│   |   ├── [AgentName]
+│   │       └── [TestName]
+│   │           ├── prompt_*.log
+│   │           ├── response_*.log
 └── [TestFiles].py
 ```
 
@@ -47,15 +48,19 @@ where `conftest.py` defines the infrastructure needed to load real-world LLM pro
 and responses for mocking purpose. Prompts and responses generated during real runs
 of agents with real LLMs are stored under `mock/AgentName/TestName` folders.
 
-**Note:** Set PERSIST_SANDBOX=false to use a clean sandbox for each test.
 
 ## Run Integration Tests
 
 Take a look at `ghcr.yml` (in the `.github/workflow` folder) to learn
-how integration tests are launched in a CI environment. You can also simply run:
+how integration tests are launched in a CI environment.
+
+You can run:
 
 ```bash
-TEST_ONLY=true ./tests/integration/regenerate.sh
+# for server runtime
+TEST_RUNTIME=server TEST_ONLY=true ./tests/integration/regenerate.sh
+# for event stream
+TEST_RUNTIME=eventstream TEST_ONLY=true ./tests/integration/regenerate.sh
 ```
 
 to run all integration tests until the first failure occurs.
@@ -75,7 +80,8 @@ When you make changes to an agent's prompt, the integration tests will fail. You
 by running the following command from OpenDevin's project root directory:
 
 ```bash
-./tests/integration/regenerate.sh
+TEST_RUNTIME=server ./tests/integration/regenerate.sh
+TEST_RUNTIME=eventstream ./tests/integration/regenerate.sh
 ```
 
 Please note that this will:
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 0d9d84a38b57..1624e7a4f6ef 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -4,6 +4,7 @@
 import shutil
 import subprocess
 import tempfile
+import time
 from functools import partial
 from http.server import HTTPServer, SimpleHTTPRequestHandler
 from threading import Thread
@@ -11,15 +12,24 @@
 import pytest
 from litellm import completion
 
+from opendevin.core.message import Message
 from opendevin.llm.llm import message_separator
 
 script_dir = os.environ.get('SCRIPT_DIR')
 project_root = os.environ.get('PROJECT_ROOT')
 workspace_path = os.environ.get('WORKSPACE_BASE')
+test_runtime = os.environ.get('TEST_RUNTIME')
+MOCK_ROOT_DIR = os.path.join(
+    script_dir,
+    'mock',
+    f'{test_runtime}_runtime',
+    os.environ.get('DEFAULT_AGENT'),
+)
 
 assert script_dir is not None, 'SCRIPT_DIR environment variable is not set'
 assert project_root is not None, 'PROJECT_ROOT environment variable is not set'
 assert workspace_path is not None, 'WORKSPACE_BASE environment variable is not set'
+assert test_runtime is not None, 'TEST_RUNTIME environment variable is not set'
 
 
 class SecretExit(Exception):
@@ -37,7 +47,19 @@ def pytest_exception_interact(node, call, report):
 
 
 def filter_out_symbols(input):
+    # remove shell hostname patterns (e.g., will change between each run)
+    # opendevin@379c7fce40b4:/workspace $
+    input = re.sub(r'(opendevin|root)@.*(:/.*)', r'\1[DUMMY_HOSTNAME]\2', input)
+
+    # handle sha256 hashes
+    # sha256=4ecf8be428f55981e2a188f510ba5f9022bed88f5fb404d7d949f44382201e3d
+    input = re.sub(r'sha256=[a-z0-9]+', 'sha256=[DUMMY_HASH]', input)
+
+    # remove newlines and whitespace
     input = re.sub(r'\\n|\\r\\n|\\r|\s+', '', input)
+
+    # remove all non-alphanumeric characters
+    input = re.sub(r'[^a-zA-Z0-9]', '', input)
     return input
 
 
@@ -54,9 +76,7 @@ def apply_prompt_and_get_mock_response(test_name: str, messages: str, id: int) -
     Note: this function blindly replaces existing prompt file with the given
     input without checking the contents.
     """
-    mock_dir = os.path.join(
-        script_dir, 'mock', os.environ.get('DEFAULT_AGENT'), test_name
-    )
+    mock_dir = os.path.join(MOCK_ROOT_DIR, test_name)
     prompt_file_path = os.path.join(mock_dir, f'prompt_{"{0:03}".format(id)}.log')
     resp_file_path = os.path.join(mock_dir, f'response_{"{0:03}".format(id)}.log')
     try:
@@ -88,47 +108,48 @@ def get_mock_response(test_name: str, messages: str, id: int) -> str:
     we start from the end of the file, but again, that is unnecessary and only
     makes test code harder to understand.
     """
+    mock_dir = os.path.join(MOCK_ROOT_DIR, test_name)
     prompt = filter_out_symbols(messages)
-    mock_dir = os.path.join(
-        script_dir, 'mock', os.environ.get('DEFAULT_AGENT'), test_name
-    )
     prompt_file_path = os.path.join(mock_dir, f'prompt_{"{0:03}".format(id)}.log')
     resp_file_path = os.path.join(mock_dir, f'response_{"{0:03}".format(id)}.log')
     # Open the prompt file and compare its contents
-    with open(prompt_file_path, 'r') as f:
-        file_content = filter_out_symbols(f.read())
-        if file_content == prompt:
-            # Read the response file and return its content
-            with open(resp_file_path, 'r') as resp_file:
-                return resp_file.read()
-        else:
-            # print the mismatched lines
-            print('Mismatched Prompt File path', prompt_file_path)
-            print('---' * 10)
-            # Create a temporary file to store messages
-            with tempfile.NamedTemporaryFile(
-                delete=False, mode='w', encoding='utf-8'
-            ) as tmp_file:
-                tmp_file_path = tmp_file.name
-                tmp_file.write(messages)
-
-            try:
-                # Use diff command to compare files and capture the output
-                result = subprocess.run(
-                    ['diff', '-u', prompt_file_path, tmp_file_path],
-                    capture_output=True,
-                    text=True,
-                )
-                if result.returncode != 0:
-                    print('Diff:')
-                    print(result.stdout)
-                else:
-                    print('No differences found.')
-            finally:
-                # Clean up the temporary file
-                os.remove(tmp_file_path)
-
-            print('---' * 10)
+    from test_patch import test_patces
+
+    for key, value in test_patces.items():
+        with open(prompt_file_path, 'r') as f:
+            file_content = filter_out_symbols(f.read().replace(key, value))
+            if file_content.strip() == prompt.strip():
+                # Read the response file and return its content
+                with open(resp_file_path, 'r') as resp_file:
+                    return resp_file.read()
+    else:
+        # print the mismatched lines
+        print('Mismatched Prompt File path', prompt_file_path)
+        print('---' * 10)
+        # Create a temporary file to store messages
+        with tempfile.NamedTemporaryFile(
+            delete=False, mode='w', encoding='utf-8'
+        ) as tmp_file:
+            tmp_file_path = tmp_file.name
+            tmp_file.write(messages)
+
+        try:
+            # Use diff command to compare files and capture the output
+            result = subprocess.run(
+                ['diff', '-u', prompt_file_path, tmp_file_path],
+                capture_output=True,
+                text=True,
+            )
+            if result.returncode != 0:
+                print('Diff:')
+                print(result.stdout)
+            else:
+                print('No differences found.')
+        finally:
+            # Clean up the temporary file
+            os.remove(tmp_file_path)
+
+        print('---' * 10)
 
 
 def mock_user_response(*args, test_name, **kwargs):
@@ -157,8 +178,12 @@ def mock_completion(*args, test_name, **kwargs):
     global cur_id
     messages = kwargs['messages']
     message_str = ''
+    if isinstance(messages[0], Message):
+        messages = [message.model_dump() for message in messages]
     for message in messages:
-        message_str += message_separator + message['content']
+        for m in message['content']:
+            if m['type'] == 'text':
+                message_str += message_separator + m['text']
     # this assumes all response_(*).log filenames are in numerical order, starting from one
     cur_id += 1
     if os.environ.get('FORCE_APPLY_PROMPTS') == 'true':
@@ -211,7 +236,9 @@ def http_server():
     thread = Thread(target=server.serve_forever)
     thread.setDaemon(True)
     thread.start()
+    time.sleep(1)
 
+    print('HTTP server started...')
     yield server
 
     # Stop the server
diff --git a/tests/integration/mock/CodeActSWEAgent/test_ipython_module/response_001.log b/tests/integration/mock/CodeActSWEAgent/test_ipython_module/response_001.log
deleted file mode 100644
index 541e772d8bee..000000000000
--- a/tests/integration/mock/CodeActSWEAgent/test_ipython_module/response_001.log
+++ /dev/null
@@ -1,4 +0,0 @@
-Let's start by installing the `pymsgbox` package.
-
-<execute_ipython>
-%pip install pymsgbox==1.0.9
diff --git a/tests/integration/mock/CodeActSWEAgent/test_ipython_module/response_002.log b/tests/integration/mock/CodeActSWEAgent/test_ipython_module/response_002.log
deleted file mode 100644
index 57d9d054a25e..000000000000
--- a/tests/integration/mock/CodeActSWEAgent/test_ipython_module/response_002.log
+++ /dev/null
@@ -1,15 +0,0 @@
-Next, let's import `pymsgbox` and print its version to `/workspace/test.txt`.
-
-<execute_ipython>
-import pymsgbox
-
-version = pymsgbox.__version__
-
-with open("/workspace/test.txt", "w") as file:
-    file.write(f"pymsgbox version: {version}\n")
-
-# Verify the content of the file
-with open("/workspace/test.txt", "r") as file:
-    content = file.read()
-
-content
diff --git a/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_001.log b/tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/prompt_001.log
similarity index 100%
rename from tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_001.log
rename to tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/prompt_001.log
diff --git a/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_002.log b/tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/prompt_002.log
similarity index 100%
rename from tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_002.log
rename to tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/prompt_002.log
diff --git a/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_003.log b/tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/prompt_003.log
similarity index 100%
rename from tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_003.log
rename to tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/prompt_003.log
diff --git a/tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/response_001.log b/tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/response_001.log
new file mode 100644
index 000000000000..8bdf5efd6c72
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/response_001.log
@@ -0,0 +1,2 @@
+In order to accomplish my goal, I need to navigate to the localhost page.
+```goto('http://localhost:8000'
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/response_003.log b/tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/response_002.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_browse_internet/response_003.log
rename to tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/response_002.log
diff --git a/tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/response_003.log b/tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/response_003.log
new file mode 100644
index 000000000000..2eeae38382de
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/BrowsingAgent/test_browse_internet/response_003.log
@@ -0,0 +1,3 @@
+In order to accomplish my goal, I need to read the static text that reveals the answer to life, the universe, and everything.
+
+```send_msg_to_user('The answer is OpenDevin is all you need!'
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_001.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_001.log
new file mode 100644
index 000000000000..62ee7a52b602
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_001.log
@@ -0,0 +1,401 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.
+
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_002.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_002.log
new file mode 100644
index 000000000000..292bb8cfdc66
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_002.log
@@ -0,0 +1,126 @@
+
+
+----------
+
+# Instructions
+Review the current state of the page and all other information to find the best
+possible next action to accomplish your goal. Your answer will be interpreted
+and executed by a program, make sure to follow the formatting instructions.
+
+# Goal:
+Certainly! I'll browse localhost:8000 and retrieve the ultimate answer to life for you.. I should start with: Get the content on "http://localhost:8000"
+
+# Action Space
+
+16 different types of actions are available.
+
+noop(wait_ms: float = 1000)
+    Examples:
+        noop()
+
+        noop(500)
+
+send_msg_to_user(text: str)
+    Examples:
+        send_msg_to_user('Based on the results of my search, the city was built in 1751.')
+
+scroll(delta_x: float, delta_y: float)
+    Examples:
+        scroll(0, 200)
+
+        scroll(-50.2, -100.5)
+
+fill(bid: str, value: str)
+    Examples:
+        fill('237', 'example value')
+
+        fill('45', 'multi-line\nexample')
+
+        fill('a12', 'example with "quotes"')
+
+select_option(bid: str, options: str | list[str])
+    Examples:
+        select_option('48', 'blue')
+
+        select_option('48', ['red', 'green', 'blue'])
+
+click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'Meta', 'Shift']] = [])
+    Examples:
+        click('51')
+
+        click('b22', button='right')
+
+        click('48', button='middle', modifiers=['Shift'])
+
+dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'Meta', 'Shift']] = [])
+    Examples:
+        dblclick('12')
+
+        dblclick('ca42', button='right')
+
+        dblclick('178', button='middle', modifiers=['Shift'])
+
+hover(bid: str)
+    Examples:
+        hover('b8')
+
+press(bid: str, key_comb: str)
+    Examples:
+        press('88', 'Backspace')
+
+        press('a26', 'Control+a')
+
+        press('a61', 'Meta+Shift+t')
+
+focus(bid: str)
+    Examples:
+        focus('b455')
+
+clear(bid: str)
+    Examples:
+        clear('996')
+
+drag_and_drop(from_bid: str, to_bid: str)
+    Examples:
+        drag_and_drop('56', '498')
+
+upload_file(bid: str, file: str | list[str])
+    Examples:
+        upload_file('572', 'my_receipt.pdf')
+
+        upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
+
+go_back()
+    Examples:
+        go_back()
+
+go_forward()
+    Examples:
+        go_forward()
+
+goto(url: str)
+    Examples:
+        goto('http://www.example.com')
+
+Multiple actions can be provided at once. Example:
+fill('a12', 'example with "quotes"')
+click('51')
+click('48', button='middle', modifiers=['Shift'])
+Multiple actions are meant to be executed sequentially without any feedback from the page.
+Don't execute multiple actions at once if you need feedback from the page.
+
+
+
+----------
+
+# Current Accessibility Tree:
+
+
+# Previous Actions
+
+
+Here is an example with chain of thought of a valid action when clicking on a button:
+"
+In order to accomplish my goal I need to click on the button with bid 12
+```click("12")```
+"
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_003.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_003.log
new file mode 100644
index 000000000000..991b96738722
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_003.log
@@ -0,0 +1,130 @@
+
+
+----------
+
+# Instructions
+Review the current state of the page and all other information to find the best
+possible next action to accomplish your goal. Your answer will be interpreted
+and executed by a program, make sure to follow the formatting instructions.
+
+# Goal:
+Certainly! I'll browse localhost:8000 and retrieve the ultimate answer to life for you.. I should start with: Get the content on "http://localhost:8000"
+
+# Action Space
+
+16 different types of actions are available.
+
+noop(wait_ms: float = 1000)
+    Examples:
+        noop()
+
+        noop(500)
+
+send_msg_to_user(text: str)
+    Examples:
+        send_msg_to_user('Based on the results of my search, the city was built in 1751.')
+
+scroll(delta_x: float, delta_y: float)
+    Examples:
+        scroll(0, 200)
+
+        scroll(-50.2, -100.5)
+
+fill(bid: str, value: str)
+    Examples:
+        fill('237', 'example value')
+
+        fill('45', 'multi-line\nexample')
+
+        fill('a12', 'example with "quotes"')
+
+select_option(bid: str, options: str | list[str])
+    Examples:
+        select_option('48', 'blue')
+
+        select_option('48', ['red', 'green', 'blue'])
+
+click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'Meta', 'Shift']] = [])
+    Examples:
+        click('51')
+
+        click('b22', button='right')
+
+        click('48', button='middle', modifiers=['Shift'])
+
+dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'Meta', 'Shift']] = [])
+    Examples:
+        dblclick('12')
+
+        dblclick('ca42', button='right')
+
+        dblclick('178', button='middle', modifiers=['Shift'])
+
+hover(bid: str)
+    Examples:
+        hover('b8')
+
+press(bid: str, key_comb: str)
+    Examples:
+        press('88', 'Backspace')
+
+        press('a26', 'Control+a')
+
+        press('a61', 'Meta+Shift+t')
+
+focus(bid: str)
+    Examples:
+        focus('b455')
+
+clear(bid: str)
+    Examples:
+        clear('996')
+
+drag_and_drop(from_bid: str, to_bid: str)
+    Examples:
+        drag_and_drop('56', '498')
+
+upload_file(bid: str, file: str | list[str])
+    Examples:
+        upload_file('572', 'my_receipt.pdf')
+
+        upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
+
+go_back()
+    Examples:
+        go_back()
+
+go_forward()
+    Examples:
+        go_forward()
+
+goto(url: str)
+    Examples:
+        goto('http://www.example.com')
+
+Multiple actions can be provided at once. Example:
+fill('a12', 'example with "quotes"')
+click('51')
+click('48', button='middle', modifiers=['Shift'])
+Multiple actions are meant to be executed sequentially without any feedback from the page.
+Don't execute multiple actions at once if you need feedback from the page.
+
+
+
+----------
+
+# Current Accessibility Tree:
+RootWebArea 'The Ultimate Answer', focused
+	[8] heading 'The Ultimate Answer'
+	[9] paragraph ''
+		StaticText 'Click the button to reveal the answer to life, the universe, and everything.'
+	[10] button 'Click me', clickable
+
+# Previous Actions
+goto("http://localhost:8000")
+
+Here is an example with chain of thought of a valid action when clicking on a button:
+"
+In order to accomplish my goal I need to click on the button with bid 12
+```click("12")```
+"
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_004.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_004.log
new file mode 100644
index 000000000000..566df42a71a6
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_004.log
@@ -0,0 +1,132 @@
+
+
+----------
+
+# Instructions
+Review the current state of the page and all other information to find the best
+possible next action to accomplish your goal. Your answer will be interpreted
+and executed by a program, make sure to follow the formatting instructions.
+
+# Goal:
+Certainly! I'll browse localhost:8000 and retrieve the ultimate answer to life for you.. I should start with: Get the content on "http://localhost:8000"
+
+# Action Space
+
+16 different types of actions are available.
+
+noop(wait_ms: float = 1000)
+    Examples:
+        noop()
+
+        noop(500)
+
+send_msg_to_user(text: str)
+    Examples:
+        send_msg_to_user('Based on the results of my search, the city was built in 1751.')
+
+scroll(delta_x: float, delta_y: float)
+    Examples:
+        scroll(0, 200)
+
+        scroll(-50.2, -100.5)
+
+fill(bid: str, value: str)
+    Examples:
+        fill('237', 'example value')
+
+        fill('45', 'multi-line\nexample')
+
+        fill('a12', 'example with "quotes"')
+
+select_option(bid: str, options: str | list[str])
+    Examples:
+        select_option('48', 'blue')
+
+        select_option('48', ['red', 'green', 'blue'])
+
+click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'Meta', 'Shift']] = [])
+    Examples:
+        click('51')
+
+        click('b22', button='right')
+
+        click('48', button='middle', modifiers=['Shift'])
+
+dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'Meta', 'Shift']] = [])
+    Examples:
+        dblclick('12')
+
+        dblclick('ca42', button='right')
+
+        dblclick('178', button='middle', modifiers=['Shift'])
+
+hover(bid: str)
+    Examples:
+        hover('b8')
+
+press(bid: str, key_comb: str)
+    Examples:
+        press('88', 'Backspace')
+
+        press('a26', 'Control+a')
+
+        press('a61', 'Meta+Shift+t')
+
+focus(bid: str)
+    Examples:
+        focus('b455')
+
+clear(bid: str)
+    Examples:
+        clear('996')
+
+drag_and_drop(from_bid: str, to_bid: str)
+    Examples:
+        drag_and_drop('56', '498')
+
+upload_file(bid: str, file: str | list[str])
+    Examples:
+        upload_file('572', 'my_receipt.pdf')
+
+        upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
+
+go_back()
+    Examples:
+        go_back()
+
+go_forward()
+    Examples:
+        go_forward()
+
+goto(url: str)
+    Examples:
+        goto('http://www.example.com')
+
+Multiple actions can be provided at once. Example:
+fill('a12', 'example with "quotes"')
+click('51')
+click('48', button='middle', modifiers=['Shift'])
+Multiple actions are meant to be executed sequentially without any feedback from the page.
+Don't execute multiple actions at once if you need feedback from the page.
+
+
+
+----------
+
+# Current Accessibility Tree:
+RootWebArea 'The Ultimate Answer', focused
+	[8] heading 'The Ultimate Answer'
+	[9] paragraph ''
+		StaticText 'Click the button to reveal the answer to life, the universe, and everything.'
+	[10] button 'Click me', clickable, focused
+	StaticText 'The answer is OpenDevin is all you need!'
+
+# Previous Actions
+goto("http://localhost:8000")
+click("10")
+
+Here is an example with chain of thought of a valid action when clicking on a button:
+"
+In order to accomplish my goal I need to click on the button with bid 12
+```click("12")```
+"
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_005.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_005.log
new file mode 100644
index 000000000000..48ebcb32b81e
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/prompt_005.log
@@ -0,0 +1,415 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.
+
+----------
+
+
+<execute_browse>
+Certainly! I'll browse localhost:8000 and retrieve the ultimate answer to life for you.. I should start with: Get the content on "http://localhost:8000"
+</execute_browse>
+
+----------
+
+OBSERVATION:
+{'content': 'The ultimate answer to life, the universe, and everything is: OpenDevin is all you need!'}
+
+ENVIRONMENT REMINDER: You have 8 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_001.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_001.log
new file mode 100644
index 000000000000..25975d7d1366
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_001.log
@@ -0,0 +1,4 @@
+Certainly! I'll browse localhost:8000 and retrieve the ultimate answer to life for you.
+
+<execute_browse>
+Get the content on "http://localhost:8000"
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_002.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_002.log
new file mode 100644
index 000000000000..05dffd72f414
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_002.log
@@ -0,0 +1,5 @@
+To accomplish our goal of retrieving the ultimate answer to life from localhost:8000, I need to start by accessing the content on that page. Here's my thought process and action:
+
+In order to accomplish my goal, I need to navigate to the specified URL using the goto action.
+
+```goto("http://localhost:8000"
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_003.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_003.log
new file mode 100644
index 000000000000..66e87a3550dc
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_003.log
@@ -0,0 +1,6 @@
+To accomplish our goal of retrieving the ultimate answer to life, we need to click on the "Click me" button. The button has a bid of 10, so we should click on it.
+
+Here's the action I'll take:
+
+In order to accomplish my goal, I need to click on the button with bid 10
+```click("10"
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_004.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_004.log
new file mode 100644
index 000000000000..ae88605a499b
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_004.log
@@ -0,0 +1,6 @@
+Based on the current Accessibility Tree and the previous actions taken, it appears that we have already successfully navigated to the webpage and clicked the button to reveal the answer. The answer is now visible on the page.
+
+To accomplish our goal of retrieving the ultimate answer to life, I will now send this information to the user:
+
+```
+send_msg_to_user("The ultimate answer to life, the universe, and everything is: OpenDevin is all you need!"
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_005.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_005.log
new file mode 100644
index 000000000000..13e8a93af051
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_browse_internet/response_005.log
@@ -0,0 +1,11 @@
+Thank you for providing the content from localhost:8000. Based on the information retrieved, I can now answer your question about the ultimate answer to life.
+
+The ultimate answer to life, according to the content on localhost:8000, is:
+
+"OpenDevin is all you need!"
+
+This appears to be a playful reference to the popular science fiction series "The Hitchhiker's Guide to the Galaxy" by Douglas Adams, where the answer to the ultimate question of life, the universe, and everything is famously "42". In this case, it's been adapted to promote OpenDevin as the ultimate solution.
+
+Since I've successfully retrieved and reported the answer from the specified localhost, I believe I've completed the task you've given me.
+
+<finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/prompt_001.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/prompt_001.log
new file mode 100644
index 000000000000..8f33d2e2f2bc
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/prompt_001.log
@@ -0,0 +1,401 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/prompt_002.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/prompt_002.log
new file mode 100644
index 000000000000..f33b77e59535
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/prompt_002.log
@@ -0,0 +1,422 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+----------
+
+Sure! Let's start by opening the `bad.txt` file to identify the typos.
+<execute_ipython>
+open_file('bad.txt')
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[File: /workspace/bad.txt (4 lines total)]
+(this is the beginning of the file)
+1|This is a stupid typoo.
+2|Really?
+3|No mor typos!
+4|Enjoy!
+(this is the end of the file)
+
+
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/prompt_003.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/prompt_003.log
new file mode 100644
index 000000000000..14ed92579529
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/prompt_003.log
@@ -0,0 +1,451 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+----------
+
+Sure! Let's start by opening the `bad.txt` file to identify the typos.
+<execute_ipython>
+open_file('bad.txt')
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[File: /workspace/bad.txt (4 lines total)]
+(this is the beginning of the file)
+1|This is a stupid typoo.
+2|Really?
+3|No mor typos!
+4|Enjoy!
+(this is the end of the file)
+
+
+----------
+
+I will correct the typos in the `bad.txt` file.
+
+1. "stupid typoo" -> "stupid typo"
+2. "No mor typos!" -> "No more typos!"
+
+Let's make these changes.
+<execute_ipython>
+edit_file_by_replace(
+    'bad.txt',
+    to_replace="This is a stupid typoo.",
+    new_content="This is a stupid typo."
+)
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[File: /workspace/bad.txt (4 lines total after edit)]
+(this is the beginning of the file)
+1|This is a stupid typo.
+2|Really?
+3|No mor typos!
+4|Enjoy!
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/prompt_004.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/prompt_004.log
new file mode 100644
index 000000000000..80b43b7dc699
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/prompt_004.log
@@ -0,0 +1,479 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+----------
+
+Sure! Let's start by opening the `bad.txt` file to identify the typos.
+<execute_ipython>
+open_file('bad.txt')
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[File: /workspace/bad.txt (4 lines total)]
+(this is the beginning of the file)
+1|This is a stupid typoo.
+2|Really?
+3|No mor typos!
+4|Enjoy!
+(this is the end of the file)
+
+
+----------
+
+I will correct the typos in the `bad.txt` file.
+
+1. "stupid typoo" -> "stupid typo"
+2. "No mor typos!" -> "No more typos!"
+
+Let's make these changes.
+<execute_ipython>
+edit_file_by_replace(
+    'bad.txt',
+    to_replace="This is a stupid typoo.",
+    new_content="This is a stupid typo."
+)
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[File: /workspace/bad.txt (4 lines total after edit)]
+(this is the beginning of the file)
+1|This is a stupid typo.
+2|Really?
+3|No mor typos!
+4|Enjoy!
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+
+----------
+
+Let's correct the remaining typo in the file.
+
+1. "No mor typos!" -> "No more typos!"
+
+I will make this change now.
+<execute_ipython>
+edit_file_by_replace(
+    'bad.txt',
+    to_replace="No mor typos!",
+    new_content="No more typos!"
+)
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[File: /workspace/bad.txt (4 lines total after edit)]
+(this is the beginning of the file)
+1|This is a stupid typo.
+2|Really?
+3|No more typos!
+4|Enjoy!
+(this is the end of the file)
+[File updated (edited at line 3). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/response_001.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/response_001.log
new file mode 100644
index 000000000000..58fb6048ba95
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/response_001.log
@@ -0,0 +1,4 @@
+Sure! Let's start by opening the `bad.txt` file to identify the typos.
+
+<execute_ipython>
+open_file('bad.txt')
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/response_002.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/response_002.log
new file mode 100644
index 000000000000..b83d070ecd9f
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/response_002.log
@@ -0,0 +1,13 @@
+I will correct the typos in the `bad.txt` file.
+
+1. "stupid typoo" -> "stupid typo"
+2. "No mor typos!" -> "No more typos!"
+
+Let's make these changes.
+
+<execute_ipython>
+edit_file_by_replace(
+    'bad.txt',
+    to_replace="This is a stupid typoo.",
+    new_content="This is a stupid typo."
+)
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/response_003.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/response_003.log
new file mode 100644
index 000000000000..c5096a8785dd
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/response_003.log
@@ -0,0 +1,11 @@
+Let's correct the remaining typo in the file.
+
+1. "No mor typos!" -> "No more typos!"
+
+I will make this change now.
+<execute_ipython>
+edit_file_by_replace(
+    'bad.txt',
+    to_replace="No mor typos!",
+    new_content="No more typos!"
+)
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/response_004.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/response_004.log
new file mode 100644
index 000000000000..4fee4faa5e15
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_edits/response_004.log
@@ -0,0 +1,11 @@
+The typos in `bad.txt` have been corrected. Here is the final content of the file:
+
+```
+1|This is a stupid typo.
+2|Really?
+3|No more typos!
+4|Enjoy!
+```
+
+If you have any further requests, feel free to ask!
+<finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython/prompt_001.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython/prompt_001.log
new file mode 100644
index 000000000000..c620c8918435
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython/prompt_001.log
@@ -0,0 +1,401 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'. Do not ask me for confirmation at any point.
+
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython/prompt_002.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython/prompt_002.log
new file mode 100644
index 000000000000..013180481a26
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython/prompt_002.log
@@ -0,0 +1,416 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'. Do not ask me for confirmation at any point.
+
+----------
+
+Sure! I will create a text file named `test.txt` in the `/workspace` directory and write 'hello world' to it.
+<execute_ipython>
+with open('/workspace/test.txt', 'w') as file:
+    file.write('hello world')
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[Code executed successfully with no output]
+
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython/response_001.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython/response_001.log
new file mode 100644
index 000000000000..ea1ad9705196
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython/response_001.log
@@ -0,0 +1,5 @@
+Sure! I will create a text file named `test.txt` in the `/workspace` directory and write 'hello world' to it.
+
+<execute_ipython>
+with open('/workspace/test.txt', 'w') as file:
+    file.write('hello world')
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython/response_002.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython/response_002.log
new file mode 100644
index 000000000000..42790c2d4a56
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython/response_002.log
@@ -0,0 +1,3 @@
+The file `test.txt` has been successfully created in the `/workspace` directory with the content 'hello world'. If you have any further requests, please let me know!
+
+<finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/prompt_001.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/prompt_001.log
new file mode 100644
index 000000000000..3ecc273f588d
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/prompt_001.log
@@ -0,0 +1,401 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
+
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/prompt_002.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/prompt_002.log
new file mode 100644
index 000000000000..54f39c635ddb
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/prompt_002.log
@@ -0,0 +1,431 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
+
+----------
+
+Sure! Let's start by installing the `pymsgbox` package.
+<execute_ipython>
+%pip install pymsgbox==1.0.9
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+Collecting pymsgbox==1.0.9
+  Downloading PyMsgBox-1.0.9.tar.gz (18 kB)
+  Installing build dependencies ... [?25l- \ | / done
+[?25h  Getting requirements to build wheel ... [?25l- done
+[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
+[?25hBuilding wheels for collected packages: pymsgbox
+  Building wheel for pymsgbox (pyproject.toml) ... [?25l- done
+[?25h  Created wheel for pymsgbox: filename=PyMsgBox-1.0.9-py3-none-any.whl size=7406 sha256=7cc4ff18827e3c2e2bd0780a92ca70fd625dc7645b994b6f3191dfb0660e606e
+  Stored in directory: /home/opendevin/.cache/pip/wheels/85/92/63/e126ee5f33d8f2ed04f96e43ef5df7270a2f331848752e8662
+Successfully built pymsgbox
+Installing collected packages: pymsgbox
+Successfully installed pymsgbox-1.0.9
+
+[notice] A new release of pip is available: 24.1 -> 24.2
+[notice] To update, run: pip install --upgrade pip
+Note: you may need to restart the kernel to use updated packages.
+
+
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/prompt_003.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/prompt_003.log
new file mode 100644
index 000000000000..2d2e45921813
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/prompt_003.log
@@ -0,0 +1,448 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
+
+----------
+
+Sure! Let's start by installing the `pymsgbox` package.
+<execute_ipython>
+%pip install pymsgbox==1.0.9
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+Collecting pymsgbox==1.0.9
+  Downloading PyMsgBox-1.0.9.tar.gz (18 kB)
+  Installing build dependencies ... [?25l- \ | / done
+[?25h  Getting requirements to build wheel ... [?25l- done
+[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
+[?25hBuilding wheels for collected packages: pymsgbox
+  Building wheel for pymsgbox (pyproject.toml) ... [?25l- done
+[?25h  Created wheel for pymsgbox: filename=PyMsgBox-1.0.9-py3-none-any.whl size=7406 sha256=7cc4ff18827e3c2e2bd0780a92ca70fd625dc7645b994b6f3191dfb0660e606e
+  Stored in directory: /home/opendevin/.cache/pip/wheels/85/92/63/e126ee5f33d8f2ed04f96e43ef5df7270a2f331848752e8662
+Successfully built pymsgbox
+Installing collected packages: pymsgbox
+Successfully installed pymsgbox-1.0.9
+
+[notice] A new release of pip is available: 24.1 -> 24.2
+[notice] To update, run: pip install --upgrade pip
+Note: you may need to restart the kernel to use updated packages.
+
+
+----------
+
+Now, let's import `pymsgbox`, get its version, and write it to `/workspace/test.txt`.
+<execute_ipython>
+import pymsgbox
+
+version = pymsgbox.__version__
+
+with open('/workspace/test.txt', 'w') as file:
+    file.write(f"pymsgbox version: {version}\n")
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[Code executed successfully with no output]
+
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/prompt_004.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/prompt_004.log
new file mode 100644
index 000000000000..043c096f6117
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/prompt_004.log
@@ -0,0 +1,463 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
+
+----------
+
+Sure! Let's start by installing the `pymsgbox` package.
+<execute_ipython>
+%pip install pymsgbox==1.0.9
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+Collecting pymsgbox==1.0.9
+  Downloading PyMsgBox-1.0.9.tar.gz (18 kB)
+  Installing build dependencies ... [?25l- \ | / done
+[?25h  Getting requirements to build wheel ... [?25l- done
+[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
+[?25hBuilding wheels for collected packages: pymsgbox
+  Building wheel for pymsgbox (pyproject.toml) ... [?25l- done
+[?25h  Created wheel for pymsgbox: filename=PyMsgBox-1.0.9-py3-none-any.whl size=7406 sha256=7cc4ff18827e3c2e2bd0780a92ca70fd625dc7645b994b6f3191dfb0660e606e
+  Stored in directory: /home/opendevin/.cache/pip/wheels/85/92/63/e126ee5f33d8f2ed04f96e43ef5df7270a2f331848752e8662
+Successfully built pymsgbox
+Installing collected packages: pymsgbox
+Successfully installed pymsgbox-1.0.9
+
+[notice] A new release of pip is available: 24.1 -> 24.2
+[notice] To update, run: pip install --upgrade pip
+Note: you may need to restart the kernel to use updated packages.
+
+
+----------
+
+Now, let's import `pymsgbox`, get its version, and write it to `/workspace/test.txt`.
+<execute_ipython>
+import pymsgbox
+
+version = pymsgbox.__version__
+
+with open('/workspace/test.txt', 'w') as file:
+    file.write(f"pymsgbox version: {version}\n")
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[Code executed successfully with no output]
+
+----------
+
+Let's verify the content of `/workspace/test.txt` to ensure the version information was written correctly.
+<execute_bash>
+cat /workspace/test.txt
+</execute_bash>
+
+----------
+
+OBSERVATION:
+pymsgbox version: 1.0.9
+
+opendevin@docker-desktop:/workspace $
+[Command -1 finished with exit code 0]
+
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/response_001.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/response_001.log
new file mode 100644
index 000000000000..3d96c1fdb215
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/response_001.log
@@ -0,0 +1,4 @@
+Sure! Let's start by installing the `pymsgbox` package.
+
+<execute_ipython>
+%pip install pymsgbox==1.0.9
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/response_002.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/response_002.log
new file mode 100644
index 000000000000..1886ba712d23
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/response_002.log
@@ -0,0 +1,9 @@
+Now, let's import `pymsgbox`, get its version, and write it to `/workspace/test.txt`.
+
+<execute_ipython>
+import pymsgbox
+
+version = pymsgbox.__version__
+
+with open('/workspace/test.txt', 'w') as file:
+    file.write(f"pymsgbox version: {version}\n")
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/response_003.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/response_003.log
new file mode 100644
index 000000000000..26fa94b072ac
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/response_003.log
@@ -0,0 +1,4 @@
+Let's verify the content of `/workspace/test.txt` to ensure the version information was written correctly.
+
+<execute_bash>
+cat /workspace/test.txt
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/response_004.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/response_004.log
new file mode 100644
index 000000000000..bbebebe77ef9
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_ipython_module/response_004.log
@@ -0,0 +1,3 @@
+The version of `pymsgbox` has been successfully written to `/workspace/test.txt`. If you have any further tasks or questions, feel free to ask!
+
+<finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/prompt_001.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/prompt_001.log
new file mode 100644
index 000000000000..ddb157a1b007
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/prompt_001.log
@@ -0,0 +1,401 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/prompt_002.log
new file mode 100644
index 000000000000..a165da206300
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/prompt_002.log
@@ -0,0 +1,422 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+----------
+
+Sure! I will create a shell script named `hello.sh` that prints "hello".
+
+Let's start by creating the file and adding the necessary content.
+<execute_bash>
+echo -e '#!/bin/bash\n\necho "hello"' > hello.sh
+chmod +x hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+
+opendevin@docker-desktop:/workspace $ chmod +x hello.sh
+
+opendevin@docker-desktop:/workspace $
+[Command -1 finished with exit code 0]
+
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/prompt_003.log
new file mode 100644
index 000000000000..cd5bec9cbab6
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/prompt_003.log
@@ -0,0 +1,437 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_ipython>.
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+Important, however: do not run interactive commands. You do not have access to stdin.
+Also, you need to handle commands that may run indefinitely and not return a result. For such cases, you should redirect the output to a file and run the command in the background to avoid blocking the execution.
+For example, to run a Python script that might run indefinitely without returning immediately, you can use the following format: <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+Also, if a command execution result saying like: Command: "npm start" timed out. Sending SIGINT to the process, you should also retry with running the command in the background.
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+To interact with GitHub, use the $GITHUB_TOKEN environment variable.
+For example, to push a branch `my_branch` to the GitHub repo `owner/repo`:
+<execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
+If $GITHUB_TOKEN is not set, ask the user to set it.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRE PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+When handling files, try to use full paths and pwd to avoid errors.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/opendevin/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/opendevin/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>
+
+USER:
+Observation:
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+----------
+
+Sure! I will create a shell script named `hello.sh` that prints "hello".
+
+Let's start by creating the file and adding the necessary content.
+<execute_bash>
+echo -e '#!/bin/bash\n\necho "hello"' > hello.sh
+chmod +x hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+
+opendevin@docker-desktop:/workspace $ chmod +x hello.sh
+
+opendevin@docker-desktop:/workspace $
+[Command -1 finished with exit code 0]
+
+----------
+
+The shell script `hello.sh` has been created and made executable. Let's run it to ensure it prints "hello".
+<execute_bash>
+./hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+hello
+
+opendevin@docker-desktop:/workspace $
+[Command -1 finished with exit code 0]
+
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/response_001.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/response_001.log
new file mode 100644
index 000000000000..774f8cbb2a1a
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/response_001.log
@@ -0,0 +1,7 @@
+Sure! I will create a shell script named `hello.sh` that prints "hello".
+
+Let's start by creating the file and adding the necessary content.
+
+<execute_bash>
+echo -e '#!/bin/bash\n\necho "hello"' > hello.sh
+chmod +x hello.sh
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/response_002.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/response_002.log
new file mode 100644
index 000000000000..a5de32570c95
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/response_002.log
@@ -0,0 +1,4 @@
+The shell script `hello.sh` has been created and made executable. Let's run it to ensure it prints "hello".
+
+<execute_bash>
+./hello.sh
diff --git a/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/response_003.log b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/response_003.log
new file mode 100644
index 000000000000..8804422f34b4
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActAgent/test_write_simple_script/response_003.log
@@ -0,0 +1,3 @@
+The shell script `hello.sh` successfully prints "hello". If you have any further tasks or questions, feel free to ask!
+
+<finish></finish>
diff --git a/tests/integration/mock/CodeActSWEAgent/test_edits/prompt_001.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/prompt_001.log
similarity index 99%
rename from tests/integration/mock/CodeActSWEAgent/test_edits/prompt_001.log
rename to tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/prompt_001.log
index 0d198ce1f0cb..75f5cea76e94 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_edits/prompt_001.log
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/prompt_001.log
@@ -591,4 +591,4 @@ NOW, LET'S START!
 
 Fix typos in bad.txt. Do not ask me for confirmation at any point.
 
-ENVIRONMENT REMINDER: You have 14 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_edits/prompt_002.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/prompt_002.log
similarity index 99%
rename from tests/integration/mock/CodeActSWEAgent/test_edits/prompt_002.log
rename to tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/prompt_002.log
index 4f046938ced7..8d973b538479 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_edits/prompt_002.log
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/prompt_002.log
@@ -601,13 +601,13 @@ open_file("bad.txt")
 ----------
 
 OBSERVATION:
-[File: /workspace/bad.txt (4 lines total)]
-(this is the beginning of the file)
-1|This is a stupid typoo.
-2|Really?
-3|No mor typos!
-4|Enjoy!
-(this is the end of the file)
+[File: /workspace/bad.txt (4 lines total)]
+(this is the beginning of the file)
+1|This is a stupid typoo.
+2|Really?
+3|No mor typos!
+4|Enjoy!
+(this is the end of the file)
 
 
-ENVIRONMENT REMINDER: You have 13 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/prompt_003.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/prompt_003.log
new file mode 100644
index 000000000000..1387066cd352
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/prompt_003.log
@@ -0,0 +1,642 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+----------
+
+Let's start by opening the `bad.txt` file to identify and fix the typos.
+<execute_ipython>
+open_file("bad.txt")
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[File: /workspace/bad.txt (4 lines total)]
+(this is the beginning of the file)
+1|This is a stupid typoo.
+2|Really?
+3|No mor typos!
+4|Enjoy!
+(this is the end of the file)
+
+
+----------
+
+Let's correct the typos in the `bad.txt` file.
+
+1. "typoo" should be "typo".
+2. "mor" should be "more".
+
+I will make these corrections now.
+<execute_ipython>
+edit_file_by_replace(
+    "bad.txt",
+    to_replace="This is a stupid typoo.",
+    new_content="This is a stupid typo."
+)
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[File: /workspace/bad.txt (4 lines total after edit)]
+(this is the beginning of the file)
+1|This is a stupid typo.
+2|Really?
+3|No mor typos!
+4|Enjoy!
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/prompt_004.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/prompt_004.log
new file mode 100644
index 000000000000..7a80c3b40d4f
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/prompt_004.log
@@ -0,0 +1,666 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+----------
+
+Let's start by opening the `bad.txt` file to identify and fix the typos.
+<execute_ipython>
+open_file("bad.txt")
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[File: /workspace/bad.txt (4 lines total)]
+(this is the beginning of the file)
+1|This is a stupid typoo.
+2|Really?
+3|No mor typos!
+4|Enjoy!
+(this is the end of the file)
+
+
+----------
+
+Let's correct the typos in the `bad.txt` file.
+
+1. "typoo" should be "typo".
+2. "mor" should be "more".
+
+I will make these corrections now.
+<execute_ipython>
+edit_file_by_replace(
+    "bad.txt",
+    to_replace="This is a stupid typoo.",
+    new_content="This is a stupid typo."
+)
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[File: /workspace/bad.txt (4 lines total after edit)]
+(this is the beginning of the file)
+1|This is a stupid typo.
+2|Really?
+3|No mor typos!
+4|Enjoy!
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+
+----------
+
+Let's proceed with correcting the second typo.
+<execute_ipython>
+edit_file_by_replace(
+    "bad.txt",
+    to_replace="No mor typos!",
+    new_content="No more typos!"
+)
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[File: /workspace/bad.txt (4 lines total after edit)]
+(this is the beginning of the file)
+1|This is a stupid typo.
+2|Really?
+3|No more typos!
+4|Enjoy!
+(this is the end of the file)
+[File updated (edited at line 3). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_edits/response_001.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/response_001.log
similarity index 100%
rename from tests/integration/mock/CodeActSWEAgent/test_edits/response_001.log
rename to tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/response_001.log
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/response_002.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/response_002.log
new file mode 100644
index 000000000000..2052a2319f66
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/response_002.log
@@ -0,0 +1,13 @@
+Let's correct the typos in the `bad.txt` file.
+
+1. "typoo" should be "typo".
+2. "mor" should be "more".
+
+I will make these corrections now.
+
+<execute_ipython>
+edit_file_by_replace(
+    "bad.txt",
+    to_replace="This is a stupid typoo.",
+    new_content="This is a stupid typo."
+)
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/response_003.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/response_003.log
new file mode 100644
index 000000000000..9fa5f7a92463
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/response_003.log
@@ -0,0 +1,8 @@
+Let's proceed with correcting the second typo.
+
+<execute_ipython>
+edit_file_by_replace(
+    "bad.txt",
+    to_replace="No mor typos!",
+    new_content="No more typos!"
+)
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/response_004.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/response_004.log
new file mode 100644
index 000000000000..74e10ef11e7d
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_edits/response_004.log
@@ -0,0 +1,10 @@
+The typos in `bad.txt` have been corrected successfully. The file now reads:
+
+```
+1|This is a stupid typo.
+2|Really?
+3|No more typos!
+4|Enjoy!
+```
+
+The task is complete.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_ipython/prompt_001.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython/prompt_001.log
similarity index 99%
rename from tests/integration/mock/CodeActSWEAgent/test_ipython/prompt_001.log
rename to tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython/prompt_001.log
index 69a60d8dbb95..04d267654d55 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_ipython/prompt_001.log
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython/prompt_001.log
@@ -591,4 +591,4 @@ NOW, LET'S START!
 
 Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'. Do not ask me for confirmation at any point.
 
-ENVIRONMENT REMINDER: You have 14 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython/prompt_002.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython/prompt_002.log
new file mode 100644
index 000000000000..e0d98a986b4f
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython/prompt_002.log
@@ -0,0 +1,607 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'. Do not ask me for confirmation at any point.
+
+----------
+
+Understood. I will create a text file named `test.txt` in the `/workspace` directory and write "hello world" to it.
+<execute_ipython>
+with open('/workspace/test.txt', 'w') as file:
+    file.write('hello world')
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[Code executed successfully with no output]
+
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython/response_001.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython/response_001.log
new file mode 100644
index 000000000000..1158c7999daf
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython/response_001.log
@@ -0,0 +1,5 @@
+Understood. I will create a text file named `test.txt` in the `/workspace` directory and write "hello world" to it.
+
+<execute_ipython>
+with open('/workspace/test.txt', 'w') as file:
+    file.write('hello world')
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython/response_002.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython/response_002.log
new file mode 100644
index 000000000000..e82588acefd2
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython/response_002.log
@@ -0,0 +1 @@
+The file `test.txt` has been successfully created in the `/workspace` directory with the content "hello world". If you need any further assistance, please let me know!
diff --git a/tests/integration/mock/CodeActSWEAgent/test_ipython_module/prompt_001.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/prompt_001.log
similarity index 99%
rename from tests/integration/mock/CodeActSWEAgent/test_ipython_module/prompt_001.log
rename to tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/prompt_001.log
index 65bbb95fc39d..a7073b5af5cd 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_ipython_module/prompt_001.log
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/prompt_001.log
@@ -591,4 +591,4 @@ NOW, LET'S START!
 
 Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
 
-ENVIRONMENT REMINDER: You have 14 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/prompt_002.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/prompt_002.log
new file mode 100644
index 000000000000..16fc34073311
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/prompt_002.log
@@ -0,0 +1,622 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
+
+----------
+
+Understood. Let's start by installing the `pymsgbox` package.
+<execute_ipython>
+%pip install pymsgbox==1.0.9
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+Collecting pymsgbox==1.0.9
+  Downloading PyMsgBox-1.0.9.tar.gz (18 kB)
+  Installing build dependencies ... [?25l- \ | / - \ | / done
+[?25h  Getting requirements to build wheel ... [?25l- done
+[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
+[?25hBuilding wheels for collected packages: pymsgbox
+  Building wheel for pymsgbox (pyproject.toml) ... [?25l- done
+[?25h  Created wheel for pymsgbox: filename=PyMsgBox-1.0.9-py3-none-any.whl size=7406 sha256=e504d1b40c05b7da59bba1f908d23edcd98381d8e0ecc41a1162745ee4ee6fd2
+  Stored in directory: /home/opendevin/.cache/pip/wheels/85/92/63/e126ee5f33d8f2ed04f96e43ef5df7270a2f331848752e8662
+Successfully built pymsgbox
+Installing collected packages: pymsgbox
+Successfully installed pymsgbox-1.0.9
+
+[notice] A new release of pip is available: 24.1 -> 24.2
+[notice] To update, run: pip install --upgrade pip
+Note: you may need to restart the kernel to use updated packages.
+
+
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/prompt_003.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/prompt_003.log
new file mode 100644
index 000000000000..9fc3c047cd0c
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/prompt_003.log
@@ -0,0 +1,641 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
+
+----------
+
+Understood. Let's start by installing the `pymsgbox` package.
+<execute_ipython>
+%pip install pymsgbox==1.0.9
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+Collecting pymsgbox==1.0.9
+  Downloading PyMsgBox-1.0.9.tar.gz (18 kB)
+  Installing build dependencies ... [?25l- \ | / - \ | / done
+[?25h  Getting requirements to build wheel ... [?25l- done
+[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
+[?25hBuilding wheels for collected packages: pymsgbox
+  Building wheel for pymsgbox (pyproject.toml) ... [?25l- done
+[?25h  Created wheel for pymsgbox: filename=PyMsgBox-1.0.9-py3-none-any.whl size=7406 sha256=e504d1b40c05b7da59bba1f908d23edcd98381d8e0ecc41a1162745ee4ee6fd2
+  Stored in directory: /home/opendevin/.cache/pip/wheels/85/92/63/e126ee5f33d8f2ed04f96e43ef5df7270a2f331848752e8662
+Successfully built pymsgbox
+Installing collected packages: pymsgbox
+Successfully installed pymsgbox-1.0.9
+
+[notice] A new release of pip is available: 24.1 -> 24.2
+[notice] To update, run: pip install --upgrade pip
+Note: you may need to restart the kernel to use updated packages.
+
+
+----------
+
+Now, let's import `pymsgbox`, print its version, and write the output to `/workspace/test.txt`.
+<execute_ipython>
+import pymsgbox
+
+version_info = pymsgbox.__version__
+
+with open("/workspace/test.txt", "w") as file:
+    file.write(f"pymsgbox version: {version_info}\n")
+
+version_info
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+'1.0.9'
+
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/prompt_004.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/prompt_004.log
new file mode 100644
index 000000000000..815ffa85850a
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/prompt_004.log
@@ -0,0 +1,656 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
+
+----------
+
+Understood. Let's start by installing the `pymsgbox` package.
+<execute_ipython>
+%pip install pymsgbox==1.0.9
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+Collecting pymsgbox==1.0.9
+  Downloading PyMsgBox-1.0.9.tar.gz (18 kB)
+  Installing build dependencies ... [?25l- \ | / - \ | / done
+[?25h  Getting requirements to build wheel ... [?25l- done
+[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
+[?25hBuilding wheels for collected packages: pymsgbox
+  Building wheel for pymsgbox (pyproject.toml) ... [?25l- done
+[?25h  Created wheel for pymsgbox: filename=PyMsgBox-1.0.9-py3-none-any.whl size=7406 sha256=e504d1b40c05b7da59bba1f908d23edcd98381d8e0ecc41a1162745ee4ee6fd2
+  Stored in directory: /home/opendevin/.cache/pip/wheels/85/92/63/e126ee5f33d8f2ed04f96e43ef5df7270a2f331848752e8662
+Successfully built pymsgbox
+Installing collected packages: pymsgbox
+Successfully installed pymsgbox-1.0.9
+
+[notice] A new release of pip is available: 24.1 -> 24.2
+[notice] To update, run: pip install --upgrade pip
+Note: you may need to restart the kernel to use updated packages.
+
+
+----------
+
+Now, let's import `pymsgbox`, print its version, and write the output to `/workspace/test.txt`.
+<execute_ipython>
+import pymsgbox
+
+version_info = pymsgbox.__version__
+
+with open("/workspace/test.txt", "w") as file:
+    file.write(f"pymsgbox version: {version_info}\n")
+
+version_info
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+'1.0.9'
+
+----------
+
+The version of `pymsgbox` has been successfully written to `/workspace/test.txt`. Let's verify the content of the file to ensure everything is correct.
+<execute_bash>
+cat /workspace/test.txt
+</execute_bash>
+
+----------
+
+OBSERVATION:
+pymsgbox version: 1.0.9
+
+opendevin@docker-desktop:/workspace $
+[Command -1 finished with exit code 0]
+
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/response_001.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/response_001.log
new file mode 100644
index 000000000000..f0dcb7ab6d9b
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/response_001.log
@@ -0,0 +1,4 @@
+Understood. Let's start by installing the `pymsgbox` package.
+
+<execute_ipython>
+%pip install pymsgbox==1.0.9
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/response_002.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/response_002.log
new file mode 100644
index 000000000000..ae6944507ff6
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/response_002.log
@@ -0,0 +1,11 @@
+Now, let's import `pymsgbox`, print its version, and write the output to `/workspace/test.txt`.
+
+<execute_ipython>
+import pymsgbox
+
+version_info = pymsgbox.__version__
+
+with open("/workspace/test.txt", "w") as file:
+    file.write(f"pymsgbox version: {version_info}\n")
+
+version_info
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/response_003.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/response_003.log
new file mode 100644
index 000000000000..530fccf29d7e
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/response_003.log
@@ -0,0 +1,4 @@
+The version of `pymsgbox` has been successfully written to `/workspace/test.txt`. Let's verify the content of the file to ensure everything is correct.
+
+<execute_bash>
+cat /workspace/test.txt
diff --git a/tests/integration/mock/CodeActSWEAgent/test_ipython_module/response_003.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/response_004.log
similarity index 59%
rename from tests/integration/mock/CodeActSWEAgent/test_ipython_module/response_003.log
rename to tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/response_004.log
index 62ad1b8be010..205a098d64b5 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_ipython_module/response_003.log
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_ipython_module/response_004.log
@@ -1 +1 @@
-The version of `pymsgbox` has been successfully written to `/workspace/test.txt`. The task is now complete.
+The version of `pymsgbox` has been successfully written to `/workspace/test.txt`. The task is complete.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_write_simple_script/prompt_001.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/prompt_001.log
similarity index 99%
rename from tests/integration/mock/CodeActSWEAgent/test_write_simple_script/prompt_001.log
rename to tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/prompt_001.log
index 1fcc8aae387f..82481572e1c4 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_write_simple_script/prompt_001.log
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/prompt_001.log
@@ -591,4 +591,4 @@ NOW, LET'S START!
 
 Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
 
-ENVIRONMENT REMINDER: You have 14 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/prompt_002.log
new file mode 100644
index 000000000000..6fec6ca79576
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/prompt_002.log
@@ -0,0 +1,608 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+----------
+
+Sure, I'll create the `hello.sh` script for you.
+<execute_bash>
+echo 'echo "hello"' > hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+
+opendevin@docker-desktop:/workspace $
+[Command -1 finished with exit code 0]
+
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/prompt_003.log
new file mode 100644
index 000000000000..0bc7c580c0d4
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/prompt_003.log
@@ -0,0 +1,625 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+----------
+
+Sure, I'll create the `hello.sh` script for you.
+<execute_bash>
+echo 'echo "hello"' > hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+
+opendevin@docker-desktop:/workspace $
+[Command -1 finished with exit code 0]
+
+----------
+
+The `hello.sh` script has been created. Let's verify its content and make it executable.
+
+First, let's check the content:
+<execute_bash>
+cat hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+echo "hello"
+
+opendevin@docker-desktop:/workspace $
+[Command -1 finished with exit code 0]
+
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/prompt_004.log
new file mode 100644
index 000000000000..c7be5e5981f4
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/prompt_004.log
@@ -0,0 +1,643 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+----------
+
+Sure, I'll create the `hello.sh` script for you.
+<execute_bash>
+echo 'echo "hello"' > hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+
+opendevin@docker-desktop:/workspace $
+[Command -1 finished with exit code 0]
+
+----------
+
+The `hello.sh` script has been created. Let's verify its content and make it executable.
+
+First, let's check the content:
+<execute_bash>
+cat hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+echo "hello"
+
+opendevin@docker-desktop:/workspace $
+[Command -1 finished with exit code 0]
+
+----------
+
+The content of `hello.sh` is correct. Now, let's make the script executable and run it to ensure it works as expected.
+<execute_bash>
+chmod +x hello.sh
+./hello.sh
+</execute_bash>
+
+----------
+
+OBSERVATION:
+
+opendevin@docker-desktop:/workspace $ ./hello.sh
+hello
+
+opendevin@docker-desktop:/workspace $
+[Command -1 finished with exit code 0]
+
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/response_001.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/response_001.log
new file mode 100644
index 000000000000..752c145b858d
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/response_001.log
@@ -0,0 +1,4 @@
+Sure, I'll create the `hello.sh` script for you.
+
+<execute_bash>
+echo 'echo "hello"' > hello.sh
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/response_002.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/response_002.log
new file mode 100644
index 000000000000..dcc2eabc6a63
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/response_002.log
@@ -0,0 +1,5 @@
+The `hello.sh` script has been created. Let's verify its content and make it executable.
+
+First, let's check the content:
+<execute_bash>
+cat hello.sh
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/response_003.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/response_003.log
new file mode 100644
index 000000000000..6da7ce285509
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/response_003.log
@@ -0,0 +1,5 @@
+The content of `hello.sh` is correct. Now, let's make the script executable and run it to ensure it works as expected.
+
+<execute_bash>
+chmod +x hello.sh
+./hello.sh
diff --git a/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/response_004.log b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/response_004.log
new file mode 100644
index 000000000000..89d9d9ab87cc
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/CodeActSWEAgent/test_write_simple_script/response_004.log
@@ -0,0 +1 @@
+The `hello.sh` script executed successfully and printed "hello". The task is complete.
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_001.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_001.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/prompt_001.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_001.log
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_003.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_002.log
similarity index 86%
rename from tests/integration/mock/DelegatorAgent/test_edits/prompt_003.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_002.log
index 5019fbdf7fef..8b8ef0b4a37c 100644
--- a/tests/integration/mock/DelegatorAgent/test_edits/prompt_003.log
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_002.log
@@ -41,7 +41,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}, {"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "bad.txt"}}]
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\nopendevin@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_003.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_003.log
new file mode 100644
index 000000000000..982b3d2b64a6
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_003.log
@@ -0,0 +1,84 @@
+
+
+----------
+
+# Task
+You are a software architect. Your team has inherited an existing codebase, and
+need to finish a project:
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+As an architect, you need to study the codebase to find all the information that
+might be helpful for your software engineering team.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+You must ONLY `run` commands that have no side-effects, like `ls` and `grep`. You
+MUST NOT modify or write to any file.
+
+Do NOT finish until you have a complete understanding of which parts of the
+codebase are relevant to the project, including particular files, functions, and classes.
+When you're done, put your summary in `outputs.summary` in the `finish` action.
+Remember, your task is to explore and study the current repository, not actually
+implement the solution. If the codebase is empty, you should call the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\nopendevin@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}, {"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "/workspace/bad.txt"}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
+
+
+## Examples
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+
+ASSISTANT:
+{
+  "action": "run",
+  "args": {
+    "command": "ls"
+  }
+}
+
+USER:
+OBSERVATION:
+[]
+
+ASSISTANT:
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "summary": "The codebase appears to be empty. Engineers should start everything from scratch."
+    }
+  }
+}
+
+--- END OF EXAMPLE ---
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_004.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_004.log
new file mode 100644
index 000000000000..946bb4950c54
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_004.log
@@ -0,0 +1,58 @@
+
+
+----------
+
+# Task
+You are a software engineer. You've inherited an existing codebase, which you
+need to modify to complete this task:
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+
+Here's a summary of the codebase, as it relates to this task:
+
+The codebase consists of a single file named 'bad.txt' which contains several typos. The content of 'bad.txt' is:
+
+This is a stupid typoo.
+Really?
+No mor typos!
+Enjoy!
+
+The engineering team needs to correct the typos in this file.
+
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+Do NOT finish until you have completed the tasks.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_005.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_005.log
new file mode 100644
index 000000000000..f1518f188dec
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_005.log
@@ -0,0 +1,58 @@
+
+
+----------
+
+# Task
+You are a software engineer. You've inherited an existing codebase, which you
+need to modify to complete this task:
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+
+Here's a summary of the codebase, as it relates to this task:
+
+The codebase consists of a single file named 'bad.txt' which contains several typos. The content of 'bad.txt' is:
+
+This is a stupid typoo.
+Really?
+No mor typos!
+Enjoy!
+
+The engineering team needs to correct the typos in this file.
+
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+Do NOT finish until you have completed the tasks.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "/workspace/bad.txt"}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_006.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_006.log
new file mode 100644
index 000000000000..6e6649226de9
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_006.log
@@ -0,0 +1,58 @@
+
+
+----------
+
+# Task
+You are a software engineer. You've inherited an existing codebase, which you
+need to modify to complete this task:
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+
+Here's a summary of the codebase, as it relates to this task:
+
+The codebase consists of a single file named 'bad.txt' which contains several typos. The content of 'bad.txt' is:
+
+This is a stupid typoo.
+Really?
+No mor typos!
+Enjoy!
+
+The engineering team needs to correct the typos in this file.
+
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+Do NOT finish until you have completed the tasks.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "/workspace/bad.txt"}}, {"source": "agent", "action": "write", "args": {"path": "bad.txt", "content": "This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!\n", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/bad.txt"}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_007.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_007.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/prompt_007.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_007.log
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_008.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_008.log
new file mode 100644
index 000000000000..b5683d733d04
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_008.log
@@ -0,0 +1,49 @@
+
+
+----------
+
+# Task
+You are a quality assurance engineer. Another engineer has made changes to the
+codebase which are supposed to solve this task:
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+Note the changes might have already been applied in-line. You should focus on
+validating if the task is solved, nothing else.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+You must ONLY `run` commands that have no side-effects, like `ls`, `grep`, and test scripts.
+
+Do NOT finish until you know whether the task is complete and correct.
+When you're done, add a `completed` boolean to the `outputs` of the `finish` action.
+If `completed` is `false`, you MUST also provide a `summary` in the `outputs` of the `finish` action
+explaining what the problem is.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!\n\n", "extras": {"path": "/workspace/bad.txt"}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/response_001.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_001.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/response_001.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_001.log
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/response_002.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_002.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/response_002.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_002.log
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_003.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_003.log
new file mode 100644
index 000000000000..10e978d7704f
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_003.log
@@ -0,0 +1,8 @@
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "summary": "The codebase consists of a single file named 'bad.txt' which contains several typos. The content of 'bad.txt' is:\n\nThis is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n\nThe engineering team needs to correct the typos in this file."
+    }
+  }
+}
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/response_004.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_004.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/response_004.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_004.log
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/response_005.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_005.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/response_005.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_005.log
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/response_006.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_006.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/response_006.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_006.log
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/response_007.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_007.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/response_007.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_007.log
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/response_008.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_008.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/response_008.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/response_008.log
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_001.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_001.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_001.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_001.log
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_002.log
new file mode 100644
index 000000000000..d998ca89f5a1
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_002.log
@@ -0,0 +1,84 @@
+
+
+----------
+
+# Task
+You are a software architect. Your team has inherited an existing codebase, and
+need to finish a project:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+As an architect, you need to study the codebase to find all the information that
+might be helpful for your software engineering team.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+You must ONLY `run` commands that have no side-effects, like `ls` and `grep`. You
+MUST NOT modify or write to any file.
+
+Do NOT finish until you have a complete understanding of which parts of the
+codebase are relevant to the project, including particular files, functions, and classes.
+When you're done, put your summary in `outputs.summary` in the `finish` action.
+Remember, your task is to explore and study the current repository, not actually
+implement the solution. If the codebase is empty, you should call the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\nopendevin@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
+
+
+## Examples
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+
+ASSISTANT:
+{
+  "action": "run",
+  "args": {
+    "command": "ls"
+  }
+}
+
+USER:
+OBSERVATION:
+[]
+
+ASSISTANT:
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "summary": "The codebase appears to be empty. Engineers should start everything from scratch."
+    }
+  }
+}
+
+--- END OF EXAMPLE ---
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_003.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_003.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_003.log
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_004.log
new file mode 100644
index 000000000000..36bf347a651d
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_004.log
@@ -0,0 +1,51 @@
+
+
+----------
+
+# Task
+You are a software engineer. You've inherited an existing codebase, which you
+need to modify to complete this task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+
+Here's a summary of the codebase, as it relates to this task:
+
+The codebase appears to be empty. Engineers should start everything from scratch.
+
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+Do NOT finish until you have completed the tasks.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_005.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_005.log
new file mode 100644
index 000000000000..896c201f5baa
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_005.log
@@ -0,0 +1,51 @@
+
+
+----------
+
+# Task
+You are a software engineer. You've inherited an existing codebase, which you
+need to modify to complete this task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+
+Here's a summary of the codebase, as it relates to this task:
+
+The codebase appears to be empty. Engineers should start everything from scratch.
+
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+Do NOT finish until you have completed the tasks.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\nopendevin@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_006.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_006.log
new file mode 100644
index 000000000000..76480300833a
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_006.log
@@ -0,0 +1,51 @@
+
+
+----------
+
+# Task
+You are a software engineer. You've inherited an existing codebase, which you
+need to modify to complete this task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+
+Here's a summary of the codebase, as it relates to this task:
+
+The codebase appears to be empty. Engineers should start everything from scratch.
+
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+Do NOT finish until you have completed the tasks.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\nopendevin@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}, {"source": "agent", "action": "run", "args": {"command": "./hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\nopendevin@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "./hello.sh", "exit_code": 0}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_007.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_007.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_007.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_007.log
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_008.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_008.log
new file mode 100644
index 000000000000..5ddd4dd4eb47
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_008.log
@@ -0,0 +1,49 @@
+
+
+----------
+
+# Task
+You are a quality assurance engineer. Another engineer has made changes to the
+codebase which are supposed to solve this task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+Note the changes might have already been applied in-line. You should focus on
+validating if the task is solved, nothing else.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+You must ONLY `run` commands that have no side-effects, like `ls`, `grep`, and test scripts.
+
+Do NOT finish until you know whether the task is complete and correct.
+When you're done, add a `completed` boolean to the `outputs` of the `finish` action.
+If `completed` is `false`, you MUST also provide a `summary` in the `outputs` of the `finish` action
+explaining what the problem is.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\necho 'hello'\n", "extras": {"path": "/workspace/hello.sh"}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_009.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_009.log
new file mode 100644
index 000000000000..51bc8477cb4e
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_009.log
@@ -0,0 +1,49 @@
+
+
+----------
+
+# Task
+You are a quality assurance engineer. Another engineer has made changes to the
+codebase which are supposed to solve this task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+Note the changes might have already been applied in-line. You should focus on
+validating if the task is solved, nothing else.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+You must ONLY `run` commands that have no side-effects, like `ls`, `grep`, and test scripts.
+
+Do NOT finish until you know whether the task is complete and correct.
+When you're done, add a `completed` boolean to the `outputs` of the `finish` action.
+If `completed` is `false`, you MUST also provide a `summary` in the `outputs` of the `finish` action
+explaining what the problem is.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\necho 'hello'\n", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "bash hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\nopendevin@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "bash hello.sh", "exit_code": 0}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_001.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_001.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/response_001.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_001.log
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_002.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_002.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/response_002.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_002.log
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_003.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_003.log
new file mode 100644
index 000000000000..c146bbc0164c
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_003.log
@@ -0,0 +1,9 @@
+```json
+{
+  "action": "write",
+  "args": {
+    "path": "hello.sh",
+    "content": "#!/bin/bash\n\necho 'hello'"
+  }
+}
+```
diff --git a/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_004.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_004.log
new file mode 100644
index 000000000000..a64dfbf4814b
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_004.log
@@ -0,0 +1,8 @@
+```json
+{
+  "action": "run",
+  "args": {
+    "command": "chmod +x hello.sh"
+  }
+}
+```
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_005.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_005.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/response_005.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_005.log
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_006.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_006.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/response_006.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_006.log
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_007.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_007.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/response_007.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_007.log
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_008.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_008.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/response_008.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_008.log
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_009.log b/tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_009.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/response_009.log
rename to tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/response_009.log
diff --git a/tests/integration/mock/ManagerAgent/test_edits/prompt_001.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_001.log
similarity index 98%
rename from tests/integration/mock/ManagerAgent/test_edits/prompt_001.log
rename to tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_001.log
index 2839256104a5..98395254391d 100644
--- a/tests/integration/mock/ManagerAgent/test_edits/prompt_001.log
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_001.log
@@ -70,7 +70,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "user", "action": "message", "args": {"content": "Fix typos in bad.txt. Do not ask me for confirmation at any point.", "wait_for_response": false}}]
+[{"source": "user", "action": "message", "args": {"content": "Fix typos in bad.txt. Do not ask me for confirmation at any point.", "images_urls": null, "wait_for_response": false}}]
 
 If the last item in the history is an error, you should try to fix it. If you
 cannot fix it, call the `reject` action.
diff --git a/tests/integration/mock/ManagerAgent/test_edits/prompt_002.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_002.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_edits/prompt_002.log
rename to tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_002.log
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_003.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_003.log
new file mode 100644
index 000000000000..c219bff987b2
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_003.log
@@ -0,0 +1,79 @@
+
+
+----------
+
+# Task
+You are a proofreader tasked with fixing typos in the files in your current working directory.
+
+
+Specifically, your task is:
+Fix typos in bad.txt
+
+
+To achieve this goal, you should:
+
+1. Scan the files for typos
+2. Overwrite the files with the typos fixed
+3. Provide a summary of the typos fixed
+
+## Available Actions
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+To complete this task:
+1. Use the `read` action to read the contents of the files in your current working directory. Make sure to provide the file path in the format `'./file_name.ext'`.
+2. Use the `message` action to analyze the contents and identify typos.
+3. Use the `write` action to create new versions of the files with the typos fixed.
+  - Overwrite the original files with the corrected content. Make sure to provide the file path in the format `'./file_name.ext'`.
+4. Use the `message` action to generate a summary of the typos fixed, including the original and fixed versions of each typo, and the file(s) they were found in.
+5. Use the `finish` action to return the summary in the `outputs.summary` field.
+
+Do NOT finish until you have fixed all the typos and generated a summary.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "read", "args": {"path": "./bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "/workspace/bad.txt"}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
+
+
+For example, if you want to use the read action to read the contents of a file named example.txt, your response should look like this:
+{
+  "action": "read",
+  "args": {
+    "path": "./example.txt"
+  }
+}
+
+Similarly, if you want to use the write action to write content to a file named output.txt, your response should look like this:
+{
+  "action": "write",
+  "args": {
+    "path": "./output.txt",
+    "content": "This is the content to be written to the file."
+  }
+}
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_004.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_004.log
new file mode 100644
index 000000000000..bb7dbbc98d4a
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_004.log
@@ -0,0 +1,79 @@
+
+
+----------
+
+# Task
+You are a proofreader tasked with fixing typos in the files in your current working directory.
+
+
+Specifically, your task is:
+Fix typos in bad.txt
+
+
+To achieve this goal, you should:
+
+1. Scan the files for typos
+2. Overwrite the files with the typos fixed
+3. Provide a summary of the typos fixed
+
+## Available Actions
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+To complete this task:
+1. Use the `read` action to read the contents of the files in your current working directory. Make sure to provide the file path in the format `'./file_name.ext'`.
+2. Use the `message` action to analyze the contents and identify typos.
+3. Use the `write` action to create new versions of the files with the typos fixed.
+  - Overwrite the original files with the corrected content. Make sure to provide the file path in the format `'./file_name.ext'`.
+4. Use the `message` action to generate a summary of the typos fixed, including the original and fixed versions of each typo, and the file(s) they were found in.
+5. Use the `finish` action to return the summary in the `outputs.summary` field.
+
+Do NOT finish until you have fixed all the typos and generated a summary.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "read", "args": {"path": "./bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "/workspace/bad.txt"}}, {"source": "agent", "action": "message", "args": {"content": "The content of 'bad.txt' has been read. The following typos have been identified:\n1. 'typoo' should be 'typo'\n2. 'mor' should be 'more'\nI will now proceed to fix these typos and overwrite the original file with the corrected content.", "images_urls": null, "wait_for_response": false}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
+
+
+For example, if you want to use the read action to read the contents of a file named example.txt, your response should look like this:
+{
+  "action": "read",
+  "args": {
+    "path": "./example.txt"
+  }
+}
+
+Similarly, if you want to use the write action to write content to a file named output.txt, your response should look like this:
+{
+  "action": "write",
+  "args": {
+    "path": "./output.txt",
+    "content": "This is the content to be written to the file."
+  }
+}
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_005.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_005.log
new file mode 100644
index 000000000000..84df94facacc
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_005.log
@@ -0,0 +1,79 @@
+
+
+----------
+
+# Task
+You are a proofreader tasked with fixing typos in the files in your current working directory.
+
+
+Specifically, your task is:
+Fix typos in bad.txt
+
+
+To achieve this goal, you should:
+
+1. Scan the files for typos
+2. Overwrite the files with the typos fixed
+3. Provide a summary of the typos fixed
+
+## Available Actions
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+To complete this task:
+1. Use the `read` action to read the contents of the files in your current working directory. Make sure to provide the file path in the format `'./file_name.ext'`.
+2. Use the `message` action to analyze the contents and identify typos.
+3. Use the `write` action to create new versions of the files with the typos fixed.
+  - Overwrite the original files with the corrected content. Make sure to provide the file path in the format `'./file_name.ext'`.
+4. Use the `message` action to generate a summary of the typos fixed, including the original and fixed versions of each typo, and the file(s) they were found in.
+5. Use the `finish` action to return the summary in the `outputs.summary` field.
+
+Do NOT finish until you have fixed all the typos and generated a summary.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "read", "args": {"path": "./bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "/workspace/bad.txt"}}, {"source": "agent", "action": "message", "args": {"content": "The content of 'bad.txt' has been read. The following typos have been identified:\n1. 'typoo' should be 'typo'\n2. 'mor' should be 'more'\nI will now proceed to fix these typos and overwrite the original file with the corrected content.", "images_urls": null, "wait_for_response": false}}, {"source": "agent", "action": "write", "args": {"path": "./bad.txt", "content": "This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!\n", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/bad.txt"}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
+
+
+For example, if you want to use the read action to read the contents of a file named example.txt, your response should look like this:
+{
+  "action": "read",
+  "args": {
+    "path": "./example.txt"
+  }
+}
+
+Similarly, if you want to use the write action to write content to a file named output.txt, your response should look like this:
+{
+  "action": "write",
+  "args": {
+    "path": "./output.txt",
+    "content": "This is the content to be written to the file."
+  }
+}
diff --git a/tests/integration/mock/ManagerAgent/test_edits/prompt_006.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_006.log
similarity index 86%
rename from tests/integration/mock/ManagerAgent/test_edits/prompt_006.log
rename to tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_006.log
index 9fb0e100dbb4..1ee30eefbf81 100644
--- a/tests/integration/mock/ManagerAgent/test_edits/prompt_006.log
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/prompt_006.log
@@ -70,7 +70,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "user", "action": "message", "args": {"content": "Fix typos in bad.txt. Do not ask me for confirmation at any point.", "wait_for_response": false}}, {"source": "agent", "action": "delegate", "args": {"agent": "TypoFixerAgent", "inputs": {"task": "Fix typos in bad.txt"}, "thought": ""}}, {"source": "agent", "observation": "delegate", "content": "TypoFixerAgent finishes task with summary: {'file': './bad.txt', 'typos_fixed': [{'original': 'typoo', 'corrected': 'typo'}, {'original': 'mor', 'corrected': 'more'}]}", "extras": {"outputs": {"summary": {"file": "./bad.txt", "typos_fixed": [{"original": "typoo", "corrected": "typo"}, {"original": "mor", "corrected": "more"}]}}}}]
+[{"source": "user", "action": "message", "args": {"content": "Fix typos in bad.txt. Do not ask me for confirmation at any point.", "images_urls": null, "wait_for_response": false}}, {"source": "agent", "action": "delegate", "args": {"agent": "TypoFixerAgent", "inputs": {"task": "Fix typos in bad.txt"}, "thought": ""}}, {"source": "agent", "observation": "delegate", "content": "TypoFixerAgent finishes task with summary: {'bad.txt': {'original': ['typoo', 'mor'], 'fixed': ['typo', 'more']}}", "extras": {"outputs": {"summary": {"bad.txt": {"original": ["typoo", "mor"], "fixed": ["typo", "more"]}}}}}]
 
 If the last item in the history is an error, you should try to fix it. If you
 cannot fix it, call the `reject` action.
diff --git a/tests/integration/mock/ManagerAgent/test_edits/response_001.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_001.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_edits/response_001.log
rename to tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_001.log
diff --git a/tests/integration/mock/ManagerAgent/test_edits/response_002.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_002.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_edits/response_002.log
rename to tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_002.log
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_003.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_003.log
new file mode 100644
index 000000000000..10a33cbb7a82
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_003.log
@@ -0,0 +1,6 @@
+{
+  "action": "message",
+  "args": {
+    "content": "The content of 'bad.txt' has been read. The following typos have been identified:\n1. 'typoo' should be 'typo'\n2. 'mor' should be 'more'\nI will now proceed to fix these typos and overwrite the original file with the corrected content."
+  }
+}
diff --git a/tests/integration/mock/ManagerAgent/test_edits/response_004.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_004.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_edits/response_004.log
rename to tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_004.log
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_005.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_005.log
new file mode 100644
index 000000000000..501d4cbc223a
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_005.log
@@ -0,0 +1,19 @@
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "summary": {
+        "bad.txt": {
+          "original": [
+            "typoo",
+            "mor"
+          ],
+          "fixed": [
+            "typo",
+            "more"
+          ]
+        }
+      }
+    }
+  }
+}
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_006.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_006.log
new file mode 100644
index 000000000000..5900f83a7035
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_edits/response_006.log
@@ -0,0 +1,15 @@
+```json
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "summary": {
+        "bad.txt": {
+          "original": ["typoo", "mor"],
+          "fixed": ["typo", "more"]
+        }
+      }
+    }
+  }
+}
+```
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_001.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_001.log
similarity index 97%
rename from tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_001.log
rename to tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_001.log
index 8ad7e8de7c1c..71235b445509 100644
--- a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_001.log
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_001.log
@@ -70,7 +70,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "user", "action": "message", "args": {"content": "Write a git commit message for the current staging area. Do not ask me for confirmation at any point.", "wait_for_response": false}}]
+[{"source": "user", "action": "message", "args": {"content": "Write a git commit message for the current staging area. Do not ask me for confirmation at any point.", "images_urls": null, "wait_for_response": false}}]
 
 If the last item in the history is an error, you should try to fix it. If you
 cannot fix it, call the `reject` action.
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_002.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_002.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_002.log
rename to tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_002.log
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_004.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_003.log
similarity index 81%
rename from tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_004.log
rename to tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_003.log
index 29f1dc839254..4ffd673c10c4 100644
--- a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_004.log
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_003.log
@@ -28,7 +28,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\nopendevin@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}]
 
 If the last item in the history is an error, you should try to fix it.
 
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_005.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_004.log
similarity index 76%
rename from tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_005.log
rename to tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_004.log
index 07713e2d3ba6..7ba2092a6818 100644
--- a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_005.log
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_004.log
@@ -28,7 +28,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\nopendevin@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
 
 If the last item in the history is an error, you should try to fix it.
 
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_005.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_005.log
new file mode 100644
index 000000000000..f26a33bc5f92
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_005.log
@@ -0,0 +1,51 @@
+
+
+----------
+
+# Task
+You are a responsible software engineer and always write good commit messages.
+
+Please analyze the diff in the staging area, understand the context and content
+of the updates from the diff only. Identify key elements like:
+- Which files are affected?
+- What types of changes were made (e.g., new features, bug fixes, refactoring, documentation, testing)?
+
+Then you should generate a commit message that succinctly summarizes the staged
+changes. The commit message should include:
+- A summary line that clearly states the purpose of the changes.
+- Optionally, a detailed description if the changes are complex or need further explanation.
+
+You should first use `git status` to check whether it's a valid git repo and there
+is diff in the staging area. If not, please call the `reject` action.
+
+If it is a valid git repo and there is diff in the staging area, you should find
+the diff using `git diff --cached`, compile a commit message, and call the `finish`
+action with `outputs.answer` set to the answer.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\nopendevin@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+
+If the last item in the history is an error, you should try to fix it.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `reject` - reject the task. Arguments:
+  * `outputs` - a dictionary with only a `reason` attribute
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_006.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_006.log
new file mode 100644
index 000000000000..948fd8c57e3d
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_006.log
@@ -0,0 +1,51 @@
+
+
+----------
+
+# Task
+You are a responsible software engineer and always write good commit messages.
+
+Please analyze the diff in the staging area, understand the context and content
+of the updates from the diff only. Identify key elements like:
+- Which files are affected?
+- What types of changes were made (e.g., new features, bug fixes, refactoring, documentation, testing)?
+
+Then you should generate a commit message that succinctly summarizes the staged
+changes. The commit message should include:
+- A summary line that clearly states the purpose of the changes.
+- Optionally, a detailed description if the changes are complex or need further explanation.
+
+You should first use `git status` to check whether it's a valid git repo and there
+is diff in the staging area. If not, please call the `reject` action.
+
+If it is a valid git repo and there is diff in the staging area, you should find
+the diff using `git diff --cached`, compile a commit message, and call the `finish`
+action with `outputs.answer` set to the answer.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\nopendevin@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+
+If the last item in the history is an error, you should try to fix it.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `reject` - reject the task. Arguments:
+  * `outputs` - a dictionary with only a `reason` attribute
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_007.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_007.log
new file mode 100644
index 000000000000..79f3fc54cfec
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_007.log
@@ -0,0 +1,51 @@
+
+
+----------
+
+# Task
+You are a responsible software engineer and always write good commit messages.
+
+Please analyze the diff in the staging area, understand the context and content
+of the updates from the diff only. Identify key elements like:
+- Which files are affected?
+- What types of changes were made (e.g., new features, bug fixes, refactoring, documentation, testing)?
+
+Then you should generate a commit message that succinctly summarizes the staged
+changes. The commit message should include:
+- A summary line that clearly states the purpose of the changes.
+- Optionally, a detailed description if the changes are complex or need further explanation.
+
+You should first use `git status` to check whether it's a valid git repo and there
+is diff in the staging area. If not, please call the `reject` action.
+
+If it is a valid git repo and there is diff in the staging area, you should find
+the diff using `git diff --cached`, compile a commit message, and call the `finish`
+action with `outputs.answer` set to the answer.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\nopendevin@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+
+If the last item in the history is an error, you should try to fix it.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `reject` - reject the task. Arguments:
+  * `outputs` - a dictionary with only a `reason` attribute
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_008.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_008.log
new file mode 100644
index 000000000000..c869abe63021
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_008.log
@@ -0,0 +1,95 @@
+
+
+----------
+
+# Task
+You are in charge of accomplishing the following task:
+Write a git commit message for the current staging area. Do not ask me for confirmation at any point.
+
+In order to accomplish this goal, you must delegate tasks to one or more agents, who
+can do the actual work. A description of each agent is provided below. You MUST
+select one of the delegates below to move towards accomplishing the task, and you MUST
+provide the correct inputs for the delegate you select.
+
+Note: the delegated agent either returns "finish" or "reject".
+- If the action is "finish", but the full task is not done yet, you should
+continue to delegate to one of the agents below to until the full task is finished.
+- If the action is "reject", it means the delegated agent is not capable of the
+task you send to. You should revisit the input you send to the delegate, and consider
+whether any other delegate would be able to solve the task. If you cannot find
+a proper delegate agent, or the delegate attempts keep failing, call the `reject`
+action. In `reason` attribute, make sure you include your attempts (e.g. what agent
+you have delegated to, and why they failed).
+
+## Agents
+
+### CoderAgent
+Given a particular task, and a detailed description of the codebase, accomplishes the task
+#### Inputs
+{"task": "string", "summary": "string"}
+
+### CommitWriterAgent
+Write a git commit message for files in the git staging area
+#### Inputs
+{}
+
+### MathAgent
+Solves simple and complex math problems using python
+#### Inputs
+{"task": "string"}
+
+### PostgresAgent
+Writes and maintains PostgreSQL migrations
+#### Inputs
+{"task": "string"}
+
+### RepoExplorerAgent
+Generates a detailed summary of an existing codebase
+#### Inputs
+{}
+
+### StudyRepoForTaskAgent
+Given a particular task, finds and describes all relevant parts of the codebase
+#### Inputs
+{"task": "string"}
+
+### TypoFixerAgent
+Fixes typos in files in the current working directory
+#### Inputs
+{"task": "string"}
+
+### VerifierAgent
+Given a particular task, verifies that the task has been completed
+#### Inputs
+{"task": "string"}
+
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "user", "action": "message", "args": {"content": "Write a git commit message for the current staging area. Do not ask me for confirmation at any point.", "images_urls": null, "wait_for_response": false}}, {"source": "agent", "action": "delegate", "args": {"agent": "CommitWriterAgent", "inputs": {}, "thought": ""}}, {"source": "agent", "observation": "delegate", "content": "CommitWriterAgent finishes task with reason: Not a valid git repository.", "extras": {"outputs": {"reason": "Not a valid git repository."}}}]
+
+If the last item in the history is an error, you should try to fix it. If you
+cannot fix it, call the `reject` action.
+
+## Available Actions
+* `delegate` - send a task to another agent from the list provided. Arguments:
+  * `agent` - the agent to which the task is delegated. MUST match a name in the list of agents provided.
+  * `inputs` - a dictionary of input parameters to the agent, as specified in the list
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+* `reject` - reject the task. Arguments:
+  * `outputs` - a dictionary with only a `reason` attribute
+
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_001.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_001.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_001.log
rename to tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_001.log
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_002.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_002.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_002.log
rename to tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_002.log
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_003.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_003.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_003.log
rename to tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_003.log
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_004.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_004.log
new file mode 100644
index 000000000000..284ec6eaf959
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_004.log
@@ -0,0 +1,8 @@
+```json
+{
+  "action": "reject",
+  "args": {
+    "reason": "Not a valid git repository."
+  }
+}
+```
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_005.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_005.log
new file mode 100644
index 000000000000..284ec6eaf959
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_005.log
@@ -0,0 +1,8 @@
+```json
+{
+  "action": "reject",
+  "args": {
+    "reason": "Not a valid git repository."
+  }
+}
+```
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_006.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_006.log
new file mode 100644
index 000000000000..284ec6eaf959
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_006.log
@@ -0,0 +1,8 @@
+```json
+{
+  "action": "reject",
+  "args": {
+    "reason": "Not a valid git repository."
+  }
+}
+```
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_007.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_007.log
new file mode 100644
index 000000000000..eea0660fde99
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_007.log
@@ -0,0 +1,8 @@
+{
+  "action": "reject",
+  "args": {
+    "outputs": {
+      "reason": "Not a valid git repository."
+    }
+  }
+}
diff --git a/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_008.log b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_008.log
new file mode 100644
index 000000000000..71122a7f88fd
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/response_008.log
@@ -0,0 +1,10 @@
+```json
+{
+  "action": "reject",
+  "args": {
+    "outputs": {
+      "reason": "CommitWriterAgent failed because the repository is not valid. No other agents can resolve this issue."
+    }
+  }
+}
+```
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_001.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_001.log
similarity index 99%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_001.log
rename to tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_001.log
index cab7b2af583e..00389e2e9fdc 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_001.log
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_001.log
@@ -57,6 +57,7 @@ ten actions--more happened before that.
     "action": "message",
     "args": {
       "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
       "wait_for_response": false
     }
   }
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_002.log
new file mode 100644
index 000000000000..f9e7cf5b6339
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_002.log
@@ -0,0 +1,149 @@
+
+
+----------
+
+
+# Task
+You're a diligent software engineer AI. You can't see, draw, or interact with a
+browser, but you can read and write files, and you can run commands, and you can think.
+
+You've been given the following task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+## Plan
+As you complete this task, you're building a plan and keeping
+track of your progress. Here's a JSON representation of your plan:
+
+{
+  "id": "",
+  "goal": "",
+  "state": "open",
+  "subtasks": [
+    {
+      "id": "0",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "state": "open",
+      "subtasks": [
+        {
+          "id": "0.0",
+          "goal": "Create the shell script file 'hello.sh'.",
+          "state": "open",
+          "subtasks": []
+        },
+        {
+          "id": "0.1",
+          "goal": "Write the content to print 'hello' in 'hello.sh'.",
+          "state": "open",
+          "subtasks": []
+        },
+        {
+          "id": "0.2",
+          "goal": "Run the shell script to verify it prints 'hello'.",
+          "state": "open",
+          "subtasks": []
+        }
+      ]
+    }
+  ]
+}
+
+
+You're not currently working on any tasks. Your next action MUST be to mark a task as in_progress.
+
+You're responsible for managing this plan and the status of tasks in
+it, by using the `add_task` and `modify_task` actions described below.
+
+If the History below contradicts the state of any of these tasks, you
+MUST modify the task using the `modify_task` action described below.
+
+Be sure NOT to duplicate any tasks. Do NOT use the `add_task` action for
+a task that's already represented. Every task must be represented only once.
+
+Tasks that are sequential MUST be siblings. They must be added in order
+to their parent task.
+
+If you mark a task as 'completed', 'verified', or 'abandoned',
+all non-abandoned subtasks will be marked the same way.
+So before closing a task this way, you MUST not only be sure that it has
+been completed successfully--you must ALSO be sure that all its subtasks
+are ready to be marked the same way.
+
+If, and only if, ALL tasks have already been marked verified,
+you MUST respond with the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+ten actions--more happened before that.
+
+[
+  {
+    "source": "user",
+    "action": "message",
+    "args": {
+      "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
+      "wait_for_response": false
+    }
+  },
+  {
+    "source": "agent",
+    "action": "add_task",
+    "args": {
+      "parent": "",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "subtasks": [
+        {
+          "goal": "Create the shell script file 'hello.sh'."
+        },
+        {
+          "goal": "Write the content to print 'hello' in 'hello.sh'."
+        },
+        {
+          "goal": "Run the shell script to verify it prints 'hello'."
+        }
+      ],
+      "thought": ""
+    }
+  }
+]
+
+
+Your most recent action is at the bottom of that history.
+
+## Action
+What is your next thought or action? Your response must be in JSON format.
+
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+* `add_task` - add a task to your plan. Arguments:
+  * `parent` - the ID of the parent task (leave empty if it should go at the top level)
+  * `goal` - the goal of the task
+  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
+* `modify_task` - close a task. Arguments:
+  * `task_id` - the ID of the task to close
+  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
+* `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+You MUST take time to think in between read, write, run, and browse actions--do this with the `message` action.
+You should never act twice in a row without thinking. But if your last several
+actions are all `message` actions, you should consider taking a different action.
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
+
+You should think about the next action to take.
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_003.log
new file mode 100644
index 000000000000..c1aa30ab7087
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_003.log
@@ -0,0 +1,160 @@
+
+
+----------
+
+
+# Task
+You're a diligent software engineer AI. You can't see, draw, or interact with a
+browser, but you can read and write files, and you can run commands, and you can think.
+
+You've been given the following task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+## Plan
+As you complete this task, you're building a plan and keeping
+track of your progress. Here's a JSON representation of your plan:
+
+{
+  "id": "",
+  "goal": "",
+  "state": "in_progress",
+  "subtasks": [
+    {
+      "id": "0",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "state": "in_progress",
+      "subtasks": [
+        {
+          "id": "0.0",
+          "goal": "Create the shell script file 'hello.sh'.",
+          "state": "in_progress",
+          "subtasks": []
+        },
+        {
+          "id": "0.1",
+          "goal": "Write the content to print 'hello' in 'hello.sh'.",
+          "state": "open",
+          "subtasks": []
+        },
+        {
+          "id": "0.2",
+          "goal": "Run the shell script to verify it prints 'hello'.",
+          "state": "open",
+          "subtasks": []
+        }
+      ]
+    }
+  ]
+}
+
+
+You're currently working on this task:
+Create the shell script file 'hello.sh'..
+If it's not achievable AND verifiable with a SINGLE action, you MUST break it down into subtasks NOW.
+
+You're responsible for managing this plan and the status of tasks in
+it, by using the `add_task` and `modify_task` actions described below.
+
+If the History below contradicts the state of any of these tasks, you
+MUST modify the task using the `modify_task` action described below.
+
+Be sure NOT to duplicate any tasks. Do NOT use the `add_task` action for
+a task that's already represented. Every task must be represented only once.
+
+Tasks that are sequential MUST be siblings. They must be added in order
+to their parent task.
+
+If you mark a task as 'completed', 'verified', or 'abandoned',
+all non-abandoned subtasks will be marked the same way.
+So before closing a task this way, you MUST not only be sure that it has
+been completed successfully--you must ALSO be sure that all its subtasks
+are ready to be marked the same way.
+
+If, and only if, ALL tasks have already been marked verified,
+you MUST respond with the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+ten actions--more happened before that.
+
+[
+  {
+    "source": "user",
+    "action": "message",
+    "args": {
+      "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
+      "wait_for_response": false
+    }
+  },
+  {
+    "source": "agent",
+    "action": "add_task",
+    "args": {
+      "parent": "",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "subtasks": [
+        {
+          "goal": "Create the shell script file 'hello.sh'."
+        },
+        {
+          "goal": "Write the content to print 'hello' in 'hello.sh'."
+        },
+        {
+          "goal": "Run the shell script to verify it prints 'hello'."
+        }
+      ],
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "in_progress",
+      "thought": ""
+    }
+  }
+]
+
+
+Your most recent action is at the bottom of that history.
+
+## Action
+What is your next thought or action? Your response must be in JSON format.
+
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+* `add_task` - add a task to your plan. Arguments:
+  * `parent` - the ID of the parent task (leave empty if it should go at the top level)
+  * `goal` - the goal of the task
+  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
+* `modify_task` - close a task. Arguments:
+  * `task_id` - the ID of the task to close
+  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
+* `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+You MUST take time to think in between read, write, run, and browse actions--do this with the `message` action.
+You should never act twice in a row without thinking. But if your last several
+actions are all `message` actions, you should consider taking a different action.
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
+
+You should think about the next action to take.
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_004.log
new file mode 100644
index 000000000000..c03ff9801b1c
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_004.log
@@ -0,0 +1,179 @@
+
+
+----------
+
+
+# Task
+You're a diligent software engineer AI. You can't see, draw, or interact with a
+browser, but you can read and write files, and you can run commands, and you can think.
+
+You've been given the following task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+## Plan
+As you complete this task, you're building a plan and keeping
+track of your progress. Here's a JSON representation of your plan:
+
+{
+  "id": "",
+  "goal": "",
+  "state": "in_progress",
+  "subtasks": [
+    {
+      "id": "0",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "state": "in_progress",
+      "subtasks": [
+        {
+          "id": "0.0",
+          "goal": "Create the shell script file 'hello.sh'.",
+          "state": "in_progress",
+          "subtasks": []
+        },
+        {
+          "id": "0.1",
+          "goal": "Write the content to print 'hello' in 'hello.sh'.",
+          "state": "open",
+          "subtasks": []
+        },
+        {
+          "id": "0.2",
+          "goal": "Run the shell script to verify it prints 'hello'.",
+          "state": "open",
+          "subtasks": []
+        }
+      ]
+    }
+  ]
+}
+
+
+You're currently working on this task:
+Create the shell script file 'hello.sh'..
+If it's not achievable AND verifiable with a SINGLE action, you MUST break it down into subtasks NOW.
+
+You're responsible for managing this plan and the status of tasks in
+it, by using the `add_task` and `modify_task` actions described below.
+
+If the History below contradicts the state of any of these tasks, you
+MUST modify the task using the `modify_task` action described below.
+
+Be sure NOT to duplicate any tasks. Do NOT use the `add_task` action for
+a task that's already represented. Every task must be represented only once.
+
+Tasks that are sequential MUST be siblings. They must be added in order
+to their parent task.
+
+If you mark a task as 'completed', 'verified', or 'abandoned',
+all non-abandoned subtasks will be marked the same way.
+So before closing a task this way, you MUST not only be sure that it has
+been completed successfully--you must ALSO be sure that all its subtasks
+are ready to be marked the same way.
+
+If, and only if, ALL tasks have already been marked verified,
+you MUST respond with the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+ten actions--more happened before that.
+
+[
+  {
+    "source": "user",
+    "action": "message",
+    "args": {
+      "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
+      "wait_for_response": false
+    }
+  },
+  {
+    "source": "agent",
+    "action": "add_task",
+    "args": {
+      "parent": "",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "subtasks": [
+        {
+          "goal": "Create the shell script file 'hello.sh'."
+        },
+        {
+          "goal": "Write the content to print 'hello' in 'hello.sh'."
+        },
+        {
+          "goal": "Run the shell script to verify it prints 'hello'."
+        }
+      ],
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "in_progress",
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "/workspace/hello.sh"
+    }
+  }
+]
+
+
+Your most recent action is at the bottom of that history.
+
+## Action
+What is your next thought or action? Your response must be in JSON format.
+
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+* `add_task` - add a task to your plan. Arguments:
+  * `parent` - the ID of the parent task (leave empty if it should go at the top level)
+  * `goal` - the goal of the task
+  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
+* `modify_task` - close a task. Arguments:
+  * `task_id` - the ID of the task to close
+  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
+* `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+You MUST take time to think in between read, write, run, and browse actions--do this with the `message` action.
+You should never act twice in a row without thinking. But if your last several
+actions are all `message` actions, you should consider taking a different action.
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
+
+You just changed a file. You should think about how it affects your plan.
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_005.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_005.log
new file mode 100644
index 000000000000..fe13547223c9
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_005.log
@@ -0,0 +1,187 @@
+
+
+----------
+
+
+# Task
+You're a diligent software engineer AI. You can't see, draw, or interact with a
+browser, but you can read and write files, and you can run commands, and you can think.
+
+You've been given the following task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+## Plan
+As you complete this task, you're building a plan and keeping
+track of your progress. Here's a JSON representation of your plan:
+
+{
+  "id": "",
+  "goal": "",
+  "state": "in_progress",
+  "subtasks": [
+    {
+      "id": "0",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "state": "in_progress",
+      "subtasks": [
+        {
+          "id": "0.0",
+          "goal": "Create the shell script file 'hello.sh'.",
+          "state": "completed",
+          "subtasks": []
+        },
+        {
+          "id": "0.1",
+          "goal": "Write the content to print 'hello' in 'hello.sh'.",
+          "state": "open",
+          "subtasks": []
+        },
+        {
+          "id": "0.2",
+          "goal": "Run the shell script to verify it prints 'hello'.",
+          "state": "open",
+          "subtasks": []
+        }
+      ]
+    }
+  ]
+}
+
+
+You're currently working on this task:
+Write a shell script 'hello.sh' that prints 'hello'..
+
+You're responsible for managing this plan and the status of tasks in
+it, by using the `add_task` and `modify_task` actions described below.
+
+If the History below contradicts the state of any of these tasks, you
+MUST modify the task using the `modify_task` action described below.
+
+Be sure NOT to duplicate any tasks. Do NOT use the `add_task` action for
+a task that's already represented. Every task must be represented only once.
+
+Tasks that are sequential MUST be siblings. They must be added in order
+to their parent task.
+
+If you mark a task as 'completed', 'verified', or 'abandoned',
+all non-abandoned subtasks will be marked the same way.
+So before closing a task this way, you MUST not only be sure that it has
+been completed successfully--you must ALSO be sure that all its subtasks
+are ready to be marked the same way.
+
+If, and only if, ALL tasks have already been marked verified,
+you MUST respond with the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+ten actions--more happened before that.
+
+[
+  {
+    "source": "user",
+    "action": "message",
+    "args": {
+      "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
+      "wait_for_response": false
+    }
+  },
+  {
+    "source": "agent",
+    "action": "add_task",
+    "args": {
+      "parent": "",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "subtasks": [
+        {
+          "goal": "Create the shell script file 'hello.sh'."
+        },
+        {
+          "goal": "Write the content to print 'hello' in 'hello.sh'."
+        },
+        {
+          "goal": "Run the shell script to verify it prints 'hello'."
+        }
+      ],
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "in_progress",
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "/workspace/hello.sh"
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "completed",
+      "thought": "The shell script file 'hello.sh' has been created successfully."
+    }
+  }
+]
+
+
+Your most recent action is at the bottom of that history.
+
+## Action
+What is your next thought or action? Your response must be in JSON format.
+
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+* `add_task` - add a task to your plan. Arguments:
+  * `parent` - the ID of the parent task (leave empty if it should go at the top level)
+  * `goal` - the goal of the task
+  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
+* `modify_task` - close a task. Arguments:
+  * `task_id` - the ID of the task to close
+  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
+* `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+You MUST take time to think in between read, write, run, and browse actions--do this with the `message` action.
+You should never act twice in a row without thinking. But if your last several
+actions are all `message` actions, you should consider taking a different action.
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
+
+You should think about the next action to take.
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_006.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_006.log
new file mode 100644
index 000000000000..7d762e8029f5
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_006.log
@@ -0,0 +1,197 @@
+
+
+----------
+
+
+# Task
+You're a diligent software engineer AI. You can't see, draw, or interact with a
+browser, but you can read and write files, and you can run commands, and you can think.
+
+You've been given the following task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+## Plan
+As you complete this task, you're building a plan and keeping
+track of your progress. Here's a JSON representation of your plan:
+
+{
+  "id": "",
+  "goal": "",
+  "state": "in_progress",
+  "subtasks": [
+    {
+      "id": "0",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "state": "in_progress",
+      "subtasks": [
+        {
+          "id": "0.0",
+          "goal": "Create the shell script file 'hello.sh'.",
+          "state": "completed",
+          "subtasks": []
+        },
+        {
+          "id": "0.1",
+          "goal": "Write the content to print 'hello' in 'hello.sh'.",
+          "state": "in_progress",
+          "subtasks": []
+        },
+        {
+          "id": "0.2",
+          "goal": "Run the shell script to verify it prints 'hello'.",
+          "state": "open",
+          "subtasks": []
+        }
+      ]
+    }
+  ]
+}
+
+
+You're currently working on this task:
+Write the content to print 'hello' in 'hello.sh'..
+If it's not achievable AND verifiable with a SINGLE action, you MUST break it down into subtasks NOW.
+
+You're responsible for managing this plan and the status of tasks in
+it, by using the `add_task` and `modify_task` actions described below.
+
+If the History below contradicts the state of any of these tasks, you
+MUST modify the task using the `modify_task` action described below.
+
+Be sure NOT to duplicate any tasks. Do NOT use the `add_task` action for
+a task that's already represented. Every task must be represented only once.
+
+Tasks that are sequential MUST be siblings. They must be added in order
+to their parent task.
+
+If you mark a task as 'completed', 'verified', or 'abandoned',
+all non-abandoned subtasks will be marked the same way.
+So before closing a task this way, you MUST not only be sure that it has
+been completed successfully--you must ALSO be sure that all its subtasks
+are ready to be marked the same way.
+
+If, and only if, ALL tasks have already been marked verified,
+you MUST respond with the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+ten actions--more happened before that.
+
+[
+  {
+    "source": "user",
+    "action": "message",
+    "args": {
+      "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
+      "wait_for_response": false
+    }
+  },
+  {
+    "source": "agent",
+    "action": "add_task",
+    "args": {
+      "parent": "",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "subtasks": [
+        {
+          "goal": "Create the shell script file 'hello.sh'."
+        },
+        {
+          "goal": "Write the content to print 'hello' in 'hello.sh'."
+        },
+        {
+          "goal": "Run the shell script to verify it prints 'hello'."
+        }
+      ],
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "in_progress",
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "/workspace/hello.sh"
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "completed",
+      "thought": "The shell script file 'hello.sh' has been created successfully."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.1",
+      "state": "in_progress",
+      "thought": "The next step is to write the content to print 'hello' in 'hello.sh'."
+    }
+  }
+]
+
+
+Your most recent action is at the bottom of that history.
+
+## Action
+What is your next thought or action? Your response must be in JSON format.
+
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+* `add_task` - add a task to your plan. Arguments:
+  * `parent` - the ID of the parent task (leave empty if it should go at the top level)
+  * `goal` - the goal of the task
+  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
+* `modify_task` - close a task. Arguments:
+  * `task_id` - the ID of the task to close
+  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
+* `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+You MUST take time to think in between read, write, run, and browse actions--do this with the `message` action.
+You should never act twice in a row without thinking. But if your last several
+actions are all `message` actions, you should consider taking a different action.
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
+
+You should think about the next action to take.
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_007.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_007.log
new file mode 100644
index 000000000000..dd92e38b6c4f
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_007.log
@@ -0,0 +1,216 @@
+
+
+----------
+
+
+# Task
+You're a diligent software engineer AI. You can't see, draw, or interact with a
+browser, but you can read and write files, and you can run commands, and you can think.
+
+You've been given the following task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+## Plan
+As you complete this task, you're building a plan and keeping
+track of your progress. Here's a JSON representation of your plan:
+
+{
+  "id": "",
+  "goal": "",
+  "state": "in_progress",
+  "subtasks": [
+    {
+      "id": "0",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "state": "in_progress",
+      "subtasks": [
+        {
+          "id": "0.0",
+          "goal": "Create the shell script file 'hello.sh'.",
+          "state": "completed",
+          "subtasks": []
+        },
+        {
+          "id": "0.1",
+          "goal": "Write the content to print 'hello' in 'hello.sh'.",
+          "state": "in_progress",
+          "subtasks": []
+        },
+        {
+          "id": "0.2",
+          "goal": "Run the shell script to verify it prints 'hello'.",
+          "state": "open",
+          "subtasks": []
+        }
+      ]
+    }
+  ]
+}
+
+
+You're currently working on this task:
+Write the content to print 'hello' in 'hello.sh'..
+If it's not achievable AND verifiable with a SINGLE action, you MUST break it down into subtasks NOW.
+
+You're responsible for managing this plan and the status of tasks in
+it, by using the `add_task` and `modify_task` actions described below.
+
+If the History below contradicts the state of any of these tasks, you
+MUST modify the task using the `modify_task` action described below.
+
+Be sure NOT to duplicate any tasks. Do NOT use the `add_task` action for
+a task that's already represented. Every task must be represented only once.
+
+Tasks that are sequential MUST be siblings. They must be added in order
+to their parent task.
+
+If you mark a task as 'completed', 'verified', or 'abandoned',
+all non-abandoned subtasks will be marked the same way.
+So before closing a task this way, you MUST not only be sure that it has
+been completed successfully--you must ALSO be sure that all its subtasks
+are ready to be marked the same way.
+
+If, and only if, ALL tasks have already been marked verified,
+you MUST respond with the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+ten actions--more happened before that.
+
+[
+  {
+    "source": "user",
+    "action": "message",
+    "args": {
+      "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
+      "wait_for_response": false
+    }
+  },
+  {
+    "source": "agent",
+    "action": "add_task",
+    "args": {
+      "parent": "",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "subtasks": [
+        {
+          "goal": "Create the shell script file 'hello.sh'."
+        },
+        {
+          "goal": "Write the content to print 'hello' in 'hello.sh'."
+        },
+        {
+          "goal": "Run the shell script to verify it prints 'hello'."
+        }
+      ],
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "in_progress",
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "/workspace/hello.sh"
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "completed",
+      "thought": "The shell script file 'hello.sh' has been created successfully."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.1",
+      "state": "in_progress",
+      "thought": "The next step is to write the content to print 'hello' in 'hello.sh'."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\n\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "/workspace/hello.sh"
+    }
+  }
+]
+
+
+Your most recent action is at the bottom of that history.
+
+## Action
+What is your next thought or action? Your response must be in JSON format.
+
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+* `add_task` - add a task to your plan. Arguments:
+  * `parent` - the ID of the parent task (leave empty if it should go at the top level)
+  * `goal` - the goal of the task
+  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
+* `modify_task` - close a task. Arguments:
+  * `task_id` - the ID of the task to close
+  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
+* `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+You MUST take time to think in between read, write, run, and browse actions--do this with the `message` action.
+You should never act twice in a row without thinking. But if your last several
+actions are all `message` actions, you should consider taking a different action.
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
+
+You just changed a file. You should think about how it affects your plan.
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_008.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_008.log
new file mode 100644
index 000000000000..237f8f8046d3
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_008.log
@@ -0,0 +1,224 @@
+
+
+----------
+
+
+# Task
+You're a diligent software engineer AI. You can't see, draw, or interact with a
+browser, but you can read and write files, and you can run commands, and you can think.
+
+You've been given the following task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+## Plan
+As you complete this task, you're building a plan and keeping
+track of your progress. Here's a JSON representation of your plan:
+
+{
+  "id": "",
+  "goal": "",
+  "state": "in_progress",
+  "subtasks": [
+    {
+      "id": "0",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "state": "in_progress",
+      "subtasks": [
+        {
+          "id": "0.0",
+          "goal": "Create the shell script file 'hello.sh'.",
+          "state": "completed",
+          "subtasks": []
+        },
+        {
+          "id": "0.1",
+          "goal": "Write the content to print 'hello' in 'hello.sh'.",
+          "state": "completed",
+          "subtasks": []
+        },
+        {
+          "id": "0.2",
+          "goal": "Run the shell script to verify it prints 'hello'.",
+          "state": "open",
+          "subtasks": []
+        }
+      ]
+    }
+  ]
+}
+
+
+You're currently working on this task:
+Write a shell script 'hello.sh' that prints 'hello'..
+
+You're responsible for managing this plan and the status of tasks in
+it, by using the `add_task` and `modify_task` actions described below.
+
+If the History below contradicts the state of any of these tasks, you
+MUST modify the task using the `modify_task` action described below.
+
+Be sure NOT to duplicate any tasks. Do NOT use the `add_task` action for
+a task that's already represented. Every task must be represented only once.
+
+Tasks that are sequential MUST be siblings. They must be added in order
+to their parent task.
+
+If you mark a task as 'completed', 'verified', or 'abandoned',
+all non-abandoned subtasks will be marked the same way.
+So before closing a task this way, you MUST not only be sure that it has
+been completed successfully--you must ALSO be sure that all its subtasks
+are ready to be marked the same way.
+
+If, and only if, ALL tasks have already been marked verified,
+you MUST respond with the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+ten actions--more happened before that.
+
+[
+  {
+    "source": "user",
+    "action": "message",
+    "args": {
+      "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
+      "wait_for_response": false
+    }
+  },
+  {
+    "source": "agent",
+    "action": "add_task",
+    "args": {
+      "parent": "",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "subtasks": [
+        {
+          "goal": "Create the shell script file 'hello.sh'."
+        },
+        {
+          "goal": "Write the content to print 'hello' in 'hello.sh'."
+        },
+        {
+          "goal": "Run the shell script to verify it prints 'hello'."
+        }
+      ],
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "in_progress",
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "/workspace/hello.sh"
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "completed",
+      "thought": "The shell script file 'hello.sh' has been created successfully."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.1",
+      "state": "in_progress",
+      "thought": "The next step is to write the content to print 'hello' in 'hello.sh'."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\n\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "/workspace/hello.sh"
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.1",
+      "state": "completed",
+      "thought": "The content to print 'hello' has been written to 'hello.sh'."
+    }
+  }
+]
+
+
+Your most recent action is at the bottom of that history.
+
+## Action
+What is your next thought or action? Your response must be in JSON format.
+
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+* `add_task` - add a task to your plan. Arguments:
+  * `parent` - the ID of the parent task (leave empty if it should go at the top level)
+  * `goal` - the goal of the task
+  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
+* `modify_task` - close a task. Arguments:
+  * `task_id` - the ID of the task to close
+  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
+* `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+You MUST take time to think in between read, write, run, and browse actions--do this with the `message` action.
+You should never act twice in a row without thinking. But if your last several
+actions are all `message` actions, you should consider taking a different action.
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
+
+You should think about the next action to take.
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_009.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_009.log
new file mode 100644
index 000000000000..152b99d11846
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_009.log
@@ -0,0 +1,234 @@
+
+
+----------
+
+
+# Task
+You're a diligent software engineer AI. You can't see, draw, or interact with a
+browser, but you can read and write files, and you can run commands, and you can think.
+
+You've been given the following task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+## Plan
+As you complete this task, you're building a plan and keeping
+track of your progress. Here's a JSON representation of your plan:
+
+{
+  "id": "",
+  "goal": "",
+  "state": "in_progress",
+  "subtasks": [
+    {
+      "id": "0",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "state": "in_progress",
+      "subtasks": [
+        {
+          "id": "0.0",
+          "goal": "Create the shell script file 'hello.sh'.",
+          "state": "completed",
+          "subtasks": []
+        },
+        {
+          "id": "0.1",
+          "goal": "Write the content to print 'hello' in 'hello.sh'.",
+          "state": "completed",
+          "subtasks": []
+        },
+        {
+          "id": "0.2",
+          "goal": "Run the shell script to verify it prints 'hello'.",
+          "state": "in_progress",
+          "subtasks": []
+        }
+      ]
+    }
+  ]
+}
+
+
+You're currently working on this task:
+Run the shell script to verify it prints 'hello'..
+If it's not achievable AND verifiable with a SINGLE action, you MUST break it down into subtasks NOW.
+
+You're responsible for managing this plan and the status of tasks in
+it, by using the `add_task` and `modify_task` actions described below.
+
+If the History below contradicts the state of any of these tasks, you
+MUST modify the task using the `modify_task` action described below.
+
+Be sure NOT to duplicate any tasks. Do NOT use the `add_task` action for
+a task that's already represented. Every task must be represented only once.
+
+Tasks that are sequential MUST be siblings. They must be added in order
+to their parent task.
+
+If you mark a task as 'completed', 'verified', or 'abandoned',
+all non-abandoned subtasks will be marked the same way.
+So before closing a task this way, you MUST not only be sure that it has
+been completed successfully--you must ALSO be sure that all its subtasks
+are ready to be marked the same way.
+
+If, and only if, ALL tasks have already been marked verified,
+you MUST respond with the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+ten actions--more happened before that.
+
+[
+  {
+    "source": "user",
+    "action": "message",
+    "args": {
+      "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
+      "wait_for_response": false
+    }
+  },
+  {
+    "source": "agent",
+    "action": "add_task",
+    "args": {
+      "parent": "",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "subtasks": [
+        {
+          "goal": "Create the shell script file 'hello.sh'."
+        },
+        {
+          "goal": "Write the content to print 'hello' in 'hello.sh'."
+        },
+        {
+          "goal": "Run the shell script to verify it prints 'hello'."
+        }
+      ],
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "in_progress",
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "/workspace/hello.sh"
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "completed",
+      "thought": "The shell script file 'hello.sh' has been created successfully."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.1",
+      "state": "in_progress",
+      "thought": "The next step is to write the content to print 'hello' in 'hello.sh'."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\n\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "/workspace/hello.sh"
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.1",
+      "state": "completed",
+      "thought": "The content to print 'hello' has been written to 'hello.sh'."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.2",
+      "state": "in_progress",
+      "thought": "The next step is to run the shell script to verify it prints 'hello'."
+    }
+  }
+]
+
+
+Your most recent action is at the bottom of that history.
+
+## Action
+What is your next thought or action? Your response must be in JSON format.
+
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+* `add_task` - add a task to your plan. Arguments:
+  * `parent` - the ID of the parent task (leave empty if it should go at the top level)
+  * `goal` - the goal of the task
+  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
+* `modify_task` - close a task. Arguments:
+  * `task_id` - the ID of the task to close
+  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
+* `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+You MUST take time to think in between read, write, run, and browse actions--do this with the `message` action.
+You should never act twice in a row without thinking. But if your last several
+actions are all `message` actions, you should consider taking a different action.
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
+
+You should think about the next action to take.
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_010.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_010.log
new file mode 100644
index 000000000000..16fd2fd0f890
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_010.log
@@ -0,0 +1,254 @@
+
+
+----------
+
+
+# Task
+You're a diligent software engineer AI. You can't see, draw, or interact with a
+browser, but you can read and write files, and you can run commands, and you can think.
+
+You've been given the following task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+## Plan
+As you complete this task, you're building a plan and keeping
+track of your progress. Here's a JSON representation of your plan:
+
+{
+  "id": "",
+  "goal": "",
+  "state": "in_progress",
+  "subtasks": [
+    {
+      "id": "0",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "state": "in_progress",
+      "subtasks": [
+        {
+          "id": "0.0",
+          "goal": "Create the shell script file 'hello.sh'.",
+          "state": "completed",
+          "subtasks": []
+        },
+        {
+          "id": "0.1",
+          "goal": "Write the content to print 'hello' in 'hello.sh'.",
+          "state": "completed",
+          "subtasks": []
+        },
+        {
+          "id": "0.2",
+          "goal": "Run the shell script to verify it prints 'hello'.",
+          "state": "in_progress",
+          "subtasks": []
+        }
+      ]
+    }
+  ]
+}
+
+
+You're currently working on this task:
+Run the shell script to verify it prints 'hello'..
+If it's not achievable AND verifiable with a SINGLE action, you MUST break it down into subtasks NOW.
+
+You're responsible for managing this plan and the status of tasks in
+it, by using the `add_task` and `modify_task` actions described below.
+
+If the History below contradicts the state of any of these tasks, you
+MUST modify the task using the `modify_task` action described below.
+
+Be sure NOT to duplicate any tasks. Do NOT use the `add_task` action for
+a task that's already represented. Every task must be represented only once.
+
+Tasks that are sequential MUST be siblings. They must be added in order
+to their parent task.
+
+If you mark a task as 'completed', 'verified', or 'abandoned',
+all non-abandoned subtasks will be marked the same way.
+So before closing a task this way, you MUST not only be sure that it has
+been completed successfully--you must ALSO be sure that all its subtasks
+are ready to be marked the same way.
+
+If, and only if, ALL tasks have already been marked verified,
+you MUST respond with the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+ten actions--more happened before that.
+
+[
+  {
+    "source": "user",
+    "action": "message",
+    "args": {
+      "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
+      "wait_for_response": false
+    }
+  },
+  {
+    "source": "agent",
+    "action": "add_task",
+    "args": {
+      "parent": "",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "subtasks": [
+        {
+          "goal": "Create the shell script file 'hello.sh'."
+        },
+        {
+          "goal": "Write the content to print 'hello' in 'hello.sh'."
+        },
+        {
+          "goal": "Run the shell script to verify it prints 'hello'."
+        }
+      ],
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "in_progress",
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "/workspace/hello.sh"
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "completed",
+      "thought": "The shell script file 'hello.sh' has been created successfully."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.1",
+      "state": "in_progress",
+      "thought": "The next step is to write the content to print 'hello' in 'hello.sh'."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\n\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "/workspace/hello.sh"
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.1",
+      "state": "completed",
+      "thought": "The content to print 'hello' has been written to 'hello.sh'."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.2",
+      "state": "in_progress",
+      "thought": "The next step is to run the shell script to verify it prints 'hello'."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "run",
+    "args": {
+      "command": "bash hello.sh",
+      "thought": "",
+      "keep_prompt": true,
+      "is_confirmed": "confirmed"
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "run",
+    "content": "hello\r\n\r\nopendevin@docker-desktop:/workspace $ ",
+    "extras": {
+      "command_id": -1,
+      "command": "bash hello.sh",
+      "exit_code": 0
+    }
+  }
+]
+
+
+Your most recent action is at the bottom of that history.
+
+## Action
+What is your next thought or action? Your response must be in JSON format.
+
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+* `add_task` - add a task to your plan. Arguments:
+  * `parent` - the ID of the parent task (leave empty if it should go at the top level)
+  * `goal` - the goal of the task
+  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
+* `modify_task` - close a task. Arguments:
+  * `task_id` - the ID of the task to close
+  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
+* `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+You MUST take time to think in between read, write, run, and browse actions--do this with the `message` action.
+You should never act twice in a row without thinking. But if your last several
+actions are all `message` actions, you should consider taking a different action.
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
+
+You should think about the command you just ran, what output it gave, and how that affects your plan.
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_011.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_011.log
new file mode 100644
index 000000000000..6012255eb8e0
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_011.log
@@ -0,0 +1,262 @@
+
+
+----------
+
+
+# Task
+You're a diligent software engineer AI. You can't see, draw, or interact with a
+browser, but you can read and write files, and you can run commands, and you can think.
+
+You've been given the following task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+## Plan
+As you complete this task, you're building a plan and keeping
+track of your progress. Here's a JSON representation of your plan:
+
+{
+  "id": "",
+  "goal": "",
+  "state": "in_progress",
+  "subtasks": [
+    {
+      "id": "0",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "state": "in_progress",
+      "subtasks": [
+        {
+          "id": "0.0",
+          "goal": "Create the shell script file 'hello.sh'.",
+          "state": "completed",
+          "subtasks": []
+        },
+        {
+          "id": "0.1",
+          "goal": "Write the content to print 'hello' in 'hello.sh'.",
+          "state": "completed",
+          "subtasks": []
+        },
+        {
+          "id": "0.2",
+          "goal": "Run the shell script to verify it prints 'hello'.",
+          "state": "completed",
+          "subtasks": []
+        }
+      ]
+    }
+  ]
+}
+
+
+You're currently working on this task:
+Write a shell script 'hello.sh' that prints 'hello'..
+
+You're responsible for managing this plan and the status of tasks in
+it, by using the `add_task` and `modify_task` actions described below.
+
+If the History below contradicts the state of any of these tasks, you
+MUST modify the task using the `modify_task` action described below.
+
+Be sure NOT to duplicate any tasks. Do NOT use the `add_task` action for
+a task that's already represented. Every task must be represented only once.
+
+Tasks that are sequential MUST be siblings. They must be added in order
+to their parent task.
+
+If you mark a task as 'completed', 'verified', or 'abandoned',
+all non-abandoned subtasks will be marked the same way.
+So before closing a task this way, you MUST not only be sure that it has
+been completed successfully--you must ALSO be sure that all its subtasks
+are ready to be marked the same way.
+
+If, and only if, ALL tasks have already been marked verified,
+you MUST respond with the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+ten actions--more happened before that.
+
+[
+  {
+    "source": "user",
+    "action": "message",
+    "args": {
+      "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
+      "wait_for_response": false
+    }
+  },
+  {
+    "source": "agent",
+    "action": "add_task",
+    "args": {
+      "parent": "",
+      "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+      "subtasks": [
+        {
+          "goal": "Create the shell script file 'hello.sh'."
+        },
+        {
+          "goal": "Write the content to print 'hello' in 'hello.sh'."
+        },
+        {
+          "goal": "Run the shell script to verify it prints 'hello'."
+        }
+      ],
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "in_progress",
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "/workspace/hello.sh"
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.0",
+      "state": "completed",
+      "thought": "The shell script file 'hello.sh' has been created successfully."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.1",
+      "state": "in_progress",
+      "thought": "The next step is to write the content to print 'hello' in 'hello.sh'."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "write",
+    "args": {
+      "path": "hello.sh",
+      "content": "#!/bin/bash\n\necho 'hello'",
+      "start": 0,
+      "end": -1,
+      "thought": ""
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "write",
+    "content": "",
+    "extras": {
+      "path": "/workspace/hello.sh"
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.1",
+      "state": "completed",
+      "thought": "The content to print 'hello' has been written to 'hello.sh'."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.2",
+      "state": "in_progress",
+      "thought": "The next step is to run the shell script to verify it prints 'hello'."
+    }
+  },
+  {
+    "source": "agent",
+    "action": "run",
+    "args": {
+      "command": "bash hello.sh",
+      "thought": "",
+      "keep_prompt": true,
+      "is_confirmed": "confirmed"
+    }
+  },
+  {
+    "source": "agent",
+    "observation": "run",
+    "content": "hello\r\n\r\nopendevin@docker-desktop:/workspace $ ",
+    "extras": {
+      "command_id": -1,
+      "command": "bash hello.sh",
+      "exit_code": 0
+    }
+  },
+  {
+    "source": "agent",
+    "action": "modify_task",
+    "args": {
+      "task_id": "0.2",
+      "state": "completed",
+      "thought": "The shell script 'hello.sh' ran successfully and printed 'hello'."
+    }
+  }
+]
+
+
+Your most recent action is at the bottom of that history.
+
+## Action
+What is your next thought or action? Your response must be in JSON format.
+
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+* `add_task` - add a task to your plan. Arguments:
+  * `parent` - the ID of the parent task (leave empty if it should go at the top level)
+  * `goal` - the goal of the task
+  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
+* `modify_task` - close a task. Arguments:
+  * `task_id` - the ID of the task to close
+  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
+* `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+You MUST take time to think in between read, write, run, and browse actions--do this with the `message` action.
+You should never act twice in a row without thinking. But if your last several
+actions are all `message` actions, you should consider taking a different action.
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
+
+You should think about the next action to take.
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_001.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_001.log
new file mode 100644
index 000000000000..6c92e2144ccc
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_001.log
@@ -0,0 +1,20 @@
+```json
+{
+  "action": "add_task",
+  "args": {
+    "parent": "",
+    "goal": "Write a shell script 'hello.sh' that prints 'hello'.",
+    "subtasks": [
+      {
+        "goal": "Create the shell script file 'hello.sh'."
+      },
+      {
+        "goal": "Write the content to print 'hello' in 'hello.sh'."
+      },
+      {
+        "goal": "Run the shell script to verify it prints 'hello'."
+      }
+    ]
+  }
+}
+```
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_002.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_002.log
similarity index 100%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/response_002.log
rename to tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_002.log
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_003.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_003.log
similarity index 100%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/response_003.log
rename to tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_003.log
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_004.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_004.log
new file mode 100644
index 000000000000..c617e4d165b4
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_004.log
@@ -0,0 +1,10 @@
+```json
+{
+  "action": "modify_task",
+  "args": {
+    "task_id": "0.0",
+    "state": "completed",
+    "thought": "The shell script file 'hello.sh' has been created successfully."
+  }
+}
+```
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_005.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_005.log
new file mode 100644
index 000000000000..d8a2c6e3f872
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_005.log
@@ -0,0 +1,10 @@
+```json
+{
+  "action": "modify_task",
+  "args": {
+    "task_id": "0.1",
+    "state": "in_progress",
+    "thought": "The next step is to write the content to print 'hello' in 'hello.sh'."
+  }
+}
+```
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_006.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_006.log
new file mode 100644
index 000000000000..c146bbc0164c
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_006.log
@@ -0,0 +1,9 @@
+```json
+{
+  "action": "write",
+  "args": {
+    "path": "hello.sh",
+    "content": "#!/bin/bash\n\necho 'hello'"
+  }
+}
+```
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_007.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_007.log
new file mode 100644
index 000000000000..c548df329af9
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_007.log
@@ -0,0 +1,10 @@
+```json
+{
+  "action": "modify_task",
+  "args": {
+    "task_id": "0.1",
+    "state": "completed",
+    "thought": "The content to print 'hello' has been written to 'hello.sh'."
+  }
+}
+```
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_008.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_008.log
new file mode 100644
index 000000000000..0b767dfd2b5e
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_008.log
@@ -0,0 +1,10 @@
+```json
+{
+  "action": "modify_task",
+  "args": {
+    "task_id": "0.2",
+    "state": "in_progress",
+    "thought": "The next step is to run the shell script to verify it prints 'hello'."
+  }
+}
+```
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_009.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_009.log
similarity index 100%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/response_009.log
rename to tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_009.log
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_010.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_010.log
similarity index 100%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/response_010.log
rename to tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_010.log
diff --git a/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_011.log b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_011.log
new file mode 100644
index 000000000000..4aecac12ebf0
--- /dev/null
+++ b/tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/response_011.log
@@ -0,0 +1,10 @@
+```json
+{
+  "action": "modify_task",
+  "args": {
+    "task_id": "0",
+    "state": "verified",
+    "thought": "All subtasks for writing the shell script 'hello.sh' that prints 'hello' have been completed successfully. The script has been verified to work as expected."
+  }
+}
+```
diff --git a/tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/prompt_001.log b/tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/prompt_001.log
new file mode 100644
index 000000000000..dcde11e4e398
--- /dev/null
+++ b/tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/prompt_001.log
@@ -0,0 +1,126 @@
+
+
+----------
+
+# Instructions
+Review the current state of the page and all other information to find the best
+possible next action to accomplish your goal. Your answer will be interpreted
+and executed by a program, make sure to follow the formatting instructions.
+
+# Goal:
+Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.
+
+# Action Space
+
+16 different types of actions are available.
+
+noop(wait_ms: float = 1000)
+    Examples:
+        noop()
+
+        noop(500)
+
+send_msg_to_user(text: str)
+    Examples:
+        send_msg_to_user('Based on the results of my search, the city was built in 1751.')
+
+scroll(delta_x: float, delta_y: float)
+    Examples:
+        scroll(0, 200)
+
+        scroll(-50.2, -100.5)
+
+fill(bid: str, value: str)
+    Examples:
+        fill('237', 'example value')
+
+        fill('45', 'multi-line\nexample')
+
+        fill('a12', 'example with "quotes"')
+
+select_option(bid: str, options: str | list[str])
+    Examples:
+        select_option('48', 'blue')
+
+        select_option('48', ['red', 'green', 'blue'])
+
+click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'Meta', 'Shift']] = [])
+    Examples:
+        click('51')
+
+        click('b22', button='right')
+
+        click('48', button='middle', modifiers=['Shift'])
+
+dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'Meta', 'Shift']] = [])
+    Examples:
+        dblclick('12')
+
+        dblclick('ca42', button='right')
+
+        dblclick('178', button='middle', modifiers=['Shift'])
+
+hover(bid: str)
+    Examples:
+        hover('b8')
+
+press(bid: str, key_comb: str)
+    Examples:
+        press('88', 'Backspace')
+
+        press('a26', 'Control+a')
+
+        press('a61', 'Meta+Shift+t')
+
+focus(bid: str)
+    Examples:
+        focus('b455')
+
+clear(bid: str)
+    Examples:
+        clear('996')
+
+drag_and_drop(from_bid: str, to_bid: str)
+    Examples:
+        drag_and_drop('56', '498')
+
+upload_file(bid: str, file: str | list[str])
+    Examples:
+        upload_file('572', 'my_receipt.pdf')
+
+        upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
+
+go_back()
+    Examples:
+        go_back()
+
+go_forward()
+    Examples:
+        go_forward()
+
+goto(url: str)
+    Examples:
+        goto('http://www.example.com')
+
+Multiple actions can be provided at once. Example:
+fill('a12', 'example with "quotes"')
+click('51')
+click('48', button='middle', modifiers=['Shift'])
+Multiple actions are meant to be executed sequentially without any feedback from the page.
+Don't execute multiple actions at once if you need feedback from the page.
+
+
+
+----------
+
+# Current Accessibility Tree:
+
+
+# Previous Actions
+
+
+Here is an example with chain of thought of a valid action when clicking on a button:
+"
+In order to accomplish my goal I need to click on the button with bid 12
+```click("12")```
+"
diff --git a/tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/prompt_002.log b/tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/prompt_002.log
new file mode 100644
index 000000000000..7bb7b3309561
--- /dev/null
+++ b/tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/prompt_002.log
@@ -0,0 +1,130 @@
+
+
+----------
+
+# Instructions
+Review the current state of the page and all other information to find the best
+possible next action to accomplish your goal. Your answer will be interpreted
+and executed by a program, make sure to follow the formatting instructions.
+
+# Goal:
+Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.
+
+# Action Space
+
+16 different types of actions are available.
+
+noop(wait_ms: float = 1000)
+    Examples:
+        noop()
+
+        noop(500)
+
+send_msg_to_user(text: str)
+    Examples:
+        send_msg_to_user('Based on the results of my search, the city was built in 1751.')
+
+scroll(delta_x: float, delta_y: float)
+    Examples:
+        scroll(0, 200)
+
+        scroll(-50.2, -100.5)
+
+fill(bid: str, value: str)
+    Examples:
+        fill('237', 'example value')
+
+        fill('45', 'multi-line\nexample')
+
+        fill('a12', 'example with "quotes"')
+
+select_option(bid: str, options: str | list[str])
+    Examples:
+        select_option('48', 'blue')
+
+        select_option('48', ['red', 'green', 'blue'])
+
+click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'Meta', 'Shift']] = [])
+    Examples:
+        click('51')
+
+        click('b22', button='right')
+
+        click('48', button='middle', modifiers=['Shift'])
+
+dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'Meta', 'Shift']] = [])
+    Examples:
+        dblclick('12')
+
+        dblclick('ca42', button='right')
+
+        dblclick('178', button='middle', modifiers=['Shift'])
+
+hover(bid: str)
+    Examples:
+        hover('b8')
+
+press(bid: str, key_comb: str)
+    Examples:
+        press('88', 'Backspace')
+
+        press('a26', 'Control+a')
+
+        press('a61', 'Meta+Shift+t')
+
+focus(bid: str)
+    Examples:
+        focus('b455')
+
+clear(bid: str)
+    Examples:
+        clear('996')
+
+drag_and_drop(from_bid: str, to_bid: str)
+    Examples:
+        drag_and_drop('56', '498')
+
+upload_file(bid: str, file: str | list[str])
+    Examples:
+        upload_file('572', 'my_receipt.pdf')
+
+        upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
+
+go_back()
+    Examples:
+        go_back()
+
+go_forward()
+    Examples:
+        go_forward()
+
+goto(url: str)
+    Examples:
+        goto('http://www.example.com')
+
+Multiple actions can be provided at once. Example:
+fill('a12', 'example with "quotes"')
+click('51')
+click('48', button='middle', modifiers=['Shift'])
+Multiple actions are meant to be executed sequentially without any feedback from the page.
+Don't execute multiple actions at once if you need feedback from the page.
+
+
+
+----------
+
+# Current Accessibility Tree:
+RootWebArea 'The Ultimate Answer', focused
+	[8] heading 'The Ultimate Answer'
+	[9] paragraph ''
+		StaticText 'Click the button to reveal the answer to life, the universe, and everything.'
+	[10] button 'Click me', clickable
+
+# Previous Actions
+goto('http://localhost:8000')
+
+Here is an example with chain of thought of a valid action when clicking on a button:
+"
+In order to accomplish my goal I need to click on the button with bid 12
+```click("12")```
+"
diff --git a/tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/prompt_003.log b/tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/prompt_003.log
new file mode 100644
index 000000000000..f183ccbb59bb
--- /dev/null
+++ b/tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/prompt_003.log
@@ -0,0 +1,132 @@
+
+
+----------
+
+# Instructions
+Review the current state of the page and all other information to find the best
+possible next action to accomplish your goal. Your answer will be interpreted
+and executed by a program, make sure to follow the formatting instructions.
+
+# Goal:
+Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.
+
+# Action Space
+
+16 different types of actions are available.
+
+noop(wait_ms: float = 1000)
+    Examples:
+        noop()
+
+        noop(500)
+
+send_msg_to_user(text: str)
+    Examples:
+        send_msg_to_user('Based on the results of my search, the city was built in 1751.')
+
+scroll(delta_x: float, delta_y: float)
+    Examples:
+        scroll(0, 200)
+
+        scroll(-50.2, -100.5)
+
+fill(bid: str, value: str)
+    Examples:
+        fill('237', 'example value')
+
+        fill('45', 'multi-line\nexample')
+
+        fill('a12', 'example with "quotes"')
+
+select_option(bid: str, options: str | list[str])
+    Examples:
+        select_option('48', 'blue')
+
+        select_option('48', ['red', 'green', 'blue'])
+
+click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'Meta', 'Shift']] = [])
+    Examples:
+        click('51')
+
+        click('b22', button='right')
+
+        click('48', button='middle', modifiers=['Shift'])
+
+dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'Meta', 'Shift']] = [])
+    Examples:
+        dblclick('12')
+
+        dblclick('ca42', button='right')
+
+        dblclick('178', button='middle', modifiers=['Shift'])
+
+hover(bid: str)
+    Examples:
+        hover('b8')
+
+press(bid: str, key_comb: str)
+    Examples:
+        press('88', 'Backspace')
+
+        press('a26', 'Control+a')
+
+        press('a61', 'Meta+Shift+t')
+
+focus(bid: str)
+    Examples:
+        focus('b455')
+
+clear(bid: str)
+    Examples:
+        clear('996')
+
+drag_and_drop(from_bid: str, to_bid: str)
+    Examples:
+        drag_and_drop('56', '498')
+
+upload_file(bid: str, file: str | list[str])
+    Examples:
+        upload_file('572', 'my_receipt.pdf')
+
+        upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
+
+go_back()
+    Examples:
+        go_back()
+
+go_forward()
+    Examples:
+        go_forward()
+
+goto(url: str)
+    Examples:
+        goto('http://www.example.com')
+
+Multiple actions can be provided at once. Example:
+fill('a12', 'example with "quotes"')
+click('51')
+click('48', button='middle', modifiers=['Shift'])
+Multiple actions are meant to be executed sequentially without any feedback from the page.
+Don't execute multiple actions at once if you need feedback from the page.
+
+
+
+----------
+
+# Current Accessibility Tree:
+RootWebArea 'The Ultimate Answer', focused
+	[8] heading 'The Ultimate Answer'
+	[9] paragraph ''
+		StaticText 'Click the button to reveal the answer to life, the universe, and everything.'
+	[10] button 'Click me', clickable, focused
+	StaticText 'The answer is OpenDevin is all you need!'
+
+# Previous Actions
+goto('http://localhost:8000')
+click("10")
+
+Here is an example with chain of thought of a valid action when clicking on a button:
+"
+In order to accomplish my goal I need to click on the button with bid 12
+```click("12")```
+"
diff --git a/tests/integration/mock/BrowsingAgent/test_browse_internet/response_001.log b/tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/response_001.log
similarity index 100%
rename from tests/integration/mock/BrowsingAgent/test_browse_internet/response_001.log
rename to tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/response_001.log
diff --git a/tests/integration/mock/BrowsingAgent/test_browse_internet/response_002.log b/tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/response_002.log
similarity index 100%
rename from tests/integration/mock/BrowsingAgent/test_browse_internet/response_002.log
rename to tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/response_002.log
diff --git a/tests/integration/mock/BrowsingAgent/test_browse_internet/response_003.log b/tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/response_003.log
similarity index 100%
rename from tests/integration/mock/BrowsingAgent/test_browse_internet/response_003.log
rename to tests/integration/mock/server_runtime/BrowsingAgent/test_browse_internet/response_003.log
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_001.log b/tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/prompt_001.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_browse_internet/prompt_001.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/prompt_001.log
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_002.log b/tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/prompt_002.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_browse_internet/prompt_002.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/prompt_002.log
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_003.log b/tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/prompt_003.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_browse_internet/prompt_003.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/prompt_003.log
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_004.log b/tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/prompt_004.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_browse_internet/prompt_004.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/prompt_004.log
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_005.log b/tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/prompt_005.log
similarity index 99%
rename from tests/integration/mock/CodeActAgent/test_browse_internet/prompt_005.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/prompt_005.log
index 645ac5d1ef0f..72cc98e46804 100644
--- a/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_005.log
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/prompt_005.log
@@ -396,8 +396,6 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 
 NOW, LET'S START!
 
-----------
-
 Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.
 
 ----------
@@ -412,4 +410,4 @@ Sure! Let me browse the server's homepage at http://localhost:8000 to find the u
 OBSERVATION:
 {'content': 'The answer to life, the universe, and everything is: OpenDevin is all you need!'}
 
-ENVIRONMENT REMINDER: You have 8 turns left to complete the task. When finished reply with <finish></finish>
+ENVIRONMENT REMINDER: You have 8 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/response_001.log b/tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/response_001.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_browse_internet/response_001.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/response_001.log
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/response_002.log b/tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/response_002.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_browse_internet/response_002.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/response_002.log
diff --git a/tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/response_003.log b/tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/response_003.log
new file mode 100644
index 000000000000..cfc29b91693e
--- /dev/null
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/response_003.log
@@ -0,0 +1,2 @@
+In order to accomplish my goal, I need to click on the button with bid 10 to reveal the answer to life, the universe, and everything.
+```click("10"
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/response_004.log b/tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/response_004.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_browse_internet/response_004.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/response_004.log
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/response_005.log b/tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/response_005.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_browse_internet/response_005.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_browse_internet/response_005.log
diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_001.log b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_001.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_edits/prompt_001.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_001.log
diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_002.log
similarity index 99%
rename from tests/integration/mock/CodeActAgent/test_edits/prompt_002.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_002.log
index 14ca3b1ee863..b86926a95656 100644
--- a/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_002.log
@@ -396,8 +396,6 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 
 NOW, LET'S START!
 
-----------
-
 Fix typos in bad.txt. Do not ask me for confirmation at any point.
 
 ----------
@@ -415,4 +413,4 @@ OBSERVATION:
 [End of matches for "bad.txt" in ./]
 
 
-ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_003.log
similarity index 99%
rename from tests/integration/mock/CodeActAgent/test_edits/prompt_003.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_003.log
index c154368638e1..e5d519666d33 100644
--- a/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_003.log
@@ -396,8 +396,6 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 
 NOW, LET'S START!
 
-----------
-
 Fix typos in bad.txt. Do not ask me for confirmation at any point.
 
 ----------
@@ -434,4 +432,4 @@ OBSERVATION:
 (this is the end of the file)
 
 
-ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_004.log b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_004.log
similarity index 99%
rename from tests/integration/mock/CodeActAgent/test_edits/prompt_004.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_004.log
index 5234b65b2aca..b6aa38ba3854 100644
--- a/tests/integration/mock/CodeActAgent/test_edits/prompt_004.log
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_004.log
@@ -396,8 +396,6 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 
 NOW, LET'S START!
 
-----------
-
 Fix typos in bad.txt. Do not ask me for confirmation at any point.
 
 ----------
@@ -463,4 +461,4 @@ OBSERVATION:
 [File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
 
 
-ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_005.log b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_005.log
similarity index 99%
rename from tests/integration/mock/CodeActAgent/test_edits/prompt_005.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_005.log
index 9d3c125ec7e5..110fb844dcd1 100644
--- a/tests/integration/mock/CodeActAgent/test_edits/prompt_005.log
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/prompt_005.log
@@ -396,8 +396,6 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 
 NOW, LET'S START!
 
-----------
-
 Fix typos in bad.txt. Do not ask me for confirmation at any point.
 
 ----------
@@ -491,4 +489,4 @@ OBSERVATION:
 [File updated (edited at line 3). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
 
 
-ENVIRONMENT REMINDER: You have 10 turns left to complete the task. When finished reply with <finish></finish>
+ENVIRONMENT REMINDER: You have 10 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActAgent/test_edits/response_001.log b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/response_001.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_edits/response_001.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_edits/response_001.log
diff --git a/tests/integration/mock/CodeActAgent/test_edits/response_002.log b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/response_002.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_edits/response_002.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_edits/response_002.log
diff --git a/tests/integration/mock/CodeActAgent/test_edits/response_003.log b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/response_003.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_edits/response_003.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_edits/response_003.log
diff --git a/tests/integration/mock/CodeActAgent/test_edits/response_004.log b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/response_004.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_edits/response_004.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_edits/response_004.log
diff --git a/tests/integration/mock/CodeActAgent/test_edits/response_005.log b/tests/integration/mock/server_runtime/CodeActAgent/test_edits/response_005.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_edits/response_005.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_edits/response_005.log
diff --git a/tests/integration/mock/CodeActAgent/test_ipython/prompt_001.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython/prompt_001.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_ipython/prompt_001.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython/prompt_001.log
diff --git a/tests/integration/mock/CodeActAgent/test_ipython/prompt_002.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython/prompt_002.log
similarity index 99%
rename from tests/integration/mock/CodeActAgent/test_ipython/prompt_002.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython/prompt_002.log
index 544403208a92..3d257a10fa0f 100644
--- a/tests/integration/mock/CodeActAgent/test_ipython/prompt_002.log
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython/prompt_002.log
@@ -396,8 +396,6 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 
 NOW, LET'S START!
 
-----------
-
 Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'. Do not ask me for confirmation at any point.
 
 ----------
@@ -415,4 +413,4 @@ OBSERVATION:
 File created successfully.
 
 
-ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActAgent/test_ipython/prompt_003.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython/prompt_003.log
similarity index 99%
rename from tests/integration/mock/CodeActAgent/test_ipython/prompt_003.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython/prompt_003.log
index 1eb6cc6274c4..0990c25bf19d 100644
--- a/tests/integration/mock/CodeActAgent/test_ipython/prompt_003.log
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython/prompt_003.log
@@ -396,8 +396,6 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 
 NOW, LET'S START!
 
-----------
-
 Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'. Do not ask me for confirmation at any point.
 
 ----------
@@ -430,4 +428,4 @@ OBSERVATION:
 Content of /workspace/test.txt: hello world
 
 
-ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActAgent/test_ipython/response_001.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython/response_001.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_ipython/response_001.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython/response_001.log
diff --git a/tests/integration/mock/CodeActAgent/test_ipython/response_002.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython/response_002.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_ipython/response_002.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython/response_002.log
diff --git a/tests/integration/mock/CodeActAgent/test_ipython/response_003.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython/response_003.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_ipython/response_003.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython/response_003.log
diff --git a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_001.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/prompt_001.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_ipython_module/prompt_001.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/prompt_001.log
diff --git a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_002.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/prompt_002.log
similarity index 99%
rename from tests/integration/mock/CodeActAgent/test_ipython_module/prompt_002.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/prompt_002.log
index 5b67d2523375..61e76b991ea9 100644
--- a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_002.log
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/prompt_002.log
@@ -396,8 +396,6 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 
 NOW, LET'S START!
 
-----------
-
 Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
 
 ----------
@@ -413,4 +411,4 @@ OBSERVATION:
 [Package installed successfully]
 [Kernel restarted successfully to load the package]
 
-ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_003.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/prompt_003.log
similarity index 99%
rename from tests/integration/mock/CodeActAgent/test_ipython_module/prompt_003.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/prompt_003.log
index a5e6549bf391..9d966960bc89 100644
--- a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_003.log
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/prompt_003.log
@@ -396,8 +396,6 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 
 NOW, LET'S START!
 
-----------
-
 Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
 
 ----------
@@ -429,4 +427,4 @@ OBSERVATION:
 Version written to /workspace/test.txt
 
 
-ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_004.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/prompt_004.log
similarity index 99%
rename from tests/integration/mock/CodeActAgent/test_ipython_module/prompt_004.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/prompt_004.log
index c4c317508065..9fd341238d81 100644
--- a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_004.log
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/prompt_004.log
@@ -396,8 +396,6 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 
 NOW, LET'S START!
 
-----------
-
 Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
 
 ----------
@@ -442,4 +440,4 @@ OBSERVATION:
 pymsgbox version: 1.0.9
 [Command -1 finished with exit code 0]
 
-ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActAgent/test_ipython_module/response_001.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/response_001.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_ipython_module/response_001.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/response_001.log
diff --git a/tests/integration/mock/CodeActAgent/test_ipython_module/response_002.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/response_002.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_ipython_module/response_002.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/response_002.log
diff --git a/tests/integration/mock/CodeActAgent/test_ipython_module/response_003.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/response_003.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_ipython_module/response_003.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/response_003.log
diff --git a/tests/integration/mock/CodeActAgent/test_ipython_module/response_004.log b/tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/response_004.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_ipython_module/response_004.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_ipython_module/response_004.log
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_001.log b/tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/prompt_001.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_001.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/prompt_001.log
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/prompt_002.log
similarity index 99%
rename from tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/prompt_002.log
index ca4f9fe02e92..fe5ece61fcc9 100644
--- a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/prompt_002.log
@@ -396,8 +396,6 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 
 NOW, LET'S START!
 
-----------
-
 Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
 
 ----------
@@ -419,4 +417,4 @@ OBSERVATION:
 [File hello.sh created.]
 
 
-ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/prompt_003.log
similarity index 99%
rename from tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_003.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/prompt_003.log
index 08061310e5b2..3223fc5c809f 100644
--- a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_003.log
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/prompt_003.log
@@ -396,8 +396,6 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 
 NOW, LET'S START!
 
-----------
-
 Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
 
 ----------
@@ -437,4 +435,4 @@ OBSERVATION:
 [File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
 
 
-ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/prompt_004.log
similarity index 99%
rename from tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_004.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/prompt_004.log
index af83dd9d138e..12fdb252d760 100644
--- a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_004.log
+++ b/tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/prompt_004.log
@@ -396,8 +396,6 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 
 NOW, LET'S START!
 
-----------
-
 Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
 
 ----------
@@ -451,4 +449,4 @@ OBSERVATION:
 hello
 [Command -1 finished with exit code 0]
 
-ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/response_001.log b/tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/response_001.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_write_simple_script/response_001.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/response_001.log
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/response_002.log b/tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/response_002.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_write_simple_script/response_002.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/response_002.log
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/response_003.log b/tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/response_003.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_write_simple_script/response_003.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/response_003.log
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/response_004.log b/tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/response_004.log
similarity index 100%
rename from tests/integration/mock/CodeActAgent/test_write_simple_script/response_004.log
rename to tests/integration/mock/server_runtime/CodeActAgent/test_write_simple_script/response_004.log
diff --git a/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_001.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_001.log
new file mode 100644
index 000000000000..75f5cea76e94
--- /dev/null
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_001.log
@@ -0,0 +1,594 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_002.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_002.log
new file mode 100644
index 000000000000..069b0382b1f3
--- /dev/null
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_002.log
@@ -0,0 +1,613 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+----------
+
+Let's start by opening the `bad.txt` file to identify and fix the typos.
+<execute_ipython>
+open_file("bad.txt")
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[File: /workspace/bad.txt (4 lines total)]
+(this is the beginning of the file)
+1|This is a stupid typoo.
+2|Really?
+3|No mor typos!
+4|Enjoy!
+(this is the end of the file)
+
+
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_edits/prompt_003.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_003.log
similarity index 99%
rename from tests/integration/mock/CodeActSWEAgent/test_edits/prompt_003.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_003.log
index 0ddb465d3aad..b5480b80d36e 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_edits/prompt_003.log
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_003.log
@@ -637,4 +637,4 @@ No mor typos!
 ]
 
 
-ENVIRONMENT REMINDER: You have 12 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_edits/prompt_004.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_004.log
similarity index 99%
rename from tests/integration/mock/CodeActSWEAgent/test_edits/prompt_004.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_004.log
index 831b0fe98d47..190512220da0 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_edits/prompt_004.log
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_004.log
@@ -666,4 +666,4 @@ OBSERVATION:
 [File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
 
 
-ENVIRONMENT REMINDER: You have 11 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_edits/prompt_005.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_005.log
similarity index 99%
rename from tests/integration/mock/CodeActSWEAgent/test_edits/prompt_005.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_005.log
index ec7273367899..c4e7b818d90f 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_edits/prompt_005.log
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/prompt_005.log
@@ -601,13 +601,13 @@ open_file("bad.txt")
 ----------
 
 OBSERVATION:
-[File: /workspace/bad.txt (4 lines total)]
-(this is the beginning of the file)
-1|This is a stupid typoo.
-2|Really?
-3|No mor typos!
-4|Enjoy!
-(this is the end of the file)
+[File: /workspace/bad.txt (4 lines total)]
+(this is the beginning of the file)
+1|This is a stupid typoo.
+2|Really?
+3|No mor typos!
+4|Enjoy!
+(this is the end of the file)
 
 
 ----------
@@ -629,12 +629,12 @@ edit_file_by_replace(
 ----------
 
 OBSERVATION:
-[No exact match found in bad.txt for
-```
-This is a stupid typoo.
-No mor typos!
-```
-]
+[No exact match found in bad.txt for
+```
+This is a stupid typoo.
+No mor typos!
+```
+]
 
 
 ----------
@@ -656,14 +656,14 @@ edit_file_by_replace(
 ----------
 
 OBSERVATION:
-[File: /workspace/bad.txt (4 lines total after edit)]
-(this is the beginning of the file)
-1|This is a stupid typo.
-2|Really?
-3|No mor typos!
-4|Enjoy!
-(this is the end of the file)
-[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+[File: /workspace/bad.txt (4 lines total after edit)]
+(this is the beginning of the file)
+1|This is a stupid typo.
+2|Really?
+3|No mor typos!
+4|Enjoy!
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
 
 
 ----------
@@ -680,14 +680,14 @@ edit_file_by_replace(
 ----------
 
 OBSERVATION:
-[File: /workspace/bad.txt (4 lines total after edit)]
-(this is the beginning of the file)
-1|This is a stupid typo.
-2|Really?
-3|No more typos!
-4|Enjoy!
-(this is the end of the file)
-[File updated (edited at line 3). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+[File: /workspace/bad.txt (4 lines total after edit)]
+(this is the beginning of the file)
+1|This is a stupid typo.
+2|Really?
+3|No more typos!
+4|Enjoy!
+(this is the end of the file)
+[File updated (edited at line 3). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
 
 
-ENVIRONMENT REMINDER: You have 10 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 10 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/response_001.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/response_001.log
new file mode 100644
index 000000000000..6a273b208828
--- /dev/null
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/response_001.log
@@ -0,0 +1,4 @@
+Let's start by opening the `bad.txt` file to identify and fix the typos.
+
+<execute_ipython>
+open_file("bad.txt")
diff --git a/tests/integration/mock/CodeActSWEAgent/test_edits/response_002.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/response_002.log
similarity index 100%
rename from tests/integration/mock/CodeActSWEAgent/test_edits/response_002.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/response_002.log
diff --git a/tests/integration/mock/CodeActSWEAgent/test_edits/response_003.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/response_003.log
similarity index 100%
rename from tests/integration/mock/CodeActSWEAgent/test_edits/response_003.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/response_003.log
diff --git a/tests/integration/mock/CodeActSWEAgent/test_edits/response_004.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/response_004.log
similarity index 100%
rename from tests/integration/mock/CodeActSWEAgent/test_edits/response_004.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/response_004.log
diff --git a/tests/integration/mock/CodeActSWEAgent/test_edits/response_005.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/response_005.log
similarity index 100%
rename from tests/integration/mock/CodeActSWEAgent/test_edits/response_005.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_edits/response_005.log
diff --git a/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython/prompt_001.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython/prompt_001.log
new file mode 100644
index 000000000000..04d267654d55
--- /dev/null
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython/prompt_001.log
@@ -0,0 +1,594 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'. Do not ask me for confirmation at any point.
+
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_ipython/prompt_002.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython/prompt_002.log
similarity index 99%
rename from tests/integration/mock/CodeActSWEAgent/test_ipython/prompt_002.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython/prompt_002.log
index c64727d5e108..82775a8660a0 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_ipython/prompt_002.log
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython/prompt_002.log
@@ -604,4 +604,4 @@ with open('/workspace/test.txt', 'w') as f:
 OBSERVATION:
 [Code executed successfully with no output]
 
-ENVIRONMENT REMINDER: You have 13 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_ipython/response_001.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython/response_001.log
similarity index 100%
rename from tests/integration/mock/CodeActSWEAgent/test_ipython/response_001.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython/response_001.log
diff --git a/tests/integration/mock/CodeActSWEAgent/test_ipython/response_002.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython/response_002.log
similarity index 100%
rename from tests/integration/mock/CodeActSWEAgent/test_ipython/response_002.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython/response_002.log
diff --git a/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/prompt_001.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/prompt_001.log
new file mode 100644
index 000000000000..a7073b5af5cd
--- /dev/null
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/prompt_001.log
@@ -0,0 +1,594 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
+
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_ipython_module/prompt_002.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/prompt_002.log
similarity index 99%
rename from tests/integration/mock/CodeActSWEAgent/test_ipython_module/prompt_002.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/prompt_002.log
index 22fe91d275f4..b369619f680a 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_ipython_module/prompt_002.log
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/prompt_002.log
@@ -593,7 +593,7 @@ Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt
 
 ----------
 
-Let's start by installing the `pymsgbox` package.
+Certainly! I'll install pymsgbox version 1.0.9, import it, and print its version to /workspace/test.txt. Here's how we'll do it:
 <execute_ipython>
 %pip install pymsgbox==1.0.9
 </execute_ipython>
@@ -604,4 +604,4 @@ OBSERVATION:
 [Package installed successfully]
 [Kernel restarted successfully to load the package]
 
-ENVIRONMENT REMINDER: You have 13 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_ipython_module/prompt_003.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/prompt_003.log
similarity index 98%
rename from tests/integration/mock/CodeActSWEAgent/test_ipython_module/prompt_003.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/prompt_003.log
index 1542f5f3e8a9..84de1a1cb6e0 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_ipython_module/prompt_003.log
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/prompt_003.log
@@ -593,7 +593,7 @@ Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt
 
 ----------
 
-Let's start by installing the `pymsgbox` package.
+Certainly! I'll install pymsgbox version 1.0.9, import it, and print its version to /workspace/test.txt. Here's how we'll do it:
 <execute_ipython>
 %pip install pymsgbox==1.0.9
 </execute_ipython>
@@ -606,25 +606,18 @@ OBSERVATION:
 
 ----------
 
-Next, let's import `pymsgbox` and print its version to `/workspace/test.txt`.
+Great! Now that pymsgbox is installed, let's import it and print its version to /workspace/test.txt.
 <execute_ipython>
 import pymsgbox
-
-version = pymsgbox.__version__
-
-with open("/workspace/test.txt", "w") as file:
-    file.write(f"pymsgbox version: {version}\n")
-
-# Verify the content of the file
-with open("/workspace/test.txt", "r") as file:
-    content = file.read()
-
-content
+with open('/workspace/test.txt', 'w') as f:
+    f.write(f"pymsgbox version: {pymsgbox.__version__}")
+print("Version written to /workspace/test.txt")
 </execute_ipython>
 
 ----------
 
 OBSERVATION:
-'pymsgbox version: 1.0.9\n'
+Version written to /workspace/test.txt
+
 
-ENVIRONMENT REMINDER: You have 12 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/prompt_004.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/prompt_004.log
new file mode 100644
index 000000000000..79a86d5c92b7
--- /dev/null
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/prompt_004.log
@@ -0,0 +1,636 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
+
+----------
+
+Certainly! I'll install pymsgbox version 1.0.9, import it, and print its version to /workspace/test.txt. Here's how we'll do it:
+<execute_ipython>
+%pip install pymsgbox==1.0.9
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+[Package installed successfully]
+[Kernel restarted successfully to load the package]
+
+----------
+
+Great! Now that pymsgbox is installed, let's import it and print its version to /workspace/test.txt.
+<execute_ipython>
+import pymsgbox
+with open('/workspace/test.txt', 'w') as f:
+    f.write(f"pymsgbox version: {pymsgbox.__version__}")
+print("Version written to /workspace/test.txt")
+</execute_ipython>
+
+----------
+
+OBSERVATION:
+Version written to /workspace/test.txt
+
+
+----------
+
+Great! We have successfully installed pymsgbox version 1.0.9, imported it, and written its version to /workspace/test.txt. Let's verify the contents of the file to make sure everything is correct.
+<execute_bash>
+cat /workspace/test.txt
+</execute_bash>
+
+----------
+
+OBSERVATION:
+pymsgbox version: 1.0.9
+[Command -1 finished with exit code 0]
+
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/response_001.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/response_001.log
new file mode 100644
index 000000000000..767509c1959b
--- /dev/null
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/response_001.log
@@ -0,0 +1,4 @@
+Certainly! I'll install pymsgbox version 1.0.9, import it, and print its version to /workspace/test.txt. Here's how we'll do it:
+
+<execute_ipython>
+%pip install pymsgbox==1.0.9
diff --git a/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/response_002.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/response_002.log
new file mode 100644
index 000000000000..3e166997350a
--- /dev/null
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/response_002.log
@@ -0,0 +1,7 @@
+Great! Now that pymsgbox is installed, let's import it and print its version to /workspace/test.txt.
+
+<execute_ipython>
+import pymsgbox
+with open('/workspace/test.txt', 'w') as f:
+    f.write(f"pymsgbox version: {pymsgbox.__version__}")
+print("Version written to /workspace/test.txt")
diff --git a/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/response_003.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/response_003.log
new file mode 100644
index 000000000000..3973023ab668
--- /dev/null
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/response_003.log
@@ -0,0 +1,4 @@
+Great! We have successfully installed pymsgbox version 1.0.9, imported it, and written its version to /workspace/test.txt. Let's verify the contents of the file to make sure everything is correct.
+
+<execute_bash>
+cat /workspace/test.txt
diff --git a/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/response_004.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/response_004.log
new file mode 100644
index 000000000000..836b9eca6230
--- /dev/null
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_ipython_module/response_004.log
@@ -0,0 +1,3 @@
+Excellent! We have successfully completed the task. The pymsgbox version 1.0.9 has been installed, imported, and its version has been correctly written to /workspace/test.txt. Since there are no further instructions or tasks to complete, I'll finish the task now.
+
+<finish></finish>
diff --git a/tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/prompt_001.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/prompt_001.log
new file mode 100644
index 000000000000..82481572e1c4
--- /dev/null
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/prompt_001.log
@@ -0,0 +1,594 @@
+
+
+----------
+
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+The assistant can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag, for example:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
+For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+
+
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down`
+    to view the file if you want to see more.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+create_file(filename: str) -> None:
+    Creates and opens a new file with the given name.
+    Args:
+    filename: str: The name of the file to create.
+
+edit_file_by_replace(file_name: str, to_replace: str, new_content: str) -> None:
+    Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
+    Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
+    Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty.
+    For example, given a file "/workspace/example.txt" with the following content:
+    ```
+    line 1
+    line 2
+    line 2
+    line 3
+    ```
+    EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='new line
+    line 3',
+    )
+    This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged.
+    The resulting file will be:
+    ```
+    line 1
+    line 2
+    new line
+    line 3
+    ```
+    REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string:
+    edit_file_by_replace(
+    '/workspace/example.txt',
+    to_replace='line 2
+    line 3',
+    new_content='',
+    )
+    Args:
+    file_name: str: The name of the file to edit.
+    to_replace: str: The content to search for and replace.
+    new_content: str: The new content to replace the old content with.
+
+insert_content_at_line(file_name: str, line_number: int, content: str) -> None:
+    Insert content at the given line number in a file.
+    This will NOT modify the content of the lines before OR after the given line number.
+    For example, if the file has the following content:
+    ```
+    line 1
+    line 2
+    line 3
+    ```
+    and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to:
+    ```
+    line 1
+    new line
+    line 2
+    line 3
+    ```
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+append_file(file_name: str, content: str) -> None:
+    Append content to the given file.
+    It appends text `content` to the end of the specified file.
+    Args:
+    file_name: str: The name of the file to edit.
+    line_number: int: The line number (starting from 1) to insert the content after.
+    content: str: The content to insert.
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: Optional[str]: The path to the directory to search.
+
+search_file(search_term: str, file_path: Optional[str] = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: Optional[str]: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: Optional[str]: The path to the directory to search.
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+
+The assistant's response should be concise.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+
+
+----------
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: We are currently solving the following issue within our repository. Here is the issue text:
+--- BEGIN ISSUE ---
+TimeDelta serialization precision
+Hi there!
+
+I just found quite strange behaviour of `TimeDelta` field serialization
+
+```python3
+from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+```
+
+Output of this snippet is `344`, but it seems that `345` is correct.
+
+Looks like a rounding issue here: https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L1474
+--- END ISSUE ---
+
+[Current directory: /workspace/marshmallow-code__marshmallow]
+
+ASSISTANT:
+Let's first start by reproducing the results of the issue. The issue includes some example code for reproduction, which we can use. We'll create a new file called `reproduce.py` and paste the example code into it.
+<execute_ipython>
+create_file('/workspace/marshmallow-code__marshmallow/reproduce.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (1 lines total)]
+1|
+[File /workspace/marshmallow-code__marshmallow/reproduce.py created.]
+
+ASSISTANT:
+Now let's paste in the example code from the issue.
+<execute_ipython>
+EDITED_CODE = """from marshmallow.fields import TimeDelta
+from datetime import timedelta
+
+td_field = TimeDelta(precision="milliseconds")
+
+obj = dict()
+obj["td_field"] = timedelta(milliseconds=345)
+
+print(td_field.serialize("td_field", obj))
+"""
+edit_file(start=1, end=1, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/reproduce.py (9 lines total)]
+1|from marshmallow.fields import TimeDelta
+2|from datetime import timedelta
+3|
+4|td_field = TimeDelta(precision="milliseconds")
+5|
+6|obj = dict()
+7|obj["td_field"] = timedelta(milliseconds=345)
+8|
+9|print(td_field.serialize("td_field", obj))
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Now let's run the code to see if we see the same output as the issue.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+344
+
+ASSISTANT:
+We are indeed seeing the same output as the issue. The issue suggests that we should look at line 1474 of the `fields.py` file to see if there is a rounding issue. Let's open the `fields.py` file and navigate to line 1474 to see the relevant code. First we should find the `fields.py` file in the repository. It is likely to be in the `src/marshmallow` directory, but we should check to be sure.
+<execute_bash>
+ls -F
+</execute_bash>
+
+USER:
+OBSERVATION:
+AUTHORS.rst
+CHANGELOG.rst
+CODE_OF_CONDUCT.md
+CONTRIBUTING.rst
+LICENSE
+MANIFEST.in
+NOTICE
+README.rst
+RELEASING.md
+azure-pipelines.yml
+docs/
+examples/
+performance/
+pyproject.toml
+reproduce.py
+setup.cfg
+setup.py
+src/
+tests/
+tox.ini
+
+ASSISTANT:
+It looks like the `src` directory is present, which suggests that the `fields.py` file is likely to be in the `src` directory. Let's use find_file to see where it is.
+<execute_ipython>
+find_file("fields.py", "/workspace/marshmallow-code__marshmallow/src")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Found 1 matches for "fields.py" in /workspace/marshmallow-code__marshmallow/src]
+/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py
+
+ASSISTANT:
+It looks like the `fields.py` file is present in the `./src/marshmallow/` directory. The issue also points to a specific URL with line number 1474. We should navigate to that line in fields.py to see the relevant code for the `TimeDelta` serialization.
+<execute_ipython>
+open_file("/workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py", 1474)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1458 more lines above)
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+1525|                    "marshmallow.base.FieldABC."
+1526|                ) from error
+1527|
+1528|        if values is None:
+1529|            self.value_field = None
+1530|        else:
+1531|            try:
+1532|                self.value_field = resolve_field_instance(values)
+1533|            except FieldInstanceResolutionError as error:
+1534|                raise ValueError(
+1535|                    '"values" must be a subclass or instance of '
+1536|                    "marshmallow.base.FieldABC."
+1537|                ) from error
+1538|            if isinstance(self.value_field, Nested):
+1539|                self.only = self.value_field.only
+1540|                self.exclude = self.value_field.exclude
+1541|
+1542|    def _bind_to_schema(self, field_name, schema):
+1543|        super()._bind_to_schema(field_name, schema)
+1544|        if self.value_field:
+1545|            self.value_field = copy.deepcopy(self.value_field)
+1546|            self.value_field._bind_to_schema(field_name, self)
+1547|        if isinstance(self.value_field, Nested):
+1548|            self.value_field.only = self.only
+1549|            self.value_field.exclude = self.exclude
+1550|        if self.key_field:
+1551|            self.key_field = copy.deepcopy(self.key_field)
+1552|            self.key_field._bind_to_schema(field_name, self)
+1553|
+1554|    def _serialize(self, value, attr, obj, **kwargs):
+1555|        if value is None:
+1556|            return None
+1557|        if not self.value_field and not self.key_field:
+1558|            return self.mapping_type(value)
+(439 more lines below)
+
+ASSISTANT:
+We are now looking at the relevant section of the `fields.py` file where the `TimeDelta` serialization occurs. The issue suggests that there is a rounding problem near line 1474. The current implementation uses integer division, which truncates the decimal part and can lead to incorrect rounding on line 1475. To fix this issue, we should ensure that the division results in a float, and then round the result to the nearest integer before converting it to an int. This should preserve the milliseconds precision as expected. Let's make the necessary edit to the code.
+<execute_ipython>
+edit_file(start=1475, end=1475, content="return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int")
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+
+ERRORS:
+- E999 IndentationError: unexpected indent
+
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1468 more lines above)
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+(518 more lines below)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1469 more lines above)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(value.total_seconds() / base_unit.total_seconds())
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+(517 more lines below)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+
+ASSISTANT:
+My edit command did not use the proper indentation, I will fix my syntax in this follow up edit command.
+<execute_ipython>
+EDITED_CODE="        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int"
+edit_file(start=1475, end=1475, content=EDITED_CODE)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /marshmallow-code__marshmallow/src/marshmallow/fields.py (1997 lines total)]
+(1424 more lines above)
+1425|
+1426|    :param precision: Influences how the integer is interpreted during
+1427|        (de)serialization. Must be 'days', 'seconds', 'microseconds',
+1428|        'milliseconds', 'minutes', 'hours' or 'weeks'.
+1429|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1430|
+1431|    .. versionchanged:: 2.0.0
+1432|        Always serializes to an integer value to avoid rounding errors.
+1433|        Add `precision` parameter.
+1434|    """
+1435|
+1436|    DAYS = "days"
+1437|    SECONDS = "seconds"
+1438|    MICROSECONDS = "microseconds"
+1439|    MILLISECONDS = "milliseconds"
+1440|    MINUTES = "minutes"
+1441|    HOURS = "hours"
+1442|    WEEKS = "weeks"
+1443|
+1444|    #: Default error messages.
+1445|    default_error_messages = {
+1446|        "invalid": "Not a valid period of time.",
+1447|        "format": "{input!r} cannot be formatted as a timedelta.",
+1448|    }
+1449|
+1450|    def __init__(self, precision: str = SECONDS, **kwargs):
+1451|        precision = precision.lower()
+1452|        units = (
+1453|            self.DAYS,
+1454|            self.SECONDS,
+1455|            self.MICROSECONDS,
+1456|            self.MILLISECONDS,
+1457|            self.MINUTES,
+1458|            self.HOURS,
+1459|            self.WEEKS,
+1460|        )
+1461|
+1462|        if precision not in units:
+1463|            msg = 'The precision must be {} or "{}".'.format(
+1464|                ", ".join([f'"{each}"' for each in units[:-1]]), units[-1]
+1465|            )
+1466|            raise ValueError(msg)
+1467|
+1468|        self.precision = precision
+1469|        super().__init__(**kwargs)
+1470|
+1471|    def _serialize(self, value, attr, obj, **kwargs):
+1472|        if value is None:
+1473|            return None
+1474|        base_unit = dt.timedelta(**{self.precision: 1})
+1475|        return int(round(value.total_seconds() / base_unit.total_seconds()))  # round to nearest int
+1476|
+1477|    def _deserialize(self, value, attr, data, **kwargs):
+1478|        try:
+1479|            value = int(value)
+1480|        except (TypeError, ValueError) as error:
+1481|            raise self.make_error("invalid") from error
+1482|
+1483|        kwargs = {self.precision: value}
+1484|
+1485|        try:
+1486|            return dt.timedelta(**kwargs)
+1487|        except OverflowError as error:
+1488|            raise self.make_error("invalid") from error
+1489|
+1490|
+1491|class Mapping(Field):
+1492|    """An abstract class for objects with key-value pairs.
+1493|
+1494|    :param keys: A field class or instance for dict keys.
+1495|    :param values: A field class or instance for dict values.
+1496|    :param kwargs: The same keyword arguments that :class:`Field` receives.
+1497|
+1498|    .. note::
+1499|        When the structure of nested data is not known, you may omit the
+1500|        `keys` and `values` arguments to prevent content validation.
+1501|
+1502|    .. versionadded:: 3.0.0rc4
+1503|    """
+1504|
+1505|    mapping_type = dict
+1506|
+1507|    #: Default error messages.
+1508|    default_error_messages = {"invalid": "Not a valid mapping type."}
+1509|
+1510|    def __init__(
+1511|        self,
+1512|        keys: typing.Optional[typing.Union[Field, type]] = None,
+1513|        values: typing.Optional[typing.Union[Field, type]] = None,
+1514|        **kwargs,
+1515|    ):
+1516|        super().__init__(**kwargs)
+1517|        if keys is None:
+1518|            self.key_field = None
+1519|        else:
+1520|            try:
+1521|                self.key_field = resolve_field_instance(keys)
+1522|            except FieldInstanceResolutionError as error:
+1523|                raise ValueError(
+1524|                    '"keys" must be a subclass or instance of '
+(473 more lines below)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+The code has been updated to use the `round` function, which should fix the rounding issue. Before submitting the changes, it would be prudent to run the reproduce.py code again to ensure that this change has the desired effect. I see that "Current directory" is still `/marshmallow-code__marshmallow`, so we can just run `reproduce.py` directly.
+<execute_bash>
+python reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+345
+
+ASSISTANT:
+The output has changed from 344 to 345, which suggests that the rounding issue has been fixed. Let's remove the reproduce.py file since it is no longer needed.
+<execute_bash>
+rm reproduce.py
+</execute_bash>
+
+USER:
+OBSERVATION:
+
+ASSISTANT:
+rm doesn't have any output when it deletes successfully, so that must have worked. Now that we have fixed the issue, we can exit the current task.
+<execute_bash>
+exit
+</execute_bash>
+
+--- END OF EXAMPLE ---
+
+
+NOW, LET'S START!
+
+----------
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/prompt_002.log
similarity index 99%
rename from tests/integration/mock/CodeActSWEAgent/test_write_simple_script/prompt_002.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/prompt_002.log
index e06f9bd513a2..0352d5489486 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_write_simple_script/prompt_002.log
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/prompt_002.log
@@ -604,4 +604,4 @@ OBSERVATION:
 
 [Command -1 finished with exit code 0]
 
-ENVIRONMENT REMINDER: You have 13 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/prompt_003.log
similarity index 99%
rename from tests/integration/mock/CodeActSWEAgent/test_write_simple_script/prompt_003.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/prompt_003.log
index 0c54161524f8..8a56b312090f 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_write_simple_script/prompt_003.log
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/prompt_003.log
@@ -617,4 +617,4 @@ OBSERVATION:
 echo "hello"
 [Command -1 finished with exit code 0]
 
-ENVIRONMENT REMINDER: You have 12 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/prompt_004.log
similarity index 99%
rename from tests/integration/mock/CodeActSWEAgent/test_write_simple_script/prompt_004.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/prompt_004.log
index d8a0cfb83a1c..24f9998c220d 100644
--- a/tests/integration/mock/CodeActSWEAgent/test_write_simple_script/prompt_004.log
+++ b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/prompt_004.log
@@ -630,4 +630,4 @@ OBSERVATION:
 hello
 [Command -1 finished with exit code 0]
 
-ENVIRONMENT REMINDER: You have 11 turns left to complete the task.
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>.
diff --git a/tests/integration/mock/CodeActSWEAgent/test_write_simple_script/response_001.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/response_001.log
similarity index 100%
rename from tests/integration/mock/CodeActSWEAgent/test_write_simple_script/response_001.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/response_001.log
diff --git a/tests/integration/mock/CodeActSWEAgent/test_write_simple_script/response_002.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/response_002.log
similarity index 100%
rename from tests/integration/mock/CodeActSWEAgent/test_write_simple_script/response_002.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/response_002.log
diff --git a/tests/integration/mock/CodeActSWEAgent/test_write_simple_script/response_003.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/response_003.log
similarity index 100%
rename from tests/integration/mock/CodeActSWEAgent/test_write_simple_script/response_003.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/response_003.log
diff --git a/tests/integration/mock/CodeActSWEAgent/test_write_simple_script/response_004.log b/tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/response_004.log
similarity index 100%
rename from tests/integration/mock/CodeActSWEAgent/test_write_simple_script/response_004.log
rename to tests/integration/mock/server_runtime/CodeActSWEAgent/test_write_simple_script/response_004.log
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_001.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_001.log
new file mode 100644
index 000000000000..06bc351be162
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_001.log
@@ -0,0 +1,84 @@
+
+
+----------
+
+# Task
+You are a software architect. Your team has inherited an existing codebase, and
+need to finish a project:
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+As an architect, you need to study the codebase to find all the information that
+might be helpful for your software engineering team.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+You must ONLY `run` commands that have no side-effects, like `ls` and `grep`. You
+MUST NOT modify or write to any file.
+
+Do NOT finish until you have a complete understanding of which parts of the
+codebase are relevant to the project, including particular files, functions, and classes.
+When you're done, put your summary in `outputs.summary` in the `finish` action.
+Remember, your task is to explore and study the current repository, not actually
+implement the solution. If the codebase is empty, you should call the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
+
+
+## Examples
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+
+ASSISTANT:
+{
+  "action": "run",
+  "args": {
+    "command": "ls"
+  }
+}
+
+USER:
+OBSERVATION:
+[]
+
+ASSISTANT:
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "summary": "The codebase appears to be empty. Engineers should start everything from scratch."
+    }
+  }
+}
+
+--- END OF EXAMPLE ---
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_002.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_002.log
similarity index 93%
rename from tests/integration/mock/DelegatorAgent/test_edits/prompt_002.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_002.log
index af5206d72018..92c3f14f1c45 100644
--- a/tests/integration/mock/DelegatorAgent/test_edits/prompt_002.log
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_002.log
@@ -41,7 +41,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_003.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_003.log
new file mode 100644
index 000000000000..52277d3e7bf3
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_003.log
@@ -0,0 +1,84 @@
+
+
+----------
+
+# Task
+You are a software architect. Your team has inherited an existing codebase, and
+need to finish a project:
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+As an architect, you need to study the codebase to find all the information that
+might be helpful for your software engineering team.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+You must ONLY `run` commands that have no side-effects, like `ls` and `grep`. You
+MUST NOT modify or write to any file.
+
+Do NOT finish until you have a complete understanding of which parts of the
+codebase are relevant to the project, including particular files, functions, and classes.
+When you're done, put your summary in `outputs.summary` in the `finish` action.
+Remember, your task is to explore and study the current repository, not actually
+implement the solution. If the codebase is empty, you should call the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}, {"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "bad.txt"}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
+
+
+## Examples
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+
+ASSISTANT:
+{
+  "action": "run",
+  "args": {
+    "command": "ls"
+  }
+}
+
+USER:
+OBSERVATION:
+[]
+
+ASSISTANT:
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "summary": "The codebase appears to be empty. Engineers should start everything from scratch."
+    }
+  }
+}
+
+--- END OF EXAMPLE ---
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_004.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_004.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/prompt_004.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_004.log
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_005.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_005.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/prompt_005.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_005.log
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_006.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_006.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/prompt_006.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_006.log
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_007.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_007.log
new file mode 100644
index 000000000000..5a32e02b4cb4
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_007.log
@@ -0,0 +1,49 @@
+
+
+----------
+
+# Task
+You are a quality assurance engineer. Another engineer has made changes to the
+codebase which are supposed to solve this task:
+
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+Note the changes might have already been applied in-line. You should focus on
+validating if the task is solved, nothing else.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+You must ONLY `run` commands that have no side-effects, like `ls`, `grep`, and test scripts.
+
+Do NOT finish until you know whether the task is complete and correct.
+When you're done, add a `completed` boolean to the `outputs` of the `finish` action.
+If `completed` is `false`, you MUST also provide a `summary` in the `outputs` of the `finish` action
+explaining what the problem is.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_008.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_008.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/prompt_008.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_edits/prompt_008.log
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_001.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_001.log
new file mode 100644
index 000000000000..0a4f0f2b80d5
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_001.log
@@ -0,0 +1,6 @@
+{
+  "action": "run",
+  "args": {
+    "command": "ls"
+  }
+}
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_002.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_002.log
new file mode 100644
index 000000000000..ac56b4ad06f1
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_002.log
@@ -0,0 +1,6 @@
+{
+  "action": "read",
+  "args": {
+    "path": "bad.txt"
+  }
+}
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/response_003.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_003.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_edits/response_003.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_003.log
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_004.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_004.log
new file mode 100644
index 000000000000..c2a150634e78
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_004.log
@@ -0,0 +1,8 @@
+```json
+{
+  "action": "read",
+  "args": {
+    "path": "bad.txt"
+  }
+}
+```
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_005.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_005.log
new file mode 100644
index 000000000000..b5c342ef850f
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_005.log
@@ -0,0 +1,9 @@
+```json
+{
+  "action": "write",
+  "args": {
+    "path": "bad.txt",
+    "content": "This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!\n"
+  }
+}
+```
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/response_005.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_006.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/response_005.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_006.log
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_007.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_007.log
new file mode 100644
index 000000000000..c2a150634e78
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_007.log
@@ -0,0 +1,8 @@
+```json
+{
+  "action": "read",
+  "args": {
+    "path": "bad.txt"
+  }
+}
+```
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/response_009.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_008.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/response_009.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_edits/response_008.log
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_001.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_001.log
new file mode 100644
index 000000000000..d7afd364ce77
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_001.log
@@ -0,0 +1,84 @@
+
+
+----------
+
+# Task
+You are a software architect. Your team has inherited an existing codebase, and
+need to finish a project:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+As an architect, you need to study the codebase to find all the information that
+might be helpful for your software engineering team.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+You must ONLY `run` commands that have no side-effects, like `ls` and `grep`. You
+MUST NOT modify or write to any file.
+
+Do NOT finish until you have a complete understanding of which parts of the
+codebase are relevant to the project, including particular files, functions, and classes.
+When you're done, put your summary in `outputs.summary` in the `finish` action.
+Remember, your task is to explore and study the current repository, not actually
+implement the solution. If the codebase is empty, you should call the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
+
+
+## Examples
+
+Here is an example of how you can interact with the environment for task solving:
+
+--- START OF EXAMPLE ---
+
+USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
+
+ASSISTANT:
+{
+  "action": "run",
+  "args": {
+    "command": "ls"
+  }
+}
+
+USER:
+OBSERVATION:
+[]
+
+ASSISTANT:
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "summary": "The codebase appears to be empty. Engineers should start everything from scratch."
+    }
+  }
+}
+
+--- END OF EXAMPLE ---
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_002.log
similarity index 93%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_002.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_002.log
index e7d8a3babee5..b1f47640d7df 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_002.log
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_002.log
@@ -41,7 +41,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_003.log
new file mode 100644
index 000000000000..5b6363c8bb00
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_003.log
@@ -0,0 +1,51 @@
+
+
+----------
+
+# Task
+You are a software engineer. You've inherited an existing codebase, which you
+need to modify to complete this task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+
+Here's a summary of the codebase, as it relates to this task:
+
+The codebase appears to be empty. Engineers should start everything from scratch.
+
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+Do NOT finish until you have completed the tasks.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_004.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_004.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_004.log
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_005.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_005.log
similarity index 90%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_005.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_005.log
index 86546d623227..27567df8a60c 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_005.log
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_005.log
@@ -41,7 +41,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\n# Print hello\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}]
+[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\n# Print hello\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_006.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_006.log
similarity index 81%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_006.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_006.log
index 8567550de535..72c01e257efe 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_006.log
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_006.log
@@ -41,7 +41,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\n# Print hello\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}, {"source": "agent", "action": "run", "args": {"command": "./hello.sh", "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello", "extras": {"command_id": -1, "command": "./hello.sh", "exit_code": 0}}]
+[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\n# Print hello\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}, {"source": "agent", "action": "run", "args": {"command": "./hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello", "extras": {"command_id": -1, "command": "./hello.sh", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_007.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_007.log
new file mode 100644
index 000000000000..bf0ea126360b
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_007.log
@@ -0,0 +1,49 @@
+
+
+----------
+
+# Task
+You are a quality assurance engineer. Another engineer has made changes to the
+codebase which are supposed to solve this task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+Note the changes might have already been applied in-line. You should focus on
+validating if the task is solved, nothing else.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+You must ONLY `run` commands that have no side-effects, like `ls`, `grep`, and test scripts.
+
+Do NOT finish until you know whether the task is complete and correct.
+When you're done, add a `completed` boolean to the `outputs` of the `finish` action.
+If `completed` is `false`, you MUST also provide a `summary` in the `outputs` of the `finish` action
+explaining what the problem is.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_008.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_008.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_008.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_008.log
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_009.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_009.log
similarity index 92%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_009.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_009.log
index 20b2296e332f..57cc84621b1d 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_009.log
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/prompt_009.log
@@ -39,7 +39,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\n# Print hello\necho 'hello'\n", "extras": {"path": "hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "bash hello.sh", "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello", "extras": {"command_id": -1, "command": "bash hello.sh", "exit_code": 0}}]
+[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\n# Print hello\necho 'hello'\n", "extras": {"path": "hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "bash hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello", "extras": {"command_id": -1, "command": "bash hello.sh", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_001.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_001.log
new file mode 100644
index 000000000000..0a4f0f2b80d5
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_001.log
@@ -0,0 +1,6 @@
+{
+  "action": "run",
+  "args": {
+    "command": "ls"
+  }
+}
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_002.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_002.log
new file mode 100644
index 000000000000..26e314361dc1
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_002.log
@@ -0,0 +1,8 @@
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "summary": "The codebase appears to be empty. Engineers should start everything from scratch."
+    }
+  }
+}
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_003.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_003.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/response_003.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_003.log
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_004.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_004.log
similarity index 100%
rename from tests/integration/mock/DelegatorAgent/test_write_simple_script/response_004.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_004.log
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_005.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_005.log
new file mode 100644
index 000000000000..da7bb974446c
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_005.log
@@ -0,0 +1,8 @@
+```json
+{
+  "action": "run",
+  "args": {
+    "command": "./hello.sh"
+  }
+}
+```
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/response_010.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_006.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/response_010.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_006.log
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/response_007.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_007.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/response_007.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_007.log
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/response_008.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_008.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/response_008.log
rename to tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_008.log
diff --git a/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_009.log b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_009.log
new file mode 100644
index 000000000000..fdeae4780c7c
--- /dev/null
+++ b/tests/integration/mock/server_runtime/DelegatorAgent/test_write_simple_script/response_009.log
@@ -0,0 +1,10 @@
+```json
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "completed": true
+    }
+  }
+}
+```
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_001.log b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_001.log
new file mode 100644
index 000000000000..98395254391d
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_001.log
@@ -0,0 +1,95 @@
+
+
+----------
+
+# Task
+You are in charge of accomplishing the following task:
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+In order to accomplish this goal, you must delegate tasks to one or more agents, who
+can do the actual work. A description of each agent is provided below. You MUST
+select one of the delegates below to move towards accomplishing the task, and you MUST
+provide the correct inputs for the delegate you select.
+
+Note: the delegated agent either returns "finish" or "reject".
+- If the action is "finish", but the full task is not done yet, you should
+continue to delegate to one of the agents below to until the full task is finished.
+- If the action is "reject", it means the delegated agent is not capable of the
+task you send to. You should revisit the input you send to the delegate, and consider
+whether any other delegate would be able to solve the task. If you cannot find
+a proper delegate agent, or the delegate attempts keep failing, call the `reject`
+action. In `reason` attribute, make sure you include your attempts (e.g. what agent
+you have delegated to, and why they failed).
+
+## Agents
+
+### CoderAgent
+Given a particular task, and a detailed description of the codebase, accomplishes the task
+#### Inputs
+{"task": "string", "summary": "string"}
+
+### CommitWriterAgent
+Write a git commit message for files in the git staging area
+#### Inputs
+{}
+
+### MathAgent
+Solves simple and complex math problems using python
+#### Inputs
+{"task": "string"}
+
+### PostgresAgent
+Writes and maintains PostgreSQL migrations
+#### Inputs
+{"task": "string"}
+
+### RepoExplorerAgent
+Generates a detailed summary of an existing codebase
+#### Inputs
+{}
+
+### StudyRepoForTaskAgent
+Given a particular task, finds and describes all relevant parts of the codebase
+#### Inputs
+{"task": "string"}
+
+### TypoFixerAgent
+Fixes typos in files in the current working directory
+#### Inputs
+{"task": "string"}
+
+### VerifierAgent
+Given a particular task, verifies that the task has been completed
+#### Inputs
+{"task": "string"}
+
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "user", "action": "message", "args": {"content": "Fix typos in bad.txt. Do not ask me for confirmation at any point.", "images_urls": null, "wait_for_response": false}}]
+
+If the last item in the history is an error, you should try to fix it. If you
+cannot fix it, call the `reject` action.
+
+## Available Actions
+* `delegate` - send a task to another agent from the list provided. Arguments:
+  * `agent` - the agent to which the task is delegated. MUST match a name in the list of agents provided.
+  * `inputs` - a dictionary of input parameters to the agent, as specified in the list
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+* `reject` - reject the task. Arguments:
+  * `outputs` - a dictionary with only a `reason` attribute
+
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_002.log b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_002.log
new file mode 100644
index 000000000000..5db0ecfb1dd2
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_002.log
@@ -0,0 +1,79 @@
+
+
+----------
+
+# Task
+You are a proofreader tasked with fixing typos in the files in your current working directory.
+
+
+Specifically, your task is:
+Fix typos in bad.txt
+
+
+To achieve this goal, you should:
+
+1. Scan the files for typos
+2. Overwrite the files with the typos fixed
+3. Provide a summary of the typos fixed
+
+## Available Actions
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+To complete this task:
+1. Use the `read` action to read the contents of the files in your current working directory. Make sure to provide the file path in the format `'./file_name.ext'`.
+2. Use the `message` action to analyze the contents and identify typos.
+3. Use the `write` action to create new versions of the files with the typos fixed.
+  - Overwrite the original files with the corrected content. Make sure to provide the file path in the format `'./file_name.ext'`.
+4. Use the `message` action to generate a summary of the typos fixed, including the original and fixed versions of each typo, and the file(s) they were found in.
+5. Use the `finish` action to return the summary in the `outputs.summary` field.
+
+Do NOT finish until you have fixed all the typos and generated a summary.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
+
+
+For example, if you want to use the read action to read the contents of a file named example.txt, your response should look like this:
+{
+  "action": "read",
+  "args": {
+    "path": "./example.txt"
+  }
+}
+
+Similarly, if you want to use the write action to write content to a file named output.txt, your response should look like this:
+{
+  "action": "write",
+  "args": {
+    "path": "./output.txt",
+    "content": "This is the content to be written to the file."
+  }
+}
diff --git a/tests/integration/mock/ManagerAgent/test_edits/prompt_003.log b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_003.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_edits/prompt_003.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_003.log
diff --git a/tests/integration/mock/ManagerAgent/test_edits/prompt_004.log b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_004.log
similarity index 97%
rename from tests/integration/mock/ManagerAgent/test_edits/prompt_004.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_004.log
index 10bee3d52c47..8e0dd13238b4 100644
--- a/tests/integration/mock/ManagerAgent/test_edits/prompt_004.log
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_004.log
@@ -51,7 +51,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "read", "args": {"path": "./bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "./bad.txt"}}, {"source": "agent", "action": "message", "args": {"content": "The content of 'bad.txt' has been read. The next step is to identify and fix the typos. Here are the identified typos:\n\n1. 'typoo' should be 'typo'\n2. 'mor' should be 'more'\n\nI will now proceed to fix these typos and overwrite the original file with the corrected content.", "wait_for_response": false}}]
+[{"source": "agent", "action": "read", "args": {"path": "./bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "./bad.txt"}}, {"source": "agent", "action": "message", "args": {"content": "The content of 'bad.txt' has been read. The next step is to identify and fix the typos. Here are the identified typos:\n\n1. 'typoo' should be 'typo'\n2. 'mor' should be 'more'\n\nI will now proceed to fix these typos and overwrite the original file with the corrected content.", "images_urls": null, "wait_for_response": false}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/ManagerAgent/test_edits/prompt_005.log b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_005.log
similarity index 91%
rename from tests/integration/mock/ManagerAgent/test_edits/prompt_005.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_005.log
index d6087dbc3ac4..8916845d687a 100644
--- a/tests/integration/mock/ManagerAgent/test_edits/prompt_005.log
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_005.log
@@ -51,7 +51,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "read", "args": {"path": "./bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "./bad.txt"}}, {"source": "agent", "action": "message", "args": {"content": "The content of 'bad.txt' has been read. The next step is to identify and fix the typos. Here are the identified typos:\n\n1. 'typoo' should be 'typo'\n2. 'mor' should be 'more'\n\nI will now proceed to fix these typos and overwrite the original file with the corrected content.", "wait_for_response": false}}, {"source": "agent", "action": "write", "args": {"path": "./bad.txt", "content": "This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!\n", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "./bad.txt"}}]
+[{"source": "agent", "action": "read", "args": {"path": "./bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "./bad.txt"}}, {"source": "agent", "action": "message", "args": {"content": "The content of 'bad.txt' has been read. The next step is to identify and fix the typos. Here are the identified typos:\n\n1. 'typoo' should be 'typo'\n2. 'mor' should be 'more'\n\nI will now proceed to fix these typos and overwrite the original file with the corrected content.", "images_urls": null, "wait_for_response": false}}, {"source": "agent", "action": "write", "args": {"path": "./bad.txt", "content": "This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!\n", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "./bad.txt"}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_006.log b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_006.log
new file mode 100644
index 000000000000..8995c81ed19a
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/prompt_006.log
@@ -0,0 +1,95 @@
+
+
+----------
+
+# Task
+You are in charge of accomplishing the following task:
+Fix typos in bad.txt. Do not ask me for confirmation at any point.
+
+In order to accomplish this goal, you must delegate tasks to one or more agents, who
+can do the actual work. A description of each agent is provided below. You MUST
+select one of the delegates below to move towards accomplishing the task, and you MUST
+provide the correct inputs for the delegate you select.
+
+Note: the delegated agent either returns "finish" or "reject".
+- If the action is "finish", but the full task is not done yet, you should
+continue to delegate to one of the agents below to until the full task is finished.
+- If the action is "reject", it means the delegated agent is not capable of the
+task you send to. You should revisit the input you send to the delegate, and consider
+whether any other delegate would be able to solve the task. If you cannot find
+a proper delegate agent, or the delegate attempts keep failing, call the `reject`
+action. In `reason` attribute, make sure you include your attempts (e.g. what agent
+you have delegated to, and why they failed).
+
+## Agents
+
+### CoderAgent
+Given a particular task, and a detailed description of the codebase, accomplishes the task
+#### Inputs
+{"task": "string", "summary": "string"}
+
+### CommitWriterAgent
+Write a git commit message for files in the git staging area
+#### Inputs
+{}
+
+### MathAgent
+Solves simple and complex math problems using python
+#### Inputs
+{"task": "string"}
+
+### PostgresAgent
+Writes and maintains PostgreSQL migrations
+#### Inputs
+{"task": "string"}
+
+### RepoExplorerAgent
+Generates a detailed summary of an existing codebase
+#### Inputs
+{}
+
+### StudyRepoForTaskAgent
+Given a particular task, finds and describes all relevant parts of the codebase
+#### Inputs
+{"task": "string"}
+
+### TypoFixerAgent
+Fixes typos in files in the current working directory
+#### Inputs
+{"task": "string"}
+
+### VerifierAgent
+Given a particular task, verifies that the task has been completed
+#### Inputs
+{"task": "string"}
+
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "user", "action": "message", "args": {"content": "Fix typos in bad.txt. Do not ask me for confirmation at any point.", "images_urls": null, "wait_for_response": false}}, {"source": "agent", "action": "delegate", "args": {"agent": "TypoFixerAgent", "inputs": {"task": "Fix typos in bad.txt"}, "thought": ""}}, {"source": "agent", "observation": "delegate", "content": "TypoFixerAgent finishes task with summary: {'file': './bad.txt', 'typos_fixed': [{'original': 'typoo', 'corrected': 'typo'}, {'original': 'mor', 'corrected': 'more'}]}", "extras": {"outputs": {"summary": {"file": "./bad.txt", "typos_fixed": [{"original": "typoo", "corrected": "typo"}, {"original": "mor", "corrected": "more"}]}}}}]
+
+If the last item in the history is an error, you should try to fix it. If you
+cannot fix it, call the `reject` action.
+
+## Available Actions
+* `delegate` - send a task to another agent from the list provided. Arguments:
+  * `agent` - the agent to which the task is delegated. MUST match a name in the list of agents provided.
+  * `inputs` - a dictionary of input parameters to the agent, as specified in the list
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+* `reject` - reject the task. Arguments:
+  * `outputs` - a dictionary with only a `reason` attribute
+
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_001.log b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_001.log
new file mode 100644
index 000000000000..932f74fba045
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_001.log
@@ -0,0 +1,11 @@
+```json
+{
+  "action": "delegate",
+  "args": {
+    "agent": "TypoFixerAgent",
+    "inputs": {
+      "task": "Fix typos in bad.txt"
+    }
+  }
+}
+```
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_002.log b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_002.log
new file mode 100644
index 000000000000..4a15313c9417
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_002.log
@@ -0,0 +1,6 @@
+{
+  "action": "read",
+  "args": {
+    "path": "./bad.txt"
+  }
+}
diff --git a/tests/integration/mock/ManagerAgent/test_edits/response_003.log b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_003.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_edits/response_003.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_003.log
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_004.log b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_004.log
new file mode 100644
index 000000000000..0437106907a6
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_004.log
@@ -0,0 +1,7 @@
+{
+  "action": "write",
+  "args": {
+    "path": "./bad.txt",
+    "content": "This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!\n"
+  }
+}
diff --git a/tests/integration/mock/ManagerAgent/test_edits/response_005.log b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_005.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_edits/response_005.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_005.log
diff --git a/tests/integration/mock/ManagerAgent/test_edits/response_006.log b/tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_006.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_edits/response_006.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_edits/response_006.log
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_006.log b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_001.log
similarity index 90%
rename from tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_006.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_001.log
index 0356a4b906ae..71235b445509 100644
--- a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_006.log
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_001.log
@@ -70,7 +70,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "user", "action": "message", "args": {"content": "Write a git commit message for the current staging area. Do not ask me for confirmation at any point.", "wait_for_response": false}}, {"source": "agent", "action": "delegate", "args": {"agent": "CommitWriterAgent", "inputs": {}, "thought": ""}}, {"source": "agent", "observation": "delegate", "content": "CommitWriterAgent finishes task with reason: Not a valid git repository.", "extras": {"outputs": {"reason": "Not a valid git repository."}}}]
+[{"source": "user", "action": "message", "args": {"content": "Write a git commit message for the current staging area. Do not ask me for confirmation at any point.", "images_urls": null, "wait_for_response": false}}]
 
 If the last item in the history is an error, you should try to fix it. If you
 cannot fix it, call the `reject` action.
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_002.log b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_002.log
new file mode 100644
index 000000000000..d7237c772464
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_002.log
@@ -0,0 +1,51 @@
+
+
+----------
+
+# Task
+You are a responsible software engineer and always write good commit messages.
+
+Please analyze the diff in the staging area, understand the context and content
+of the updates from the diff only. Identify key elements like:
+- Which files are affected?
+- What types of changes were made (e.g., new features, bug fixes, refactoring, documentation, testing)?
+
+Then you should generate a commit message that succinctly summarizes the staged
+changes. The commit message should include:
+- A summary line that clearly states the purpose of the changes.
+- Optionally, a detailed description if the changes are complex or need further explanation.
+
+You should first use `git status` to check whether it's a valid git repo and there
+is diff in the staging area. If not, please call the `reject` action.
+
+If it is a valid git repo and there is diff in the staging area, you should find
+the diff using `git diff --cached`, compile a commit message, and call the `finish`
+action with `outputs.answer` set to the answer.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[]
+
+If the last item in the history is an error, you should try to fix it.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `reject` - reject the task. Arguments:
+  * `outputs` - a dictionary with only a `reason` attribute
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_003.log b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_003.log
similarity index 86%
rename from tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_003.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_003.log
index 14893e42170b..56488bb5edb3 100644
--- a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_003.log
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_003.log
@@ -28,7 +28,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}]
 
 If the last item in the history is an error, you should try to fix it.
 
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_004.log b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_004.log
new file mode 100644
index 000000000000..1bd9397ed999
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_004.log
@@ -0,0 +1,51 @@
+
+
+----------
+
+# Task
+You are a responsible software engineer and always write good commit messages.
+
+Please analyze the diff in the staging area, understand the context and content
+of the updates from the diff only. Identify key elements like:
+- Which files are affected?
+- What types of changes were made (e.g., new features, bug fixes, refactoring, documentation, testing)?
+
+Then you should generate a commit message that succinctly summarizes the staged
+changes. The commit message should include:
+- A summary line that clearly states the purpose of the changes.
+- Optionally, a detailed description if the changes are complex or need further explanation.
+
+You should first use `git status` to check whether it's a valid git repo and there
+is diff in the staging area. If not, please call the `reject` action.
+
+If it is a valid git repo and there is diff in the staging area, you should find
+the diff using `git diff --cached`, compile a commit message, and call the `finish`
+action with `outputs.answer` set to the answer.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+
+If the last item in the history is an error, you should try to fix it.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `reject` - reject the task. Arguments:
+  * `outputs` - a dictionary with only a `reason` attribute
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_005.log b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_005.log
new file mode 100644
index 000000000000..1adccc82961c
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_005.log
@@ -0,0 +1,51 @@
+
+
+----------
+
+# Task
+You are a responsible software engineer and always write good commit messages.
+
+Please analyze the diff in the staging area, understand the context and content
+of the updates from the diff only. Identify key elements like:
+- Which files are affected?
+- What types of changes were made (e.g., new features, bug fixes, refactoring, documentation, testing)?
+
+Then you should generate a commit message that succinctly summarizes the staged
+changes. The commit message should include:
+- A summary line that clearly states the purpose of the changes.
+- Optionally, a detailed description if the changes are complex or need further explanation.
+
+You should first use `git status` to check whether it's a valid git repo and there
+is diff in the staging area. If not, please call the `reject` action.
+
+If it is a valid git repo and there is diff in the staging area, you should find
+the diff using `git diff --cached`, compile a commit message, and call the `finish`
+action with `outputs.answer` set to the answer.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+
+If the last item in the history is an error, you should try to fix it.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `reject` - reject the task. Arguments:
+  * `outputs` - a dictionary with only a `reason` attribute
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_006.log b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_006.log
new file mode 100644
index 000000000000..c869abe63021
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/prompt_006.log
@@ -0,0 +1,95 @@
+
+
+----------
+
+# Task
+You are in charge of accomplishing the following task:
+Write a git commit message for the current staging area. Do not ask me for confirmation at any point.
+
+In order to accomplish this goal, you must delegate tasks to one or more agents, who
+can do the actual work. A description of each agent is provided below. You MUST
+select one of the delegates below to move towards accomplishing the task, and you MUST
+provide the correct inputs for the delegate you select.
+
+Note: the delegated agent either returns "finish" or "reject".
+- If the action is "finish", but the full task is not done yet, you should
+continue to delegate to one of the agents below to until the full task is finished.
+- If the action is "reject", it means the delegated agent is not capable of the
+task you send to. You should revisit the input you send to the delegate, and consider
+whether any other delegate would be able to solve the task. If you cannot find
+a proper delegate agent, or the delegate attempts keep failing, call the `reject`
+action. In `reason` attribute, make sure you include your attempts (e.g. what agent
+you have delegated to, and why they failed).
+
+## Agents
+
+### CoderAgent
+Given a particular task, and a detailed description of the codebase, accomplishes the task
+#### Inputs
+{"task": "string", "summary": "string"}
+
+### CommitWriterAgent
+Write a git commit message for files in the git staging area
+#### Inputs
+{}
+
+### MathAgent
+Solves simple and complex math problems using python
+#### Inputs
+{"task": "string"}
+
+### PostgresAgent
+Writes and maintains PostgreSQL migrations
+#### Inputs
+{"task": "string"}
+
+### RepoExplorerAgent
+Generates a detailed summary of an existing codebase
+#### Inputs
+{}
+
+### StudyRepoForTaskAgent
+Given a particular task, finds and describes all relevant parts of the codebase
+#### Inputs
+{"task": "string"}
+
+### TypoFixerAgent
+Fixes typos in files in the current working directory
+#### Inputs
+{"task": "string"}
+
+### VerifierAgent
+Given a particular task, verifies that the task has been completed
+#### Inputs
+{"task": "string"}
+
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "user", "action": "message", "args": {"content": "Write a git commit message for the current staging area. Do not ask me for confirmation at any point.", "images_urls": null, "wait_for_response": false}}, {"source": "agent", "action": "delegate", "args": {"agent": "CommitWriterAgent", "inputs": {}, "thought": ""}}, {"source": "agent", "observation": "delegate", "content": "CommitWriterAgent finishes task with reason: Not a valid git repository.", "extras": {"outputs": {"reason": "Not a valid git repository."}}}]
+
+If the last item in the history is an error, you should try to fix it. If you
+cannot fix it, call the `reject` action.
+
+## Available Actions
+* `delegate` - send a task to another agent from the list provided. Arguments:
+  * `agent` - the agent to which the task is delegated. MUST match a name in the list of agents provided.
+  * `inputs` - a dictionary of input parameters to the agent, as specified in the list
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+* `reject` - reject the task. Arguments:
+  * `outputs` - a dictionary with only a `reason` attribute
+
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_001.log b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_001.log
new file mode 100644
index 000000000000..2b662799eeaa
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_001.log
@@ -0,0 +1,9 @@
+```json
+{
+  "action": "delegate",
+  "args": {
+    "agent": "CommitWriterAgent",
+    "inputs": {}
+  }
+}
+```
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_002.log b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_002.log
new file mode 100644
index 000000000000..a949993acbcd
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_002.log
@@ -0,0 +1,6 @@
+{
+  "action": "run",
+  "args": {
+    "command": "git status"
+  }
+}
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_003.log b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_003.log
new file mode 100644
index 000000000000..284ec6eaf959
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_003.log
@@ -0,0 +1,8 @@
+```json
+{
+  "action": "reject",
+  "args": {
+    "reason": "Not a valid git repository."
+  }
+}
+```
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_004.log b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_004.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_004.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_004.log
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_005.log b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_005.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_005.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_005.log
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_006.log b/tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_006.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_006.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_simple_task_rejection/response_006.log
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_001.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_001.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_001.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_001.log
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_002.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_002.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_002.log
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_003.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_003.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_003.log
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_004.log
similarity index 90%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_004.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_004.log
index d7463dbaf3e5..9b630d1d96b0 100644
--- a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_004.log
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_004.log
@@ -38,7 +38,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "hello.sh"}}], [{"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "background": false, "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}]]
+[[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "hello.sh"}}], [{"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "background": false, "thought": "", "timeout": 120, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}]]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_005.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_005.log
similarity index 80%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_005.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_005.log
index b12e03344f5d..055b53de6da2 100644
--- a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_005.log
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_005.log
@@ -38,7 +38,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "hello.sh"}}], [{"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "background": false, "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}], [{"source": "agent", "action": "run", "args": {"command": "./hello.sh", "background": false, "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello", "extras": {"command_id": -1, "command": "./hello.sh", "exit_code": 0}}]]
+[[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "hello.sh"}}], [{"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "background": false, "thought": "", "timeout": 120, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}], [{"source": "agent", "action": "run", "args": {"command": "./hello.sh", "background": false, "thought": "", "timeout": 120, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello", "extras": {"command_id": -1, "command": "./hello.sh", "exit_code": 0}}]]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_006.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_006.log
similarity index 91%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_006.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_006.log
index 5c732af1da22..252ec88cfd1d 100644
--- a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_006.log
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_006.log
@@ -60,7 +60,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "user", "action": "message", "args": {"content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.", "wait_for_response": false}}, {"observation": "null", "content": "", "extras": {}}], [{"source": "agent", "action": "delegate", "args": {"agent": "CoderAgent", "inputs": {"task": "Write a shell script 'hello.sh' that prints 'hello'.", "summary": ""}, "thought": "", "is_confirmed": "confirmed"}}, {"observation": "null", "content": "", "extras": {}}], [{"action": "null", "args": {}}, {"source": "agent", "observation": "delegate", "content": "", "extras": {"outputs": {}}}]]
+[[{"source": "user", "action": "message", "args": {"content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.", "wait_for_response": false}}, {"observation": "null", "content": "", "extras": {}}], [{"source": "agent", "action": "delegate", "args": {"agent": "CoderAgent", "inputs": {"task": "Write a shell script 'hello.sh' that prints 'hello'.", "summary": ""}, "thought": "", "timeout": 120, "is_confirmed": "confirmed"}}, {"observation": "null", "content": "", "extras": {}}], [{"action": "null", "args": {}}, {"source": "agent", "observation": "delegate", "content": "", "extras": {"outputs": {}}}]]
 
 ## Available Actions
 * `delegate` - send a task to another agent from the list provided. Arguments:
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_007.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_007.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_007.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_007.log
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_008.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_008.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_008.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_008.log
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_009.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_009.log
similarity index 91%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_009.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_009.log
index c04a8cd0e889..80b6ea328ab2 100644
--- a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_009.log
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_009.log
@@ -40,7 +40,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'\n", "extras": {"path": "hello.sh"}}], [{"source": "agent", "action": "run", "args": {"command": "bash hello.sh", "background": false, "thought": "", "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello", "extras": {"command_id": -1, "command": "bash hello.sh", "exit_code": 0}}]]
+[[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'\n", "extras": {"path": "hello.sh"}}], [{"source": "agent", "action": "run", "args": {"command": "bash hello.sh", "background": false, "thought": "", "timeout": 120, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello", "extras": {"command_id": -1, "command": "bash hello.sh", "exit_code": 0}}]]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_010.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_010.log
similarity index 80%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_010.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_010.log
index a8d42cbf75ea..0983aa8e28d7 100644
--- a/tests/integration/mock/ManagerAgent/test_write_simple_script/prompt_010.log
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/prompt_010.log
@@ -60,7 +60,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "user", "action": "message", "args": {"content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.", "wait_for_response": false}}, {"observation": "null", "content": "", "extras": {}}], [{"source": "agent", "action": "delegate", "args": {"agent": "CoderAgent", "inputs": {"task": "Write a shell script 'hello.sh' that prints 'hello'.", "summary": ""}, "thought": "", "is_confirmed": "confirmed"}}, {"observation": "null", "content": "", "extras": {}}], [{"action": "null", "args": {}}, {"source": "agent", "observation": "delegate", "content": "", "extras": {"outputs": {}}}], [{"source": "agent", "action": "delegate", "args": {"agent": "VerifierAgent", "inputs": {"task": "Verify that the shell script 'hello.sh' prints 'hello'."}, "thought": "", "is_confirmed": "confirmed"}}, {"observation": "null", "content": "", "extras": {}}], [{"action": "null", "args": {}}, {"source": "agent", "observation": "delegate", "content": "", "extras": {"outputs": {"completed": true}}}]]
+[[{"source": "user", "action": "message", "args": {"content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.", "wait_for_response": false}}, {"observation": "null", "content": "", "extras": {}}], [{"source": "agent", "action": "delegate", "args": {"agent": "CoderAgent", "inputs": {"task": "Write a shell script 'hello.sh' that prints 'hello'.", "summary": ""}, "thought": "", "timeout": 120, "is_confirmed": "confirmed"}}, {"observation": "null", "content": "", "extras": {}}], [{"action": "null", "args": {}}, {"source": "agent", "observation": "delegate", "content": "", "extras": {"outputs": {}}}], [{"source": "agent", "action": "delegate", "args": {"agent": "VerifierAgent", "inputs": {"task": "Verify that the shell script 'hello.sh' prints 'hello'."}, "thought": "", "timeout": 120, "is_confirmed": "confirmed"}}, {"observation": "null", "content": "", "extras": {}}], [{"action": "null", "args": {}}, {"source": "agent", "observation": "delegate", "content": "", "extras": {"outputs": {"completed": true}}}]]
 
 ## Available Actions
 * `delegate` - send a task to another agent from the list provided. Arguments:
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/response_001.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_001.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/response_001.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_001.log
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/response_002.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_002.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/response_002.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_002.log
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/response_003.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_003.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/response_003.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_003.log
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/response_004.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_004.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/response_004.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_004.log
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_005.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_005.log
new file mode 100644
index 000000000000..762f9fe83483
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_005.log
@@ -0,0 +1,8 @@
+```json
+{
+  "action": "finish",
+  "args": {
+    "outputs": {}
+  }
+}
+```
diff --git a/tests/integration/mock/ManagerAgent/test_write_simple_script/response_006.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_006.log
similarity index 100%
rename from tests/integration/mock/ManagerAgent/test_write_simple_script/response_006.log
rename to tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_006.log
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_007.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_007.log
new file mode 100644
index 000000000000..58119a3819e9
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_007.log
@@ -0,0 +1,8 @@
+```json
+{
+  "action": "read",
+  "args": {
+    "path": "hello.sh"
+  }
+}
+```
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_008.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_008.log
new file mode 100644
index 000000000000..67183065b00d
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_008.log
@@ -0,0 +1,8 @@
+```json
+{
+  "action": "run",
+  "args": {
+    "command": "bash hello.sh"
+  }
+}
+```
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_009.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_009.log
new file mode 100644
index 000000000000..fdeae4780c7c
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_009.log
@@ -0,0 +1,10 @@
+```json
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "completed": true
+    }
+  }
+}
+```
diff --git a/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_010.log b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_010.log
new file mode 100644
index 000000000000..762f9fe83483
--- /dev/null
+++ b/tests/integration/mock/server_runtime/ManagerAgent/test_write_simple_script/response_010.log
@@ -0,0 +1,8 @@
+```json
+{
+  "action": "finish",
+  "args": {
+    "outputs": {}
+  }
+}
+```
diff --git a/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_001.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_001.log
new file mode 100644
index 000000000000..00389e2e9fdc
--- /dev/null
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_001.log
@@ -0,0 +1,103 @@
+
+
+----------
+
+
+# Task
+You're a diligent software engineer AI. You can't see, draw, or interact with a
+browser, but you can read and write files, and you can run commands, and you can think.
+
+You've been given the following task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+## Plan
+As you complete this task, you're building a plan and keeping
+track of your progress. Here's a JSON representation of your plan:
+
+{
+  "id": "",
+  "goal": "",
+  "state": "open",
+  "subtasks": []
+}
+
+
+You're not currently working on any tasks. Your next action MUST be to mark a task as in_progress.
+
+You're responsible for managing this plan and the status of tasks in
+it, by using the `add_task` and `modify_task` actions described below.
+
+If the History below contradicts the state of any of these tasks, you
+MUST modify the task using the `modify_task` action described below.
+
+Be sure NOT to duplicate any tasks. Do NOT use the `add_task` action for
+a task that's already represented. Every task must be represented only once.
+
+Tasks that are sequential MUST be siblings. They must be added in order
+to their parent task.
+
+If you mark a task as 'completed', 'verified', or 'abandoned',
+all non-abandoned subtasks will be marked the same way.
+So before closing a task this way, you MUST not only be sure that it has
+been completed successfully--you must ALSO be sure that all its subtasks
+are ready to be marked the same way.
+
+If, and only if, ALL tasks have already been marked verified,
+you MUST respond with the `finish` action.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+ten actions--more happened before that.
+
+[
+  {
+    "source": "user",
+    "action": "message",
+    "args": {
+      "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
+      "wait_for_response": false
+    }
+  }
+]
+
+
+Your most recent action is at the bottom of that history.
+
+## Action
+What is your next thought or action? Your response must be in JSON format.
+
+It must be an object, and it must contain two fields:
+* `action`, which is one of the actions below
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+* `browse` - opens a web page. Arguments:
+  * `url` - the URL to open
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+* `add_task` - add a task to your plan. Arguments:
+  * `parent` - the ID of the parent task (leave empty if it should go at the top level)
+  * `goal` - the goal of the task
+  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
+* `modify_task` - close a task. Arguments:
+  * `task_id` - the ID of the task to close
+  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
+* `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
+
+You MUST take time to think in between read, write, run, and browse actions--do this with the `message` action.
+You should never act twice in a row without thinking. But if your last several
+actions are all `message` actions, you should consider taking a different action.
+
+What is your next thought or action? Again, you must reply with JSON, and only with JSON.
+
+Look at your last thought in the history above. What does it suggest? Don't think anymore--take action.
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_002.log
similarity index 99%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_002.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_002.log
index 43a6379b3d77..d34d8ff73fdd 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_002.log
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_002.log
@@ -83,6 +83,7 @@ ten actions--more happened before that.
     "action": "message",
     "args": {
       "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
       "wait_for_response": false
     }
   },
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_003.log
similarity index 99%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_003.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_003.log
index 10f93f52bf85..f53086de7e04 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_003.log
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_003.log
@@ -85,6 +85,7 @@ ten actions--more happened before that.
     "action": "message",
     "args": {
       "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
       "wait_for_response": false
     }
   },
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_004.log
similarity index 99%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_004.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_004.log
index 7ab8c9e2e373..06e31b83276a 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_004.log
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_004.log
@@ -85,6 +85,7 @@ ten actions--more happened before that.
     "action": "message",
     "args": {
       "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
       "wait_for_response": false
     }
   },
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_005.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_005.log
similarity index 99%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_005.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_005.log
index 9644e550423f..e6e78795d865 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_005.log
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_005.log
@@ -84,6 +84,7 @@ ten actions--more happened before that.
     "action": "message",
     "args": {
       "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
       "wait_for_response": false
     }
   },
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_006.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_006.log
similarity index 99%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_006.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_006.log
index c48dfa359f2e..676e1f4854fe 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_006.log
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_006.log
@@ -85,6 +85,7 @@ ten actions--more happened before that.
     "action": "message",
     "args": {
       "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
       "wait_for_response": false
     }
   },
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_007.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_007.log
similarity index 99%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_007.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_007.log
index ee79a05e8ca7..59e833fac70a 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_007.log
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_007.log
@@ -85,6 +85,7 @@ ten actions--more happened before that.
     "action": "message",
     "args": {
       "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
       "wait_for_response": false
     }
   },
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_008.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_008.log
similarity index 99%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_008.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_008.log
index 4f3d54d8e316..c7fcaa954975 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_008.log
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_008.log
@@ -84,6 +84,7 @@ ten actions--more happened before that.
     "action": "message",
     "args": {
       "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
       "wait_for_response": false
     }
   },
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_009.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_009.log
similarity index 99%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_009.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_009.log
index dbb9c2151608..341ed2d57f23 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_009.log
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_009.log
@@ -85,6 +85,7 @@ ten actions--more happened before that.
     "action": "message",
     "args": {
       "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
       "wait_for_response": false
     }
   },
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_010.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_010.log
similarity index 99%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_010.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_010.log
index 14f3733493c3..78959e7edd5c 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_010.log
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_010.log
@@ -85,6 +85,7 @@ ten actions--more happened before that.
     "action": "message",
     "args": {
       "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
       "wait_for_response": false
     }
   },
@@ -197,6 +198,7 @@ ten actions--more happened before that.
     "args": {
       "command": "bash hello.sh",
       "thought": "",
+      "keep_prompt": true,
       "is_confirmed": "confirmed"
     }
   },
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_011.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_011.log
similarity index 99%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_011.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_011.log
index c273bd8505e7..2db0b0a965d5 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_011.log
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/prompt_011.log
@@ -84,6 +84,7 @@ ten actions--more happened before that.
     "action": "message",
     "args": {
       "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "images_urls": null,
       "wait_for_response": false
     }
   },
@@ -196,6 +197,7 @@ ten actions--more happened before that.
     "args": {
       "command": "bash hello.sh",
       "thought": "",
+      "keep_prompt": true,
       "is_confirmed": "confirmed"
     }
   },
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_001.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_001.log
similarity index 100%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/response_001.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_001.log
diff --git a/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_002.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_002.log
new file mode 100644
index 000000000000..eb081caba2c2
--- /dev/null
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_002.log
@@ -0,0 +1,9 @@
+```json
+{
+  "action": "modify_task",
+  "args": {
+    "task_id": "0.0",
+    "state": "in_progress"
+  }
+}
+```
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_006.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_003.log
similarity index 100%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/response_006.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_003.log
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_004.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_004.log
similarity index 100%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/response_004.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_004.log
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_005.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_005.log
similarity index 100%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/response_005.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_005.log
diff --git a/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_006.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_006.log
new file mode 100644
index 000000000000..9177d1343db4
--- /dev/null
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_006.log
@@ -0,0 +1,9 @@
+```json
+{
+  "action": "write",
+  "args": {
+    "path": "hello.sh",
+    "content": "#!/bin/bash\n\n# Print 'hello'\necho 'hello'"
+  }
+}
+```
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_007.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_007.log
similarity index 100%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/response_007.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_007.log
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_008.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_008.log
similarity index 100%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/response_008.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_008.log
diff --git a/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_009.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_009.log
new file mode 100644
index 000000000000..67183065b00d
--- /dev/null
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_009.log
@@ -0,0 +1,8 @@
+```json
+{
+  "action": "run",
+  "args": {
+    "command": "bash hello.sh"
+  }
+}
+```
diff --git a/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_010.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_010.log
new file mode 100644
index 000000000000..3d141b7f73fd
--- /dev/null
+++ b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_010.log
@@ -0,0 +1,10 @@
+```json
+{
+  "action": "modify_task",
+  "args": {
+    "task_id": "0.2",
+    "state": "completed",
+    "thought": "The shell script 'hello.sh' ran successfully and printed 'hello'."
+  }
+}
+```
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/response_011.log b/tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_011.log
similarity index 100%
rename from tests/integration/mock/PlannerAgent/test_write_simple_script/response_011.log
rename to tests/integration/mock/server_runtime/PlannerAgent/test_write_simple_script/response_011.log
diff --git a/tests/integration/regenerate.sh b/tests/integration/regenerate.sh
index ca43ad605098..56253c5b96fd 100755
--- a/tests/integration/regenerate.sh
+++ b/tests/integration/regenerate.sh
@@ -34,16 +34,21 @@ fi
 export SCRIPT_DIR=$(get_script_dir)
 export PROJECT_ROOT=$(realpath "$SCRIPT_DIR/../..")
 
-WORKSPACE_MOUNT_PATH=$(realpath "${WORKSPACE_MOUNT_PATH}/_test_workspace")
-WORKSPACE_BASE=$(realpath "${WORKSPACE_BASE}/_test_workspace")
-WORKSPACE_MOUNT_PATH_IN_SANDBOX="/workspace"
+WORKSPACE_BASE=${WORKSPACE_BASE}/_test_workspace
+mkdir -p $WORKSPACE_BASE
+chmod -R 777 $WORKSPACE_BASE
+WORKSPACE_BASE=$(realpath $WORKSPACE_BASE)
+
+WORKSPACE_MOUNT_PATH=${WORKSPACE_MOUNT_PATH}/_test_workspace
+mkdir -p $WORKSPACE_MOUNT_PATH
+chmod -R 777 $WORKSPACE_MOUNT_PATH
+WORKSPACE_MOUNT_PATH=$(realpath $WORKSPACE_MOUNT_PATH)
 
 echo "Current working directory: $(pwd)"
 echo "SCRIPT_DIR: $SCRIPT_DIR"
 echo "PROJECT_ROOT: $PROJECT_ROOT"
 echo "WORKSPACE_BASE: $WORKSPACE_BASE"
 echo "WORKSPACE_MOUNT_PATH: $WORKSPACE_MOUNT_PATH"
-echo "WORKSPACE_MOUNT_PATH_IN_SANDBOX: $WORKSPACE_MOUNT_PATH_IN_SANDBOX"
 
 # Ensure we're in the correct directory
 cd "$PROJECT_ROOT" || exit 1
@@ -51,10 +56,14 @@ cd "$PROJECT_ROOT" || exit 1
 mkdir -p $WORKSPACE_BASE
 
 # use environmental variable if exists, otherwise use "ssh"
-SANDBOX_BOX_TYPE="${SANDBOX_TYPE:-ssh}"
-# TODO: we should also test PERSIST_SANDBOX = true, once it's fixed
-PERSIST_SANDBOX=false
+TEST_RUNTIME="${TEST_RUNTIME:-eventstream}"  # can be server or eventstream
+# TODO: set this as default after ServerRuntime is deprecated
+if [ "$TEST_RUNTIME" == "eventstream" ] && [ -z "$SANDBOX_CONTAINER_IMAGE" ]; then
+  SANDBOX_CONTAINER_IMAGE="nikolaik/python-nodejs:python3.11-nodejs22"
+fi
+
 MAX_ITERATIONS=15
+echo "TEST_RUNTIME: $TEST_RUNTIME"
 
 agents=(
   "DelegatorAgent"
@@ -93,7 +102,7 @@ run_test() {
   # Ensure we're in the correct directory
   cd "$PROJECT_ROOT" || exit 1
 
-  local pytest_cmd="poetry run pytest --cache-clear -s $SCRIPT_DIR/test_agent.py::$test_name"
+  local pytest_cmd="poetry run pytest --cache-clear -vvsxx $SCRIPT_DIR/test_agent.py::$test_name"
   # Check if TEST_IN_CI is defined
   if [ -n "$TEST_IN_CI" ]; then
     pytest_cmd+=" --cov=agenthub --cov=opendevin --cov-report=xml --cov-append"
@@ -101,13 +110,12 @@ run_test() {
 
   env SCRIPT_DIR="$SCRIPT_DIR" \
     PROJECT_ROOT="$PROJECT_ROOT" \
-    SANDBOX_BOX_TYPE="$SANDBOX_BOX_TYPE" \
-    PERSIST_SANDBOX=$PERSIST_SANDBOX \
     WORKSPACE_BASE=$WORKSPACE_BASE \
     WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
-    WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
     MAX_ITERATIONS=$MAX_ITERATIONS \
     DEFAULT_AGENT=$agent \
+    TEST_RUNTIME="$TEST_RUNTIME" \
+    SANDBOX_CONTAINER_IMAGE="$SANDBOX_CONTAINER_IMAGE" \
     $pytest_cmd 2>&1 | tee $TMP_FILE
 
   # Capture the exit code of pytest
@@ -153,7 +161,7 @@ cleanup() {
   echo "Cleaning up before exit..."
   if [ -n "$HTTP_SERVER_PID" ]; then
     echo "Killing HTTP server..."
-    kill $HTTP_SERVER_PID
+    kill $HTTP_SERVER_PID || true
     unset HTTP_SERVER_PID
   fi
   [ -f $TMP_FILE ] && rm $TMP_FILE
@@ -163,7 +171,7 @@ cleanup() {
 # Trap the EXIT signal to run the cleanup function
 trap cleanup EXIT
 
-# generate prompts again, using existing LLM responses under tests/integration/mock/[agent]/[test_name]/response_*.log
+# generate prompts again, using existing LLM responses under tests/integration/mock/[test_runtime]_runtime/[agent]/[test_name]/response_*.log
 # this is a compromise; the prompts might be non-sense yet still pass the test, because we don't use a real LLM to
 # respond to the prompts. The benefit is developers don't have to regenerate real responses from LLM, if they only
 # apply a small change to prompts.
@@ -172,14 +180,13 @@ regenerate_without_llm() {
   set -x
   env SCRIPT_DIR="$SCRIPT_DIR" \
       PROJECT_ROOT="$PROJECT_ROOT" \
-      SANDBOX_BOX_TYPE="$SANDBOX_BOX_TYPE" \
-      PERSIST_SANDBOX=$PERSIST_SANDBOX \
       WORKSPACE_BASE=$WORKSPACE_BASE \
       WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
-      WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
       MAX_ITERATIONS=$MAX_ITERATIONS \
       FORCE_APPLY_PROMPTS=true \
       DEFAULT_AGENT=$agent \
+      TEST_RUNTIME="$TEST_RUNTIME" \
+      SANDBOX_CONTAINER_IMAGE="$SANDBOX_CONTAINER_IMAGE" \
       poetry run pytest -s $SCRIPT_DIR/test_agent.py::$test_name
   set +x
 }
@@ -195,28 +202,28 @@ regenerate_with_llm() {
   fi
 
   rm -rf logs
-  rm -rf "$SCRIPT_DIR/mock/$agent/$test_name/*"
+  rm -rf "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/*"
   # set -x to print the command being executed
   set -x
   echo -e "/exit\n" | \
     env SCRIPT_DIR="$SCRIPT_DIR" \
       PROJECT_ROOT="$PROJECT_ROOT" \
       DEBUG=true \
-      SANDBOX_BOX_TYPE="$SANDBOX_BOX_TYPE" \
-      PERSIST_SANDBOX=$PERSIST_SANDBOX \
       WORKSPACE_BASE=$WORKSPACE_BASE \
       WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
-      AGENT=$agent \
-      WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
+      DEFAULT_AGENT=$agent \
+      RUNTIME="$TEST_RUNTIME" \
+      SANDBOX_CONTAINER_IMAGE="$SANDBOX_CONTAINER_IMAGE" \
       poetry run python "$PROJECT_ROOT/opendevin/core/main.py" \
       -i $MAX_ITERATIONS \
       -t "$task Do not ask me for confirmation at any point." \
       -c $agent
   set +x
 
-  mkdir -p "$SCRIPT_DIR/mock/$agent/$test_name/"
-  mv logs/llm/**/* "$SCRIPT_DIR/mock/$agent/$test_name/"
+  mkdir -p "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/"
+  mv logs/llm/**/* "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/"
 
+  kill $HTTP_SERVER_PID || true
 }
 
 ##############################################################
@@ -272,8 +279,8 @@ for ((i = 0; i < num_of_tests; i++)); do
 
       if [ "$FORCE_USE_LLM" = true ]; then
         echo -e "\n\n\n\n========FORCE_USE_LLM, skipping step 2 & 3========\n\n\n\n"
-      elif [ ! -d "$SCRIPT_DIR/mock/$agent/$test_name" ]; then
-        echo -e "\n\n\n\n========No existing mock responses for $agent/$test_name, skipping step 2 & 3========\n\n\n\n"
+      elif [ ! -d "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name" ]; then
+        echo -e "\n\n\n\n========No existing mock responses for ${TEST_RUNTIME}_runtime/$agent/$test_name, skipping step 2 & 3========\n\n\n\n"
       else
         echo -e "\n\n\n\n========STEP 2: $test_name failed, regenerating prompts for $agent WITHOUT money cost========\n\n\n\n"
 
diff --git a/tests/integration/test_agent.py b/tests/integration/test_agent.py
index cfc2ad1cfdc5..e2f12eb0587f 100644
--- a/tests/integration/test_agent.py
+++ b/tests/integration/test_agent.py
@@ -5,10 +5,9 @@
 
 import pytest
 
-from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import LLMConfig
-from opendevin.core.main import run_agent_controller
+from opendevin.core.config import AppConfig, SandboxConfig, load_from_env
+from opendevin.core.main import run_controller
 from opendevin.core.schema import AgentState
 from opendevin.events.action import (
     AgentFinishAction,
@@ -16,23 +15,39 @@
 )
 from opendevin.events.observation.browse import BrowserOutputObservation
 from opendevin.events.observation.delegate import AgentDelegateObservation
-from opendevin.llm.llm import LLM
-
-workspace_base = os.getenv('WORKSPACE_BASE')
-workspace_mount_path = os.getenv('WORKSPACE_MOUNT_PATH')
-workspace_mount_path_in_sandbox = os.getenv('WORKSPACE_MOUNT_PATH_IN_SANDBOX')
-max_iterations = 15
-max_budget_per_task = 15
+from opendevin.runtime import get_runtime_cls
+
+TEST_RUNTIME = os.getenv('TEST_RUNTIME')
+assert TEST_RUNTIME in ['eventstream', 'server']
+_ = get_runtime_cls(TEST_RUNTIME)  # make sure it does not raise an error
+
+CONFIG = AppConfig(
+    max_iterations=int(os.getenv('MAX_ITERATIONS', 15)),
+    max_budget_per_task=int(os.getenv('MAX_BUDGET_PER_TASK', 15)),
+    runtime=TEST_RUNTIME,
+    default_agent=os.getenv('DEFAULT_AGENT'),
+    workspace_base=os.getenv('WORKSPACE_BASE'),
+    workspace_mount_path=os.getenv('WORKSPACE_MOUNT_PATH'),
+    sandbox=SandboxConfig(
+        use_host_network=True,
+    ),
+)
+load_from_env(CONFIG, os.environ)
 
 print('\nPaths used:')
-print(f'workspace_base: {workspace_base}')
-print(f'workspace_mount_path: {workspace_mount_path}')
-print(f'workspace_mount_path_in_sandbox: {workspace_mount_path_in_sandbox}')
+print(f'workspace_base: {CONFIG.workspace_base}')
+print(f'workspace_mount_path: {CONFIG.workspace_mount_path}')
+print(f'workspace_mount_path_in_sandbox: {CONFIG.workspace_mount_path_in_sandbox}')
+print(f'CONFIG: {CONFIG}')
 
 
 def get_number_of_prompts(test_name: str):
     mock_dir = os.path.join(
-        os.environ['SCRIPT_DIR'], 'mock', os.environ['DEFAULT_AGENT'], test_name
+        os.environ['SCRIPT_DIR'],
+        'mock',
+        f'{TEST_RUNTIME}_runtime',
+        os.environ['DEFAULT_AGENT'],
+        test_name,
     )
     prompt_files = [file for file in os.listdir(mock_dir) if file.startswith('prompt_')]
     return len(prompt_files)
@@ -64,8 +79,7 @@ def validate_final_state(final_state: State | None, test_name: str):
     (
         os.getenv('DEFAULT_AGENT') == 'CodeActAgent'
         or os.getenv('DEFAULT_AGENT') == 'CodeActSWEAgent'
-    )
-    and os.getenv('SANDBOX_BOX_TYPE', '').lower() != 'ssh',
+    ),
     reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
 )
 @pytest.mark.skipif(
@@ -75,19 +89,14 @@ def validate_final_state(final_state: State | None, test_name: str):
 def test_write_simple_script(current_test_name: str) -> None:
     task = "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point."
 
-    # Create the agent
-    agent = Agent.get_cls(os.getenv('DEFAULT_AGENT'))(llm=LLM(LLMConfig()))
-
     final_state: State | None = asyncio.run(
-        run_agent_controller(
-            agent, task, max_iterations, max_budget_per_task, exit_on_message=True
-        )
+        run_controller(CONFIG, task, exit_on_message=True)
     )
     validate_final_state(final_state, current_test_name)
 
     # Verify the script file exists
-    assert workspace_base is not None
-    script_path = os.path.join(workspace_base, 'hello.sh')
+    assert CONFIG.workspace_base is not None
+    script_path = os.path.join(CONFIG.workspace_base, 'hello.sh')
     assert os.path.exists(script_path), 'The file "hello.sh" does not exist'
 
     # Run the script and capture the output
@@ -107,37 +116,27 @@ def test_write_simple_script(current_test_name: str) -> None:
     (
         os.getenv('DEFAULT_AGENT') == 'CodeActAgent'
         or os.getenv('DEFAULT_AGENT') == 'CodeActSWEAgent'
-    )
-    and os.getenv('SANDBOX_BOX_TYPE', '').lower() != 'ssh',
+    ),
     reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
 )
 @pytest.mark.skipif(
     os.getenv('DEFAULT_AGENT') == 'PlannerAgent',
     reason='We only keep basic tests for PlannerAgent',
 )
-@pytest.mark.skipif(
-    os.getenv('SANDBOX_BOX_TYPE') == 'local',
-    reason='local sandbox shows environment-dependent absolute path for pwd command',
-)
 def test_edits(current_test_name: str):
     # Copy workspace artifacts to workspace_base location
     source_dir = os.path.join(os.path.dirname(__file__), 'workspace/test_edits/')
     files = os.listdir(source_dir)
     for file in files:
-        dest_file = os.path.join(workspace_base, file)
+        dest_file = os.path.join(CONFIG.workspace_base, file)
         if os.path.exists(dest_file):
             os.remove(dest_file)
         shutil.copy(os.path.join(source_dir, file), dest_file)
 
-    # Create the agent
-    agent = Agent.get_cls(os.getenv('DEFAULT_AGENT'))(llm=LLM(LLMConfig()))
-
     # Execute the task
     task = 'Fix typos in bad.txt. Do not ask me for confirmation at any point.'
     final_state: State | None = asyncio.run(
-        run_agent_controller(
-            agent, task, max_iterations, max_budget_per_task, exit_on_message=True
-        )
+        run_controller(CONFIG, task, exit_on_message=True)
     )
     validate_final_state(final_state, current_test_name)
 
@@ -147,7 +146,7 @@ def test_edits(current_test_name: str):
 No more typos!
 Enjoy!
 """
-    with open(os.path.join(workspace_base, 'bad.txt'), 'r') as f:
+    with open(os.path.join(CONFIG.workspace_base, 'bad.txt'), 'r') as f:
         content = f.read()
     assert content.strip() == text.strip()
 
@@ -157,25 +156,16 @@ def test_edits(current_test_name: str):
     and os.getenv('DEFAULT_AGENT') != 'CodeActSWEAgent',
     reason='currently only CodeActAgent and CodeActSWEAgent have IPython (Jupyter) execution by default',
 )
-@pytest.mark.skipif(
-    os.getenv('SANDBOX_BOX_TYPE') != 'ssh',
-    reason='Currently, only ssh sandbox supports stateful tasks',
-)
 def test_ipython(current_test_name: str):
-    # Create the agent
-    agent = Agent.get_cls(os.getenv('DEFAULT_AGENT'))(llm=LLM(LLMConfig()))
-
     # Execute the task
     task = "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'. Do not ask me for confirmation at any point."
     final_state: State | None = asyncio.run(
-        run_agent_controller(
-            agent, task, max_iterations, max_budget_per_task, exit_on_message=True
-        )
+        run_controller(CONFIG, task, exit_on_message=True)
     )
     validate_final_state(final_state, current_test_name)
 
     # Verify the file exists
-    file_path = os.path.join(workspace_base, 'test.txt')
+    file_path = os.path.join(CONFIG.workspace_base, 'test.txt')
     assert os.path.exists(file_path), 'The file "test.txt" does not exist'
 
     # Verify the file contains the expected content
@@ -190,19 +180,14 @@ def test_ipython(current_test_name: str):
     os.getenv('DEFAULT_AGENT') != 'ManagerAgent',
     reason='Currently, only ManagerAgent supports task rejection',
 )
-@pytest.mark.skipif(
-    os.getenv('SANDBOX_BOX_TYPE') == 'local',
-    reason='FIXME: local sandbox does not capture stderr',
-)
+@pytest.mark.skipif(1, reason='Due to the following changes in the codebase:')
+# https://github.com/OpenDevin/OpenDevin/commit/fad76def4076dfe8b005ce7f7ac718afa5f2b82e#diff-5bd880aa9413a6626d1f6a8c823407108da2163c1b54dad751463913ca6c0bc0R61-R64
 def test_simple_task_rejection(current_test_name: str):
-    # Create the agent
-    agent = Agent.get_cls(os.getenv('DEFAULT_AGENT'))(llm=LLM(LLMConfig()))
-
     # Give an impossible task to do: cannot write a commit message because
     # the workspace is not a git repo
     task = 'Write a git commit message for the current staging area. Do not ask me for confirmation at any point.'
     final_state: State | None = asyncio.run(
-        run_agent_controller(agent, task, max_iterations, max_budget_per_task)
+        run_controller(CONFIG, task, exit_on_message=True)
     )
     validate_final_state(final_state, current_test_name)
     assert isinstance(final_state.history.get_last_action(), AgentRejectAction)
@@ -213,25 +198,18 @@ def test_simple_task_rejection(current_test_name: str):
     and os.getenv('DEFAULT_AGENT') != 'CodeActSWEAgent',
     reason='currently only CodeActAgent and CodeActSWEAgent have IPython (Jupyter) execution by default',
 )
-@pytest.mark.skipif(
-    os.getenv('SANDBOX_BOX_TYPE') != 'ssh',
-    reason='Currently, only ssh sandbox supports stateful tasks',
-)
+@pytest.mark.skipif(1, reason='Due to the following changes in the codebase:')
+# https://github.com/SmartManoj/Kevin/commit/3b77d5b2ec592e0fcb5bd7ed8a0d5787378bc0de
 def test_ipython_module(current_test_name: str):
-    # Create the agent
-    agent = Agent.get_cls(os.getenv('DEFAULT_AGENT'))(llm=LLM(LLMConfig()))
-
     # Execute the task
     task = "Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point."
     final_state: State | None = asyncio.run(
-        run_agent_controller(
-            agent, task, max_iterations, max_budget_per_task, exit_on_message=True
-        )
+        run_controller(CONFIG, task, exit_on_message=True)
     )
     validate_final_state(final_state, current_test_name)
 
     # Verify the file exists
-    file_path = os.path.join(workspace_base, 'test.txt')
+    file_path = os.path.join(CONFIG.workspace_base, 'test.txt')
     assert os.path.exists(file_path), 'The file "test.txt" does not exist'
 
     # Verify the file contains the expected content
@@ -252,20 +230,14 @@ def test_ipython_module(current_test_name: str):
     (
         os.getenv('DEFAULT_AGENT') == 'CodeActAgent'
         or os.getenv('DEFAULT_AGENT') == 'CodeActSWEAgent'
-    )
-    and os.getenv('SANDBOX_BOX_TYPE', '').lower() != 'ssh',
+    ),
     reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
 )
 def test_browse_internet(http_server, current_test_name: str):
-    # Create the agent
-    agent = Agent.get_cls(os.getenv('DEFAULT_AGENT'))(llm=LLM(LLMConfig()))
-
     # Execute the task
     task = 'Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.'
     final_state: State | None = asyncio.run(
-        run_agent_controller(
-            agent, task, max_iterations, max_budget_per_task, exit_on_message=True
-        )
+        run_controller(CONFIG, task, exit_on_message=True)
     )
     validate_final_state(final_state, current_test_name)
 
diff --git a/tests/integration/test_patch.py b/tests/integration/test_patch.py
new file mode 100644
index 000000000000..ef5f46c772b1
--- /dev/null
+++ b/tests/integration/test_patch.py
@@ -0,0 +1,9 @@
+diffs = r"""
+-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\nopendevin@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
++[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\nopendevin@fv-az1245-968:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "Error creating <class 'opendevin.events.action.agent.AgentRejectAction'> from action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}}: AgentRejectAction.__init__() got an unexpected keyword argument 'reason'", "extras": {}}]
+""".strip().splitlines()
+
+test_patces = {'': ''}
+for i in range(0, len(diffs), 2):
+    test_patces[diffs[i][1:]] = diffs[i + 1][1:]
+print(test_patces)
diff --git a/tests/test_fileops.py b/tests/test_fileops.py
index 3b296f13de84..9fa9ceaa8587 100644
--- a/tests/test_fileops.py
+++ b/tests/test_fileops.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from opendevin.runtime.server import files
+from opendevin.runtime.utils import files
 
 SANDBOX_PATH_PREFIX = '/workspace'
 WORKSPACE_BASE = 'workspace'
diff --git a/tests/unit/test_acompletion.py b/tests/unit/test_acompletion.py
new file mode 100644
index 000000000000..b3de1f4656aa
--- /dev/null
+++ b/tests/unit/test_acompletion.py
@@ -0,0 +1,187 @@
+import asyncio
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from opendevin.core.config import load_app_config
+from opendevin.core.exceptions import UserCancelledError
+from opendevin.llm.llm import LLM
+
+config = load_app_config()
+
+
+@pytest.fixture
+def test_llm():
+    # Create a mock config for testing
+    return LLM(config=config.get_llm_config())
+
+
+@pytest.fixture
+def mock_response():
+    return [
+        {'choices': [{'delta': {'content': 'This is a'}}]},
+        {'choices': [{'delta': {'content': ' test'}}]},
+        {'choices': [{'delta': {'content': ' message.'}}]},
+        {'choices': [{'delta': {'content': ' It is'}}]},
+        {'choices': [{'delta': {'content': ' a bit'}}]},
+        {'choices': [{'delta': {'content': ' longer'}}]},
+        {'choices': [{'delta': {'content': ' than'}}]},
+        {'choices': [{'delta': {'content': ' the'}}]},
+        {'choices': [{'delta': {'content': ' previous'}}]},
+        {'choices': [{'delta': {'content': ' one,'}}]},
+        {'choices': [{'delta': {'content': ' but'}}]},
+        {'choices': [{'delta': {'content': ' hopefully'}}]},
+        {'choices': [{'delta': {'content': ' still'}}]},
+        {'choices': [{'delta': {'content': ' short'}}]},
+        {'choices': [{'delta': {'content': ' enough.'}}]},
+    ]
+
+
+@pytest.mark.asyncio
+async def test_acompletion_non_streaming():
+    with patch.object(LLM, '_call_acompletion') as mock_call_acompletion:
+        mock_response = {
+            'choices': [{'message': {'content': 'This is a test message.'}}]
+        }
+        mock_call_acompletion.return_value = mock_response
+        test_llm = LLM(config=config.get_llm_config())
+        response = await test_llm.async_completion(
+            messages=[{'role': 'user', 'content': 'Hello!'}],
+            stream=False,
+            drop_params=True,
+        )
+        # Assertions for non-streaming completion
+        assert response['choices'][0]['message']['content'] != ''
+
+
+@pytest.mark.asyncio
+async def test_acompletion_streaming(mock_response):
+    with patch.object(LLM, '_call_acompletion') as mock_call_acompletion:
+        mock_call_acompletion.return_value.__aiter__.return_value = iter(mock_response)
+        test_llm = LLM(config=config.get_llm_config())
+        async for chunk in test_llm.async_streaming_completion(
+            messages=[{'role': 'user', 'content': 'Hello!'}], stream=True
+        ):
+            print(f"Chunk: {chunk['choices'][0]['delta']['content']}")
+            # Assertions for streaming completion
+            assert chunk['choices'][0]['delta']['content'] in [
+                r['choices'][0]['delta']['content'] for r in mock_response
+            ]
+
+
+@pytest.mark.asyncio
+async def test_completion(test_llm):
+    with patch.object(LLM, 'completion') as mock_completion:
+        mock_completion.return_value = {
+            'choices': [{'message': {'content': 'This is a test message.'}}]
+        }
+        response = test_llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}])
+        assert response['choices'][0]['message']['content'] == 'This is a test message.'
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize('cancel_delay', [0.1, 0.3, 0.5, 0.7, 0.9])
+async def test_async_completion_with_user_cancellation(cancel_delay):
+    cancel_event = asyncio.Event()
+
+    async def mock_on_cancel_requested():
+        is_set = cancel_event.is_set()
+        print(f'Cancel requested: {is_set}')
+        return is_set
+
+    config = load_app_config()
+    config.on_cancel_requested_fn = mock_on_cancel_requested
+
+    async def mock_acompletion(*args, **kwargs):
+        print('Starting mock_acompletion')
+        for i in range(20):  # Increased iterations for longer running task
+            print(f'mock_acompletion iteration {i}')
+            await asyncio.sleep(0.1)
+            if await mock_on_cancel_requested():
+                print('Cancellation detected in mock_acompletion')
+                raise UserCancelledError('LLM request cancelled by user')
+        print('Completing mock_acompletion without cancellation')
+        return {'choices': [{'message': {'content': 'This is a test message.'}}]}
+
+    with patch.object(
+        LLM, '_call_acompletion', new_callable=AsyncMock
+    ) as mock_call_acompletion:
+        mock_call_acompletion.side_effect = mock_acompletion
+        test_llm = LLM(config=config.get_llm_config())
+
+        async def cancel_after_delay():
+            print(f'Starting cancel_after_delay with delay {cancel_delay}')
+            await asyncio.sleep(cancel_delay)
+            print('Setting cancel event')
+            cancel_event.set()
+
+        with pytest.raises(UserCancelledError):
+            await asyncio.gather(
+                test_llm.async_completion(
+                    messages=[{'role': 'user', 'content': 'Hello!'}],
+                    stream=False,
+                ),
+                cancel_after_delay(),
+            )
+
+    # Ensure the mock was called
+    mock_call_acompletion.assert_called_once()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize('cancel_after_chunks', [1, 3, 5, 7, 9])
+async def test_async_streaming_completion_with_user_cancellation(cancel_after_chunks):
+    cancel_requested = False
+
+    async def mock_on_cancel_requested():
+        nonlocal cancel_requested
+        return cancel_requested
+
+    config = load_app_config()
+    config.on_cancel_requested_fn = mock_on_cancel_requested
+
+    test_messages = [
+        'This is ',
+        'a test ',
+        'message ',
+        'with ',
+        'multiple ',
+        'chunks ',
+        'to ',
+        'simulate ',
+        'a ',
+        'longer ',
+        'streaming ',
+        'response.',
+    ]
+
+    async def mock_acompletion(*args, **kwargs):
+        for i, content in enumerate(test_messages):
+            yield {'choices': [{'delta': {'content': content}}]}
+            if i + 1 == cancel_after_chunks:
+                nonlocal cancel_requested
+                cancel_requested = True
+            if cancel_requested:
+                raise UserCancelledError('LLM request cancelled by user')
+            await asyncio.sleep(0.05)  # Simulate some delay between chunks
+
+    with patch.object(
+        LLM, '_call_acompletion', new_callable=AsyncMock
+    ) as mock_call_acompletion:
+        mock_call_acompletion.return_value = mock_acompletion()
+        test_llm = LLM(config=config.get_llm_config())
+
+        received_chunks = []
+        with pytest.raises(UserCancelledError):
+            async for chunk in test_llm.async_streaming_completion(
+                messages=[{'role': 'user', 'content': 'Hello!'}], stream=True
+            ):
+                received_chunks.append(chunk['choices'][0]['delta']['content'])
+                print(f"Chunk: {chunk['choices'][0]['delta']['content']}")
+
+        # Assert that we received the expected number of chunks before cancellation
+        assert len(received_chunks) == cancel_after_chunks
+        assert received_chunks == test_messages[:cancel_after_chunks]
+
+    # Ensure the mock was called
+    mock_call_acompletion.assert_called_once()
diff --git a/tests/unit/test_action_serialization.py b/tests/unit/test_action_serialization.py
index a38f00bdf999..d88b25e1f6a3 100644
--- a/tests/unit/test_action_serialization.py
+++ b/tests/unit/test_action_serialization.py
@@ -51,6 +51,7 @@ def test_event_props_serialization_deserialization():
         'action': 'message',
         'args': {
             'content': 'This is a test.',
+            'images_urls': None,
             'wait_for_response': False,
         },
     }
@@ -62,6 +63,7 @@ def test_message_action_serialization_deserialization():
         'action': 'message',
         'args': {
             'content': 'This is a test.',
+            'images_urls': None,
             'wait_for_response': False,
         },
     }
@@ -84,6 +86,7 @@ def test_cmd_run_action_serialization_deserialization():
         'args': {
             'command': 'echo "Hello world"',
             'thought': '',
+            'keep_prompt': True,
             'is_confirmed': ActionConfirmationStatus.CONFIRMED,
         },
     }
diff --git a/tests/unit/test_agent_skill.py b/tests/unit/test_agent_skill.py
index 46d205e6f420..5067b4be3d46 100644
--- a/tests/unit/test_agent_skill.py
+++ b/tests/unit/test_agent_skill.py
@@ -2,6 +2,7 @@
 import io
 import os
 import sys
+from unittest.mock import patch
 
 import docx
 import pytest
@@ -488,13 +489,9 @@ def test_open_file_large_line_number_consecutive_diff_window(tmp_path):
         assert result == expected
 
 
-def test_edit_file_by_replace_window(tmp_path, monkeypatch):
-    # Set environment variable via monkeypatch does NOT work!
-    monkeypatch.setattr(
-        'opendevin.runtime.plugins.agent_skills.agentskills.ENABLE_AUTO_LINT', True
-    )
-
-    content = """def any_int(a, b, c):
+def test_edit_file_by_replace_window(tmp_path):
+    with patch.dict(os.environ, {'ENABLE_AUTO_LINT': 'True'}):
+        content = """def any_int(a, b, c):
     return isinstance(a, int) and isinstance(b, int) and isinstance(c, int)
 
 def test_any_int():
@@ -528,83 +525,83 @@ def check(any_int):
 
 check(any_int)"""
 
-    temp_file_path = tmp_path / 'error-test.py'
-    temp_file_path.write_text(content)
+        temp_file_path = tmp_path / 'error-test.py'
+        temp_file_path.write_text(content)
 
-    open_file(str(temp_file_path))
+        open_file(str(temp_file_path))
 
-    with io.StringIO() as buf:
-        with contextlib.redirect_stdout(buf):
-            edit_file_by_replace(
-                str(temp_file_path),
-                to_replace='    assert any_int(1.0, 2, 3) == False',
-                new_content='        assert any_int(1.0, 2, 3) == False',
+        with io.StringIO() as buf:
+            with contextlib.redirect_stdout(buf):
+                edit_file_by_replace(
+                    str(temp_file_path),
+                    to_replace='    assert any_int(1.0, 2, 3) == False',
+                    new_content='        assert any_int(1.0, 2, 3) == False',
+                )
+            result = buf.getvalue()
+            expected = (
+                '[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]\n'
+                'ERRORS:\n'
+                + str(temp_file_path)
+                + ':9:9: '
+                + 'E999 IndentationError: unexpected indent\n'
+                '[This is how your edit would have looked if applied]\n'
+                '-------------------------------------------------\n'
+                '(this is the beginning of the file)\n'
+                '1|def any_int(a, b, c):\n'
+                '2|    return isinstance(a, int) and isinstance(b, int) and isinstance(c, int)\n'
+                '3|\n'
+                '4|def test_any_int():\n'
+                '5|    assert any_int(1, 2, 3) == True\n'
+                '6|    assert any_int(1.5, 2, 3) == False\n'
+                '7|    assert any_int(1, 2.5, 3) == False\n'
+                '8|    assert any_int(1, 2, 3.5) == False\n'
+                '9|        assert any_int(1.0, 2, 3) == False\n'
+                '10|    assert any_int(1, 2.0, 3) == False\n'
+                '11|    assert any_int(1, 2, 3.0) == False\n'
+                '12|    assert any_int(0, 0, 0) == True\n'
+                '13|    assert any_int(-1, -2, -3) == True\n'
+                '14|    assert any_int(1, -2, 3) == True\n'
+                '15|    assert any_int(1.5, -2, 3) == False\n'
+                '16|    assert any_int(1, -2.5, 3) == False\n'
+                '17|\n'
+                '18|def check(any_int):\n'
+                '19|    # Check some simple cases\n'
+                '20|    assert any_int(2, 3, 1)==True, "This prints if this assert fails 1 (good for debugging!)"\n'
+                '21|    assert any_int(2.5, 2, 3)==False, "This prints if this assert fails 2 (good for debugging!)"\n'
+                '(12 more lines below)\n'
+                '-------------------------------------------------\n'
+                '\n'
+                '[This is the original code before your edit]\n'
+                '-------------------------------------------------\n'
+                '(this is the beginning of the file)\n'
+                '1|def any_int(a, b, c):\n'
+                '2|    return isinstance(a, int) and isinstance(b, int) and isinstance(c, int)\n'
+                '3|\n'
+                '4|def test_any_int():\n'
+                '5|    assert any_int(1, 2, 3) == True\n'
+                '6|    assert any_int(1.5, 2, 3) == False\n'
+                '7|    assert any_int(1, 2.5, 3) == False\n'
+                '8|    assert any_int(1, 2, 3.5) == False\n'
+                '9|    assert any_int(1.0, 2, 3) == False\n'
+                '10|    assert any_int(1, 2.0, 3) == False\n'
+                '11|    assert any_int(1, 2, 3.0) == False\n'
+                '12|    assert any_int(0, 0, 0) == True\n'
+                '13|    assert any_int(-1, -2, -3) == True\n'
+                '14|    assert any_int(1, -2, 3) == True\n'
+                '15|    assert any_int(1.5, -2, 3) == False\n'
+                '16|    assert any_int(1, -2.5, 3) == False\n'
+                '17|\n'
+                '18|def check(any_int):\n'
+                '19|    # Check some simple cases\n'
+                '20|    assert any_int(2, 3, 1)==True, "This prints if this assert fails 1 (good for debugging!)"\n'
+                '21|    assert any_int(2.5, 2, 3)==False, "This prints if this assert fails 2 (good for debugging!)"\n'
+                '(12 more lines below)\n'
+                '-------------------------------------------------\n'
+                'Your changes have NOT been applied. Please fix your edit command and try again.\n'
+                'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
+                'DO NOT re-run the same failed edit command. Running it again will lead to the same error.\n'
             )
-        result = buf.getvalue()
-        expected = (
-            '[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]\n'
-            'ERRORS:\n'
-            + str(temp_file_path)
-            + ':9:9: '
-            + 'E999 IndentationError: unexpected indent\n'
-            '[This is how your edit would have looked if applied]\n'
-            '-------------------------------------------------\n'
-            '(this is the beginning of the file)\n'
-            '1|def any_int(a, b, c):\n'
-            '2|    return isinstance(a, int) and isinstance(b, int) and isinstance(c, int)\n'
-            '3|\n'
-            '4|def test_any_int():\n'
-            '5|    assert any_int(1, 2, 3) == True\n'
-            '6|    assert any_int(1.5, 2, 3) == False\n'
-            '7|    assert any_int(1, 2.5, 3) == False\n'
-            '8|    assert any_int(1, 2, 3.5) == False\n'
-            '9|        assert any_int(1.0, 2, 3) == False\n'
-            '10|    assert any_int(1, 2.0, 3) == False\n'
-            '11|    assert any_int(1, 2, 3.0) == False\n'
-            '12|    assert any_int(0, 0, 0) == True\n'
-            '13|    assert any_int(-1, -2, -3) == True\n'
-            '14|    assert any_int(1, -2, 3) == True\n'
-            '15|    assert any_int(1.5, -2, 3) == False\n'
-            '16|    assert any_int(1, -2.5, 3) == False\n'
-            '17|\n'
-            '18|def check(any_int):\n'
-            '19|    # Check some simple cases\n'
-            '20|    assert any_int(2, 3, 1)==True, "This prints if this assert fails 1 (good for debugging!)"\n'
-            '21|    assert any_int(2.5, 2, 3)==False, "This prints if this assert fails 2 (good for debugging!)"\n'
-            '(12 more lines below)\n'
-            '-------------------------------------------------\n'
-            '\n'
-            '[This is the original code before your edit]\n'
-            '-------------------------------------------------\n'
-            '(this is the beginning of the file)\n'
-            '1|def any_int(a, b, c):\n'
-            '2|    return isinstance(a, int) and isinstance(b, int) and isinstance(c, int)\n'
-            '3|\n'
-            '4|def test_any_int():\n'
-            '5|    assert any_int(1, 2, 3) == True\n'
-            '6|    assert any_int(1.5, 2, 3) == False\n'
-            '7|    assert any_int(1, 2.5, 3) == False\n'
-            '8|    assert any_int(1, 2, 3.5) == False\n'
-            '9|    assert any_int(1.0, 2, 3) == False\n'
-            '10|    assert any_int(1, 2.0, 3) == False\n'
-            '11|    assert any_int(1, 2, 3.0) == False\n'
-            '12|    assert any_int(0, 0, 0) == True\n'
-            '13|    assert any_int(-1, -2, -3) == True\n'
-            '14|    assert any_int(1, -2, 3) == True\n'
-            '15|    assert any_int(1.5, -2, 3) == False\n'
-            '16|    assert any_int(1, -2.5, 3) == False\n'
-            '17|\n'
-            '18|def check(any_int):\n'
-            '19|    # Check some simple cases\n'
-            '20|    assert any_int(2, 3, 1)==True, "This prints if this assert fails 1 (good for debugging!)"\n'
-            '21|    assert any_int(2.5, 2, 3)==False, "This prints if this assert fails 2 (good for debugging!)"\n'
-            '(12 more lines below)\n'
-            '-------------------------------------------------\n'
-            'Your changes have NOT been applied. Please fix your edit command and try again.\n'
-            'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
-            'DO NOT re-run the same failed edit command. Running it again will lead to the same error.\n'
-        )
-        assert result == expected
+            assert result == expected
 
 
 # ================================
@@ -1253,153 +1250,140 @@ def test_find_file_not_exist_file_specific_path(tmp_path):
     assert result.split('\n') == expected.split('\n')
 
 
-def test_edit_lint_file_pass(tmp_path, monkeypatch):
-    # Enable linting
-    monkeypatch.setattr(
-        'opendevin.runtime.plugins.agent_skills.agentskills.ENABLE_AUTO_LINT', True
-    )
-
-    file_path = _generate_test_file_with_lines(tmp_path, 1)
-
-    # Test linting functionality
-    with io.StringIO() as buf:
-        with contextlib.redirect_stdout(buf):
-            open_file(str(file_path))
-            insert_content_at_line(str(file_path), 1, "print('hello')\n")
-        result = buf.getvalue()
-    assert result is not None
-    expected = (
-        f'[File: {file_path} (1 lines total)]\n'
-        '(this is the beginning of the file)\n'
-        '1|\n'
-        '(this is the end of the file)\n'
-        f'[File: {file_path} (1 lines total after edit)]\n'
-        '(this is the beginning of the file)\n'
-        "1|print('hello')\n"
-        '(this is the end of the file)\n'
-        + MSG_FILE_UPDATED.format(line_number=1)
-        + '\n'
-    )
-    assert result.split('\n') == expected.split('\n')
-
-
-def test_lint_file_fail_undefined_name(tmp_path, monkeypatch, capsys):
+def test_edit_lint_file_pass(tmp_path):
     # Enable linting
-    monkeypatch.setattr(
-        'opendevin.runtime.plugins.agent_skills.agentskills.ENABLE_AUTO_LINT', True
-    )
+    with patch.dict(os.environ, {'ENABLE_AUTO_LINT': 'True'}):
+        file_path = _generate_test_file_with_lines(tmp_path, 1)
 
-    current_line = 1
+        # Test linting functionality
+        with io.StringIO() as buf:
+            with contextlib.redirect_stdout(buf):
+                open_file(str(file_path))
+                insert_content_at_line(str(file_path), 1, "print('hello')\n")
+            result = buf.getvalue()
+        assert result is not None
+        expected = (
+            f'[File: {file_path} (1 lines total)]\n'
+            '(this is the beginning of the file)\n'
+            '1|\n'
+            '(this is the end of the file)\n'
+            f'[File: {file_path} (1 lines total after edit)]\n'
+            '(this is the beginning of the file)\n'
+            "1|print('hello')\n"
+            '(this is the end of the file)\n'
+            + MSG_FILE_UPDATED.format(line_number=1)
+            + '\n'
+        )
+        assert result.split('\n') == expected.split('\n')
 
-    file_path = _generate_test_file_with_lines(tmp_path, 1)
 
-    open_file(str(file_path), current_line)
-    insert_content_at_line(str(file_path), 1, 'undefined_name()\n')
+def test_lint_file_fail_undefined_name(tmp_path, capsys):
+    with patch.dict(os.environ, {'ENABLE_AUTO_LINT': 'True'}):
+        current_line = 1
 
-    result = capsys.readouterr().out
-    assert result is not None
+        file_path = _generate_test_file_with_lines(tmp_path, 1)
 
-    expected = (
-        f'[File: {file_path} (1 lines total)]\n'
-        '(this is the beginning of the file)\n'
-        '1|\n'
-        '(this is the end of the file)\n'
-        '[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]\n'
-        'ERRORS:\n'
-        f"{file_path}:1:1: F821 undefined name 'undefined_name'\n"
-        '[This is how your edit would have looked if applied]\n'
-        '-------------------------------------------------\n'
-        '(this is the beginning of the file)\n'
-        '1|undefined_name()\n'
-        '(this is the end of the file)\n'
-        '-------------------------------------------------\n\n'
-        '[This is the original code before your edit]\n'
-        '-------------------------------------------------\n'
-        '(this is the beginning of the file)\n'
-        '1|\n'
-        '(this is the end of the file)\n'
-        '-------------------------------------------------\n'
-        'Your changes have NOT been applied. Please fix your edit command and try again.\n'
-        'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
-        'DO NOT re-run the same failed edit command. Running it again will lead to the same error.\n'
-    )
-    assert result.split('\n') == expected.split('\n')
+        open_file(str(file_path), current_line)
+        insert_content_at_line(str(file_path), 1, 'undefined_name()\n')
 
+        result = capsys.readouterr().out
+        assert result is not None
 
-def test_lint_file_fail_undefined_name_long(tmp_path, monkeypatch, capsys):
-    # Enable linting
-    monkeypatch.setattr(
-        'opendevin.runtime.plugins.agent_skills.agentskills.ENABLE_AUTO_LINT', True
-    )
+        expected = (
+            f'[File: {file_path} (1 lines total)]\n'
+            '(this is the beginning of the file)\n'
+            '1|\n'
+            '(this is the end of the file)\n'
+            '[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]\n'
+            'ERRORS:\n'
+            f"{file_path}:1:1: F821 undefined name 'undefined_name'\n"
+            '[This is how your edit would have looked if applied]\n'
+            '-------------------------------------------------\n'
+            '(this is the beginning of the file)\n'
+            '1|undefined_name()\n'
+            '(this is the end of the file)\n'
+            '-------------------------------------------------\n\n'
+            '[This is the original code before your edit]\n'
+            '-------------------------------------------------\n'
+            '(this is the beginning of the file)\n'
+            '1|\n'
+            '(this is the end of the file)\n'
+            '-------------------------------------------------\n'
+            'Your changes have NOT been applied. Please fix your edit command and try again.\n'
+            'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
+            'DO NOT re-run the same failed edit command. Running it again will lead to the same error.\n'
+        )
+        assert result.split('\n') == expected.split('\n')
 
-    num_lines = 1000
-    error_line = 500
 
-    file_path = _generate_test_file_with_lines(tmp_path, num_lines)
+def test_lint_file_fail_undefined_name_long(tmp_path, capsys):
+    with patch.dict(os.environ, {'ENABLE_AUTO_LINT': 'True'}):
+        num_lines = 1000
+        error_line = 500
 
-    error_message = f"{file_path}:{error_line}:1: F821 undefined name 'undefined_name'"
+        file_path = _generate_test_file_with_lines(tmp_path, num_lines)
 
-    open_file(str(file_path))
-    insert_content_at_line(str(file_path), error_line, 'undefined_name()\n')
+        error_message = (
+            f"{file_path}:{error_line}:1: F821 undefined name 'undefined_name'"
+        )
 
-    result = capsys.readouterr().out
-    assert result is not None
+        open_file(str(file_path))
+        insert_content_at_line(str(file_path), error_line, 'undefined_name()\n')
 
-    open_lines = '\n'.join([f'{i}|' for i in range(1, WINDOW + 1)])
-    expected = (
-        f'[File: {file_path} ({num_lines} lines total)]\n'
-        '(this is the beginning of the file)\n'
-        f'{open_lines}\n'
-        f'({num_lines - WINDOW} more lines below)\n'
-        '[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]\n'
-        f'ERRORS:\n{error_message}\n'
-        '[This is how your edit would have looked if applied]\n'
-        '-------------------------------------------------\n'
-        '(489 more lines above)\n'
-        + _numbered_test_lines(error_line - 10, error_line - 1)
-        + '500|undefined_name()\n'
-        + _numbered_test_lines(error_line + 1, error_line + 10)
-        + '(491 more lines below)\n'
-        + '-------------------------------------------------\n\n'
-        '[This is the original code before your edit]\n'
-        '-------------------------------------------------\n'
-        '(489 more lines above)\n'
-        + _numbered_test_lines(error_line - 10, error_line + 10)
-        + '(490 more lines below)\n'
-        + '-------------------------------------------------\n'
-        'Your changes have NOT been applied. Please fix your edit command and try again.\n'
-        'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
-        'DO NOT re-run the same failed edit command. Running it again will lead to the same error.\n'
-    )
-    assert result.split('\n') == expected.split('\n')
+        result = capsys.readouterr().out
+        assert result is not None
 
+        open_lines = '\n'.join([f'{i}|' for i in range(1, WINDOW + 1)])
+        expected = (
+            f'[File: {file_path} ({num_lines} lines total)]\n'
+            '(this is the beginning of the file)\n'
+            f'{open_lines}\n'
+            f'({num_lines - WINDOW} more lines below)\n'
+            '[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]\n'
+            f'ERRORS:\n{error_message}\n'
+            '[This is how your edit would have looked if applied]\n'
+            '-------------------------------------------------\n'
+            '(489 more lines above)\n'
+            + _numbered_test_lines(error_line - 10, error_line - 1)
+            + '500|undefined_name()\n'
+            + _numbered_test_lines(error_line + 1, error_line + 10)
+            + '(491 more lines below)\n'
+            + '-------------------------------------------------\n\n'
+            '[This is the original code before your edit]\n'
+            '-------------------------------------------------\n'
+            '(489 more lines above)\n'
+            + _numbered_test_lines(error_line - 10, error_line + 10)
+            + '(490 more lines below)\n'
+            + '-------------------------------------------------\n'
+            'Your changes have NOT been applied. Please fix your edit command and try again.\n'
+            'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
+            'DO NOT re-run the same failed edit command. Running it again will lead to the same error.\n'
+        )
+        assert result.split('\n') == expected.split('\n')
 
-def test_lint_file_disabled_undefined_name(tmp_path, monkeypatch, capsys):
-    # Disable linting
-    monkeypatch.setattr(
-        'opendevin.runtime.plugins.agent_skills.agentskills.ENABLE_AUTO_LINT', False
-    )
 
-    file_path = _generate_test_file_with_lines(tmp_path, 1)
+def test_lint_file_disabled_undefined_name(tmp_path, capsys):
+    with patch.dict(os.environ, {'ENABLE_AUTO_LINT': 'False'}):
+        file_path = _generate_test_file_with_lines(tmp_path, 1)
 
-    open_file(str(file_path))
-    insert_content_at_line(str(file_path), 1, 'undefined_name()\n')
+        open_file(str(file_path))
+        insert_content_at_line(str(file_path), 1, 'undefined_name()\n')
 
-    result = capsys.readouterr().out
-    assert result is not None
-    expected = (
-        f'[File: {file_path} (1 lines total)]\n'
-        '(this is the beginning of the file)\n'
-        '1|\n'
-        '(this is the end of the file)\n'
-        f'[File: {file_path} (1 lines total after edit)]\n'
-        '(this is the beginning of the file)\n'
-        '1|undefined_name()\n'
-        '(this is the end of the file)\n'
-        + MSG_FILE_UPDATED.format(line_number=1)
-        + '\n'
-    )
-    assert result.split('\n') == expected.split('\n')
+        result = capsys.readouterr().out
+        assert result is not None
+        expected = (
+            f'[File: {file_path} (1 lines total)]\n'
+            '(this is the beginning of the file)\n'
+            '1|\n'
+            '(this is the end of the file)\n'
+            f'[File: {file_path} (1 lines total after edit)]\n'
+            '(this is the beginning of the file)\n'
+            '1|undefined_name()\n'
+            '(this is the end of the file)\n'
+            + MSG_FILE_UPDATED.format(line_number=1)
+            + '\n'
+        )
+        assert result.split('\n') == expected.split('\n')
 
 
 def test_parse_docx(tmp_path):
@@ -1521,44 +1505,40 @@ def test_parse_pptx(tmp_path):
     assert output == expected_output, f'Expected output does not match. Got: {output}'
 
 
-def test_lint_file_fail_non_python(tmp_path, monkeypatch, capsys):
-    monkeypatch.setattr(
-        'opendevin.runtime.plugins.agent_skills.agentskills.ENABLE_AUTO_LINT', True
-    )
-
-    current_line = 1
-
-    file_path = _generate_ruby_test_file_with_lines(tmp_path, 1)
+def test_lint_file_fail_non_python(tmp_path, capsys):
+    with patch.dict(os.environ, {'ENABLE_AUTO_LINT': 'True'}):
+        current_line = 1
+        file_path = _generate_ruby_test_file_with_lines(tmp_path, 1)
 
-    open_file(str(file_path), current_line)
-    insert_content_at_line(
-        str(file_path), 1, "def print_hello_world()\n    puts 'Hello World'"
-    )
-    result = capsys.readouterr().out
-    assert result is not None
-    expected = (
-        f'[File: {file_path} (1 lines total)]\n'
-        '(this is the beginning of the file)\n'
-        '1|\n'
-        '(this is the end of the file)\n'
-        '[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]\n'
-        'ERRORS:\n'
-        f'{file_path}:1\n'
-        '[This is how your edit would have looked if applied]\n'
-        '-------------------------------------------------\n'
-        '(this is the beginning of the file)\n'
-        '1|def print_hello_world()\n'
-        "2|    puts 'Hello World'\n"
-        '(this is the end of the file)\n'
-        '-------------------------------------------------\n\n'
-        '[This is the original code before your edit]\n'
-        '-------------------------------------------------\n'
-        '(this is the beginning of the file)\n'
-        '1|\n'
-        '(this is the end of the file)\n'
-        '-------------------------------------------------\n'
-        'Your changes have NOT been applied. Please fix your edit command and try again.\n'
-        'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
-        'DO NOT re-run the same failed edit command. Running it again will lead to the same error.\n'
-    )
-    assert result.split('\n') == expected.split('\n')
+        open_file(str(file_path), current_line)
+        insert_content_at_line(
+            str(file_path), 1, "def print_hello_world()\n    puts 'Hello World'"
+        )
+        result = capsys.readouterr().out
+        assert result is not None
+        expected = (
+            f'[File: {file_path} (1 lines total)]\n'
+            '(this is the beginning of the file)\n'
+            '1|\n'
+            '(this is the end of the file)\n'
+            '[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]\n'
+            'ERRORS:\n'
+            f'{file_path}:1\n'
+            '[This is how your edit would have looked if applied]\n'
+            '-------------------------------------------------\n'
+            '(this is the beginning of the file)\n'
+            '1|def print_hello_world()\n'
+            "2|    puts 'Hello World'\n"
+            '(this is the end of the file)\n'
+            '-------------------------------------------------\n\n'
+            '[This is the original code before your edit]\n'
+            '-------------------------------------------------\n'
+            '(this is the beginning of the file)\n'
+            '1|\n'
+            '(this is the end of the file)\n'
+            '-------------------------------------------------\n'
+            'Your changes have NOT been applied. Please fix your edit command and try again.\n'
+            'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
+            'DO NOT re-run the same failed edit command. Running it again will lead to the same error.\n'
+        )
+        assert result.split('\n') == expected.split('\n')
diff --git a/tests/unit/test_bash_parsing.py b/tests/unit/test_bash_parsing.py
new file mode 100644
index 000000000000..3797d426d29e
--- /dev/null
+++ b/tests/unit/test_bash_parsing.py
@@ -0,0 +1,259 @@
+import pytest
+
+from opendevin.runtime.utils.bash import split_bash_commands
+
+
+def test_split_commands_util():
+    cmds = [
+        'ls -l',
+        'echo -e "hello\nworld"',
+        """
+echo -e "hello it\\'s me"
+""".strip(),
+        """
+echo \\
+    -e 'hello' \\
+    -v
+""".strip(),
+        """
+echo -e 'hello\\nworld\\nare\\nyou\\nthere?'
+""".strip(),
+        """
+echo -e 'hello
+world
+are
+you\\n
+there?'
+""".strip(),
+        """
+echo -e 'hello
+world "
+'
+""".strip(),
+        """
+kubectl apply -f - <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: busybox-sleep
+spec:
+  containers:
+  - name: busybox
+    image: busybox:1.28
+    args:
+    - sleep
+    - "1000000"
+EOF
+""".strip(),
+        """
+mkdir -p _modules && \
+for month in {01..04}; do
+    for day in {01..05}; do
+        touch "_modules/2024-${month}-${day}-sample.md"
+    done
+done
+""".strip(),
+    ]
+    joined_cmds = '\n'.join(cmds)
+    split_cmds = split_bash_commands(joined_cmds)
+    for s in split_cmds:
+        print('\nCMD')
+        print(s)
+    for i in range(len(cmds)):
+        assert (
+            split_cmds[i].strip() == cmds[i].strip()
+        ), f'At index {i}: {split_cmds[i]} != {cmds[i]}.'
+
+
+@pytest.mark.parametrize(
+    'input_command, expected_output',
+    [
+        ('ls -l', ['ls -l']),
+        ("echo 'Hello, world!'", ["echo 'Hello, world!'"]),
+        ('cd /tmp && touch test.txt', ['cd /tmp && touch test.txt']),
+        ("echo -e 'line1\\nline2\\nline3'", ["echo -e 'line1\\nline2\\nline3'"]),
+        (
+            "grep 'pattern' file.txt | sort | uniq",
+            ["grep 'pattern' file.txt | sort | uniq"],
+        ),
+        ('for i in {1..5}; do echo $i; done', ['for i in {1..5}; do echo $i; done']),
+        (
+            "echo 'Single quotes don\\'t escape'",
+            ["echo 'Single quotes don\\'t escape'"],
+        ),
+        (
+            'echo "Double quotes \\"do\\" escape"',
+            ['echo "Double quotes \\"do\\" escape"'],
+        ),
+    ],
+)
+def test_single_commands(input_command, expected_output):
+    assert split_bash_commands(input_command) == expected_output
+
+
+def test_heredoc():
+    input_commands = """
+cat <<EOF
+multiline
+text
+EOF
+echo "Done"
+"""
+    expected_output = ['cat <<EOF\nmultiline\ntext\nEOF', 'echo "Done"']
+    assert split_bash_commands(input_commands) == expected_output
+
+
+def test_backslash_continuation():
+    input_commands = """
+echo "This is a long \
+command that spans \
+multiple lines"
+echo "Next command"
+"""
+    expected_output = [
+        'echo "This is a long command that spans multiple lines"',
+        'echo "Next command"',
+    ]
+    assert split_bash_commands(input_commands) == expected_output
+
+
+def test_comments():
+    input_commands = """
+echo "Hello" # This is a comment
+# This is another comment
+ls -l
+"""
+    expected_output = [
+        'echo "Hello" # This is a comment\n# This is another comment',
+        'ls -l',
+    ]
+    assert split_bash_commands(input_commands) == expected_output
+
+
+def test_complex_quoting():
+    input_commands = """
+echo "This is a \\"quoted\\" string"
+echo 'This is a '\''single-quoted'\'' string'
+echo "Mixed 'quotes' in \\"double quotes\\""
+"""
+    expected_output = [
+        'echo "This is a \\"quoted\\" string"',
+        "echo 'This is a '''single-quoted''' string'",
+        'echo "Mixed \'quotes\' in \\"double quotes\\""',
+    ]
+    assert split_bash_commands(input_commands) == expected_output
+
+
+def test_invalid_syntax():
+    invalid_inputs = [
+        'echo "Unclosed quote',
+        "echo 'Unclosed quote",
+        'cat <<EOF\nUnclosed heredoc',
+    ]
+    for input_command in invalid_inputs:
+        # it will fall back to return the original input
+        assert split_bash_commands(input_command) == [input_command]
+
+
+@pytest.fixture
+def sample_commands():
+    return [
+        'ls -l',
+        'echo "Hello, world!"',
+        'cd /tmp && touch test.txt',
+        'echo -e "line1\\nline2\\nline3"',
+        'grep "pattern" file.txt | sort | uniq',
+        'for i in {1..5}; do echo $i; done',
+        'cat <<EOF\nmultiline\ntext\nEOF',
+        'echo "Escaped \\"quotes\\""',
+        "echo 'Single quotes don\\'t escape'",
+        'echo "Command with a trailing backslash \\\n  and continuation"',
+    ]
+
+
+def test_split_single_commands(sample_commands):
+    for cmd in sample_commands:
+        result = split_bash_commands(cmd)
+        assert len(result) == 1, f'Expected single command, got: {result}'
+
+
+def test_split_commands_with_heredoc():
+    input_commands = """
+cat <<EOF
+multiline
+text
+EOF
+echo "Done"
+"""
+    expected_output = ['cat <<EOF\nmultiline\ntext\nEOF', 'echo "Done"']
+    result = split_bash_commands(input_commands)
+    assert result == expected_output, f'Expected {expected_output}, got {result}'
+
+
+def test_split_commands_with_backslash_continuation():
+    input_commands = """
+echo "This is a long \
+command that spans \
+multiple lines"
+echo "Next command"
+"""
+    expected_output = [
+        'echo "This is a long command that spans multiple lines"',
+        'echo "Next command"',
+    ]
+    result = split_bash_commands(input_commands)
+    assert result == expected_output, f'Expected {expected_output}, got {result}'
+
+
+def test_split_commands_with_empty_lines():
+    input_commands = """
+ls -l
+
+echo "Hello"
+
+cd /tmp
+"""
+    expected_output = ['ls -l', 'echo "Hello"', 'cd /tmp']
+    result = split_bash_commands(input_commands)
+    assert result == expected_output, f'Expected {expected_output}, got {result}'
+
+
+def test_split_commands_with_comments():
+    input_commands = """
+echo "Hello" # This is a comment
+# This is another comment
+ls -l
+"""
+    expected_output = [
+        'echo "Hello" # This is a comment\n# This is another comment',
+        'ls -l',
+    ]
+    result = split_bash_commands(input_commands)
+    assert result == expected_output, f'Expected {expected_output}, got {result}'
+
+
+def test_split_commands_with_complex_quoting():
+    input_commands = """
+echo "This is a \\"quoted\\" string"
+echo "Mixed 'quotes' in \\"double quotes\\""
+"""
+    # echo 'This is a '\''single-quoted'\'' string'
+
+    expected_output = [
+        'echo "This is a \\"quoted\\" string"',
+        'echo "Mixed \'quotes\' in \\"double quotes\\""',
+    ]
+    # "echo 'This is a '\\''single-quoted'\\'' string'",
+    result = split_bash_commands(input_commands)
+    assert result == expected_output, f'Expected {expected_output}, got {result}'
+
+
+def test_split_commands_with_invalid_input():
+    invalid_inputs = [
+        'echo "Unclosed quote',
+        "echo 'Unclosed quote",
+        'cat <<EOF\nUnclosed heredoc',
+    ]
+    for input_command in invalid_inputs:
+        # it will fall back to return the original input
+        assert split_bash_commands(input_command) == [input_command]
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index e14e669ad49d..301c1579071f 100644
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -52,7 +52,6 @@ def test_compat_env_to_config(monkeypatch, setup_env):
     monkeypatch.setenv('AGENT_MEMORY_MAX_THREADS', '4')
     monkeypatch.setenv('AGENT_MEMORY_ENABLED', 'True')
     monkeypatch.setenv('DEFAULT_AGENT', 'CodeActAgent')
-    monkeypatch.setenv('SANDBOX_TYPE', 'local')
     monkeypatch.setenv('SANDBOX_TIMEOUT', '10')
 
     config = AppConfig()
@@ -67,7 +66,6 @@ def test_compat_env_to_config(monkeypatch, setup_env):
     assert config.get_agent_config().memory_max_threads == 4
     assert config.get_agent_config().memory_enabled is True
     assert config.default_agent == 'CodeActAgent'
-    assert config.sandbox.box_type == 'local'
     assert config.sandbox.timeout == 10
 
 
@@ -120,7 +118,6 @@ def test_load_from_new_style_toml(default_config, temp_toml_file):
 [core]
 workspace_base = "/opt/files2/workspace"
 default_agent = "TestAgent"
-sandbox_type = "local"
 """
         )
 
@@ -150,12 +147,8 @@ def test_load_from_new_style_toml(default_config, temp_toml_file):
     assert default_config.get_agent_config('BrowsingAgent').memory_enabled is False
 
     assert default_config.workspace_base == '/opt/files2/workspace'
-    assert default_config.sandbox.box_type == 'local'
     assert default_config.sandbox.timeout == 1
 
-    # default config doesn't have a field sandbox_type
-    assert not hasattr(default_config, 'sandbox_type')
-
     # before finalize_config, workspace_mount_path is UndefinedString.UNDEFINED if it was not set
     assert default_config.workspace_mount_path is UndefinedString.UNDEFINED
     assert (
@@ -170,7 +163,7 @@ def test_load_from_new_style_toml(default_config, temp_toml_file):
     assert default_config.workspace_mount_path == '/opt/files2/workspace'
 
 
-def test_compat_load_sandbox_from_toml(default_config, temp_toml_file):
+def test_compat_load_sandbox_from_toml(default_config: AppConfig, temp_toml_file: str):
     # test loading configuration from a new-style TOML file
     # uses a toml file with sandbox_vars instead of a sandbox section
     with open(temp_toml_file, 'w', encoding='utf-8') as toml_file:
@@ -184,7 +177,6 @@ def test_compat_load_sandbox_from_toml(default_config, temp_toml_file):
 
 [core]
 workspace_base = "/opt/files2/workspace"
-sandbox_type = "local"
 sandbox_timeout = 500
 sandbox_container_image = "node:14"
 sandbox_user_id = 1001
@@ -199,7 +191,6 @@ def test_compat_load_sandbox_from_toml(default_config, temp_toml_file):
     assert default_config.default_agent == 'TestAgent'
     assert default_config.get_agent_config().memory_enabled is True
     assert default_config.workspace_base == '/opt/files2/workspace'
-    assert default_config.sandbox.box_type == 'local'
     assert default_config.sandbox.timeout == 500
     assert default_config.sandbox.container_image == 'node:14'
     assert default_config.sandbox.user_id == 1001
@@ -208,7 +199,6 @@ def test_compat_load_sandbox_from_toml(default_config, temp_toml_file):
     finalize_config(default_config)
 
     # app config doesn't have fields sandbox_*
-    assert not hasattr(default_config, 'sandbox_type')
     assert not hasattr(default_config, 'sandbox_timeout')
     assert not hasattr(default_config, 'sandbox_container_image')
     assert not hasattr(default_config, 'sandbox_user_id')
@@ -229,7 +219,6 @@ def test_env_overrides_compat_toml(monkeypatch, default_config, temp_toml_file):
 
 [core]
 workspace_base = "/opt/files3/workspace"
-sandbox_type = "local"
 disable_color = true
 sandbox_timeout = 500
 sandbox_user_id = 1001
@@ -237,7 +226,6 @@ def test_env_overrides_compat_toml(monkeypatch, default_config, temp_toml_file):
 
     monkeypatch.setenv('LLM_API_KEY', 'env-api-key')
     monkeypatch.setenv('WORKSPACE_BASE', 'UNDEFINED')
-    monkeypatch.setenv('SANDBOX_TYPE', 'e2b')
     monkeypatch.setenv('SANDBOX_TIMEOUT', '1000')
     monkeypatch.setenv('SANDBOX_USER_ID', '1002')
 
@@ -262,7 +250,6 @@ def test_env_overrides_compat_toml(monkeypatch, default_config, temp_toml_file):
     assert default_config.workspace_mount_path is UndefinedString.UNDEFINED
     assert default_config.workspace_mount_path == 'UNDEFINED'
 
-    assert default_config.sandbox.box_type == 'e2b'
     assert default_config.disable_color is True
     assert default_config.sandbox.timeout == 1000
     assert default_config.sandbox.user_id == 1002
@@ -285,14 +272,12 @@ def test_env_overrides_sandbox_toml(monkeypatch, default_config, temp_toml_file)
 workspace_base = "/opt/files3/workspace"
 
 [sandbox]
-box_type = "e2b"
 timeout = 500
 user_id = 1001
 """)
 
     monkeypatch.setenv('LLM_API_KEY', 'env-api-key')
     monkeypatch.setenv('WORKSPACE_BASE', 'UNDEFINED')
-    monkeypatch.setenv('SANDBOX_TYPE', 'local')
     monkeypatch.setenv('SANDBOX_TIMEOUT', '1000')
     monkeypatch.setenv('SANDBOX_USER_ID', '1002')
 
@@ -303,7 +288,6 @@ def test_env_overrides_sandbox_toml(monkeypatch, default_config, temp_toml_file)
 
     # before load_from_env, values are set to the values from the toml file
     assert default_config.get_llm_config().api_key == 'toml-api-key'
-    assert default_config.sandbox.box_type == 'e2b'
     assert default_config.sandbox.timeout == 500
     assert default_config.sandbox.user_id == 1001
 
@@ -314,7 +298,6 @@ def test_env_overrides_sandbox_toml(monkeypatch, default_config, temp_toml_file)
     assert default_config.get_llm_config().model == 'test-model'
     assert default_config.get_llm_config().api_key == 'env-api-key'
 
-    assert default_config.sandbox.box_type == 'local'
     assert default_config.sandbox.timeout == 1000
     assert default_config.sandbox.user_id == 1002
 
@@ -335,7 +318,6 @@ def test_sandbox_config_from_toml(default_config, temp_toml_file):
 model = "test-model"
 
 [sandbox]
-box_type = "local"
 timeout = 1
 container_image = "custom_image"
 user_id = 1001
@@ -347,7 +329,6 @@ def test_sandbox_config_from_toml(default_config, temp_toml_file):
     finalize_config(default_config)
 
     assert default_config.get_llm_config().model == 'test-model'
-    assert default_config.sandbox.box_type == 'local'
     assert default_config.sandbox.timeout == 1
     assert default_config.sandbox.container_image == 'custom_image'
     assert default_config.sandbox.user_id == 1001
@@ -374,11 +355,10 @@ def test_defaults_dict_after_updates(default_config):
         defaults_after_updates['workspace_mount_path']['default']
         is UndefinedString.UNDEFINED
     )
-    assert defaults_after_updates['sandbox']['box_type']['default'] == 'ssh'
     assert defaults_after_updates['sandbox']['timeout']['default'] == 120
     assert (
         defaults_after_updates['sandbox']['container_image']['default']
-        == 'ghcr.io/opendevin/sandbox:main'
+        == 'nikolaik/python-nodejs:python3.11-nodejs22'
     )
     assert defaults_after_updates == initial_defaults
 
@@ -393,7 +373,6 @@ def test_invalid_toml_format(monkeypatch, temp_toml_file, default_config):
 
     load_from_toml(default_config)
     load_from_env(default_config, os.environ)
-    default_config.ssh_password = None  # prevent leak
     default_config.jwt_secret = None  # prevent leak
     for llm in default_config.llms.values():
         llm.api_key = None  # prevent leak
@@ -405,13 +384,8 @@ def test_invalid_toml_format(monkeypatch, temp_toml_file, default_config):
 def test_finalize_config(default_config):
     # Test finalize config
     assert default_config.workspace_mount_path is UndefinedString.UNDEFINED
-    default_config.sandbox.box_type = 'local'
     finalize_config(default_config)
 
-    assert (
-        default_config.workspace_mount_path_in_sandbox
-        == default_config.workspace_mount_path
-    )
     assert default_config.workspace_mount_path == os.path.abspath(
         default_config.workspace_base
     )
@@ -426,16 +400,6 @@ def test_workspace_mount_path_default(default_config):
     )
 
 
-def test_workspace_mount_path_in_sandbox_local(default_config):
-    assert default_config.workspace_mount_path_in_sandbox == '/workspace'
-    default_config.sandbox.box_type = 'local'
-    finalize_config(default_config)
-    assert (
-        default_config.workspace_mount_path_in_sandbox
-        == default_config.workspace_mount_path
-    )
-
-
 def test_workspace_mount_rewrite(default_config, monkeypatch):
     default_config.workspace_base = '/home/user/project'
     default_config.workspace_mount_rewrite = '/home/user:/sandbox'
@@ -512,14 +476,11 @@ def test_api_keys_repr_str():
         agents={'agent': agent_config},
         e2b_api_key='my_e2b_api_key',
         jwt_secret='my_jwt_secret',
-        ssh_password='my_ssh_password',
     )
     assert "e2b_api_key='******'" in repr(app_config)
     assert "e2b_api_key='******'" in str(app_config)
     assert "jwt_secret='******'" in repr(app_config)
     assert "jwt_secret='******'" in str(app_config)
-    assert "ssh_password='******'" in repr(app_config)
-    assert "ssh_password='******'" in str(app_config)
 
     # Check that no other attrs in AppConfig have 'key' or 'token' in their name
     # This will fail when new attrs are added, and attract attention
diff --git a/tests/unit/test_event_stream.py b/tests/unit/test_event_stream.py
index 4c7754c43f92..d2384e4d7fc5 100644
--- a/tests/unit/test_event_stream.py
+++ b/tests/unit/test_event_stream.py
@@ -1,8 +1,7 @@
 import json
-import pathlib
-import tempfile
 
 import pytest
+from pytest import TempPathFactory
 
 from opendevin.events import EventSource, EventStream
 from opendevin.events.action import (
@@ -13,11 +12,8 @@
 
 
 @pytest.fixture
-def temp_dir(monkeypatch):
-    # get a temporary directory
-    with tempfile.TemporaryDirectory() as temp_dir:
-        pathlib.Path().mkdir(parents=True, exist_ok=True)
-        yield temp_dir
+def temp_dir(tmp_path_factory: TempPathFactory) -> str:
+    return str(tmp_path_factory.mktemp('test_event_stream'))
 
 
 def collect_events(stream):
@@ -36,7 +32,7 @@ def test_stream_storage(temp_dir: str):
     event_stream = EventStream('abc', file_store)
     event_stream.add_event(NullObservation(''), EventSource.AGENT)
     assert len(collect_events(event_stream)) == 1
-    content = event_stream._file_store.read('sessions/abc/events/0.json')
+    content = event_stream.file_store.read('sessions/abc/events/0.json')
     assert content is not None
     data = json.loads(content)
     assert 'timestamp' in data
diff --git a/tests/unit/test_image_agnostic_util.py b/tests/unit/test_image_agnostic_util.py
deleted file mode 100644
index d5a94bfe24ed..000000000000
--- a/tests/unit/test_image_agnostic_util.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from unittest.mock import MagicMock, patch
-
-from opendevin.runtime.utils.image_agnostic import (
-    _get_new_image_name,
-    generate_dockerfile,
-    get_od_sandbox_image,
-)
-
-
-def test_generate_dockerfile():
-    base_image = 'debian:11'
-    dockerfile_content = generate_dockerfile(base_image)
-    assert base_image in dockerfile_content
-    assert (
-        'RUN apt update && apt install -y openssh-server wget sudo'
-        in dockerfile_content
-    )
-
-
-def test_get_new_image_name_legacy():
-    # test non-eventstream runtime (sandbox-based)
-    base_image = 'debian:11'
-    new_image_name = _get_new_image_name(base_image)
-    assert new_image_name == 'od_sandbox:debian__11'
-
-    base_image = 'ubuntu:22.04'
-    new_image_name = _get_new_image_name(base_image)
-    assert new_image_name == 'od_sandbox:ubuntu__22.04'
-
-    base_image = 'ubuntu'
-    new_image_name = _get_new_image_name(base_image)
-    assert new_image_name == 'od_sandbox:ubuntu__latest'
-
-
-@patch('opendevin.runtime.utils.image_agnostic._build_sandbox_image')
-@patch('opendevin.runtime.utils.image_agnostic.docker.DockerClient')
-def test_get_od_sandbox_image(mock_docker_client, mock_build_sandbox_image):
-    base_image = 'debian:11'
-    mock_docker_client.images.list.return_value = [
-        MagicMock(tags=['od_sandbox:debian__11'])
-    ]
-
-    image_name = get_od_sandbox_image(base_image, mock_docker_client)
-    assert image_name == 'od_sandbox:debian__11'
-
-    mock_docker_client.images.list.return_value = []
-    image_name = get_od_sandbox_image(base_image, mock_docker_client)
-    assert image_name == 'od_sandbox:debian__11'
-    mock_build_sandbox_image.assert_called_once_with(
-        base_image, 'od_sandbox:debian__11', mock_docker_client
-    )
diff --git a/tests/unit/test_ipython.py b/tests/unit/test_ipython.py
deleted file mode 100644
index f1444a5ee8da..000000000000
--- a/tests/unit/test_ipython.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import pathlib
-import tempfile
-from unittest.mock import MagicMock, call, patch
-
-import pytest
-
-from opendevin.core.config import AppConfig, SandboxConfig
-from opendevin.events.action import IPythonRunCellAction
-from opendevin.events.observation import IPythonRunCellObservation
-from opendevin.runtime.server.runtime import ServerRuntime
-
-
-@pytest.fixture
-def temp_dir(monkeypatch):
-    # get a temporary directory
-    with tempfile.TemporaryDirectory() as temp_dir:
-        pathlib.Path().mkdir(parents=True, exist_ok=True)
-        yield temp_dir
-
-
-@pytest.mark.asyncio
-async def test_run_python_backticks():
-    # Create a mock event_stream
-    mock_event_stream = MagicMock()
-
-    test_code = "print('Hello, `World`!\n')"
-
-    # Mock the asynchronous sandbox execute method
-    mock_sandbox_execute = MagicMock()
-    mock_sandbox_execute.side_effect = [
-        (0, ''),  # Initial call during DockerSSHBox initialization
-        (0, ''),  # Initial call during DockerSSHBox initialization
-        (0, ''),  # Initial call during DockerSSHBox initialization
-        (0, ''),  # Write command
-        (0, test_code),  # Execute command
-    ]
-
-    # Set up the patches for the runtime and sandbox
-    with patch(
-        'opendevin.runtime.docker.ssh_box.DockerSSHBox.execute',
-        new=mock_sandbox_execute,
-    ):
-        # Initialize the runtime with the mock event_stream
-        runtime = ServerRuntime(
-            config=AppConfig(
-                persist_sandbox=False, sandbox=SandboxConfig(box_type='ssh')
-            ),
-            event_stream=mock_event_stream,
-        )
-
-        # Define the test action with a simple IPython command
-        action = IPythonRunCellAction(code=test_code)
-
-        # Call the run_ipython method with the test action
-        result = await runtime.run_action(action)
-
-        # Assert that the result is an instance of IPythonRunCellObservation
-        assert isinstance(result, IPythonRunCellObservation)
-
-        # Assert that the execute method was called with the correct commands
-        expected_write_command = (
-            "cat > /tmp/opendevin_jupyter_temp.py <<'EOL'\n" f'{test_code}\n' 'EOL'
-        )
-        expected_execute_command = 'cat /tmp/opendevin_jupyter_temp.py | execute_cli'
-        mock_sandbox_execute.assert_has_calls(
-            [
-                call('mkdir -p /tmp'),
-                call('git config --global user.name "OpenDevin"'),
-                call('git config --global user.email "opendevin@all-hands.dev"'),
-                call(expected_write_command),
-                call(expected_execute_command),
-            ]
-        )
-
-        assert (
-            test_code == result.content
-        ), f'The output should contain the expected print output, got: {result.content}'
diff --git a/tests/unit/test_is_stuck.py b/tests/unit/test_is_stuck.py
index b4154e709881..b9da7e497855 100644
--- a/tests/unit/test_is_stuck.py
+++ b/tests/unit/test_is_stuck.py
@@ -1,8 +1,8 @@
 import logging
-import tempfile
 from unittest.mock import Mock, patch
 
 import pytest
+from pytest import TempPathFactory
 
 from opendevin.controller.agent_controller import AgentController
 from opendevin.controller.state.state import State
@@ -29,14 +29,17 @@ def collect_events(stream):
 
 
 @pytest.fixture
-def event_stream():
-    with tempfile.TemporaryDirectory() as temp_dir:
-        file_store = get_file_store('local', temp_dir)
-        event_stream = EventStream('asdf', file_store)
-        yield event_stream
-
-        # clear after each test
-        event_stream.clear()
+def temp_dir(tmp_path_factory: TempPathFactory) -> str:
+    return str(tmp_path_factory.mktemp('test_is_stuck'))
+
+
+@pytest.fixture
+def event_stream(temp_dir):
+    file_store = get_file_store('local', temp_dir)
+    event_stream = EventStream('asdf', file_store)
+    yield event_stream
+    # clear after each test
+    event_stream.clear()
 
 
 class TestStuckDetector:
diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py
index 8f9b23bf0476..c6a7ff9bee24 100644
--- a/tests/unit/test_json.py
+++ b/tests/unit/test_json.py
@@ -17,6 +17,7 @@ def test_event_serialization_deserialization():
         'message': 'This is a test.',
         'args': {
             'content': 'This is a test.',
+            'images_urls': None,
             'wait_for_response': False,
         },
     }
@@ -37,6 +38,7 @@ def test_array_serialization_deserialization():
             'message': 'This is a test.',
             'args': {
                 'content': 'This is a test.',
+                'images_urls': None,
                 'wait_for_response': False,
             },
         }
diff --git a/tests/unit/test_listen.py b/tests/unit/test_listen.py
new file mode 100644
index 000000000000..146d68487416
--- /dev/null
+++ b/tests/unit/test_listen.py
@@ -0,0 +1,78 @@
+from unittest.mock import patch
+
+from opendevin.core.config import AppConfig
+
+
+# Mock the SessionManager to avoid asyncio issues
+class MockSessionManager:
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+# Mock StaticFiles
+class MockStaticFiles:
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+# Patch necessary components before importing from listen
+with patch('opendevin.server.session.SessionManager', MockSessionManager), patch(
+    'fastapi.staticfiles.StaticFiles', MockStaticFiles
+):
+    from opendevin.server.listen import is_extension_allowed, load_file_upload_config
+
+
+def test_load_file_upload_config():
+    config = AppConfig(
+        file_uploads_max_file_size_mb=10,
+        file_uploads_restrict_file_types=True,
+        file_uploads_allowed_extensions=['.txt', '.pdf'],
+    )
+    with patch('opendevin.server.listen.config', config):
+        max_size, restrict_types, allowed_extensions = load_file_upload_config()
+
+        assert max_size == 10
+        assert restrict_types is True
+        assert set(allowed_extensions) == {'.txt', '.pdf'}
+
+
+def test_load_file_upload_config_invalid_max_size():
+    config = AppConfig(
+        file_uploads_max_file_size_mb=-5,
+        file_uploads_restrict_file_types=False,
+        file_uploads_allowed_extensions=[],
+    )
+    with patch('opendevin.server.listen.config', config):
+        max_size, restrict_types, allowed_extensions = load_file_upload_config()
+
+        assert max_size == 0  # Should default to 0 when invalid
+        assert restrict_types is False
+        assert allowed_extensions == ['.*']  # Should default to '.*' when empty
+
+
+def test_is_extension_allowed():
+    with patch('opendevin.server.listen.RESTRICT_FILE_TYPES', True), patch(
+        'opendevin.server.listen.ALLOWED_EXTENSIONS', ['.txt', '.pdf']
+    ):
+        assert is_extension_allowed('file.txt')
+        assert is_extension_allowed('file.pdf')
+        assert not is_extension_allowed('file.doc')
+        assert not is_extension_allowed('file')
+
+
+def test_is_extension_allowed_no_restrictions():
+    with patch('opendevin.server.listen.RESTRICT_FILE_TYPES', False):
+        assert is_extension_allowed('file.txt')
+        assert is_extension_allowed('file.pdf')
+        assert is_extension_allowed('file.doc')
+        assert is_extension_allowed('file')
+
+
+def test_is_extension_allowed_wildcard():
+    with patch('opendevin.server.listen.RESTRICT_FILE_TYPES', True), patch(
+        'opendevin.server.listen.ALLOWED_EXTENSIONS', ['.*']
+    ):
+        assert is_extension_allowed('file.txt')
+        assert is_extension_allowed('file.pdf')
+        assert is_extension_allowed('file.doc')
+        assert is_extension_allowed('file')
diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py
new file mode 100644
index 000000000000..435fc6673002
--- /dev/null
+++ b/tests/unit/test_llm.py
@@ -0,0 +1,84 @@
+from unittest.mock import patch
+
+import pytest
+
+from opendevin.core.config import LLMConfig
+from opendevin.core.metrics import Metrics
+from opendevin.llm.llm import LLM
+
+
+@pytest.fixture
+def default_config():
+    return LLMConfig(model='gpt-3.5-turbo', api_key='test_key')
+
+
+def test_llm_init_with_default_config(default_config):
+    llm = LLM(default_config)
+    assert llm.config.model == 'gpt-3.5-turbo'
+    assert llm.config.api_key == 'test_key'
+    assert isinstance(llm.metrics, Metrics)
+
+
+@patch('opendevin.llm.llm.litellm.get_model_info')
+def test_llm_init_with_model_info(mock_get_model_info, default_config):
+    mock_get_model_info.return_value = {
+        'max_input_tokens': 8000,
+        'max_output_tokens': 2000,
+    }
+    llm = LLM(default_config)
+    assert llm.config.max_input_tokens == 8000
+    assert llm.config.max_output_tokens == 2000
+
+
+@patch('opendevin.llm.llm.litellm.get_model_info')
+def test_llm_init_without_model_info(mock_get_model_info, default_config):
+    mock_get_model_info.side_effect = Exception('Model info not available')
+    llm = LLM(default_config)
+    assert llm.config.max_input_tokens == 4096
+    assert llm.config.max_output_tokens == 1024
+
+
+def test_llm_init_with_custom_config():
+    custom_config = LLMConfig(
+        model='custom-model',
+        api_key='custom_key',
+        max_input_tokens=5000,
+        max_output_tokens=1500,
+        temperature=0.8,
+        top_p=0.9,
+    )
+    llm = LLM(custom_config)
+    assert llm.config.model == 'custom-model'
+    assert llm.config.api_key == 'custom_key'
+    assert llm.config.max_input_tokens == 5000
+    assert llm.config.max_output_tokens == 1500
+    assert llm.config.temperature == 0.8
+    assert llm.config.top_p == 0.9
+
+
+def test_llm_init_with_metrics():
+    config = LLMConfig(model='gpt-3.5-turbo', api_key='test_key')
+    metrics = Metrics()
+    llm = LLM(config, metrics=metrics)
+    assert llm.metrics is metrics
+
+
+def test_llm_reset():
+    llm = LLM(LLMConfig(model='gpt-3.5-turbo', api_key='test_key'))
+    initial_metrics = llm.metrics
+    llm.reset()
+    assert llm.metrics is not initial_metrics
+    assert isinstance(llm.metrics, Metrics)
+
+
+@patch('opendevin.llm.llm.litellm.get_model_info')
+def test_llm_init_with_openrouter_model(mock_get_model_info, default_config):
+    default_config.model = 'openrouter:gpt-3.5-turbo'
+    mock_get_model_info.return_value = {
+        'max_input_tokens': 7000,
+        'max_output_tokens': 1500,
+    }
+    llm = LLM(default_config)
+    assert llm.config.max_input_tokens == 7000
+    assert llm.config.max_output_tokens == 1500
+    mock_get_model_info.assert_called_once_with('openrouter:gpt-3.5-turbo')
diff --git a/tests/unit/test_message_serialization.py b/tests/unit/test_message_serialization.py
new file mode 100644
index 000000000000..26eabc36197c
--- /dev/null
+++ b/tests/unit/test_message_serialization.py
@@ -0,0 +1,64 @@
+from opendevin.core.message import ImageContent, Message, TextContent
+
+
+def test_message_serialization():
+    text_content1 = TextContent(text='This is a text message')
+    image_content1 = ImageContent(
+        image_urls=['http://example.com/image1.png', 'http://example.com/image2.png']
+    )
+    text_content2 = TextContent(text='This is another text message')
+    image_content2 = ImageContent(
+        image_urls=['http://example.com/image3.png', 'http://example.com/image4.png']
+    )
+
+    message = Message(
+        role='user',
+        content=[text_content1, image_content1, text_content2, image_content2],
+    )
+    serialized_message = message.serialize_model()
+
+    expected_serialized_message = {
+        'role': 'user',
+        'content': [
+            {'type': 'text', 'text': 'This is a text message'},
+            {
+                'type': 'image_url',
+                'image_url': {'url': 'http://example.com/image1.png'},
+            },
+            {
+                'type': 'image_url',
+                'image_url': {'url': 'http://example.com/image2.png'},
+            },
+            {'type': 'text', 'text': 'This is another text message'},
+            {
+                'type': 'image_url',
+                'image_url': {'url': 'http://example.com/image3.png'},
+            },
+            {
+                'type': 'image_url',
+                'image_url': {'url': 'http://example.com/image4.png'},
+            },
+        ],
+    }
+
+    assert serialized_message == expected_serialized_message
+    assert message.contains_image is True
+
+
+def test_message_with_only_text_content():
+    text_content1 = TextContent(text='This is a text message')
+    text_content2 = TextContent(text='This is another text message')
+
+    message = Message(role='user', content=[text_content1, text_content2])
+    serialized_message = message.serialize_model()
+
+    expected_serialized_message = {
+        'role': 'user',
+        'content': [
+            {'type': 'text', 'text': 'This is a text message'},
+            {'type': 'text', 'text': 'This is another text message'},
+        ],
+    }
+
+    assert serialized_message == expected_serialized_message
+    assert message.contains_image is False
diff --git a/tests/unit/test_micro_agents.py b/tests/unit/test_micro_agents.py
index 3c474aca140e..82e5a0ff1704 100644
--- a/tests/unit/test_micro_agents.py
+++ b/tests/unit/test_micro_agents.py
@@ -1,10 +1,10 @@
 import json
 import os
-import tempfile
 from unittest.mock import MagicMock
 
 import pytest
 import yaml
+from pytest import TempPathFactory
 
 from agenthub.micro.registry import all_microagents
 from opendevin.controller.agent import Agent
@@ -17,14 +17,18 @@
 
 
 @pytest.fixture
-def event_stream():
-    with tempfile.TemporaryDirectory() as temp_dir:
-        file_store = get_file_store('local', temp_dir)
-        event_stream = EventStream('asdf', file_store)
-        yield event_stream
+def temp_dir(tmp_path_factory: TempPathFactory) -> str:
+    return str(tmp_path_factory.mktemp('test_micro_agents'))
 
-        # clear after each test
-        event_stream.clear()
+
+@pytest.fixture
+def event_stream(temp_dir):
+    file_store = get_file_store('local', temp_dir)
+    event_stream = EventStream('asdf', file_store)
+    yield event_stream
+
+    # clear after each test
+    event_stream.clear()
 
 
 def test_all_agents_are_loaded():
@@ -64,7 +68,7 @@ def test_coder_agent_with_summary(event_stream: EventStream):
 
     mock_llm.completion.assert_called_once()
     _, kwargs = mock_llm.completion.call_args
-    prompt = kwargs['messages'][0]['content']
+    prompt = kwargs['messages'][0]['content'][0]['text']
     assert task in prompt
     assert "Here's a summary of the codebase, as it relates to this task" in prompt
     assert summary in prompt
@@ -92,6 +96,6 @@ def test_coder_agent_without_summary(event_stream: EventStream):
 
     mock_llm.completion.assert_called_once()
     _, kwargs = mock_llm.completion.call_args
-    prompt = kwargs['messages'][0]['content']
+    prompt = kwargs['messages'][0]['content'][0]['text']
     assert task in prompt
     assert "Here's a summary of the codebase, as it relates to this task" not in prompt
diff --git a/tests/unit/test_runtime.py b/tests/unit/test_runtime.py
index 909893220be6..b26f6673ab76 100644
--- a/tests/unit/test_runtime.py
+++ b/tests/unit/test_runtime.py
@@ -1,26 +1,37 @@
 """Test the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""
 
 import asyncio
+import json
 import os
-import pathlib
 import tempfile
 import time
 from unittest.mock import patch
 
 import pytest
+from pytest import TempPathFactory
 
-from opendevin.core.config import AppConfig, SandboxConfig
+from opendevin.core.config import AppConfig, SandboxConfig, load_from_env
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.events import EventStream
 from opendevin.events.action import (
+    BrowseInteractiveAction,
+    BrowseURLAction,
     CmdRunAction,
+    FileReadAction,
+    FileWriteAction,
+    IPythonRunCellAction,
 )
 from opendevin.events.observation import (
+    BrowserOutputObservation,
     CmdOutputObservation,
+    ErrorObservation,
+    FileReadObservation,
+    FileWriteObservation,
+    IPythonRunCellObservation,
 )
 from opendevin.runtime.client.runtime import EventStreamRuntime
 from opendevin.runtime.plugins import AgentSkillsRequirement, JupyterRequirement
-from opendevin.runtime.server.runtime import ServerRuntime
+from opendevin.runtime.runtime import Runtime
 from opendevin.storage import get_file_store
 
 
@@ -29,66 +40,111 @@ def print_method_name(request):
     print('\n########################################################################')
     print(f'Running test: {request.node.name}')
     print('########################################################################')
+    yield
 
 
 @pytest.fixture
-def temp_dir(monkeypatch):
-    # get a temporary directory
-    with tempfile.TemporaryDirectory() as temp_dir:
-        pathlib.Path().mkdir(parents=True, exist_ok=True)
-        yield temp_dir
+def temp_dir(tmp_path_factory: TempPathFactory) -> str:
+    return str(tmp_path_factory.mktemp('test_runtime'))
 
 
-# This assures that all tests run together for each runtime, not alternating between them,
-# which caused them to fail previously.
-@pytest.fixture(scope='module', params=[EventStreamRuntime, ServerRuntime])
+TEST_RUNTIME = os.getenv('TEST_RUNTIME', 'both')
+PY3_FOR_TESTING = '/opendevin/miniforge3/bin/mamba run -n base python3'
+
+
+# Depending on TEST_RUNTIME, feed the appropriate box class(es) to the test.
+def get_box_classes():
+    runtime = TEST_RUNTIME
+    if runtime.lower() == 'eventstream':
+        return [EventStreamRuntime]
+    else:
+        return [EventStreamRuntime]
+
+
+# This assures that all tests run together per runtime, not alternating between them,
+# which cause errors (especially outside GitHub actions).
+@pytest.fixture(scope='module', params=get_box_classes())
 def box_class(request):
+    time.sleep(2)
+    return request.param
+
+
+# TODO: We will change this to `run_as_user` when `ServerRuntime` is deprecated.
+# since `EventStreamRuntime` supports running as an arbitrary user.
+@pytest.fixture(scope='module', params=[True, False])
+def run_as_devin(request):
+    time.sleep(1)
+    return request.param
+
+
+@pytest.fixture(scope='module', params=[True, False])
+def enable_auto_lint(request):
+    time.sleep(1)
+    return request.param
+
+
+@pytest.fixture(
+    scope='module', params=['nikolaik/python-nodejs:python3.11-nodejs22', 'debian:11']
+)
+def container_image(request):
     time.sleep(1)
     return request.param
 
 
-async def _load_runtime(temp_dir, box_class):
+async def _load_runtime(
+    temp_dir,
+    box_class,
+    run_as_devin: bool = True,
+    enable_auto_lint: bool = False,
+    container_image: str | None = None,
+    browsergym_eval_env: str | None = None,
+) -> Runtime:
     sid = 'test'
     cli_session = 'main_test'
-    plugins = [JupyterRequirement(), AgentSkillsRequirement()]
+    # AgentSkills need to be initialized **before** Jupyter
+    # otherwise Jupyter will not access the proper dependencies installed by AgentSkills
+    plugins = [AgentSkillsRequirement(), JupyterRequirement()]
     config = AppConfig(
         workspace_base=temp_dir,
         workspace_mount_path=temp_dir,
         sandbox=SandboxConfig(
             use_host_network=True,
+            browsergym_eval_env=browsergym_eval_env,
         ),
     )
+    load_from_env(config, os.environ)
+    config.run_as_devin = run_as_devin
+    config.sandbox.enable_auto_lint = enable_auto_lint
+
     file_store = get_file_store(config.file_store, config.file_store_path)
     event_stream = EventStream(cli_session, file_store)
 
-    container_image = config.sandbox.container_image
-    # NOTE: we will use the default container image specified in the config.sandbox
-    # if it is an official od_runtime image.
-    if 'od_runtime' not in container_image:
-        container_image = 'ubuntu:22.04'
-        logger.warning(
-            f'`{config.sandbox.container_image}` is not an od_runtime image. Will use `{container_image}` as the container image for testing.'
-        )
+    if container_image is not None:
+        config.sandbox.container_image = container_image
+
     if box_class == EventStreamRuntime:
+        # NOTE: we will use the default container image specified in the config.sandbox
+        # if it is an official od_runtime image.
+        cur_container_image = config.sandbox.container_image
+        if 'od_runtime' not in cur_container_image and cur_container_image not in {
+            'xingyaoww/od-eval-miniwob:v1.0'
+        }:  # a special exception list
+            cur_container_image = 'nikolaik/python-nodejs:python3.11-nodejs22'
+            logger.warning(
+                f'`{config.sandbox.container_image}` is not an od_runtime image. Will use `{cur_container_image}` as the container image for testing.'
+            )
+
         runtime = EventStreamRuntime(
             config=config,
             event_stream=event_stream,
             sid=sid,
+            plugins=plugins,
             # NOTE: we probably don't have a default container image `/sandbox` for the event stream runtime
             # Instead, we will pre-build a suite of container images with OD-runtime-cli installed.
-            container_image=container_image,
-            plugins=plugins,
+            container_image=cur_container_image,
         )
         await runtime.ainit()
-    elif box_class == ServerRuntime:
-        runtime = ServerRuntime(config=config, event_stream=event_stream, sid=sid)
-        await runtime.ainit()
-        runtime.init_sandbox_plugins(plugins)
-        runtime.init_runtime_tools(
-            [],
-            is_async=False,
-            runtime_tools_config={},
-        )
+
     else:
         raise ValueError(f'Invalid box class: {box_class}')
     await asyncio.sleep(1)
@@ -96,9 +152,9 @@ async def _load_runtime(temp_dir, box_class):
 
 
 @pytest.mark.asyncio
-async def test_env_vars_os_environ(temp_dir, box_class):
+async def test_env_vars_os_environ(temp_dir, box_class, run_as_devin):
     with patch.dict(os.environ, {'SANDBOX_ENV_FOOBAR': 'BAZ'}):
-        runtime = await _load_runtime(temp_dir, box_class)
+        runtime = await _load_runtime(temp_dir, box_class, run_as_devin)
 
         obs: CmdOutputObservation = await runtime.run_action(
             CmdRunAction(command='env')
@@ -195,8 +251,8 @@ async def test_env_vars_runtime_add_env_vars_overwrite(temp_dir, box_class):
 
 
 @pytest.mark.asyncio
-async def test_bash_command_pexcept(temp_dir, box_class):
-    runtime = await _load_runtime(temp_dir, box_class)
+async def test_bash_command_pexcept(temp_dir, box_class, run_as_devin):
+    runtime = await _load_runtime(temp_dir, box_class, run_as_devin)
 
     # We set env var PS1="\u@\h:\w $"
     # and construct the PEXCEPT prompt base on it.
@@ -223,3 +279,1076 @@ async def test_bash_command_pexcept(temp_dir, box_class):
 
     await runtime.close()
     await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_simple_cmd_ipython_and_fileop(temp_dir, box_class, run_as_devin):
+    runtime = await _load_runtime(temp_dir, box_class, run_as_devin)
+
+    # Test run command
+    action_cmd = CmdRunAction(command='ls -l')
+    logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_cmd)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'total 0' in obs.content
+
+    # Test run ipython
+    test_code = "print('Hello, `World`!\\n')"
+    action_ipython = IPythonRunCellAction(code=test_code)
+    logger.info(action_ipython, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_ipython)
+    assert isinstance(obs, IPythonRunCellObservation)
+
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.content.strip() == 'Hello, `World`!'
+
+    # Test read file (file should not exist)
+    action_read = FileReadAction(path='hello.sh')
+    logger.info(action_read, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_read)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, ErrorObservation)
+    assert 'File not found' in obs.content
+
+    # Test write file
+    action_write = FileWriteAction(content='echo "Hello, World!"', path='hello.sh')
+    logger.info(action_write, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_write)
+    assert isinstance(obs, FileWriteObservation)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert obs.content == ''
+    # event stream runtime will always use absolute path
+    assert obs.path == '/workspace/hello.sh'
+
+    # Test read file (file should exist)
+    action_read = FileReadAction(path='hello.sh')
+    logger.info(action_read, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_read)
+    assert isinstance(
+        obs, FileReadObservation
+    ), 'The observation should be a FileReadObservation.'
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert obs.content == 'echo "Hello, World!"\n'
+    assert obs.path == '/workspace/hello.sh'
+
+    # clean up
+    action = CmdRunAction(command='rm -rf hello.sh')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_simple_browse(temp_dir, box_class, run_as_devin):
+    runtime = await _load_runtime(temp_dir, box_class, run_as_devin)
+
+    # Test browse
+    action_cmd = CmdRunAction(
+        command=f'{PY3_FOR_TESTING} -m http.server 8000 > server.log 2>&1 &'
+    )
+    logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_cmd)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert '[1]' in obs.content
+
+    action_cmd = CmdRunAction(command='sleep 5 && cat server.log')
+    logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_cmd)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    action_browse = BrowseURLAction(url='http://localhost:8000')
+    logger.info(action_browse, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_browse)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, BrowserOutputObservation)
+    assert 'http://localhost:8000' in obs.url
+    assert not obs.error
+    assert obs.open_pages_urls == ['http://localhost:8000/']
+    assert obs.active_page_index == 0
+    assert obs.last_browser_action == 'goto("http://localhost:8000")'
+    assert obs.last_browser_action_error == ''
+    assert 'Directory listing for /' in obs.content
+    assert 'server.log' in obs.content
+
+    # clean up
+    action = CmdRunAction(command='rm -rf server.log')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_browsergym_eval_env(temp_dir):
+    runtime = await _load_runtime(
+        temp_dir,
+        # only supported in event stream runtime
+        box_class=EventStreamRuntime,
+        run_as_devin=False,  # need root permission to access file
+        container_image='xingyaoww/od-eval-miniwob:v1.0',
+        browsergym_eval_env='browsergym/miniwob.choose-list',
+    )
+    from opendevin.runtime.browser.browser_env import (
+        BROWSER_EVAL_GET_GOAL_ACTION,
+        BROWSER_EVAL_GET_REWARDS_ACTION,
+    )
+
+    # Test browse
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, BrowserOutputObservation)
+    assert not obs.error
+    assert 'Select' in obs.content
+    assert 'from the list and click Submit' in obs.content
+
+    # Make sure the browser can produce observation in eva[l
+    action = BrowseInteractiveAction(browser_actions='noop()')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert (
+        obs.url.strip()
+        == 'file:///miniwob-plusplus/miniwob/html/miniwob/choose-list.html'
+    )
+
+    # Make sure the rewards are working
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert json.loads(obs.content) == [0.0]
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_single_multiline_command(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    action = CmdRunAction(command='echo \\\n -e "foo"')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0, 'The exit code should be 0.'
+    assert 'foo' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_multiline_echo(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    action = CmdRunAction(command='echo -e "hello\nworld"')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0, 'The exit code should be 0.'
+    assert 'hello\r\nworld' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_runtime_whitespace(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    action = CmdRunAction(command='echo -e "\\n\\n\\n"')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert obs.exit_code == 0, 'The exit code should be 0.'
+    assert '\r\n\r\n\r\n' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_multiple_multiline_commands(temp_dir, box_class, run_as_devin):
+    cmds = [
+        'ls -l',
+        'echo -e "hello\nworld"',
+        """
+echo -e "hello it\\'s me"
+""".strip(),
+        """
+echo \\
+    -e 'hello' \\
+    -v
+""".strip(),
+        """
+echo -e 'hello\\nworld\\nare\\nyou\\nthere?'
+""".strip(),
+        """
+echo -e 'hello
+world
+are
+you\\n
+there?'
+""".strip(),
+        """
+echo -e 'hello
+world "
+'
+""".strip(),
+    ]
+    joined_cmds = '\n'.join(cmds)
+
+    runtime = await _load_runtime(temp_dir, box_class, run_as_devin)
+
+    action = CmdRunAction(command=joined_cmds)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0, 'The exit code should be 0.'
+
+    assert 'total 0' in obs.content
+    assert 'hello\r\nworld' in obs.content
+    assert "hello it\\'s me" in obs.content
+    assert 'hello -v' in obs.content
+    assert 'hello\r\nworld\r\nare\r\nyou\r\nthere?' in obs.content
+    assert 'hello\r\nworld\r\nare\r\nyou\r\n\r\nthere?' in obs.content
+    assert 'hello\r\nworld "\r\n' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_no_ps2_in_output(temp_dir, box_class, run_as_devin):
+    """Test that the PS2 sign is not added to the output of a multiline command."""
+    runtime = await _load_runtime(temp_dir, box_class, run_as_devin)
+
+    action = CmdRunAction(command='echo -e "hello\nworld"')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert 'hello\r\nworld' in obs.content
+    assert '>' not in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_multiline_command_loop(temp_dir, box_class):
+    # https://github.com/OpenDevin/OpenDevin/issues/3143
+
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    init_cmd = """
+mkdir -p _modules && \
+for month in {01..04}; do
+    for day in {01..05}; do
+        touch "_modules/2024-${month}-${day}-sample.md"
+    done
+done
+echo "created files"
+"""
+    action = CmdRunAction(command=init_cmd)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0, 'The exit code should be 0.'
+    assert 'created files' in obs.content
+
+    follow_up_cmd = """
+for file in _modules/*.md; do
+    new_date=$(echo $file | sed -E 's/2024-(01|02|03|04)-/2024-/;s/2024-01/2024-08/;s/2024-02/2024-09/;s/2024-03/2024-10/;s/2024-04/2024-11/')
+    mv "$file" "$new_date"
+done
+echo "success"
+"""
+    action = CmdRunAction(command=follow_up_cmd)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0, 'The exit code should be 0.'
+    assert 'success' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_cmd_run(temp_dir, box_class, run_as_devin):
+    runtime = await _load_runtime(temp_dir, box_class, run_as_devin)
+
+    action = CmdRunAction(command='ls -l')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'total 0' in obs.content
+
+    action = CmdRunAction(command='mkdir test')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='ls -l')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    if run_as_devin:
+        assert 'opendevin' in obs.content
+    else:
+        assert 'root' in obs.content
+    assert 'test' in obs.content
+
+    action = CmdRunAction(command='touch test/foo.txt')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='ls -l test')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'foo.txt' in obs.content
+
+    # clean up: this is needed, since CI will not be
+    # run as root, and this test may leave a file
+    # owned by root
+    action = CmdRunAction(command='rm -rf test')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_run_as_user_correct_home_dir(temp_dir, box_class, run_as_devin):
+    runtime = await _load_runtime(temp_dir, box_class, run_as_devin)
+
+    action = CmdRunAction(command='cd ~ && pwd')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    if run_as_devin:
+        assert '/home/opendevin' in obs.content
+    else:
+        assert '/root' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_multi_cmd_run_in_single_line(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    action = CmdRunAction(command='pwd && ls -l')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert '/workspace' in obs.content
+    assert 'total 0' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_stateful_cmd(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    action = CmdRunAction(command='mkdir test')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0, 'The exit code should be 0.'
+
+    action = CmdRunAction(command='cd test')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0, 'The exit code should be 0.'
+
+    action = CmdRunAction(command='pwd')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0, 'The exit code should be 0.'
+    assert '/workspace/test' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_failed_cmd(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    action = CmdRunAction(command='non_existing_command')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code != 0, 'The exit code should not be 0 for a failed command.'
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_ipython_multi_user(temp_dir, box_class, run_as_devin):
+    runtime = await _load_runtime(temp_dir, box_class, run_as_devin)
+
+    # Test run ipython
+    # get username
+    test_code = "import os; print(os.environ['USER'])"
+    action_ipython = IPythonRunCellAction(code=test_code)
+    logger.info(action_ipython, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_ipython)
+    assert isinstance(obs, IPythonRunCellObservation)
+
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    if run_as_devin:
+        assert 'opendevin' in obs.content
+    else:
+        assert 'root' in obs.content
+
+    # print pwd
+    test_code = 'import os; print(os.getcwd())'
+    action_ipython = IPythonRunCellAction(code=test_code)
+    logger.info(action_ipython, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_ipython)
+    assert isinstance(obs, IPythonRunCellObservation)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.content.strip() == '/workspace'
+
+    # write a file
+    test_code = "with open('test.txt', 'w') as f: f.write('Hello, world!')"
+    action_ipython = IPythonRunCellAction(code=test_code)
+    logger.info(action_ipython, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_ipython)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, IPythonRunCellObservation)
+    assert obs.content.strip() == '[Code executed successfully with no output]'
+
+    # check file owner via bash
+    action = CmdRunAction(command='ls -alh test.txt')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+    if run_as_devin:
+        # -rw-r--r-- 1 opendevin root 13 Jul 28 03:53 test.txt
+        assert 'opendevin' in obs.content.split('\r\n')[0]
+        assert 'root' in obs.content.split('\r\n')[0]
+    else:
+        # -rw-r--r-- 1 root root 13 Jul 28 03:53 test.txt
+        assert 'root' in obs.content.split('\r\n')[0]
+
+    # clean up
+    action = CmdRunAction(command='rm -rf test')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_ipython_simple(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    # Test run ipython
+    # get username
+    test_code = 'print(1)'
+    action_ipython = IPythonRunCellAction(code=test_code)
+    logger.info(action_ipython, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action_ipython)
+    assert isinstance(obs, IPythonRunCellObservation)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.content.strip() == '1'
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+async def _test_ipython_agentskills_fileop_pwd_impl(
+    runtime: EventStreamRuntime, enable_auto_lint: bool
+):
+    # remove everything in /workspace
+    action = CmdRunAction(command='rm -rf /workspace/*')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='mkdir test')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+
+    action = IPythonRunCellAction(code="create_file('hello.py')")
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, IPythonRunCellObservation)
+    assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
+        '[File: /workspace/hello.py (1 lines total)]\n'
+        '(this is the beginning of the file)\n'
+        '1|\n'
+        '(this is the end of the file)\n'
+        '[File hello.py created.]\n'
+    ).strip().split('\n')
+
+    action = CmdRunAction(command='cd test')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+
+    # This should create a file in the current working directory
+    # i.e., /workspace/test/hello.py instead of /workspace/hello.py
+    action = IPythonRunCellAction(code="create_file('hello.py')")
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, IPythonRunCellObservation)
+    assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
+        '[File: /workspace/test/hello.py (1 lines total)]\n'
+        '(this is the beginning of the file)\n'
+        '1|\n'
+        '(this is the end of the file)\n'
+        '[File hello.py created.]\n'
+    ).strip().split('\n')
+
+    if enable_auto_lint:
+        # edit file, but make a mistake in indentation
+        action = IPythonRunCellAction(
+            code="insert_content_at_line('hello.py', 1, '  print(\"hello world\")')"
+        )
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = await runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, IPythonRunCellObservation)
+        assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
+            """
+[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
+ERRORS:
+/workspace/test/hello.py:1:3: E999 IndentationError: unexpected indent
+[This is how your edit would have looked if applied]
+-------------------------------------------------
+(this is the beginning of the file)
+1|  print("hello world")
+(this is the end of the file)
+-------------------------------------------------
+
+[This is the original code before your edit]
+-------------------------------------------------
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+-------------------------------------------------
+Your changes have NOT been applied. Please fix your edit command and try again.
+You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
+DO NOT re-run the same failed edit command. Running it again will lead to the same error.
+"""
+        ).strip().split('\n')
+
+    # edit file with correct indentation
+    action = IPythonRunCellAction(
+        code="insert_content_at_line('hello.py', 1, 'print(\"hello world\")')"
+    )
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, IPythonRunCellObservation)
+    assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
+        """
+[File: /workspace/test/hello.py (1 lines total after edit)]
+(this is the beginning of the file)
+1|print("hello world")
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+"""
+    ).strip().split('\n')
+
+    action = CmdRunAction(command='rm -rf /workspace/*')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_ipython_agentskills_fileop_pwd(
+    temp_dir, box_class, run_as_devin, enable_auto_lint
+):
+    """Make sure that cd in bash also update the current working directory in ipython."""
+
+    runtime = await _load_runtime(
+        temp_dir, box_class, run_as_devin, enable_auto_lint=enable_auto_lint
+    )
+    await _test_ipython_agentskills_fileop_pwd_impl(runtime, enable_auto_lint)
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_ipython_agentskills_fileop_pwd_with_userdir(temp_dir, box_class):
+    """Make sure that cd in bash also update the current working directory in ipython.
+
+    Handle special case where the pwd is provided as "~", which should be expanded using os.path.expanduser
+    on the client side.
+    """
+
+    runtime = await _load_runtime(
+        temp_dir,
+        box_class,
+        run_as_devin=False,
+    )
+
+    action = CmdRunAction(command='cd ~')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='mkdir test && ls -la')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+
+    action = IPythonRunCellAction(code="create_file('hello.py')")
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, IPythonRunCellObservation)
+    assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
+        '[File: /root/hello.py (1 lines total)]\n'
+        '(this is the beginning of the file)\n'
+        '1|\n'
+        '(this is the end of the file)\n'
+        '[File hello.py created.]\n'
+    ).strip().split('\n')
+
+    action = CmdRunAction(command='cd test')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+
+    # This should create a file in the current working directory
+    # i.e., /workspace/test/hello.py instead of /workspace/hello.py
+    action = IPythonRunCellAction(code="create_file('hello.py')")
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, IPythonRunCellObservation)
+    assert obs.content.replace('\r\n', '\n').strip().split('\n') == (
+        '[File: /root/test/hello.py (1 lines total)]\n'
+        '(this is the beginning of the file)\n'
+        '1|\n'
+        '(this is the end of the file)\n'
+        '[File hello.py created.]\n'
+    ).strip().split('\n')
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_bash_python_version(temp_dir, box_class):
+    """Make sure Python is available in bash."""
+
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    action = CmdRunAction(command='which python')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='python --version')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+    # Should not error out
+
+    action = CmdRunAction(command='pip --version')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+    # Should not error out
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_ipython_package_install(temp_dir, box_class, run_as_devin):
+    """Make sure that cd in bash also update the current working directory in ipython."""
+    runtime = await _load_runtime(temp_dir, box_class, run_as_devin)
+
+    # It should error out since pymsgbox is not installed
+    action = IPythonRunCellAction(code='import pymsgbox')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert "ModuleNotFoundError: No module named 'pymsgbox'" in obs.content
+
+    # Install pymsgbox in Jupyter
+    action = IPythonRunCellAction(code='%pip install pymsgbox==1.0.9')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert (
+        'Successfully installed pymsgbox-1.0.9' in obs.content
+        or '[Package installed successfully]' in obs.content
+    )
+
+    action = IPythonRunCellAction(code='import pymsgbox')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    # import should not error out
+    assert obs.content.strip() == '[Code executed successfully with no output]'
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+def _create_test_file(host_temp_dir):
+    # Single file
+    with open(os.path.join(host_temp_dir, 'test_file.txt'), 'w') as f:
+        f.write('Hello, World!')
+
+
+@pytest.mark.asyncio
+async def test_copy_single_file(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    with tempfile.TemporaryDirectory() as host_temp_dir:
+        _create_test_file(host_temp_dir)
+        await runtime.copy_to(
+            os.path.join(host_temp_dir, 'test_file.txt'), '/workspace'
+        )
+
+    action = CmdRunAction(command='ls -alh /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'test_file.txt' in obs.content
+
+    action = CmdRunAction(command='cat /workspace/test_file.txt')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'Hello, World!' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+def _create_test_dir_with_files(host_temp_dir):
+    os.mkdir(os.path.join(host_temp_dir, 'test_dir'))
+    with open(os.path.join(host_temp_dir, 'test_dir', 'file1.txt'), 'w') as f:
+        f.write('File 1 content')
+    with open(os.path.join(host_temp_dir, 'test_dir', 'file2.txt'), 'w') as f:
+        f.write('File 2 content')
+
+
+@pytest.mark.asyncio
+async def test_copy_directory_recursively(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    with tempfile.TemporaryDirectory() as host_temp_dir:
+        # We need a separate directory, since temp_dir is mounted to /workspace
+        _create_test_dir_with_files(host_temp_dir)
+        await runtime.copy_to(
+            os.path.join(host_temp_dir, 'test_dir'), '/workspace', recursive=True
+        )
+
+    action = CmdRunAction(command='ls -alh /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'test_dir' in obs.content
+    assert 'file1.txt' not in obs.content
+    assert 'file2.txt' not in obs.content
+
+    action = CmdRunAction(command='ls -alh /workspace/test_dir')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'file1.txt' in obs.content
+    assert 'file2.txt' in obs.content
+
+    action = CmdRunAction(command='cat /workspace/test_dir/file1.txt')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'File 1 content' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_copy_to_non_existent_directory(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    with tempfile.TemporaryDirectory() as host_temp_dir:
+        _create_test_file(host_temp_dir)
+        await runtime.copy_to(
+            os.path.join(host_temp_dir, 'test_file.txt'), '/workspace/new_dir'
+        )
+
+    action = CmdRunAction(command='cat /workspace/new_dir/test_file.txt')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'Hello, World!' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_overwrite_existing_file(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    # touch a file in /workspace
+    action = CmdRunAction(command='touch /workspace/test_file.txt')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cat /workspace/test_file.txt')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'Hello, World!' not in obs.content
+
+    with tempfile.TemporaryDirectory() as host_temp_dir:
+        _create_test_file(host_temp_dir)
+        await runtime.copy_to(
+            os.path.join(host_temp_dir, 'test_file.txt'), '/workspace'
+        )
+
+    action = CmdRunAction(command='cat /workspace/test_file.txt')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'Hello, World!' in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_copy_non_existent_file(temp_dir, box_class):
+    runtime = await _load_runtime(temp_dir, box_class)
+
+    with pytest.raises(FileNotFoundError):
+        await runtime.copy_to(
+            os.path.join(temp_dir, 'non_existent_file.txt'),
+            '/workspace/should_not_exist.txt',
+        )
+
+    action = CmdRunAction(command='ls /workspace/should_not_exist.txt')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code != 0  # File should not exist
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_keep_prompt(temp_dir):
+    # only EventStreamRuntime supports keep_prompt
+    runtime = await _load_runtime(
+        temp_dir, box_class=EventStreamRuntime, run_as_devin=False
+    )
+
+    action = CmdRunAction(command='touch /workspace/test_file.txt')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'root@' in obs.content
+
+    action = CmdRunAction(command='cat /workspace/test_file.txt', keep_prompt=False)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    assert 'root@' not in obs.content
+
+    await runtime.close()
+    await asyncio.sleep(1)
+
+
+@pytest.mark.asyncio
+async def test_git_operation(box_class):
+    # do not mount workspace, since workspace mount by tests will be owned by root
+    # while the user_id we get via os.getuid() is different from root
+    # which causes permission issues
+    runtime = await _load_runtime(
+        temp_dir=None,
+        box_class=box_class,
+        # Need to use non-root user to expose issues
+        run_as_devin=True,
+    )
+
+    # this will happen if permission of runtime is not properly configured
+    # fatal: detected dubious ownership in repository at '/workspace'
+
+    # check the ownership of the current directory
+    action = CmdRunAction(command='ls -alh .')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+    # drwx--S--- 2 opendevin root   64 Aug  7 23:32 .
+    # drwxr-xr-x 1 root      root 4.0K Aug  7 23:33 ..
+    for line in obs.content.split('\r\n'):
+        if ' ..' in line:
+            # parent directory should be owned by root
+            assert 'root' in line
+            assert 'opendevin' not in line
+        elif ' .' in line:
+            # current directory should be owned by opendevin
+            # and its group should be root
+            assert 'opendevin' in line
+            assert 'root' in line
+
+    # make sure all git operations are allowed
+    action = CmdRunAction(command='git init')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+
+    # create a file
+    action = CmdRunAction(command='echo "hello" > test_file.txt')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+
+    # git add
+    action = CmdRunAction(command='git add test_file.txt')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+
+    # git diff
+    action = CmdRunAction(command='git diff')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+
+    # git commit
+    action = CmdRunAction(command='git commit -m "test commit"')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = await runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert isinstance(obs, CmdOutputObservation)
+    assert obs.exit_code == 0
+
+    await runtime.close()
+
+    await runtime.close()
+    await asyncio.sleep(1)
diff --git a/tests/unit/test_runtime_build.py b/tests/unit/test_runtime_build.py
index d18cffb43ac7..438355401ea9 100644
--- a/tests/unit/test_runtime_build.py
+++ b/tests/unit/test_runtime_build.py
@@ -1,67 +1,147 @@
 import os
-import tarfile
 import tempfile
 from importlib.metadata import version
-from unittest.mock import MagicMock, patch
+from unittest.mock import ANY, MagicMock, call, patch
 
 import pytest
 import toml
+from pytest import TempPathFactory
 
 from opendevin.runtime.utils.runtime_build import (
+    RUNTIME_IMAGE_REPO,
     _generate_dockerfile,
     _get_package_version,
     _put_source_code_to_dir,
     build_runtime_image,
-    get_new_image_name,
+    get_runtime_image_repo_and_tag,
+    prep_docker_build_folder,
 )
 
 OD_VERSION = f'od_v{_get_package_version()}'
-RUNTIME_IMAGE_PREFIX = 'od_runtime'
 
 
 @pytest.fixture
-def temp_dir():
-    with tempfile.TemporaryDirectory() as temp_dir:
-        yield temp_dir
+def temp_dir(tmp_path_factory: TempPathFactory) -> str:
+    return str(tmp_path_factory.mktemp('test_runtime_build'))
 
 
-def test_put_source_code_to_dir(temp_dir):
-    folder_name = _put_source_code_to_dir(temp_dir)
-
-    # assert there is a file called 'project.tar.gz' in the temp_dir
-    assert os.path.exists(os.path.join(temp_dir, 'project.tar.gz'))
-
-    # untar the file
-    with tarfile.open(os.path.join(temp_dir, 'project.tar.gz'), 'r:gz') as tar:
-        tar.extractall(path=temp_dir)
+def _check_source_code_in_dir(temp_dir):
+    # assert there is a folder called 'code' in the temp_dir
+    code_dir = os.path.join(temp_dir, 'code')
+    assert os.path.exists(code_dir)
+    assert os.path.isdir(code_dir)
 
     # check the source file is the same as the current code base
-    assert os.path.exists(os.path.join(temp_dir, folder_name, 'pyproject.toml'))
+    assert os.path.exists(os.path.join(code_dir, 'pyproject.toml'))
+
+    # The source code should only include the `opendevin` folder, but not the other folders
+    assert set(os.listdir(code_dir)) == {
+        'opendevin',
+        'pyproject.toml',
+        'poetry.lock',
+        'LICENSE',
+        'README.md',
+        'PKG-INFO',
+    }
+    assert os.path.exists(os.path.join(code_dir, 'opendevin'))
+    assert os.path.isdir(os.path.join(code_dir, 'opendevin'))
+
     # make sure the version from the pyproject.toml is the same as the current version
-    with open(os.path.join(temp_dir, folder_name, 'pyproject.toml'), 'r') as f:
+    with open(os.path.join(code_dir, 'pyproject.toml'), 'r') as f:
         pyproject = toml.load(f)
+
     _pyproject_version = pyproject['tool']['poetry']['version']
     assert _pyproject_version == version('opendevin')
 
 
+def test_put_source_code_to_dir(temp_dir):
+    _put_source_code_to_dir(temp_dir)
+    _check_source_code_in_dir(temp_dir)
+
+
+def test_docker_build_folder(temp_dir):
+    prep_docker_build_folder(
+        temp_dir,
+        base_image='nikolaik/python-nodejs:python3.11-nodejs22',
+        skip_init=False,
+    )
+
+    # check the source code is in the folder
+    _check_source_code_in_dir(temp_dir)
+
+    # Now check dockerfile is in the folder
+    dockerfile_path = os.path.join(temp_dir, 'Dockerfile')
+    assert os.path.exists(dockerfile_path)
+    assert os.path.isfile(dockerfile_path)
+
+    # check the folder only contains the source code and the Dockerfile
+    assert set(os.listdir(temp_dir)) == {'code', 'Dockerfile'}
+
+
+def test_hash_folder_same(temp_dir):
+    dir_hash_1 = prep_docker_build_folder(
+        temp_dir,
+        base_image='nikolaik/python-nodejs:python3.11-nodejs22',
+        skip_init=False,
+    )
+
+    with tempfile.TemporaryDirectory() as temp_dir_2:
+        dir_hash_2 = prep_docker_build_folder(
+            temp_dir_2,
+            base_image='nikolaik/python-nodejs:python3.11-nodejs22',
+            skip_init=False,
+        )
+    assert dir_hash_1 == dir_hash_2
+
+
+def test_hash_folder_diff_init(temp_dir):
+    dir_hash_1 = prep_docker_build_folder(
+        temp_dir,
+        base_image='nikolaik/python-nodejs:python3.11-nodejs22',
+        skip_init=False,
+    )
+
+    with tempfile.TemporaryDirectory() as temp_dir_2:
+        dir_hash_2 = prep_docker_build_folder(
+            temp_dir_2,
+            base_image='nikolaik/python-nodejs:python3.11-nodejs22',
+            skip_init=True,
+        )
+    assert dir_hash_1 != dir_hash_2
+
+
+def test_hash_folder_diff_image(temp_dir):
+    dir_hash_1 = prep_docker_build_folder(
+        temp_dir,
+        base_image='nikolaik/python-nodejs:python3.11-nodejs22',
+        skip_init=False,
+    )
+
+    with tempfile.TemporaryDirectory() as temp_dir_2:
+        dir_hash_2 = prep_docker_build_folder(
+            temp_dir_2,
+            base_image='debian:11',
+            skip_init=False,
+        )
+    assert dir_hash_1 != dir_hash_2
+
+
 def test_generate_dockerfile_scratch():
     base_image = 'debian:11'
-    source_code_dirname = 'dummy'
     dockerfile_content = _generate_dockerfile(
         base_image,
-        source_code_dirname=source_code_dirname,
         skip_init=False,
     )
     assert base_image in dockerfile_content
     assert 'apt-get update' in dockerfile_content
     assert 'apt-get install -y wget sudo apt-utils' in dockerfile_content
     assert (
-        'RUN /opendevin/miniforge3/bin/mamba install conda-forge::poetry -y'
+        'RUN /opendevin/miniforge3/bin/mamba install conda-forge::poetry python=3.11 -y'
         in dockerfile_content
     )
 
     # Check the update command
-    assert f'mv /opendevin/{source_code_dirname} /opendevin/code' in dockerfile_content
+    assert 'COPY ./code /opendevin/code' in dockerfile_content
     assert (
         '/opendevin/miniforge3/bin/mamba run -n base poetry install'
         in dockerfile_content
@@ -70,135 +150,146 @@ def test_generate_dockerfile_scratch():
 
 def test_generate_dockerfile_skip_init():
     base_image = 'debian:11'
-    source_code_dirname = 'dummy'
     dockerfile_content = _generate_dockerfile(
         base_image,
-        source_code_dirname=source_code_dirname,
         skip_init=True,
     )
 
     # These commands SHOULD NOT include in the dockerfile if skip_init is True
     assert 'RUN apt update && apt install -y wget sudo' not in dockerfile_content
     assert (
-        'RUN /opendevin/miniforge3/bin/mamba install conda-forge::poetry -y'
+        'RUN /opendevin/miniforge3/bin/mamba install conda-forge::poetry python=3.11 -y'
         not in dockerfile_content
     )
 
     # These update commands SHOULD still in the dockerfile
-    assert (
-        f'RUN mv /opendevin/{source_code_dirname} /opendevin/code' in dockerfile_content
-    )
+    assert 'COPY ./code /opendevin/code' in dockerfile_content
     assert (
         '/opendevin/miniforge3/bin/mamba run -n base poetry install'
         in dockerfile_content
     )
 
 
-def test_get_new_image_name_eventstream():
+def test_get_runtime_image_repo_and_tag_eventstream():
     base_image = 'debian:11'
-    new_image_name = get_new_image_name(base_image)
-    assert new_image_name == f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11'
-
-    base_image = 'ubuntu:22.04'
-    new_image_name = get_new_image_name(base_image)
+    img_repo, img_tag = get_runtime_image_repo_and_tag(base_image)
     assert (
-        new_image_name == f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_ubuntu_tag_22.04'
+        img_repo == f'{RUNTIME_IMAGE_REPO}'
+        and img_tag == f'{OD_VERSION}_image_debian_tag_11'
     )
 
-    base_image = 'ubuntu'
-    new_image_name = get_new_image_name(base_image)
+    base_image = 'nikolaik/python-nodejs:python3.11-nodejs22'
+    img_repo, img_tag = get_runtime_image_repo_and_tag(base_image)
     assert (
-        new_image_name == f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_ubuntu_tag_latest'
+        img_repo == f'{RUNTIME_IMAGE_REPO}'
+        and img_tag
+        == f'{OD_VERSION}_image_nikolaik___python-nodejs_tag_python3.11-nodejs22'
     )
 
-
-def test_get_new_image_name_eventstream_dev_mode():
-    base_image = f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11'
-    new_image_name = get_new_image_name(base_image, dev_mode=True)
+    base_image = 'ubuntu'
+    img_repo, img_tag = get_runtime_image_repo_and_tag(base_image)
     assert (
-        new_image_name == f'{RUNTIME_IMAGE_PREFIX}_dev:{OD_VERSION}_image_debian_tag_11'
+        img_repo == f'{RUNTIME_IMAGE_REPO}'
+        and img_tag == f'{OD_VERSION}_image_ubuntu_tag_latest'
     )
 
-    base_image = f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_ubuntu_tag_22.04'
-    new_image_name = get_new_image_name(base_image, dev_mode=True)
-    assert (
-        new_image_name
-        == f'{RUNTIME_IMAGE_PREFIX}_dev:{OD_VERSION}_image_ubuntu_tag_22.04'
-    )
 
-    base_image = f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_ubuntu_tag_latest'
-    new_image_name = get_new_image_name(base_image, dev_mode=True)
-    assert (
-        new_image_name
-        == f'{RUNTIME_IMAGE_PREFIX}_dev:{OD_VERSION}_image_ubuntu_tag_latest'
-    )
+@patch('opendevin.runtime.utils.runtime_build.docker.DockerClient')
+def test_build_runtime_image_from_scratch(mock_docker_client, temp_dir):
+    base_image = 'debian:11'
 
+    mock_docker_client.images.list.return_value = []
 
-def test_get_new_image_name_eventstream_dev_invalid_base_image():
-    with pytest.raises(ValueError):
-        base_image = 'debian:11'
-        get_new_image_name(base_image, dev_mode=True)
+    # for image.tag(target_repo, target_image_tag)
+    mock_image = MagicMock()
+    mock_docker_client.images.get.return_value = mock_image
 
-    with pytest.raises(ValueError):
-        base_image = 'ubuntu:22.04'
-        get_new_image_name(base_image, dev_mode=True)
+    from_scratch_hash = prep_docker_build_folder(
+        temp_dir,
+        base_image,
+        skip_init=False,
+    )
+
+    image_name = build_runtime_image(base_image, mock_docker_client)
 
-    with pytest.raises(ValueError):
-        base_image = 'ubuntu:latest'
-        get_new_image_name(base_image, dev_mode=True)
+    # The build call should be called with the hash tag
+    mock_docker_client.api.build.assert_called_once_with(
+        path=ANY, tag=f'{RUNTIME_IMAGE_REPO}:{from_scratch_hash}', rm=True, decode=True
+    )
+    # Then the hash tag should be tagged to the version
+    mock_image.tag.assert_called_once_with(
+        f'{RUNTIME_IMAGE_REPO}', f'{OD_VERSION}_image_debian_tag_11'
+    )
+    assert image_name == f'{RUNTIME_IMAGE_REPO}:{from_scratch_hash}'
 
 
-@patch('opendevin.runtime.utils.runtime_build._build_sandbox_image')
 @patch('opendevin.runtime.utils.runtime_build.docker.DockerClient')
-def test_build_runtime_image_from_scratch(mock_docker_client, mock_build_sandbox_image):
+def test_build_runtime_image_exact_hash_exist(mock_docker_client, temp_dir):
     base_image = 'debian:11'
-    mock_docker_client.images.list.return_value = []
-
-    image_name = build_runtime_image(base_image, mock_docker_client)
-    assert image_name == f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11'
 
-    mock_build_sandbox_image.assert_called_once_with(
+    from_scratch_hash = prep_docker_build_folder(
+        temp_dir,
         base_image,
-        f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11',
-        mock_docker_client,
         skip_init=False,
     )
 
-
-@patch('opendevin.runtime.utils.runtime_build._build_sandbox_image')
-@patch('opendevin.runtime.utils.runtime_build.docker.DockerClient')
-def test_build_runtime_image_exist_no_update_source(
-    mock_docker_client, mock_build_sandbox_image
-):
-    base_image = 'debian:11'
     mock_docker_client.images.list.return_value = [
-        MagicMock(tags=[f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11'])
+        MagicMock(tags=[f'{RUNTIME_IMAGE_REPO}:{from_scratch_hash}'])
     ]
 
     image_name = build_runtime_image(base_image, mock_docker_client)
-    assert image_name == f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11'
-
-    mock_build_sandbox_image.assert_not_called()
+    assert image_name == f'{RUNTIME_IMAGE_REPO}:{from_scratch_hash}'
+    mock_docker_client.api.build.assert_not_called()
 
 
 @patch('opendevin.runtime.utils.runtime_build._build_sandbox_image')
 @patch('opendevin.runtime.utils.runtime_build.docker.DockerClient')
-def test_build_runtime_image_exist_with_update_source(
-    mock_docker_client, mock_build_sandbox_image
+def test_build_runtime_image_exact_hash_not_exist(
+    mock_docker_client, mock_build_sandbox_image, temp_dir
 ):
     base_image = 'debian:11'
-    mock_docker_client.images.list.return_value = [
-        MagicMock(tags=[f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11'])
-    ]
-
-    image_name = build_runtime_image(
-        base_image, mock_docker_client, update_source_code=True
-    )
-    assert image_name == f'{RUNTIME_IMAGE_PREFIX}_dev:{OD_VERSION}_image_debian_tag_11'
+    repo, latest_image_tag = get_runtime_image_repo_and_tag(base_image)
+    latest_image_name = f'{repo}:{latest_image_tag}'
 
-    mock_build_sandbox_image.assert_called_once_with(
-        f'{RUNTIME_IMAGE_PREFIX}:{OD_VERSION}_image_debian_tag_11',
-        f'{RUNTIME_IMAGE_PREFIX}_dev:{OD_VERSION}_image_debian_tag_11',
-        mock_docker_client,
-        skip_init=True,
+    from_scratch_hash = prep_docker_build_folder(
+        temp_dir,
+        base_image,
+        skip_init=False,
     )
+    with tempfile.TemporaryDirectory() as temp_dir_2:
+        non_from_scratch_hash = prep_docker_build_folder(
+            temp_dir_2,
+            base_image,
+            skip_init=True,
+        )
+
+    # latest image exists BUT not the exact hash
+    mock_docker_client.images.list.return_value = [MagicMock(tags=[latest_image_name])]
+
+    with patch(
+        'opendevin.runtime.utils.runtime_build.prep_docker_build_folder'
+    ) as mock_prep_docker_build_folder:
+        mock_prep_docker_build_folder.side_effect = [
+            from_scratch_hash,
+            non_from_scratch_hash,
+        ]
+
+        image_name = build_runtime_image(base_image, mock_docker_client)
+
+        mock_prep_docker_build_folder.assert_has_calls(
+            [
+                call(ANY, base_image=base_image, skip_init=False, extra_deps=None),
+                call(
+                    ANY, base_image=latest_image_name, skip_init=True, extra_deps=None
+                ),
+            ]
+        )
+
+        mock_build_sandbox_image.assert_called_once_with(
+            docker_folder=ANY,
+            docker_client=mock_docker_client,
+            target_image_repo=repo,
+            target_image_hash_tag=from_scratch_hash,
+            target_image_tag=latest_image_tag,
+        )
+        assert image_name == f'{repo}:{from_scratch_hash}'
diff --git a/tests/unit/test_sandbox.py b/tests/unit/test_sandbox.py
deleted file mode 100644
index eebcdf037d1f..000000000000
--- a/tests/unit/test_sandbox.py
+++ /dev/null
@@ -1,367 +0,0 @@
-import os
-import pathlib
-import tempfile
-
-import pytest
-
-from opendevin.core.config import AppConfig, SandboxConfig
-from opendevin.runtime.docker.ssh_box import DockerSSHBox
-from opendevin.runtime.plugins import AgentSkillsRequirement, JupyterRequirement
-from opendevin.runtime.utils import split_bash_commands
-
-
-def create_docker_box_from_app_config(
-    path: str, config: AppConfig | None = None
-) -> DockerSSHBox:
-    if config is None:
-        config = AppConfig(
-            sandbox=SandboxConfig(
-                box_type='ssh',
-            ),
-            persist_sandbox=False,
-        )
-    return DockerSSHBox(
-        config=config.sandbox,
-        persist_sandbox=config.persist_sandbox,
-        workspace_mount_path=path,
-        sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
-        cache_dir=config.cache_dir,
-        run_as_devin=True,
-        ssh_hostname=config.ssh_hostname,
-        ssh_password=config.ssh_password,
-        ssh_port=config.ssh_port,
-    )
-
-
-@pytest.fixture
-def temp_dir(monkeypatch):
-    # get a temporary directory
-    with tempfile.TemporaryDirectory() as temp_dir:
-        pathlib.Path().mkdir(parents=True, exist_ok=True)
-        yield temp_dir
-
-
-def test_split_commands():
-    cmds = [
-        'ls -l',
-        'echo -e "hello\nworld"',
-        """
-echo -e 'hello it\\'s me'
-""".strip(),
-        """
-echo \\
-    -e 'hello' \\
-    -v
-""".strip(),
-        """
-echo -e 'hello\\nworld\\nare\\nyou\\nthere?'
-""".strip(),
-        """
-echo -e 'hello
-world
-are
-you\\n
-there?'
-""".strip(),
-        """
-echo -e 'hello
-world "
-'
-""".strip(),
-        """
-kubectl apply -f - <<EOF
-apiVersion: v1
-kind: Pod
-metadata:
-  name: busybox-sleep
-spec:
-  containers:
-  - name: busybox
-    image: busybox:1.28
-    args:
-    - sleep
-    - "1000000"
-EOF
-""".strip(),
-    ]
-    joined_cmds = '\n'.join(cmds)
-    split_cmds = split_bash_commands(joined_cmds)
-    for s in split_cmds:
-        print('\nCMD')
-        print(s)
-    cmds = [
-        c.replace('\\\n', '') for c in cmds
-    ]  # The function strips escaped newlines, but this shouldn't matter
-    assert (
-        split_cmds == cmds
-    ), 'The split commands should be the same as the input commands.'
-
-
-def test_ssh_box_run_as_devin(temp_dir):
-    # get a temporary directory
-    for box in [
-        create_docker_box_from_app_config(temp_dir),
-    ]:  # FIXME: permission error on mkdir test for exec box
-        exit_code, output = box.execute('ls -l')
-        assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-        assert output.strip() == 'total 0'
-
-        assert box.workspace_mount_path == temp_dir
-        exit_code, output = box.execute('ls -l')
-        assert exit_code == 0, 'The exit code should be 0.'
-        assert output.strip() == 'total 0'
-
-        exit_code, output = box.execute('mkdir test')
-        assert exit_code == 0, 'The exit code should be 0.'
-        assert output.strip() == ''
-
-        exit_code, output = box.execute('ls -l')
-        assert exit_code == 0, 'The exit code should be 0.'
-        assert 'opendevin' in output, "The output should contain username 'opendevin'"
-        assert 'test' in output, 'The output should contain the test directory'
-
-        exit_code, output = box.execute('touch test/foo.txt')
-        assert exit_code == 0, 'The exit code should be 0.'
-        assert output.strip() == ''
-
-        exit_code, output = box.execute('ls -l test')
-        assert exit_code == 0, 'The exit code should be 0.'
-        assert 'foo.txt' in output, 'The output should contain the foo.txt file'
-        box.close()
-
-
-def test_ssh_box_multi_line_cmd_run_as_devin(temp_dir):
-    box = create_docker_box_from_app_config(temp_dir)
-    exit_code, output = box.execute('pwd && ls -l')
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-    expected_lines = ['/workspace', 'total 0']
-    line_sep = '\r\n' if isinstance(box, DockerSSHBox) else '\n'
-    assert output == line_sep.join(expected_lines), (
-        'The output should be the same as the input for ' + box.__class__.__name__
-    )
-    box.close()
-
-
-def test_ssh_box_stateful_cmd_run_as_devin(temp_dir):
-    box = create_docker_box_from_app_config(temp_dir)
-    exit_code, output = box.execute('mkdir test')
-    assert exit_code == 0, 'The exit code should be 0.'
-    assert output.strip() == ''
-
-    exit_code, output = box.execute('cd test')
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-    assert output.strip() == '', (
-        'The output should be empty for ' + box.__class__.__name__
-    )
-
-    exit_code, output = box.execute('pwd')
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-    assert output.strip() == '/workspace/test', (
-        'The output should be /workspace for ' + box.__class__.__name__
-    )
-    box.close()
-
-
-def test_ssh_box_failed_cmd_run_as_devin(temp_dir):
-    box = create_docker_box_from_app_config(temp_dir)
-    exit_code, output = box.execute('non_existing_command')
-    assert exit_code != 0, (
-        'The exit code should not be 0 for a failed command for '
-        + box.__class__.__name__
-    )
-    box.close()
-
-
-def test_single_multiline_command(temp_dir):
-    box = create_docker_box_from_app_config(temp_dir)
-    exit_code, output = box.execute('echo \\\n -e "foo"')
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-    # FIXME: why is there a `>` in the output? Probably PS2?
-    assert output == '> foo', (
-        'The output should be the same as the input for ' + box.__class__.__name__
-    )
-    box.close()
-
-
-def test_multiline_echo(temp_dir):
-    box = create_docker_box_from_app_config(temp_dir)
-    exit_code, output = box.execute('echo -e "hello\nworld"')
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-    # FIXME: why is there a `>` in the output?
-    assert output == '> hello\r\nworld', (
-        'The output should be the same as the input for ' + box.__class__.__name__
-    )
-    box.close()
-
-
-def test_sandbox_whitespace(temp_dir):
-    box = create_docker_box_from_app_config(temp_dir)
-    exit_code, output = box.execute('echo -e "\\n\\n\\n"')
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-    assert output == '\r\n\r\n\r\n', (
-        'The output should be the same as the input for ' + box.__class__.__name__
-    )
-    box.close()
-
-
-def test_sandbox_jupyter_plugin(temp_dir):
-    box = create_docker_box_from_app_config(temp_dir)
-    box.init_plugins([JupyterRequirement])
-    exit_code, output = box.execute('echo "print(1)" | execute_cli')
-    print(output)
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-    assert output == '1\r\n', (
-        'The output should be the same as the input for ' + box.__class__.__name__
-    )
-    box.close()
-
-
-def _test_sandbox_jupyter_agentskills_fileop_pwd_impl(box, config: AppConfig):
-    box.init_plugins([AgentSkillsRequirement, JupyterRequirement])
-    exit_code, output = box.execute('mkdir test')
-    print(output)
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-
-    exit_code, output = box.execute('echo "create_file(\'hello.py\')" | execute_cli')
-    print(output)
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-    assert output.strip().split('\r\n') == (
-        '[File: /workspace/hello.py (1 lines total)]\r\n'
-        '(this is the beginning of the file)\r\n'
-        '1|\r\n'
-        '(this is the end of the file)\r\n'
-        '[File hello.py created.]\r\n'
-    ).strip().split('\r\n')
-
-    exit_code, output = box.execute('cd test')
-    print(output)
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-
-    exit_code, output = box.execute('echo "create_file(\'hello.py\')" | execute_cli')
-    print(output)
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-    assert output.strip().split('\r\n') == (
-        '[File: /workspace/test/hello.py (1 lines total)]\r\n'
-        '(this is the beginning of the file)\r\n'
-        '1|\r\n'
-        '(this is the end of the file)\r\n'
-        '[File hello.py created.]\r\n'
-    ).strip().split('\r\n')
-
-    if config.sandbox.enable_auto_lint:
-        # edit file, but make a mistake in indentation
-        exit_code, output = box.execute(
-            'echo "insert_content_at_line(\'hello.py\', 1, \'  print(\\"hello world\\")\')" | execute_cli'
-        )
-        print(output)
-        assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-        assert output.strip().split('\r\n') == (
-            """
-[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]
-ERRORS:
-/workspace/test/hello.py:1:3: E999 IndentationError: unexpected indent
-[This is how your edit would have looked if applied]
--------------------------------------------------
-(this is the beginning of the file)
-1|  print("hello world")
-(this is the end of the file)
--------------------------------------------------
-
-[This is the original code before your edit]
--------------------------------------------------
-(this is the beginning of the file)
-1|
-(this is the end of the file)
--------------------------------------------------
-Your changes have NOT been applied. Please fix your edit command and try again.
-You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.
-DO NOT re-run the same failed edit command. Running it again will lead to the same error.
-"""
-        ).strip().split('\n')
-
-    # edit file with correct indentation
-    exit_code, output = box.execute(
-        'echo "insert_content_at_line(\'hello.py\', 1, \'print(\\"hello world\\")\')" | execute_cli'
-    )
-    print(output)
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-    assert output.strip().split('\r\n') == (
-        """
-[File: /workspace/test/hello.py (1 lines total after edit)]
-(this is the beginning of the file)
-1|print("hello world")
-(this is the end of the file)
-[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
-"""
-    ).strip().split('\n')
-
-    exit_code, output = box.execute('rm -rf /workspace/*')
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-    box.close()
-
-
-def test_sandbox_jupyter_agentskills_fileop_pwd(temp_dir):
-    # get a temporary directory
-    config = AppConfig(
-        sandbox=SandboxConfig(
-            box_type='ssh',
-            enable_auto_lint=False,
-        ),
-        persist_sandbox=False,
-    )
-    assert not config.sandbox.enable_auto_lint
-    box = create_docker_box_from_app_config(temp_dir, config)
-    _test_sandbox_jupyter_agentskills_fileop_pwd_impl(box, config)
-
-
-@pytest.mark.skipif(
-    os.getenv('TEST_IN_CI') != 'true',
-    reason='The unittest need to download image, so only run on CI',
-)
-def test_agnostic_sandbox_jupyter_agentskills_fileop_pwd(temp_dir):
-    for base_sandbox_image in ['ubuntu:22.04', 'debian:11']:
-        config = AppConfig(
-            sandbox=SandboxConfig(
-                box_type='ssh',
-                container_image=base_sandbox_image,
-                enable_auto_lint=False,
-            ),
-            persist_sandbox=False,
-        )
-        assert not config.sandbox.enable_auto_lint
-        box = create_docker_box_from_app_config(temp_dir, config)
-        _test_sandbox_jupyter_agentskills_fileop_pwd_impl(box, config)
-
-
-def test_sandbox_jupyter_plugin_backticks(temp_dir):
-    config = AppConfig(
-        sandbox=SandboxConfig(
-            box_type='ssh',
-        ),
-        persist_sandbox=False,
-    )
-    box = DockerSSHBox(
-        config=config.sandbox,
-        persist_sandbox=config.persist_sandbox,
-        workspace_mount_path=temp_dir,
-        sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
-        cache_dir=config.cache_dir,
-        run_as_devin=True,
-        ssh_hostname=config.ssh_hostname,
-        ssh_password=config.ssh_password,
-        ssh_port=config.ssh_port,
-    )
-    box.init_plugins([JupyterRequirement])
-    test_code = "print('Hello, `World`!')"
-    expected_write_command = (
-        "cat > /tmp/opendevin_jupyter_temp.py <<'EOL'\n" f'{test_code}\n' 'EOL'
-    )
-    expected_execute_command = 'cat /tmp/opendevin_jupyter_temp.py | execute_cli'
-    exit_code, output = box.execute(expected_write_command)
-    exit_code, output = box.execute(expected_execute_command)
-    print(output)
-    assert exit_code == 0, 'The exit code should be 0 for ' + box.__class__.__name__
-    assert output.strip() == 'Hello, `World`!', (
-        'The output should be the same as the input for ' + box.__class__.__name__
-    )
-    box.close()