Skip to content

Commit

Permalink
truncate to 128k + small fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
nkkko committed Aug 29, 2024
1 parent 7ab318d commit cb6a0f5
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 21 deletions.
12 changes: 7 additions & 5 deletions content.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,16 +150,15 @@ def cta_section():
Div(
H2("Ready to Simplify Your Development Environment?", cls="text-2xl font-bold text-center mb-4"),
P("Experience the power of Daytona's flexible and secure development environment platform.", cls="text-lg text-center mb-6"),
Button(
A(
Div(
Img(src="assets/icons/github-mark-white.svg", cls="svg-icon"),
cls="icon-container"
),
Span("Get Daytona Now", cls="button-text"),
cls="button",
role="button",
href="https://github.com/daytonaio/daytona",
target="_blank",
rel="noopener noreferrer"
title="Get Daytona",
),
cls="container mx-auto px-4 py-16 text-center"
),
Expand All @@ -169,7 +168,10 @@ def cta_section():
def footer_section():
return Footer(
Div(
P("© 2024 Daytona Platforms Inc. All rights reserved.", cls="text-center text-gray-600"),
P("© 2024 ",
A("Daytona Platforms Inc.", href="https://daytona.io", **_blank,
cls="border-b-2 border-b-black/30 hover:border-b-black/80"),
"All rights reserved.", cls="text-center text-gray-600"),
cls="container mx-auto px-4 py-8"
),
cls="bg-gray-100",
Expand Down
71 changes: 67 additions & 4 deletions helpers/devcontainer_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,82 @@
import logging
import os
import jsonschema
import tiktoken
from helpers.jinja_helper import process_template
from schemas import DevContainerModel

import logging
import tiktoken

def truncate_context(context, max_tokens=120000):
logging.info(f"Starting truncate_context with max_tokens={max_tokens}")
logging.debug(f"Initial context length: {len(context)} characters")

encoding = tiktoken.encoding_for_model("gpt-4o-mini")
tokens = encoding.encode(context)

logging.info(f"Initial token count: {len(tokens)}")

if len(tokens) <= max_tokens:
logging.info("Context is already within token limit. No truncation needed.")
return context

logging.info(f"Context size is {len(tokens)} tokens. Truncation needed.")

# Prioritize keeping the repository structure and languages
structure_end = context.find("<<END_SECTION: Repository Structure >>")
languages_end = context.find("<<END_SECTION: Repository Languages >>")

logging.debug(f"Structure end position: {structure_end}")
logging.debug(f"Languages end position: {languages_end}")

important_content = context[:languages_end] + "<<END_SECTION: Repository Languages >>\n\n"
remaining_content = context[languages_end + len("<<END_SECTION: Repository Languages >>\n\n"):]

def generate_devcontainer_json(instructor_client, repo_url, repo_context, max_retries=2):
important_tokens = encoding.encode(important_content)
logging.debug(f"Important content token count: {len(important_tokens)}")

if len(important_tokens) > max_tokens:
logging.warning("Important content alone exceeds max_tokens. Truncating important content.")
important_content = encoding.decode(important_tokens[:max_tokens])
return important_content

remaining_tokens = max_tokens - len(important_tokens)
logging.info(f"Tokens available for remaining content: {remaining_tokens}")

truncated_remaining = encoding.decode(encoding.encode(remaining_content)[:remaining_tokens])

final_context = important_content + truncated_remaining
final_tokens = encoding.encode(final_context)

logging.info(f"Final token count: {len(final_tokens)}")
logging.debug(f"Final context length: {len(final_context)} characters")

return final_context

def generate_devcontainer_json(instructor_client, repo_url, repo_context, devcontainer_url=None, max_retries=2, regenerate=False):
existing_devcontainer = None
if "<<EXISTING_DEVCONTAINER>>" in repo_context:
logging.info("Existing devcontainer.json found in the repository.")
existing_devcontainer = (
repo_context.split("<<EXISTING_DEVCONTAINER>>")[1]
.split("<<END_EXISTING_DEVCONTAINER>>")[0]
.strip()
)
return existing_devcontainer
if not regenerate and devcontainer_url:
logging.info(f"Using existing devcontainer.json from URL: {devcontainer_url}")
return existing_devcontainer, devcontainer_url

logging.info("Generating devcontainer.json...")

template_data = {"repo_url": repo_url, "repo_context": repo_context}
# Truncate the context to fit within token limits
truncated_context = truncate_context(repo_context, max_tokens=126000)

template_data = {
"repo_url": repo_url,
"repo_context": truncated_context,
"existing_devcontainer": existing_devcontainer
}

prompt = process_template("prompts/devcontainer.jinja", template_data)

Expand All @@ -38,7 +97,11 @@ def generate_devcontainer_json(instructor_client, repo_url, repo_context, max_re
devcontainer_json = json.dumps(response.dict(exclude_none=True), indent=2)

if validate_devcontainer_json(devcontainer_json):
return devcontainer_json
logging.info("Successfully generated and validated devcontainer.json")
if existing_devcontainer and not regenerate:
return existing_devcontainer, devcontainer_url
else:
return devcontainer_json, None # Return None as URL for generated content
else:
logging.warning(f"Generated JSON failed validation on attempt {attempt + 1}")
if attempt == max_retries:
Expand Down
5 changes: 4 additions & 1 deletion helpers/github_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@ def fetch_repo_context(repo_url, max_depth=1):
}

existing_devcontainer = None
devcontainer_url = None

root_devcontainer_url = f"{contents_api_url}/.devcontainer.json"
response = requests.get(root_devcontainer_url, headers=headers)
if response.status_code == 200:
existing_devcontainer = requests.get(response.json()["download_url"]).text
devcontainer_url = response.json()["download_url"]

if not existing_devcontainer:
devcontainer_dir_url = f"{contents_api_url}/.devcontainer"
Expand All @@ -44,6 +46,7 @@ def fetch_repo_context(repo_url, max_depth=1):
for item in response.json():
if item["name"] == "devcontainer.json":
existing_devcontainer = requests.get(item["download_url"]).text
devcontainer_url = item["download_url"]
break

context = []
Expand Down Expand Up @@ -138,7 +141,7 @@ def traverse_dir(api_url, depth=0, prefix=""):
context.append(devcontainer_context)
total_tokens += count_tokens(devcontainer_context)

return "\n\n".join(context), existing_devcontainer
return "\n\n".join(context), existing_devcontainer, devcontainer_url

def check_url_exists(url, Session):
session = Session()
Expand Down
17 changes: 8 additions & 9 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,23 +110,20 @@ async def post(session, repo_url: str, regenerate: bool = False):
exists, existing_record = check_url_exists(repo_url, Session)
logging.info(f"URL check result: exists={exists}, existing_record={existing_record}")

repo_context, existing_devcontainer = fetch_repo_context(repo_url)
repo_context, existing_devcontainer, devcontainer_url = fetch_repo_context(repo_url)
logging.info(f"Fetched repo context. Existing devcontainer: {'Yes' if existing_devcontainer else 'No'}")
logging.info(f"Devcontainer URL: {devcontainer_url}")

if exists and not regenerate:
logging.info(f"URL already exists in database. Returning existing devcontainer_json for: {repo_url}")
devcontainer_json = existing_record.devcontainer_json
generated = existing_record.generated
source = "database"
elif existing_devcontainer:
logging.info("Existing devcontainer.json found in the repository.")
devcontainer_json = existing_devcontainer
generated = False
source = "repository"
url = existing_record.devcontainer_url
else:
devcontainer_json = generate_devcontainer_json(instructor_client, repo_url, repo_context)
devcontainer_json, url = generate_devcontainer_json(instructor_client, repo_url, repo_context, devcontainer_url, regenerate=regenerate)
generated = True
source = "generated"
source = "generated" if url is None else "repository"

if not exists or regenerate:
logging.info("Saving to database...")
Expand All @@ -146,6 +143,7 @@ async def post(session, repo_url: str, regenerate: bool = False):
new_devcontainer = DevContainer(
url=repo_url,
devcontainer_json=devcontainer_json,
devcontainer_url=devcontainer_url, # Save the URL here
repo_context=repo_context,
tokens=count_tokens(repo_context),
model=os.getenv("MODEL"),
Expand All @@ -155,7 +153,7 @@ async def post(session, repo_url: str, regenerate: bool = False):
session.add(new_devcontainer)
session.commit()
session.close()
logging.info("Successfully saved to database")
logging.info(f"Successfully saved to database with devcontainer_url: {devcontainer_url}")
except Exception as e:
logging.error(f"Error while saving to database: {str(e)}")
raise
Expand Down Expand Up @@ -188,6 +186,7 @@ async def post(session, repo_url: str, regenerate: bool = False):
logging.error(f"An error occurred: {str(e)}", exc_info=True)
return Div(H2("Error"), P(f"An error occurred: {str(e)}"))


# Serve static files
@rt("/{fname:path}.{ext:static}")
async def get(fname:str, ext:str):
Expand Down
1 change: 1 addition & 0 deletions models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class DevContainer(SQLAlchemyBase):
id = Column(Integer, primary_key=True)
url = Column(String, index=True)
devcontainer_json = Column(Text)
devcontainer_url = Column(String) # New column for devcontainer.json URL
repo_context = Column(Text)
tokens = Column(Integer)
model = Column(Text)
Expand Down
10 changes: 9 additions & 1 deletion prompts/devcontainer.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,19 @@ Given the following context from a GitHub repository:

{{ repo_context }}

{% if existing_devcontainer %}
An existing devcontainer.json file was found in the repository:

{{ existing_devcontainer }}

Please use this as a reference and improve upon it, incorporating any new requirements or best practices.
{% endif %}

Begin by applying Chain of Thought (CoT) reasoning to decompose the context and task into logical, manageable components. Think slowly and pay attention to all important facts in the context such as the ports used by the application and the ports used for testing.

Generate a devcontainer.json file for this project. The file should include appropriate settings for the development environment based on the project's requirements and structure. The 'features' field is essential and should include a dictionary of features to enable within the container.

Always add comments to explain what each line or block of code does. This will help you and others who come after you understand what each line of code is doing, why it's there and how it works.
Always add comments (like in the provided example) to explain what each line or block of code does. This will help you and others who come after you understand what each line of code is doing, why it's there and how it works.

Here's an example of a devcontainer.json:
```json
Expand Down
12 changes: 11 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@ aiohttp==3.10.2
aiosignal==1.3.1
annotated-types==0.7.0
anyio==4.4.0
appnope==0.1.4
asttokens==2.4.1
async-timeout==4.0.3
attrs==24.2.0
beautifulsoup4==4.12.3
black==24.8.0
certifi==2024.7.4
charset-normalizer==2.1.1
click==8.1.7
Expand All @@ -32,6 +34,7 @@ idna==3.7
instructor==1.3.7
ipykernel==6.29.5
ipython==8.26.0
isort==5.13.2
itsdangerous==2.2.0
jedi==0.19.1
Jinja2==3.1.4
Expand All @@ -40,19 +43,24 @@ jsonschema==4.23.0
jsonschema-specifications==2023.12.1
jupyter_client==8.6.2
jupyter_core==5.7.2
kaleido==0.2.1
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib-inline==0.1.7
mdurl==0.1.2
multidict==6.0.5
mypy-extensions==1.0.0
nest-asyncio==1.6.0
numpy==2.0.1
oauthlib==3.2.2
openai==1.40.2
packaging==24.1
pandas==2.2.2
parso==0.8.4
pathspec==0.12.1
pexpect==4.9.0
pipdeptree==2.23.1
platformdirs==4.2.2
plotly==5.23.0
prompt_toolkit==3.0.47
psutil==6.0.0
ptyprocess==0.7.0
Expand All @@ -65,6 +73,7 @@ python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-fasthtml==0.2.4
python-multipart==0.0.9
pytz==2024.1
PyYAML==6.0.2
pyzmq==26.1.0
referencing==0.35.1
Expand Down Expand Up @@ -92,6 +101,7 @@ tqdm==4.66.5
traitlets==5.14.3
typer==0.12.3
typing_extensions==4.12.2
tzdata==2024.1
urllib3==2.2.2
uvicorn==0.30.5
uvloop==0.19.0
Expand Down

0 comments on commit cb6a0f5

Please sign in to comment.