Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GitHub Actions workflow to pull data to another repository #1080

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions .github/workflows/pull_data.yml
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hey @reaganfue , can you describe your motivations for these changes?

Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
name: Pull Data

on:
schedule:
- cron: '0 0 * * *' # Runs daily at midnight
workflow_dispatch:

jobs:
pull_data:
runs-on: ubuntu-latest

steps:
- name: Checkout khoj-ai/khoj repository
uses: actions/checkout@v2
with:
repository: khoj-ai/khoj
path: khoj

- name: Checkout target repository
uses: actions/checkout@v2
with:
repository: target-owner/target-repo
path: target

- name: Pull data from khoj-ai/khoj to target repository
run: |
python3 -c "
import os
from src.khoj.processor.content.github.github_to_entries import GithubToEntries
config = {
'pat_token': os.getenv('GITHUB_PAT'),
'repos': [
{'owner': 'khoj-ai', 'name': 'khoj', 'branch': 'main'}
]
}
github_to_entries = GithubToEntries(config)
github_to_entries.pull_data_to_repo(
target_repo_url='https://api.github.com/repos/target-owner/target-repo',
target_repo_branch='main',
target_repo_token=os.getenv('TARGET_REPO_PAT')
)
"

- name: Commit and push changes to target repository
run: |
cd target
git config --global user.name 'github-actions[bot]'
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
git add .
git commit -m 'Pull data from khoj-ai/khoj'
git push
45 changes: 45 additions & 0 deletions src/khoj/processor/content/github/github_to_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,48 @@ def extract_plaintext_entries(plaintext_files):
doc["content"], doc["path"], entries, entry_to_file_map
)
return entries, dict(entry_to_file_map)

def pull_data_to_repo(self, target_repo_url: str, target_repo_branch: str, target_repo_token: str):
# Clone the target repository
headers = {"Authorization": f"token {target_repo_token}"}
response = requests.get(f"{target_repo_url}/git/trees/{target_repo_branch}", headers=headers)
if response.status_code != 200:
raise Exception(f"Failed to clone target repository: {response.text}")

# Copy necessary data from this repository to the target repository
for repo in self.config.repos:
repo_url = f"https://api.github.com/repos/{repo.owner}/{repo.name}"
markdown_files, org_files, plaintext_files = self.get_files(repo_url, repo)
for file in markdown_files + org_files + plaintext_files:
file_path = file["path"].replace(f"https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/", "")
response = requests.put(
f"{target_repo_url}/contents/{file_path}",
headers=headers,
json={
"message": f"Add {file_path}",
"content": file["content"].encode("utf-8").decode("base64"),
"branch": target_repo_branch,
},
)
if response.status_code != 201:
raise Exception(f"Failed to add file {file_path} to target repository: {response.text}")

# Commit and push the changes to the target repository
response = requests.post(
f"{target_repo_url}/git/commits",
headers=headers,
json={
"message": "Pull data from khoj-ai/khoj",
"branch": target_repo_branch,
},
)
if response.status_code != 201:
raise Exception(f"Failed to commit changes to target repository: {response.text}")

response = requests.post(
f"{target_repo_url}/git/refs/heads/{target_repo_branch}",
headers=headers,
json={"sha": response.json()["sha"]},
)
if response.status_code != 200:
raise Exception(f"Failed to push changes to target repository: {response.text}")