Skip to content

Commit

Permalink
enable translator and fix getting null articles
Browse files Browse the repository at this point in the history
  • Loading branch information
wenhwang97 committed Feb 12, 2025
1 parent 7f45fd2 commit 0c65592
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 52 deletions.
74 changes: 37 additions & 37 deletions .github/workflows/server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,47 +51,47 @@ jobs:
method: POST
url: ${{ secrets.PORTAINER_WEBHOOK_STAGING }}
preventFailureOnResponse: true
# translator:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v4
# with:
# python-version: 3.11
# - uses: docker/setup-qemu-action@v3
# - uses: docker/setup-buildx-action@v3
translator:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: 3.11
- uses: docker/setup-qemu-action@v3
- uses: docker/setup-buildx-action@v3

# - name: Install Dependencies
# run: |
# python -m pip install --upgrade pip
# pip install -r translator/requirements.txt
- name: Install Dependencies
run: |
python -m pip install --upgrade pip
pip install -r translator/requirements.txt
# - name: Login to Docker Hub
# uses: docker/login-action@v3
# with:
# username: ${{ secrets.DOCKERHUB_USERNAME }}
# password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

# - name: Build & Push Docker Staging Build
# uses: docker/build-push-action@v5
# if: github.ref == 'refs/heads/main'
# with:
# context: ./translator
# push: true
# tags: hicsail/gdp-flask-translator:staging
# build-args: |
# NOCO_DB_URL=${{ secrets.NOCO_DB_URL }}
# NOCO_XC_TOKEN=${{ secrets.NOCO_XC_TOKEN }}
# DEEPL_API_KEY=${{ secrets.DEEPL_API_KEY }}
# GOOGLE_API_KEY=${{ secrets.GOOGLE_API_KEY }}
- name: Build & Push Docker Staging Build
uses: docker/build-push-action@v5
if: github.ref == 'refs/heads/main'
with:
context: ./translator
push: true
tags: hicsail/gdp-flask-translator:staging
build-args: |
NOCO_DB_URL=${{ secrets.NOCO_DB_URL }}
NOCO_XC_TOKEN=${{ secrets.NOCO_XC_TOKEN }}
DEEPL_API_KEY=${{ secrets.DEEPL_API_KEY }}
GOOGLE_API_KEY=${{ secrets.GOOGLE_API_KEY }}
# - name: Push to Staging
# uses: fjogeleit/http-request-action@v1
# if: github.ref == 'refs/heads/main'
# with:
# method: POST
# url: ${{ secrets.PORTAINER_WEBHOOK_STAGING }}
# preventFailureOnResponse: true
- name: Push to Staging
uses: fjogeleit/http-request-action@v1
if: github.ref == 'refs/heads/main'
with:
method: POST
url: ${{ secrets.PORTAINER_WEBHOOK_STAGING }}
preventFailureOnResponse: true
# classifier:
# runs-on: ubuntu-latest
# steps:
Expand Down
18 changes: 9 additions & 9 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ services:
- 5001:80
env_file:
- .env
# translator:
# container_name: gdp-flask-translator
# build:
# context: ./translator
# restart: always
# ports:
# - 5002:80
# env_file:
# - .env
translator:
container_name: gdp-flask-translator
build:
context: ./translator
restart: always
ports:
- 5002:80
env_file:
- .env
# classifier:
# container_name: gdp-flask-classifier
# build:
Expand Down
13 changes: 10 additions & 3 deletions scraper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def scrape_country(country, latest_date, keywords):
article = BeautifulSoup(article_page.content, "html.parser")

# article does not exist
if article.find(id="zoom") is None or article.find(id="artitle") is None:
if article.find(id="zoom") is None and article.find("div", class_="art-con") is None:
print(f"[MOF Scraper] Article does not exist for {link}")
if "政策" in i.find("em", class_="tag").text:
continue
Expand Down Expand Up @@ -175,11 +175,18 @@ def scrape_country(country, latest_date, keywords):
if contype == "政策":
continue

title = article.find(id="artitle").text.strip()
for script in article.find(id="zoom").find_all("script"):
script.decompose()

content = article.find(id="zoom").text.strip()
if article.find(id="artitle") is not None:
title = article.find(id="artitle").text.strip()
else:
title = article.find("div", class_="art-title").text.strip()

if article.find(id="zoom") is not None:
content = article.find(id="zoom").text.strip()
else:
content = article.find("div", class_="art-con").text.strip()

# ignore duplicate articles
if title in result_set:
Expand Down
7 changes: 4 additions & 3 deletions translator/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def translate():
url = os.getenv("NOCO_DB_URL")
headers = {"xc-token": os.getenv("NOCO_XC_TOKEN")}
params = {
"where": "(isEnglish,eq,false)",
"where": "(isEnglish,eq,false)~and(originalTitle,isnot,null)",
"limit": 10, # translate 10 records at a time
}
res = requests.get(url, headers=headers, params=params)
Expand Down Expand Up @@ -58,7 +58,8 @@ def translate():

if __name__ == "__main__":
load_dotenv()
scheduler.add_job(translate, "cron", hour="*", minute="*/5")
scheduler.start()
# scheduler.add_job(translate, "cron", hour="*", minute="*/5")
# scheduler.start()
translate()
print("[MOF Translator] Start translating")
app.run(port=5002)

0 comments on commit 0c65592

Please sign in to comment.