Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/hkbus/hk-bus-crawling
Browse files Browse the repository at this point in the history
  • Loading branch information
chunlaw committed Jan 20, 2025
2 parents ec01b88 + 2ee307d commit 23bc06d
Show file tree
Hide file tree
Showing 27 changed files with 1,427 additions and 1,107 deletions.
22 changes: 22 additions & 0 deletions .github/workflows/format.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: Format

on: [push]

jobs:
format:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Format code
id: autopep8
uses: peter-evans/autopep8@v2
with:
args: --exit-code --recursive --in-place --aggressive --aggressive --indent-size=2 .
- name: Commit autopep8 changes
if: steps.autopep8.outputs.exit-code == 2
run: |
# copied from https://github.com/creyD/prettier_action/blob/9561a3f1e164fa28b6f4da59c1807e1cd1af7cf5/entrypoint.sh#L131
git config user.name "GitHub Action"
git config user.email "[email protected]"
git commit -am "Formatted Code!" --author="$GITHUB_ACTOR <[email protected]>" || echo "No files added to commit"
git push
54 changes: 33 additions & 21 deletions crawling/cleansing.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,50 @@
import json


def isNameMatch(name_a, name_b):
tmp_a = name_a.lower()
tmp_b = name_b.lower()
return tmp_a.find(tmp_b) >= 0 or tmp_b.find(tmp_a) >= 0


def countBus(freq):
if freq is None: return 0
if freq is None:
return 0
sum = 0
for entries in freq.values():
for startTime, v in entries.items():
if v is None:
if v is None:
sum += 1
continue
endTime, waitTime = v
sum += int ( ( int(endTime[0:2]) - int(startTime[0:2]) ) * 60 + int(endTime[2:4]) - int(startTime[2:4]) ) / ( int(waitTime) / 60 )
sum += int((int(endTime[0:2]) - int(startTime[0:2])) * 60 + \
int(endTime[2:4]) - int(startTime[2:4])) / (int(waitTime) / 60)
return sum


def cleansing(co):
with open('routeFareList.%s.json' % co, 'r', encoding='UTF-8') as f:
routeList = json.load(f)

for i in range(len(routeList)):
route = routeList[i]
route["co"] = [co for co in route["co"] if co != "ferry"]
if 'skip' in route or 'freq' in route:
continue
bestIdx, maxBus = -1, 0
for j in range(len(routeList)):
if i == j: continue
if i == j:
continue
_route = routeList[j]
if route['route'] == _route['route'] and sorted(route['co']) == sorted(_route['co']) and \
isNameMatch(route['orig_en'], _route['orig_en']) and isNameMatch(route['dest_en'], _route['dest_en']):
if 'freq' not in _route: continue
if route['route'] == _route['route'] and sorted(
route['co']) == sorted(
_route['co']) and isNameMatch(
route['orig_en'],
_route['orig_en']) and isNameMatch(
route['dest_en'],
_route['dest_en']):
if 'freq' not in _route:
continue
bus = countBus(_route['freq'])
if bus > maxBus:
bestIdx = j
Expand All @@ -42,19 +54,19 @@ def cleansing(co):
routeList[i]['skip'] = True

_routeList = [route for route in routeList if 'skip' not in route]
print (co, len(routeList), len(_routeList))
print(co, len(routeList), len(_routeList))

with open('routeFareList.%s.cleansed.json' % co, 'w', encoding='UTF-8') as f:
f.write(json.dumps(_routeList, ensure_ascii=False))

cleansing ('kmb')
cleansing ('ctb')
cleansing ('nlb')
cleansing ('lrtfeeder')
cleansing ('gmb')
cleansing ('lightRail')
cleansing ('mtr')
cleansing ('sunferry')


cleansing('kmb')
cleansing('ctb')
cleansing('nlb')
cleansing('lrtfeeder')
cleansing('gmb')
cleansing('lightRail')
cleansing('mtr')
cleansing('sunferry')
cleansing('fortuneferry')
cleansing ('hkkf')
cleansing('hkkf')
24 changes: 14 additions & 10 deletions crawling/crawl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,43 +4,47 @@
import logging
import os

logger=logging.getLogger(__name__)
logger = logging.getLogger(__name__)

async def emitRequest(url:str,client: httpx.AsyncClient, headers={}):
RETRY_TIMEOUT_MAX=60
retry_timeout=1

async def emitRequest(url: str, client: httpx.AsyncClient, headers={}):
RETRY_TIMEOUT_MAX = 60
retry_timeout = 1
# retry if "Too many request (429)"
while True:
try:
r = await client.get(url, headers=headers)
if r.status_code == 200:
return r
elif r.status_code in (429, 502, 504):
logger.warning(f"status_code={r.status_code}, wait {retry_timeout} and retry. URL={url}")
logger.warning(
f"status_code={r.status_code}, wait {retry_timeout} and retry. URL={url}")
await asyncio.sleep(retry_timeout)
retry_timeout = min (retry_timeout * 2, RETRY_TIMEOUT_MAX)
retry_timeout = min(retry_timeout * 2, RETRY_TIMEOUT_MAX)
else:
r.raise_for_status()
raise Exception(r.status_code, url)
except (httpx.PoolTimeout, httpx.ReadTimeout, httpx.ReadError) as e:
logger.warning(f"Exception {repr(e)} occurred, wait {retry_timeout} and retry. URL={url}")
logger.warning(
f"Exception {repr(e)} occurred, wait {retry_timeout} and retry. URL={url}")
await asyncio.sleep(retry_timeout)
retry_timeout = min (retry_timeout * 2, RETRY_TIMEOUT_MAX)
retry_timeout = min(retry_timeout * 2, RETRY_TIMEOUT_MAX)


def get_request_limit():
default_limit = "10"
return int(os.environ.get('REQUEST_LIMIT', default_limit))


def store_version(key: str, version: str):
logger.info(f"{key} version: {version}")
# "0" is prepended in filename so that this file appears first in Github directory listing
try:
with open('0versions.json', 'r') as f:
version_dict = json.load(f)
except:
except BaseException:
version_dict = {}
version_dict[key] = version
version_dict = dict(sorted(version_dict.items()))
with open('0versions.json', 'w', encoding='UTF-8') as f:
json.dump(version_dict, f, indent=4)
json.dump(version_dict, f, indent=4)
182 changes: 92 additions & 90 deletions crawling/ctb.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,94 +9,96 @@

logger = logging.getLogger(__name__)


async def getRouteStop(co):
a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
# define output name
ROUTE_LIST = 'routeList.'+co+'.json'
STOP_LIST = 'stopList.'+co+'.json'

# load route list and stop list if exist
routeList = {}
if path.isfile(ROUTE_LIST):
return
else:
# load routes
r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/route/'+co, a_client)
routeList = r.json()['data']

_stops = []
stopList = {}
if path.isfile(STOP_LIST):
with open(STOP_LIST, 'r', encoding='UTF-8') as f:
stopList = json.load(f)

# function to load single stop info
req_stop_list_limit = asyncio.Semaphore(get_request_limit())
async def getStop ( stopId ):
async with req_stop_list_limit:
r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/stop/'+stopId, a_client)
return r.json()['data']

# function to async load multiple stops info
async def getStopList ( stops ):
ret = await asyncio.gather(*[getStop(stop) for stop in stops])
return ret

req_route_stop_limit = asyncio.Semaphore(get_request_limit())
async def getRouteStop(param):
co, route = param
if route.get('bound', 0) != 0 or route.get('stops', {}):
return route
route['stops'] = {}
for direction in ['inbound', 'outbound']:
r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/route-stop/'+co.upper()+'/'+route['route']+"/"+direction, a_client)
route['stops'][direction] = [stop['stop'] for stop in r.json()['data']]
return route

async def getRouteStopList ():
ret = await asyncio.gather(*[getRouteStop((co, route))
for route in routeList])
return ret

routeList = await getRouteStopList()
for route in routeList:
for direction, stops in route['stops'].items():
for stopId in stops:
_stops.append(stopId)

# load stops for this route aync
_stops = list(set(_stops))
_stops.sort()

stopInfos = list( zip ( _stops, await getStopList(_stops)) )
for stopId, stopInfo in stopInfos:
stopList[stopId] = stopInfo

_routeList = []
for route in routeList:
if route.get('bound', 0) != 0:
_routeList.append(route)
continue
for bound in ['inbound', 'outbound']:
if len(route['stops'][bound]) > 0:
_routeList.append({
'co': co,
'route': route['route'],
'bound': 'O' if bound == 'outbound' else 'I',
'orig_en': route['orig_en'] if bound == 'outbound' else route['dest_en'],
'orig_tc': route['orig_tc'] if bound == 'outbound' else route['dest_tc'],
'dest_en': route['dest_en'] if bound == 'outbound' else route['orig_en'],
'dest_tc': route['dest_tc'] if bound == 'outbound' else route['orig_tc'],
'stops': list(filter(lambda stopId: bool(stopList[stopId]), route['stops'][bound])),
'serviceType': 0
})

with open(ROUTE_LIST, 'w', encoding='UTF-8') as f:
f.write(json.dumps(_routeList, ensure_ascii=False))
with open(STOP_LIST, 'w', encoding='UTF-8') as f:
f.write(json.dumps(stopList, ensure_ascii=False))

if __name__=='__main__':
logging.basicConfig(level=logging.INFO)
logging.getLogger('httpx').setLevel(logging.WARNING)
asyncio.run(getRouteStop('ctb'))
a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
# define output name
ROUTE_LIST = 'routeList.' + co + '.json'
STOP_LIST = 'stopList.' + co + '.json'

# load route list and stop list if exist
routeList = {}
if path.isfile(ROUTE_LIST):
return
else:
# load routes
r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/route/' + co, a_client)
routeList = r.json()['data']

_stops = []
stopList = {}
if path.isfile(STOP_LIST):
with open(STOP_LIST, 'r', encoding='UTF-8') as f:
stopList = json.load(f)

# function to load single stop info
req_stop_list_limit = asyncio.Semaphore(get_request_limit())

async def getStop(stopId):
async with req_stop_list_limit:
r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/stop/' + stopId, a_client)
return r.json()['data']

# function to async load multiple stops info
async def getStopList(stops):
ret = await asyncio.gather(*[getStop(stop) for stop in stops])
return ret

req_route_stop_limit = asyncio.Semaphore(get_request_limit())

async def getRouteStop(param):
co, route = param
if route.get('bound', 0) != 0 or route.get('stops', {}):
return route
route['stops'] = {}
for direction in ['inbound', 'outbound']:
r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/route-stop/' + co.upper() + '/' + route['route'] + "/" + direction, a_client)
route['stops'][direction] = [stop['stop'] for stop in r.json()['data']]
return route

async def getRouteStopList():
ret = await asyncio.gather(*[getRouteStop((co, route))
for route in routeList])
return ret

routeList = await getRouteStopList()
for route in routeList:
for direction, stops in route['stops'].items():
for stopId in stops:
_stops.append(stopId)

# load stops for this route aync
_stops = sorted(set(_stops))

stopInfos = list(zip(_stops, await getStopList(_stops)))
for stopId, stopInfo in stopInfos:
stopList[stopId] = stopInfo

_routeList = []
for route in routeList:
if route.get('bound', 0) != 0:
_routeList.append(route)
continue
for bound in ['inbound', 'outbound']:
if len(route['stops'][bound]) > 0:
_routeList.append({
'co': co,
'route': route['route'],
'bound': 'O' if bound == 'outbound' else 'I',
'orig_en': route['orig_en'] if bound == 'outbound' else route['dest_en'],
'orig_tc': route['orig_tc'] if bound == 'outbound' else route['dest_tc'],
'dest_en': route['dest_en'] if bound == 'outbound' else route['orig_en'],
'dest_tc': route['dest_tc'] if bound == 'outbound' else route['orig_tc'],
'stops': list(filter(lambda stopId: bool(stopList[stopId]), route['stops'][bound])),
'serviceType': 0
})

with open(ROUTE_LIST, 'w', encoding='UTF-8') as f:
f.write(json.dumps(_routeList, ensure_ascii=False))
with open(STOP_LIST, 'w', encoding='UTF-8') as f:
f.write(json.dumps(stopList, ensure_ascii=False))

if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
logging.getLogger('httpx').setLevel(logging.WARNING)
asyncio.run(getRouteStop('ctb'))
Loading

0 comments on commit 23bc06d

Please sign in to comment.