diff --git a/crawling/cleansing.py b/crawling/cleansing.py
index 4d2aad40..099b7e2a 100644
--- a/crawling/cleansing.py
+++ b/crawling/cleansing.py
@@ -1,26 +1,31 @@
import json
+
def isNameMatch(name_a, name_b):
tmp_a = name_a.lower()
tmp_b = name_b.lower()
return tmp_a.find(tmp_b) >= 0 or tmp_b.find(tmp_a) >= 0
+
def countBus(freq):
- if freq is None: return 0
+ if freq is None:
+ return 0
sum = 0
for entries in freq.values():
for startTime, v in entries.items():
- if v is None:
+ if v is None:
sum += 1
continue
endTime, waitTime = v
- sum += int ( ( int(endTime[0:2]) - int(startTime[0:2]) ) * 60 + int(endTime[2:4]) - int(startTime[2:4]) ) / ( int(waitTime) / 60 )
+ sum += int((int(endTime[0:2]) - int(startTime[0:2])) * 60 + \
+ int(endTime[2:4]) - int(startTime[2:4])) / (int(waitTime) / 60)
return sum
+
def cleansing(co):
with open('routeFareList.%s.json' % co, 'r', encoding='UTF-8') as f:
routeList = json.load(f)
-
+
for i in range(len(routeList)):
route = routeList[i]
route["co"] = [co for co in route["co"] if co != "ferry"]
@@ -28,11 +33,18 @@ def cleansing(co):
continue
bestIdx, maxBus = -1, 0
for j in range(len(routeList)):
- if i == j: continue
+ if i == j:
+ continue
_route = routeList[j]
- if route['route'] == _route['route'] and sorted(route['co']) == sorted(_route['co']) and \
- isNameMatch(route['orig_en'], _route['orig_en']) and isNameMatch(route['dest_en'], _route['dest_en']):
- if 'freq' not in _route: continue
+ if route['route'] == _route['route'] and sorted(
+ route['co']) == sorted(
+ _route['co']) and isNameMatch(
+ route['orig_en'],
+ _route['orig_en']) and isNameMatch(
+ route['dest_en'],
+ _route['dest_en']):
+ if 'freq' not in _route:
+ continue
bus = countBus(_route['freq'])
if bus > maxBus:
bestIdx = j
@@ -42,19 +54,19 @@ def cleansing(co):
routeList[i]['skip'] = True
_routeList = [route for route in routeList if 'skip' not in route]
- print (co, len(routeList), len(_routeList))
-
+ print(co, len(routeList), len(_routeList))
+
with open('routeFareList.%s.cleansed.json' % co, 'w', encoding='UTF-8') as f:
f.write(json.dumps(_routeList, ensure_ascii=False))
-
-
-cleansing ('kmb')
-cleansing ('ctb')
-cleansing ('nlb')
-cleansing ('lrtfeeder')
-cleansing ('gmb')
-cleansing ('lightRail')
-cleansing ('mtr')
-cleansing ('sunferry')
+
+
+cleansing('kmb')
+cleansing('ctb')
+cleansing('nlb')
+cleansing('lrtfeeder')
+cleansing('gmb')
+cleansing('lightRail')
+cleansing('mtr')
+cleansing('sunferry')
cleansing('fortuneferry')
-cleansing ('hkkf')
\ No newline at end of file
+cleansing('hkkf')
diff --git a/crawling/crawl_utils.py b/crawling/crawl_utils.py
index caca1e13..cfff0ff8 100644
--- a/crawling/crawl_utils.py
+++ b/crawling/crawl_utils.py
@@ -4,11 +4,12 @@
import logging
import os
-logger=logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
-async def emitRequest(url:str,client: httpx.AsyncClient, headers={}):
- RETRY_TIMEOUT_MAX=60
- retry_timeout=1
+
+async def emitRequest(url: str, client: httpx.AsyncClient, headers={}):
+ RETRY_TIMEOUT_MAX = 60
+ retry_timeout = 1
# retry if "Too many request (429)"
while True:
try:
@@ -16,31 +17,34 @@ async def emitRequest(url:str,client: httpx.AsyncClient, headers={}):
if r.status_code == 200:
return r
elif r.status_code in (429, 502, 504):
- logger.warning(f"status_code={r.status_code}, wait {retry_timeout} and retry. URL={url}")
+ logger.warning(
+ f"status_code={r.status_code}, wait {retry_timeout} and retry. URL={url}")
await asyncio.sleep(retry_timeout)
- retry_timeout = min (retry_timeout * 2, RETRY_TIMEOUT_MAX)
+ retry_timeout = min(retry_timeout * 2, RETRY_TIMEOUT_MAX)
else:
r.raise_for_status()
raise Exception(r.status_code, url)
except (httpx.PoolTimeout, httpx.ReadTimeout, httpx.ReadError) as e:
- logger.warning(f"Exception {repr(e)} occurred, wait {retry_timeout} and retry. URL={url}")
+ logger.warning(
+ f"Exception {repr(e)} occurred, wait {retry_timeout} and retry. URL={url}")
await asyncio.sleep(retry_timeout)
- retry_timeout = min (retry_timeout * 2, RETRY_TIMEOUT_MAX)
+ retry_timeout = min(retry_timeout * 2, RETRY_TIMEOUT_MAX)
def get_request_limit():
default_limit = "10"
return int(os.environ.get('REQUEST_LIMIT', default_limit))
+
def store_version(key: str, version: str):
logger.info(f"{key} version: {version}")
# "0" is prepended in filename so that this file appears first in Github directory listing
try:
with open('0versions.json', 'r') as f:
version_dict = json.load(f)
- except:
+ except BaseException:
version_dict = {}
version_dict[key] = version
version_dict = dict(sorted(version_dict.items()))
with open('0versions.json', 'w', encoding='UTF-8') as f:
- json.dump(version_dict, f, indent=4)
\ No newline at end of file
+ json.dump(version_dict, f, indent=4)
diff --git a/crawling/ctb.py b/crawling/ctb.py
index ce194647..a76d4978 100644
--- a/crawling/ctb.py
+++ b/crawling/ctb.py
@@ -9,94 +9,96 @@
logger = logging.getLogger(__name__)
+
async def getRouteStop(co):
- a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
- # define output name
- ROUTE_LIST = 'routeList.'+co+'.json'
- STOP_LIST = 'stopList.'+co+'.json'
-
- # load route list and stop list if exist
- routeList = {}
- if path.isfile(ROUTE_LIST):
- return
- else:
- # load routes
- r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/route/'+co, a_client)
- routeList = r.json()['data']
-
- _stops = []
- stopList = {}
- if path.isfile(STOP_LIST):
- with open(STOP_LIST, 'r', encoding='UTF-8') as f:
- stopList = json.load(f)
-
- # function to load single stop info
- req_stop_list_limit = asyncio.Semaphore(get_request_limit())
- async def getStop ( stopId ):
- async with req_stop_list_limit:
- r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/stop/'+stopId, a_client)
- return r.json()['data']
-
- # function to async load multiple stops info
- async def getStopList ( stops ):
- ret = await asyncio.gather(*[getStop(stop) for stop in stops])
- return ret
-
- req_route_stop_limit = asyncio.Semaphore(get_request_limit())
- async def getRouteStop(param):
- co, route = param
- if route.get('bound', 0) != 0 or route.get('stops', {}):
- return route
- route['stops'] = {}
- for direction in ['inbound', 'outbound']:
- r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/route-stop/'+co.upper()+'/'+route['route']+"/"+direction, a_client)
- route['stops'][direction] = [stop['stop'] for stop in r.json()['data']]
- return route
-
- async def getRouteStopList ():
- ret = await asyncio.gather(*[getRouteStop((co, route))
- for route in routeList])
- return ret
-
- routeList = await getRouteStopList()
- for route in routeList:
- for direction, stops in route['stops'].items():
- for stopId in stops:
- _stops.append(stopId)
-
- # load stops for this route aync
- _stops = list(set(_stops))
- _stops.sort()
-
- stopInfos = list( zip ( _stops, await getStopList(_stops)) )
- for stopId, stopInfo in stopInfos:
- stopList[stopId] = stopInfo
-
- _routeList = []
- for route in routeList:
- if route.get('bound', 0) != 0:
- _routeList.append(route)
- continue
- for bound in ['inbound', 'outbound']:
- if len(route['stops'][bound]) > 0:
- _routeList.append({
- 'co': co,
- 'route': route['route'],
- 'bound': 'O' if bound == 'outbound' else 'I',
- 'orig_en': route['orig_en'] if bound == 'outbound' else route['dest_en'],
- 'orig_tc': route['orig_tc'] if bound == 'outbound' else route['dest_tc'],
- 'dest_en': route['dest_en'] if bound == 'outbound' else route['orig_en'],
- 'dest_tc': route['dest_tc'] if bound == 'outbound' else route['orig_tc'],
- 'stops': list(filter(lambda stopId: bool(stopList[stopId]), route['stops'][bound])),
- 'serviceType': 0
- })
-
- with open(ROUTE_LIST, 'w', encoding='UTF-8') as f:
- f.write(json.dumps(_routeList, ensure_ascii=False))
- with open(STOP_LIST, 'w', encoding='UTF-8') as f:
- f.write(json.dumps(stopList, ensure_ascii=False))
-
-if __name__=='__main__':
- logging.basicConfig(level=logging.INFO)
- logging.getLogger('httpx').setLevel(logging.WARNING)
- asyncio.run(getRouteStop('ctb'))
+ a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
+ # define output name
+ ROUTE_LIST = 'routeList.' + co + '.json'
+ STOP_LIST = 'stopList.' + co + '.json'
+
+ # load route list and stop list if exist
+ routeList = {}
+ if path.isfile(ROUTE_LIST):
+ return
+ else:
+ # load routes
+ r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/route/' + co, a_client)
+ routeList = r.json()['data']
+
+ _stops = []
+ stopList = {}
+ if path.isfile(STOP_LIST):
+ with open(STOP_LIST, 'r', encoding='UTF-8') as f:
+ stopList = json.load(f)
+
+ # function to load single stop info
+ req_stop_list_limit = asyncio.Semaphore(get_request_limit())
+
+ async def getStop(stopId):
+ async with req_stop_list_limit:
+ r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/stop/' + stopId, a_client)
+ return r.json()['data']
+
+ # function to async load multiple stops info
+ async def getStopList(stops):
+ ret = await asyncio.gather(*[getStop(stop) for stop in stops])
+ return ret
+
+ req_route_stop_limit = asyncio.Semaphore(get_request_limit())
+
+ async def getRouteStop(param):
+ co, route = param
+ if route.get('bound', 0) != 0 or route.get('stops', {}):
+ return route
+ route['stops'] = {}
+ for direction in ['inbound', 'outbound']:
+ r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/route-stop/' + co.upper() + '/' + route['route'] + "/" + direction, a_client)
+ route['stops'][direction] = [stop['stop'] for stop in r.json()['data']]
+ return route
+
+ async def getRouteStopList():
+ ret = await asyncio.gather(*[getRouteStop((co, route))
+ for route in routeList])
+ return ret
+
+ routeList = await getRouteStopList()
+ for route in routeList:
+ for direction, stops in route['stops'].items():
+ for stopId in stops:
+ _stops.append(stopId)
+
+ # load stops for this route aync
+ _stops = sorted(set(_stops))
+
+ stopInfos = list(zip(_stops, await getStopList(_stops)))
+ for stopId, stopInfo in stopInfos:
+ stopList[stopId] = stopInfo
+
+ _routeList = []
+ for route in routeList:
+ if route.get('bound', 0) != 0:
+ _routeList.append(route)
+ continue
+ for bound in ['inbound', 'outbound']:
+ if len(route['stops'][bound]) > 0:
+ _routeList.append({
+ 'co': co,
+ 'route': route['route'],
+ 'bound': 'O' if bound == 'outbound' else 'I',
+ 'orig_en': route['orig_en'] if bound == 'outbound' else route['dest_en'],
+ 'orig_tc': route['orig_tc'] if bound == 'outbound' else route['dest_tc'],
+ 'dest_en': route['dest_en'] if bound == 'outbound' else route['orig_en'],
+ 'dest_tc': route['dest_tc'] if bound == 'outbound' else route['orig_tc'],
+ 'stops': list(filter(lambda stopId: bool(stopList[stopId]), route['stops'][bound])),
+ 'serviceType': 0
+ })
+
+ with open(ROUTE_LIST, 'w', encoding='UTF-8') as f:
+ f.write(json.dumps(_routeList, ensure_ascii=False))
+ with open(STOP_LIST, 'w', encoding='UTF-8') as f:
+ f.write(json.dumps(stopList, ensure_ascii=False))
+
+if __name__ == '__main__':
+ logging.basicConfig(level=logging.INFO)
+ logging.getLogger('httpx').setLevel(logging.WARNING)
+ asyncio.run(getRouteStop('ctb'))
diff --git a/crawling/fortuneferry.py b/crawling/fortuneferry.py
index 8a8d9822..188cd13b 100644
--- a/crawling/fortuneferry.py
+++ b/crawling/fortuneferry.py
@@ -11,11 +11,11 @@
gtfsEn = json.load(f)
routes = {
- "7059": ["中環", "紅磡"],
- "7021": ["北角", "啟德"],
- "7056": ["北角", "觀塘"],
- "7025": ["屯門", "大澳"],
- "7000004": ["東涌", "大澳"],
+ "7059": ["中環", "紅磡"],
+ "7021": ["北角", "啟德"],
+ "7056": ["北角", "觀塘"],
+ "7025": ["屯門", "大澳"],
+ "7000004": ["東涌", "大澳"],
}
routeList = []
@@ -24,42 +24,43 @@
for [route_code, [orig, dest]] in routes.items():
for route_id, gtfsRoute in gtfsRoutes.items():
if "ferry" in gtfsRoute["co"]:
- if orig.lower() == gtfsRoute["orig"]["zh"].lower() and dest.lower() == gtfsRoute["dest"]["zh"].lower():
+ if orig.lower() == gtfsRoute["orig"]["zh"].lower(
+ ) and dest.lower() == gtfsRoute["dest"]["zh"].lower():
routeList.append({
- "gtfsId": route_id,
- "route": route_code,
- "orig_tc": gtfsRoute["orig"]["zh"],
- "orig_en": gtfsEn["routeList"][route_id]["orig"]["en"],
- "dest_tc": gtfsRoute["dest"]["zh"],
- "dest_en": gtfsEn["routeList"][route_id]["dest"]["en"],
- "service_type": 1,
- "bound": "O",
- "stops": gtfsRoute["stops"]["1"],
- "freq": gtfsRoute["freq"]["1"],
- })
- if "2" in gtfsRoute["freq"]:
- routeList.append({
"gtfsId": route_id,
"route": route_code,
- "dest_tc": gtfsRoute["orig"]["zh"],
- "dest_en": gtfsEn["routeList"][route_id]["orig"]["en"],
- "orig_tc": gtfsRoute["dest"]["zh"],
- "orig_en": gtfsEn["routeList"][route_id]["dest"]["en"],
+ "orig_tc": gtfsRoute["orig"]["zh"],
+ "orig_en": gtfsEn["routeList"][route_id]["orig"]["en"],
+ "dest_tc": gtfsRoute["dest"]["zh"],
+ "dest_en": gtfsEn["routeList"][route_id]["dest"]["en"],
"service_type": 1,
- "bound": "I",
- "stops": gtfsRoute["stops"]["2"] if "2" in gtfsRoute["stops"] else gtfsRoute["stops"]["1"][::-1],
- "freq": gtfsRoute["freq"]["2"] if "2" in gtfsRoute["freq"] else {},
+ "bound": "O",
+ "stops": gtfsRoute["stops"]["1"],
+ "freq": gtfsRoute["freq"]["1"],
+ })
+ if "2" in gtfsRoute["freq"]:
+ routeList.append({
+ "gtfsId": route_id,
+ "route": route_code,
+ "dest_tc": gtfsRoute["orig"]["zh"],
+ "dest_en": gtfsEn["routeList"][route_id]["orig"]["en"],
+ "orig_tc": gtfsRoute["dest"]["zh"],
+ "orig_en": gtfsEn["routeList"][route_id]["dest"]["en"],
+ "service_type": 1,
+ "bound": "I",
+ "stops": gtfsRoute["stops"]["2"] if "2" in gtfsRoute["stops"] else gtfsRoute["stops"]["1"][::-1],
+ "freq": gtfsRoute["freq"]["2"] if "2" in gtfsRoute["freq"] else {},
})
for route in routeList:
for stopId in route["stops"]:
stopList[stopId] = {
- "stop": stopId,
- "name_en": gtfsEn["stopList"][stopId]["stopName"]["unknown"],
- "name_tc": gtfsStops[stopId]["stopName"]["unknown"],
- "lat": gtfsStops[stopId]["lat"],
- "long": gtfsStops[stopId]["lng"],
+ "stop": stopId,
+ "name_en": gtfsEn["stopList"][stopId]["stopName"]["unknown"],
+ "name_tc": gtfsStops[stopId]["stopName"]["unknown"],
+ "lat": gtfsStops[stopId]["lat"],
+ "long": gtfsStops[stopId]["lng"],
}
with open('routeList.fortuneferry.json', 'w', encoding='UTF-8') as f:
diff --git a/crawling/gmb.py b/crawling/gmb.py
index c5b79d14..bce4c084 100644
--- a/crawling/gmb.py
+++ b/crawling/gmb.py
@@ -10,6 +10,7 @@
logger = logging.getLogger(__name__)
+
async def getRouteStop(co):
a_client = httpx.AsyncClient()
# parse gtfs service_id
@@ -18,7 +19,14 @@ async def getRouteStop(co):
reader = csv.reader(csvfile)
headers = next(reader, None)
for [service_id, mon, tue, wed, thur, fri, sat, sun, *tmp] in reader:
- serviceIdMap[service_id] = [mon == "1", tue == "1", wed == "1", thur == "1", fri == "1", sat == "1", sun == "1"]
+ serviceIdMap[service_id] = [
+ mon == "1",
+ tue == "1",
+ wed == "1",
+ thur == "1",
+ fri == "1",
+ sat == "1",
+ sun == "1"]
serviceIdMap["111"] = [True, True, False, True, True, True, True]
def mapServiceId(weekdays, serviceIdMap_a):
@@ -31,12 +39,12 @@ def mapServiceId(weekdays, serviceIdMap_a):
def getFreq(headways, serviceIdMap_a):
freq = {}
for headway in headways:
- service_id = mapServiceId( headway['weekdays'] , serviceIdMap_a)
+ service_id = mapServiceId(headway['weekdays'], serviceIdMap_a)
if service_id not in freq:
freq[service_id] = {}
freq[service_id][headway['start_time'].replace(':', '')[:4]] = [
- headway['end_time'].replace(':', '')[:4],
- str(headway['frequency'] * 60)
+ headway['end_time'].replace(':', '')[:4],
+ str(headway['frequency'] * 60)
] if headway['frequency'] is not None else None
return freq
@@ -48,7 +56,7 @@ def getFreq(headways, serviceIdMap_a):
async def get_route_directions(route, route_no):
service_type = 2
for direction in route['directions']:
- rs = await emitRequest('https://data.etagmb.gov.hk/route-stop/'+str(route['route_id'])+'/'+str(direction['route_seq']), a_client)
+ rs = await emitRequest('https://data.etagmb.gov.hk/route-stop/' + str(route['route_id']) + '/' + str(direction['route_seq']), a_client)
for stop in rs.json()['data']['route_stops']:
stop_id = stop['stop_id']
@@ -118,41 +126,42 @@ async def get_route_directions(route, route_no):
"name_tc": useNameTc,
}
routeList.append({
- "gtfsId": str(route['route_id']),
- "route": route_no,
- "orig_tc": direction['orig_tc'],
- "orig_en": direction['orig_en'],
- "dest_tc": direction['dest_tc'],
- "dest_en": direction['dest_en'],
- "bound": 'O' if direction['route_seq'] == 1 else 'I',
- "service_type": 1 if route["description_tc"] == '正常班次' else service_type,
- "stops": [str(stop['stop_id']) for stop in rs.json()['data']['route_stops']],
- "freq": getFreq(direction['headways'], serviceIdMap)
+ "gtfsId": str(route['route_id']),
+ "route": route_no,
+ "orig_tc": direction['orig_tc'],
+ "orig_en": direction['orig_en'],
+ "dest_tc": direction['dest_tc'],
+ "dest_en": direction['dest_en'],
+ "bound": 'O' if direction['route_seq'] == 1 else 'I',
+ "service_type": 1 if route["description_tc"] == '正常班次' else service_type,
+ "stops": [str(stop['stop_id']) for stop in rs.json()['data']['route_stops']],
+ "freq": getFreq(direction['headways'], serviceIdMap)
})
- #print(routeList)
+ # print(routeList)
if route["description_tc"] != '正常班次':
service_type += 1
-
+
req_route_limit = asyncio.Semaphore(get_request_limit())
- async def get_route(region:str, route_no):
+
+ async def get_route(region: str, route_no):
async with req_route_limit:
- r = await emitRequest('https://data.etagmb.gov.hk/route/'+region+'/'+route_no, a_client)
+ r = await emitRequest('https://data.etagmb.gov.hk/route/' + region + '/' + route_no, a_client)
await asyncio.gather(*[get_route_directions(route, route_no) for route in r.json()['data']])
- routeList.sort(key = lambda a: a['gtfsId'])
+ routeList.sort(key=lambda a: a['gtfsId'])
req_route_region_limit = asyncio.Semaphore(get_request_limit())
+
async def get_routes_region(region: str):
async with req_route_region_limit:
- r = await emitRequest('https://data.etagmb.gov.hk/route/'+region, a_client)
+ r = await emitRequest('https://data.etagmb.gov.hk/route/' + region, a_client)
await asyncio.gather(*[get_route(region, route) for route in r.json()['data']['routes']])
-
+
await asyncio.gather(*[get_routes_region(r) for r in ['HKI', 'KLN', "NT"]])
with open(f'routeList.{co}.json', 'w', encoding='UTF-8') as f:
json.dump(routeList, f, ensure_ascii=False)
logger.info("Route done")
-
req_stops_limit = asyncio.Semaphore(get_request_limit())
with open("gtfs.json", "r", encoding='UTF-8') as f:
gtfs = json.load(f)
@@ -162,9 +171,11 @@ async def update_stop_loc(stop_id):
if stop_id not in gtfsStops:
logger.info(f"Getting stop {stop_id} from etagmb")
async with req_stops_limit:
- r = await emitRequest('https://data.etagmb.gov.hk/stop/'+str(stop_id), a_client)
- stops[stop_id]['lat'] = r.json()['data']['coordinates']['wgs84']['latitude']
- stops[stop_id]['long'] = r.json()['data']['coordinates']['wgs84']['longitude']
+ r = await emitRequest('https://data.etagmb.gov.hk/stop/' + str(stop_id), a_client)
+ stops[stop_id]['lat'] = r.json(
+ )['data']['coordinates']['wgs84']['latitude']
+ stops[stop_id]['long'] = r.json(
+ )['data']['coordinates']['wgs84']['longitude']
else:
logger.debug(f"Getting stop {stop_id} from gtfs")
stops[stop_id]['lat'] = gtfsStops[stop_id]['lat']
@@ -173,7 +184,7 @@ async def update_stop_loc(stop_id):
await asyncio.gather(*[update_stop_loc(stop_id) for stop_id in sorted(stops.keys())])
with open(f'stopList.{co}.json', 'w', encoding='UTF-8') as f:
- json.dump(stops,f, ensure_ascii=False)
+ json.dump(stops, f, ensure_ascii=False)
for stop in stopCandidates:
stopCandidates[stop]["tc_others"].discard(stopCandidates[stop]["tc_used"])
stopCandidates[stop]["tc_others"] = sorted(
@@ -184,11 +195,11 @@ async def update_stop_loc(stop_id):
with open(f'stopCandidates.{co}.json', 'w', encoding='UTF-8') as f:
def set_default(obj):
if isinstance(obj, set):
- return list(obj)
+ return list(obj)
raise TypeError
json.dump(stopCandidates, f, ensure_ascii=False, default=set_default)
-if __name__=='__main__':
- logging.basicConfig(level=logging.INFO)
- logging.getLogger('httpx').setLevel(logging.WARNING)
- asyncio.run(getRouteStop('gmb'))
+if __name__ == '__main__':
+ logging.basicConfig(level=logging.INFO)
+ logging.getLogger('httpx').setLevel(logging.WARNING)
+ asyncio.run(getRouteStop('gmb'))
diff --git a/crawling/hkkf.py b/crawling/hkkf.py
index 1ef45350..963e09bd 100644
--- a/crawling/hkkf.py
+++ b/crawling/hkkf.py
@@ -10,18 +10,20 @@
from crawl_utils import emitRequest
routes = {
- "1": ["Central Pier 4", "Sok Kwu Wan"],
- "2": ["Central Pier 4", "Yung Shue Wan"],
- "3": ["Central Pier 6", "Peng Chau"],
- "4": ["Peng Chau", "Hei Ling Chau"],
+ "1": ["Central Pier 4", "Sok Kwu Wan"],
+ "2": ["Central Pier 4", "Yung Shue Wan"],
+ "3": ["Central Pier 6", "Peng Chau"],
+ "4": ["Peng Chau", "Hei Ling Chau"],
}
+
def parseStop(name_en, apiStops):
for stop in apiStops:
if stop["name_en"].startswith(name_en):
return stop
raise Exception("Undefined stop")
+
async def getRouteStop(co):
a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
routeList = []
@@ -30,11 +32,10 @@ async def getRouteStop(co):
r = await emitRequest('https://www.hkkfeta.com/opendata/route/', a_client)
apiRoutes = r.json()['data']
apiStops = []
- for stopId in [1,2,3,4,5,6]:
- stop = (await emitRequest('https://www.hkkfeta.com/opendata/pier/'+str(stopId), a_client)).json()["data"]
+ for stopId in [1, 2, 3, 4, 5, 6]:
+ stop = (await emitRequest('https://www.hkkfeta.com/opendata/pier/' + str(stopId), a_client)).json()["data"]
apiStops.append(stop)
-
with open("gtfs.json", 'r', encoding="utf-8") as f:
gtfsZh = json.load(f)
@@ -47,41 +48,41 @@ async def getRouteStop(co):
orig = parseStop(routes[str(apiRoute["route_id"])][0], apiStops)
dest = parseStop(routes[str(apiRoute["route_id"])][1], apiStops)
routeList.append({
- "route": "KF" + str(apiRoute["route_id"]),
- "orig_tc": orig["name_tc"],
- "orig_en": orig["name_en"],
- "dest_tc": dest["name_tc"],
- "dest_en": dest["name_en"],
- "service_type": 1,
- "bound": "O",
- "stops": [
- "KF" + str(orig["pier_id"]),
- "KF" + str(dest["pier_id"]),
- ],
- "co": "hkkf",
+ "route": "KF" + str(apiRoute["route_id"]),
+ "orig_tc": orig["name_tc"],
+ "orig_en": orig["name_en"],
+ "dest_tc": dest["name_tc"],
+ "dest_en": dest["name_en"],
+ "service_type": 1,
+ "bound": "O",
+ "stops": [
+ "KF" + str(orig["pier_id"]),
+ "KF" + str(dest["pier_id"]),
+ ],
+ "co": "hkkf",
})
routeList.append({
- "route": "KF" + str(apiRoute["route_id"]),
- "orig_tc": dest["name_tc"],
- "orig_en": dest["name_en"],
- "dest_tc": orig["name_tc"],
- "dest_en": orig["name_en"],
- "service_type": 1,
- "bound": "I",
- "stops": [
- "KF" + str(dest["pier_id"]),
- "KF" + str(orig["pier_id"]),
- ],
- "co": "hkkf",
+ "route": "KF" + str(apiRoute["route_id"]),
+ "orig_tc": dest["name_tc"],
+ "orig_en": dest["name_en"],
+ "dest_tc": orig["name_tc"],
+ "dest_en": orig["name_en"],
+ "service_type": 1,
+ "bound": "I",
+ "stops": [
+ "KF" + str(dest["pier_id"]),
+ "KF" + str(orig["pier_id"]),
+ ],
+ "co": "hkkf",
})
for apiStop in apiStops:
- stopList["KF"+str(apiStop["pier_id"])] = {
- "stop": "KF"+str(apiStop["pier_id"]),
- "name_en": apiStop["name_en"],
- "name_tc": apiStop["name_tc"],
- "lat": apiStop["lat"],
- "long": apiStop["long"]
+ stopList["KF" + str(apiStop["pier_id"])] = {
+ "stop": "KF" + str(apiStop["pier_id"]),
+ "name_en": apiStop["name_en"],
+ "name_tc": apiStop["name_tc"],
+ "lat": apiStop["lat"],
+ "long": apiStop["long"]
}
with open('routeList.hkkf.json', 'w', encoding="utf-8") as f:
@@ -90,7 +91,7 @@ async def getRouteStop(co):
with open('stopList.hkkf.json', 'w', encoding="utf-8") as f:
f.write(json.dumps(stopList, ensure_ascii=False))
-if __name__=='__main__':
- logging.basicConfig(level=logging.INFO)
- logging.getLogger('httpx').setLevel(logging.WARNING)
- asyncio.run(getRouteStop('hkkf'))
+if __name__ == '__main__':
+ logging.basicConfig(level=logging.INFO)
+ logging.getLogger('httpx').setLevel(logging.WARNING)
+ asyncio.run(getRouteStop('hkkf'))
diff --git a/crawling/kmb.py b/crawling/kmb.py
index 89358137..d09265be 100644
--- a/crawling/kmb.py
+++ b/crawling/kmb.py
@@ -9,69 +9,74 @@
from crawl_utils import emitRequest
+
async def getRouteStop():
- a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
- # define output name
- ROUTE_LIST = 'routeList.kmb.json'
- STOP_LIST = 'stopList.kmb.json'
+ a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
+ # define output name
+ ROUTE_LIST = 'routeList.kmb.json'
+ STOP_LIST = 'stopList.kmb.json'
+
+ stopList = {}
+ if path.isfile(STOP_LIST):
+ with open(STOP_LIST, 'r', encoding='UTF-8') as f:
+ stopList = json.load(f)
+ else:
+ # load stops
+ r = await emitRequest('https://data.etabus.gov.hk/v1/transport/kmb/stop', a_client)
+ _stopList = r.json()['data']
+ for stop in _stopList:
+ stopList[stop['stop']] = stop
- stopList = {}
- if path.isfile(STOP_LIST):
- with open(STOP_LIST, 'r', encoding='UTF-8') as f:
- stopList = json.load(f)
- else:
- # load stops
- r = await emitRequest('https://data.etabus.gov.hk/v1/transport/kmb/stop', a_client)
- _stopList = r.json()['data']
- for stop in _stopList:
- stopList[stop['stop']] = stop
+ def isStopExist(stopId):
+ if stopId not in stopList:
+ print("Not exist stop: ", stopId, file=sys.stderr)
+ return stopId in stopList
- def isStopExist( stopId ):
- if stopId not in stopList:
- print ("Not exist stop: ", stopId, file=sys.stderr)
- return stopId in stopList
+ # load route list and stop list if exist
+ routeList = {}
+ if path.isfile(ROUTE_LIST):
+ return
+ else:
+ # load routes
+ r = await emitRequest('https://data.etabus.gov.hk/v1/transport/kmb/route/', a_client)
+ for route in r.json()['data']:
+ route['stops'] = {}
+ route['co'] = 'kmb'
+ routeList['+'.join([route['route'],
+ route['service_type'],
+ route['bound']])] = route
- # load route list and stop list if exist
- routeList = {}
- if path.isfile(ROUTE_LIST):
- return
- else:
- # load routes
- r = await emitRequest('https://data.etabus.gov.hk/v1/transport/kmb/route/', a_client)
- for route in r.json()['data']:
- route['stops'] = {}
- route['co'] = 'kmb'
- routeList['+'.join([route['route'], route['service_type'], route['bound']])] = route
+ # load route stops
+ r = await emitRequest('https://data.etabus.gov.hk/v1/transport/kmb/route-stop/', a_client)
+ for stop in r.json()['data']:
+ routeKey = '+'.join([stop['route'], stop['service_type'], stop['bound']])
+ if routeKey in routeList:
+ routeList[routeKey]['stops'][int(stop['seq'])] = stop['stop']
+ else:
+ # if route not found, clone it from service type = 1
+ _routeKey = '+'.join([stop['route'], str('1'), stop['bound']])
+ routeList[routeKey] = copy.deepcopy(routeList[_routeKey])
+ routeList[routeKey]['stops'] = {}
+ routeList[routeKey]['stops'][int(stop['seq'])] = stop['stop']
- # load route stops
- r = await emitRequest('https://data.etabus.gov.hk/v1/transport/kmb/route-stop/', a_client)
- for stop in r.json()['data']:
- routeKey = '+'.join([stop['route'], stop['service_type'], stop['bound']])
- if routeKey in routeList:
- routeList[routeKey]['stops'][int(stop['seq'])] = stop['stop']
- else:
- # if route not found, clone it from service type = 1
- _routeKey = '+'.join([stop['route'], str('1'), stop['bound']])
- routeList[routeKey] = copy.deepcopy(routeList[_routeKey])
- routeList[routeKey]['stops'] = {}
- routeList[routeKey]['stops'][int(stop['seq'])] = stop['stop']
+ # flatten the route stops back to array
+ for routeKey in routeList.keys():
+ stops = [routeList[routeKey]['stops'][seq]
+ for seq in sorted(routeList[routeKey]['stops'].keys())]
+ # filter non-exist stops
+ stops = list(filter(isStopExist, stops))
+ routeList[routeKey]['stops'] = stops
- # flatten the route stops back to array
- for routeKey in routeList.keys():
- stops = [routeList[routeKey]['stops'][seq] for seq in sorted(routeList[routeKey]['stops'].keys())]
- # filter non-exist stops
- stops = list(filter(isStopExist, stops))
- routeList[routeKey]['stops'] = stops
+ # flatten the routeList back to array
+ routeList = [routeList[routeKey]
+ for routeKey in routeList.keys() if not routeKey.startswith('K')]
- # flatten the routeList back to array
- routeList = [routeList[routeKey] for routeKey in routeList.keys() if not routeKey.startswith('K')]
-
- with open(ROUTE_LIST, 'w', encoding='UTF-8') as f:
- f.write(json.dumps(routeList, ensure_ascii=False))
- with open(STOP_LIST, 'w', encoding='UTF-8') as f:
- f.write(json.dumps(stopList, ensure_ascii=False))
+ with open(ROUTE_LIST, 'w', encoding='UTF-8') as f:
+ f.write(json.dumps(routeList, ensure_ascii=False))
+ with open(STOP_LIST, 'w', encoding='UTF-8') as f:
+ f.write(json.dumps(stopList, ensure_ascii=False))
-if __name__=='__main__':
- logging.basicConfig(level=logging.INFO)
- logging.getLogger('httpx').setLevel(logging.WARNING)
- asyncio.run(getRouteStop())
+if __name__ == '__main__':
+ logging.basicConfig(level=logging.INFO)
+ logging.getLogger('httpx').setLevel(logging.WARNING)
+ asyncio.run(getRouteStop())
diff --git a/crawling/lightRail.py b/crawling/lightRail.py
index 14c5b331..4a7508ef 100644
--- a/crawling/lightRail.py
+++ b/crawling/lightRail.py
@@ -10,7 +10,8 @@
from crawl_utils import emitRequest
-async def getRouteStop(co = 'lightRail'):
+
+async def getRouteStop(co='lightRail'):
a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
epsgTransformer = Transformer.from_crs('epsg:2326', 'epsg:4326')
@@ -19,51 +20,53 @@ async def getRouteStop(co = 'lightRail'):
stopList = {}
r = await emitRequest('https://opendata.mtr.com.hk/data/light_rail_routes_and_stops.csv', a_client)
- reader = csv.reader(r.text.split("\n") )
- headers = next(reader,None)
+ reader = csv.reader(r.text.split("\n"))
+ headers = next(reader, None)
routes = [route for route in reader if len(route) == 7]
for [route, bound, stopCode, stopId, chn, eng, seq] in routes:
- if route+"_"+bound not in routeList:
- routeList[route+"_"+bound] = {
- "gtfsId": None,
- "route": route,
- "bound": "O" if bound == "1" else "I",
- "service_type": "1",
- "orig_tc": None,
- "orig_en": None,
- "dest_tc": None,
- "dest_en": None,
- "stops": [],
- "fare": []
+ if route + "_" + bound not in routeList:
+ routeList[route + "_" + bound] = {
+ "gtfsId": None,
+ "route": route,
+ "bound": "O" if bound == "1" else "I",
+ "service_type": "1",
+ "orig_tc": None,
+ "orig_en": None,
+ "dest_tc": None,
+ "dest_en": None,
+ "stops": [],
+ "fare": []
}
if seq == "1.00":
- routeList[route+"_"+bound]["orig_tc"] = chn
- routeList[route+"_"+bound]["orig_en"] = eng
- routeList[route+"_"+bound]["dest_tc"] = chn
- routeList[route+"_"+bound]["dest_en"] = eng
- routeList[route+"_"+bound]["stops"].append("LR"+stopId)
- if "LR"+stopId not in stopList:
- url='https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q=輕鐵-'+chn
+ routeList[route + "_" + bound]["orig_tc"] = chn
+ routeList[route + "_" + bound]["orig_en"] = eng
+ routeList[route + "_" + bound]["dest_tc"] = chn
+ routeList[route + "_" + bound]["dest_en"] = eng
+ routeList[route + "_" + bound]["stops"].append("LR" + stopId)
+ if "LR" + stopId not in stopList:
+ url = 'https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q=輕鐵-' + chn
r = await emitRequest(url, a_client, headers={'Accept': 'application/json'})
try:
- lat, lng = epsgTransformer.transform( r.json()[0]['y'], r.json()[0]['x'] )
- stopList["LR"+stopId] = {
- "stop": "LR"+stopId,
- "name_en": eng,
- "name_tc": chn,
- "lat": lat,
- "long": lng
+ lat, lng = epsgTransformer.transform(
+ r.json()[0]['y'], r.json()[0]['x'])
+ stopList["LR" + stopId] = {
+ "stop": "LR" + stopId,
+ "name_en": eng,
+ "name_tc": chn,
+ "lat": lat,
+ "long": lng
}
- except:
+ except BaseException:
logger.exception(f"Error parsing {url}: {r.text}")
raise
with open('routeList.lightRail.json', 'w', encoding='UTF-8') as f:
- f.write(json.dumps([route for route in routeList.values() if len(route['stops']) > 0], ensure_ascii=False))
+ f.write(json.dumps([route for route in routeList.values()
+ if len(route['stops']) > 0], ensure_ascii=False))
with open('stopList.lightRail.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(stopList, ensure_ascii=False))
-if __name__=='__main__':
+if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
logging.getLogger('httpx').setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
diff --git a/crawling/lrtfeeder.py b/crawling/lrtfeeder.py
index 9d570a3c..3452f746 100644
--- a/crawling/lrtfeeder.py
+++ b/crawling/lrtfeeder.py
@@ -10,66 +10,68 @@
from crawl_utils import emitRequest
-async def getRouteStop(co = 'lrtfeeder'):
+
+async def getRouteStop(co='lrtfeeder'):
a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
routeList = {}
stopList = {}
r = await emitRequest('https://opendata.mtr.com.hk/data/mtr_bus_routes.csv', a_client)
r.encoding = 'utf-8'
- reader = csv.reader(r.text.split("\n") )
- headers = next(reader,None)
+ reader = csv.reader(r.text.split("\n"))
+ headers = next(reader, None)
routes = [route for route in reader if len(route) == 4]
for [route, chn, eng, circular] in routes:
if route == '':
continue
start = {
- "zh": chn.split('至')[0],
- "en": eng.split(' to ')[0]
+ "zh": chn.split('至')[0],
+ "en": eng.split(' to ')[0]
}
end = {
- "zh": chn.split('至')[1],
- "en": eng.split(' to ')[1]
+ "zh": chn.split('至')[1],
+ "en": eng.split(' to ')[1]
}
for bound in ['I', 'O']:
- routeList[route+"_"+bound] = {
- "route": route,
- "bound": bound,
- "service_type": "1",
- "orig_tc": start['zh'] if bound == 'O' else end['zh'],
- "dest_tc": end["zh"] if bound == 'O' else start['zh'],
- "orig_en": start['en'] if bound == 'O' else end['en'],
- "dest_en": end["en"] if bound == 'O' else start['en'],
- "stops": [],
- "co": "lrtfeeder"
+ routeList[route + "_" + bound] = {
+ "route": route,
+ "bound": bound,
+ "service_type": "1",
+ "orig_tc": start['zh'] if bound == 'O' else end['zh'],
+ "dest_tc": end["zh"] if bound == 'O' else start['zh'],
+ "orig_en": start['en'] if bound == 'O' else end['en'],
+ "dest_en": end["en"] if bound == 'O' else start['en'],
+ "stops": [],
+ "co": "lrtfeeder"
}
# Parse stops
r = await emitRequest('https://opendata.mtr.com.hk/data/mtr_bus_stops.csv', a_client)
r.encoding = 'utf-8'
- reader = csv.reader(r.text.split("\n") )
- headers = next(reader,None)
+ reader = csv.reader(r.text.split("\n"))
+ headers = next(reader, None)
stops = [stop for stop in reader if len(stop) == 8]
for [route, bound, seq, stationId, lat, lng, name_zh, name_en] in stops:
- routeKey = route+"_"+bound
+ routeKey = route + "_" + bound
if routeKey in routeList:
routeList[routeKey]['stops'].append(stationId)
else:
- print ("error", routeKey)
+ print("error", routeKey)
stopList[stationId] = {
- "stop": stationId,
- "name_en": name_en,
- "name_tc": name_zh,
- "lat": lat,
- "long": lng
+ "stop": stationId,
+ "name_en": name_en,
+ "name_tc": name_zh,
+ "lat": lat,
+ "long": lng
}
with open('routeList.lrtfeeder.json', 'w', encoding='UTF-8') as f:
- f.write(json.dumps([route for route in routeList.values() if len(route['stops']) > 0], ensure_ascii=False))
+ f.write(json.dumps([route for route in routeList.values()
+ if len(route['stops']) > 0], ensure_ascii=False))
with open('stopList.lrtfeeder.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(stopList, ensure_ascii=False))
-if __name__=='__main__':
+if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
logging.getLogger('httpx').setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
diff --git a/crawling/matchGtfs.py b/crawling/matchGtfs.py
index 12cf7046..8a76eef7 100644
--- a/crawling/matchGtfs.py
+++ b/crawling/matchGtfs.py
@@ -10,23 +10,28 @@
gtfsRoutes = gtfs['routeList']
gtfsStops = gtfs['stopList']
+
def isNameMatch(name_a, name_b):
- tmp_a = name_a.lower()
- tmp_b = name_b.lower()
- return tmp_a.find(tmp_b) >= 0 or tmp_b.find(tmp_a) >= 0
+ tmp_a = name_a.lower()
+ tmp_b = name_b.lower()
+ return tmp_a.find(tmp_b) >= 0 or tmp_b.find(tmp_a) >= 0
# ctb routes only give list of stops in topological order
# the actual servicing routes may skip some stop in the coStops
# this DP function is trying to map the coStops back to GTFS stops
-def matchStopsByDp ( coStops, gtfsStops, co, debug=False ):
- co = 'unknown' if co not in gtfsStops[0]['stopName'] else co # handle unknown stop
+
+
+def matchStopsByDp(coStops, gtfsStops, co, debug=False):
+ # handle unknown stop
+ co = 'unknown' if co not in gtfsStops[0]['stopName'] else co
if len(gtfsStops) > len(coStops) + 1:
return [], INFINITY_DIST
if len(gtfsStops) - len(coStops) == 1:
gtfsStops = gtfsStops[:-1]
-
+
# initialization
- distSum = [[INFINITY_DIST for x in range(len(coStops)+1) ] for y in range(len(gtfsStops)+1)]
+ distSum = [[INFINITY_DIST for x in range(
+ len(coStops) + 1)] for y in range(len(gtfsStops) + 1)]
for j in range(len(coStops) - len(gtfsStops) + 1):
distSum[0][j] = 0
@@ -35,17 +40,17 @@ def matchStopsByDp ( coStops, gtfsStops, co, debug=False ):
gtfsStop = gtfsStops[i]
for j in range(len(coStops)):
coStop = coStops[j]
- dist = ( 0
- if coStop['name_tc'] == gtfsStop['stopName'][co]
- else haversine(
- (float(coStop['lat']), float(coStop['long'])),
- (gtfsStop['lat'], gtfsStop['lng'])
- ) * 1000
- )
-
- distSum[i+1][j+1] = min(
- distSum[i][j] + dist, # from previous stops of both sides
- distSum[i+1][j] # skipping current coStops
+ dist = (0
+ if coStop['name_tc'] == gtfsStop['stopName'][co]
+ else haversine(
+ (float(coStop['lat']), float(coStop['long'])),
+ (gtfsStop['lat'], gtfsStop['lng'])
+ ) * 1000
+ )
+
+ distSum[i + 1][j + 1] = min(
+ distSum[i][j] + dist, # from previous stops of both sides
+ distSum[i + 1][j] # skipping current coStops
)
# fast return if no good result
@@ -57,34 +62,35 @@ def matchStopsByDp ( coStops, gtfsStops, co, debug=False ):
j = len(coStops)
ret = []
while i > 0 and j > 0:
- if distSum[i][j] == distSum[i][j-1]:
+ if distSum[i][j] == distSum[i][j - 1]:
j -= 1
else:
- ret.append( ( i-1, j-1 ) )
+ ret.append((i - 1, j - 1))
i -= 1
j -= 1
ret.reverse()
-
+
# penalty distance is given for not exact match route
- penalty = sum([abs(a-b) for a, b in ret]) * 0.01
-
+ penalty = sum([abs(a - b) for a, b in ret]) * 0.01
+
return ret, min(distSum[len(gtfsStops)]) / len(gtfsStops) + penalty
def mergeRouteAsCircularRoute(routeA, routeB):
return {
- "co": routeA['co'],
- "route": routeA["route"],
- "bound": routeA["bound"] + routeB["bound"],
- "orig_en": routeA["orig_en"],
- "orig_tc": routeA["orig_tc"],
- "dest_en": routeB["dest_en"],
- "dest_tc": routeB["dest_tc"],
- "serviceType": routeA["serviceType"],
- "stops": routeA['stops'] + routeB['stops'],
- "virtual": True,
+ "co": routeA['co'],
+ "route": routeA["route"],
+ "bound": routeA["bound"] + routeB["bound"],
+ "orig_en": routeA["orig_en"],
+ "orig_tc": routeA["orig_tc"],
+ "dest_en": routeB["dest_en"],
+ "dest_tc": routeB["dest_tc"],
+ "serviceType": routeA["serviceType"],
+ "stops": routeA['stops'] + routeB['stops'],
+ "virtual": True,
}
+
def getVirtualCircularRoutes(routeList, routeNo):
indices = []
for idx, route in enumerate(routeList):
@@ -92,7 +98,7 @@ def getVirtualCircularRoutes(routeList, routeNo):
indices.append(idx)
if len(indices) != 2:
return []
-
+
ret = []
routeA = routeList[indices[0]]
routeB = routeList[indices[1]]
@@ -100,94 +106,128 @@ def getVirtualCircularRoutes(routeList, routeNo):
return []
return [
- mergeRouteAsCircularRoute(routeA, routeB),
- mergeRouteAsCircularRoute(routeB, routeA)
+ mergeRouteAsCircularRoute(routeA, routeB),
+ mergeRouteAsCircularRoute(routeB, routeA)
]
+
def printStopMatches(bestMatch, gtfsStops, stopList, co):
- stopPair = [(bestMatch[4][gtfsStopIdx], bestMatch[5]["stops"][routeStopIdx]) for gtfsStopIdx, routeStopIdx in bestMatch[2]]
- print (bestMatch[3], bestMatch[0], bestMatch[1])
- print ("\t|\t".join(["運輸處", co]))
- print ("\n".join([
- str(idx + 1) + " " + "\t|\t".join(
- [gtfsStops[gtfsId]["stopName"][co], stopList[stopId]["name_tc"]]) for idx, (gtfsId, stopId) in enumerate(stopPair)]
- )
- )
- print ()
+ stopPair = [(bestMatch[4][gtfsStopIdx], bestMatch[5]["stops"][routeStopIdx])
+ for gtfsStopIdx, routeStopIdx in bestMatch[2]]
+ print(bestMatch[3], bestMatch[0], bestMatch[1])
+ print("\t|\t".join(["運輸處", co]))
+ print("\n".join([str(idx + 1) + " " + "\t|\t".join([gtfsStops[gtfsId]["stopName"][co],
+ stopList[stopId]["name_tc"]]) for idx, (gtfsId, stopId) in enumerate(stopPair)]))
+ print()
+
def matchRoutes(co):
- print (co)
- with open( 'routeList.%s.json' % co, 'r', encoding="utf-8" ) as f:
+ print(co)
+ with open('routeList.%s.json' % co, 'r', encoding="utf-8") as f:
routeList = json.load(f)
- with open( 'stopList.%s.json' % co, 'r', encoding="utf-8" ) as f:
+ with open('stopList.%s.json' % co, 'r', encoding="utf-8") as f:
stopList = json.load(f)
routeCandidates = []
# one pass to find matches of co vs gtfs by DP
for gtfsId, gtfsRoute in gtfsRoutes.items():
debug = False and gtfsId == '1047' and gtfsRoute['orig']['zh'] == '沙田站'
- if co == 'gmb' and co in gtfsRoute['co']: # handle for gmb
+ if co == 'gmb' and co in gtfsRoute['co']: # handle for gmb
for route in routeList:
if route['gtfsId'] == gtfsId:
- route['fares'] = [gtfsRoute['fares']['1'][0] for i in range(len(route['stops'])-1) ]
- elif ( co == "sunferry" or co == "fortuneferry" ) and "ferry" in gtfsRoute['co']:
+ route['fares'] = [gtfsRoute['fares']['1'][0]
+ for i in range(len(route['stops']) - 1)]
+ elif (co == "sunferry" or co == "fortuneferry") and "ferry" in gtfsRoute['co']:
for route in routeList:
if route['gtfsId'] == gtfsId:
- route['fares'] = [gtfsRoute['fares']['1'][0] for i in range(len(route['stops'])-1) ]
- elif co in gtfsRoute['co'] or ( co == "hkkf" and 'ferry' in gtfsRoute['co'] ): # handle for other companies
+ route['fares'] = [gtfsRoute['fares']['1'][0]
+ for i in range(len(route['stops']) - 1)]
+ # handle for other companies
+ elif co in gtfsRoute['co'] or (co == "hkkf" and 'ferry' in gtfsRoute['co']):
for bound, stops in gtfsRoute['stops'].items():
bestMatch = (-1, INFINITY_DIST)
- for route in routeList + getVirtualCircularRoutes(routeList, gtfsRoute['route']):
- if ( co in gtfsRoute['co'] and route['route'] == gtfsRoute['route'] ) or \
- ( co == 'hkkf' and ( ( route["orig_tc"].startswith(gtfsRoute['orig']['zh']) and route["dest_tc"].startswith(gtfsRoute['dest']['zh']) ) or
- ( route["orig_tc"].startswith(gtfsRoute['dest']['zh']) and route["dest_tc"].startswith(gtfsRoute['orig']['zh']) ) ) ):
- ret, avgDist = matchStopsByDp([stopList[stop] for stop in route['stops']], [gtfsStops[stop] for stop in stops], co, debug)
+ for route in routeList + \
+ getVirtualCircularRoutes(routeList, gtfsRoute['route']):
+ if (
+ co in gtfsRoute['co'] and route['route'] == gtfsRoute['route']) or (
+ co == 'hkkf' and (
+ (route["orig_tc"].startswith(
+ gtfsRoute['orig']['zh']) and route["dest_tc"].startswith(
+ gtfsRoute['dest']['zh'])) or (
+ route["orig_tc"].startswith(
+ gtfsRoute['dest']['zh']) and route["dest_tc"].startswith(
+ gtfsRoute['orig']['zh'])))):
+ ret, avgDist = matchStopsByDp([stopList[stop] for stop in route['stops']], [
+ gtfsStops[stop] for stop in stops], co, debug)
if avgDist < bestMatch[1]:
bestMatch = (gtfsId, avgDist, ret, bound, stops, route)
- if bestMatch[1] < DIST_DIFF: # assume matching to be avg stop distance diff is lower than 100
+ # assume matching to be avg stop distance diff is lower than 100
+ if bestMatch[1] < DIST_DIFF:
ret, bound, stops, route = bestMatch[2:]
-
+
routeCandidate = route.copy()
- if (len(ret) == len(route['stops']) or len(ret) + 1 == len(route['stops'])) and 'gtfs' not in route and "virtual" not in route:
- routeCandidate['fares'] = [gtfsRoute['fares'][bound][i] for i, j in ret[:-1]] if len(ret[:-1]) < len(gtfsRoute["fares"][bound]) + 1 else None
+ if (
+ len(ret) == len(
+ route['stops']) or len(ret) +
+ 1 == len(
+ route['stops'])) and 'gtfs' not in route and "virtual" not in route:
+ routeCandidate['fares'] = [gtfsRoute['fares'][bound][i] for i, j in ret[:-1]
+ ] if len(ret[:-1]) < len(gtfsRoute["fares"][bound]) + 1 else None
routeCandidate['freq'] = gtfsRoute['freq'][bound]
routeCandidate['jt'] = gtfsRoute['jt']
- routeCandidate['co'] = gtfsRoute['co'] if co in gtfsRoute['co'] else ( gtfsRoute['co'] + [co] )
+ routeCandidate['co'] = gtfsRoute['co'] if co in gtfsRoute['co'] else (
+ gtfsRoute['co'] + [co])
routeCandidate['stops'] = [route['stops'][j] for i, j in ret]
routeCandidate['gtfs'] = [gtfsId]
route['found'] = True
else:
routeCandidate['stops'] = [route['stops'][j] for i, j in ret]
- routeCandidate['fares'] = [gtfsRoute['fares'][bound][i] for i, j in ret[:-1]] if len(ret[:-1]) < len(gtfsRoute["fares"][bound]) + 1 else None
+ routeCandidate['fares'] = [gtfsRoute['fares'][bound][i] for i, j in ret[:-1]
+ ] if len(ret[:-1]) < len(gtfsRoute["fares"][bound]) + 1 else None
routeCandidate['freq'] = gtfsRoute['freq'][bound]
routeCandidate['jt'] = gtfsRoute['jt']
routeCandidate['co'] = gtfsRoute['co']
routeCandidate['orig_tc'] = stopList[routeCandidate['stops'][0]]['name_tc']
routeCandidate['orig_en'] = stopList[routeCandidate['stops'][0]]['name_en']
- routeCandidate['dest_tc'] = stopList[routeCandidate['stops'][-1]]['name_tc']
- routeCandidate['dest_en'] = stopList[routeCandidate['stops'][-1]]['name_en']
+ routeCandidate['dest_tc'] = stopList[routeCandidate['stops']
+ [-1]]['name_tc']
+ routeCandidate['dest_en'] = stopList[routeCandidate['stops']
+ [-1]]['name_en']
routeCandidate['service_type'] = "2" if 'found' in route else "1"
routeCandidate['gtfs'] = [gtfsId]
- route['found'] = True # mark the route has mapped to GTFS, mainly for ctb routes
+ # mark the route has mapped to GTFS, mainly for ctb routes
+ route['found'] = True
routeCandidates.append(routeCandidate)
if '_route' not in gtfsRoute:
gtfsRoute['_route'] = {}
gtfsRoute['_route'][co] = route.copy()
elif co in gtfsRoute['co']:
- print(co, gtfsRoute['route'], 'cannot match any in GTFS', file=sys.stderr)
-
+ print(
+ co,
+ gtfsRoute['route'],
+ 'cannot match any in GTFS',
+ file=sys.stderr)
+
for route in routeList:
if 'gtfs' not in route:
route['co'] = [co]
-
- print (co, len([route for route in routeList if 'gtfs' not in route]), 'out of',len(routeList), 'not match')
- if co != 'mtr': routeList.extend(routeCandidates)
- routeList = [route for route in routeList if 'found' not in route or 'fares' in route] # skipping routes that just partially mapped to GTFS
- with open( 'routeFareList.%s.json' % co, 'w', encoding='UTF-8' ) as f:
+ print(co,
+ len([route for route in routeList if 'gtfs' not in route]),
+ 'out of',
+ len(routeList),
+ 'not match')
+ if co != 'mtr':
+ routeList.extend(routeCandidates)
+ # skipping routes that just partially mapped to GTFS
+ routeList = [
+ route for route in routeList if 'found' not in route or 'fares' in route]
+
+ with open('routeFareList.%s.json' % co, 'w', encoding='UTF-8') as f:
f.write(json.dumps(routeList, ensure_ascii=False))
-
+
+
matchRoutes('kmb')
matchRoutes('ctb')
matchRoutes('nlb')
@@ -208,5 +248,5 @@ def matchRoutes(co):
routeFareList = {}
-with open( 'routeGtfs.all.json', 'w', encoding='UTF-8' ) as f:
- f.write(json.dumps(gtfsRoutes, ensure_ascii=False, indent=4))
\ No newline at end of file
+with open('routeGtfs.all.json', 'w', encoding='UTF-8') as f:
+ f.write(json.dumps(gtfsRoutes, ensure_ascii=False, indent=4))
diff --git a/crawling/mergeRoutes.py b/crawling/mergeRoutes.py
index 89a5e7f4..62e0c6eb 100644
--- a/crawling/mergeRoutes.py
+++ b/crawling/mergeRoutes.py
@@ -6,71 +6,106 @@
stopList = {}
stopMap = {}
-def getRouteObj ( route, co, stops, bound, orig, dest, seq, fares, faresHoliday, freq, jt, nlbId, gtfsId, serviceType = 1):
+
+def getRouteObj(
+ route,
+ co,
+ stops,
+ bound,
+ orig,
+ dest,
+ seq,
+ fares,
+ faresHoliday,
+ freq,
+ jt,
+ nlbId,
+ gtfsId,
+ serviceType=1):
return {
- 'route': route,
- 'co': co,
- 'stops': stops,
- 'serviceType': serviceType,
- 'bound': bound,
- 'orig': orig,
- 'dest': dest,
- 'fares': fares,
- 'faresHoliday': faresHoliday,
- 'freq': freq,
- 'jt': jt,
- 'nlbId': nlbId,
- 'gtfsId': gtfsId,
- 'seq': seq
+ 'route': route,
+ 'co': co,
+ 'stops': stops,
+ 'serviceType': serviceType,
+ 'bound': bound,
+ 'orig': orig,
+ 'dest': dest,
+ 'fares': fares,
+ 'faresHoliday': faresHoliday,
+ 'freq': freq,
+ 'jt': jt,
+ 'nlbId': nlbId,
+ 'gtfsId': gtfsId,
+ 'seq': seq
}
+
def isGtfsMatch(knownRoute, newRoute):
- if knownRoute['gtfsId'] is None: return True
- if 'gtfs' not in newRoute: return True
+ if knownRoute['gtfsId'] is None:
+ return True
+ if 'gtfs' not in newRoute:
+ return True
return knownRoute['gtfsId'] in newRoute['gtfs']
-
-def importRouteListJson( co ):
- _routeList = json.load(open('routeFareList.%s.cleansed.json'%co, 'r', encoding='UTF-8'))
- _stopList = json.load(open('stopList.%s.json'%co, 'r', encoding='UTF-8'))
+
+
+def importRouteListJson(co):
+ _routeList = json.load(
+ open(
+ 'routeFareList.%s.cleansed.json' %
+ co, 'r', encoding='UTF-8'))
+ _stopList = json.load(open('stopList.%s.json' % co, 'r', encoding='UTF-8'))
for stopId, stop in _stopList.items():
if stopId not in stopList:
try:
stopList[stopId] = {
- 'name': {
- 'en': stop['name_en'],
- 'zh': stop['name_tc']
- },
- 'location': {
- 'lat': float(stop['lat']),
- 'lng': float(stop['long'])
- }
+ 'name': {
+ 'en': stop['name_en'],
+ 'zh': stop['name_tc']
+ },
+ 'location': {
+ 'lat': float(stop['lat']),
+ 'lng': float(stop['long'])
+ }
}
- except:
+ except BaseException:
print("Problematic stop: ", stopId, file=stderr)
-
+
for _route in _routeList:
found = False
speicalType = 1
- orig = {'en': _route['orig_en'].replace('/', '/'), 'zh': _route['orig_tc'].replace('/', '/')}
- dest = {'en': _route['dest_en'].replace('/', '/'), 'zh': _route['dest_tc'].replace('/', '/')}
-
+ orig = {
+ 'en': _route['orig_en'].replace(
+ '/',
+ '/'),
+ 'zh': _route['orig_tc'].replace(
+ '/',
+ '/')}
+ dest = {
+ 'en': _route['dest_en'].replace(
+ '/',
+ '/'),
+ 'zh': _route['dest_tc'].replace(
+ '/',
+ '/')}
+
for route in routeList:
- if _route['route'] == route['route'] and co in route['co'] and isGtfsMatch(route, _route):
+ if _route['route'] == route['route'] and co in route['co'] and isGtfsMatch(
+ route, _route):
# skip checking if the bound is not the same
if co in route["bound"] and route['bound'][co] != _route['bound']:
continue
-
+
if len(_route['stops']) == route['seq']:
dist = 0
merge = True
- for stop_a, stop_b in zip( _route['stops'], route['stops'][0][1] ):
+ for stop_a, stop_b in zip(_route['stops'], route['stops'][0][1]):
stop_a = stopList[stop_a]
stop_b = stopList[stop_b]
- dist = haversine(
- (stop_a['location']['lat'], stop_a['location']['lng']),
- (stop_b['location']['lat'], stop_b['location']['lng']),
- unit=Unit.METERS # specify that we want distance in metres, default unit is km
+ dist = haversine(
+ (stop_a['location']['lat'], stop_a['location']['lng']),
+ (stop_b['location']['lat'], stop_b['location']['lng']),
+ unit=Unit.METERS # specify that we want distance in metres, default unit is km
)
merge = merge and dist < 300
if merge:
@@ -81,37 +116,41 @@ def importRouteListJson( co ):
speicalType = int(route['serviceType']) + 1
if _route["route"] == '606' and _route['dest_tc'].startswith("彩雲"):
print("Yes", speicalType)
-
+
if not found:
- routeList.append(
- getRouteObj(
- route = _route['route'],
- co = _route['co'],
- serviceType = _route.get('service_type', speicalType),
- stops = [(co, _route['stops'])],
- bound = {co: _route['bound']},
- orig = orig,
- dest = dest,
- fares = _route.get('fares', None),
- faresHoliday = _route.get('faresHoliday', None),
- freq = _route.get('freq', None),
- jt = _route.get('jt', None),
- nlbId = _route.get('id', None),
- gtfsId = _route.get('gtfsId', _route.get('gtfs', [None])[0]),
- seq = len(_route['stops'])
- )
+ routeList.append(
+ getRouteObj(
+ route=_route['route'],
+ co=_route['co'],
+ serviceType=_route.get('service_type', speicalType),
+ stops=[(co, _route['stops'])],
+ bound={co: _route['bound']},
+ orig=orig,
+ dest=dest,
+ fares=_route.get('fares', None),
+ faresHoliday=_route.get('faresHoliday', None),
+ freq=_route.get('freq', None),
+ jt=_route.get('jt', None),
+ nlbId=_route.get('id', None),
+ gtfsId=_route.get('gtfsId', _route.get('gtfs', [None])[0]),
+ seq=len(_route['stops'])
+ )
)
-def isMatchStops(stops_a, stops_b, debug = False):
+
+def isMatchStops(stops_a, stops_b, debug=False):
if len(stops_a) != len(stops_b):
return False
for v in stops_a:
- if stopMap.get(v, [[None,None]])[0][1] in stops_b:
+ if stopMap.get(v, [[None, None]])[0][1] in stops_b:
return True
return False
+
def getRouteId(v):
- return '%s+%s+%s+%s'%(v['route'], v['serviceType'], v['orig']['en'], v['dest']['en'])
+ return '%s+%s+%s+%s' % (v['route'], v['serviceType'],
+ v['orig']['en'], v['dest']['en'])
+
def smartUnique():
_routeList = []
@@ -120,29 +159,30 @@ def smartUnique():
continue
founds = []
# compare route one-by-one
- for j in range(i+1, len(routeList)):
+ for j in range(i + 1, len(routeList)):
if routeList[i]['route'] == routeList[j]['route'] \
- and len(routeList[i]['stops']) == len(routeList[j]['stops']) \
- and len([co for co in routeList[i]['co'] if co in routeList[j]['co']]) == 0 \
- and isMatchStops(routeList[i]['stops'][0][1], routeList[j]['stops'][0][1]):
- founds.append( j )
+ and len(routeList[i]['stops']) == len(routeList[j]['stops']) \
+ and len([co for co in routeList[i]['co'] if co in routeList[j]['co']]) == 0 \
+ and isMatchStops(routeList[i]['stops'][0][1], routeList[j]['stops'][0][1]):
+ founds.append(j)
elif routeList[i]['route'] == routeList[j]['route'] \
- and str(routeList[i]['serviceType']) == str(routeList[j]['serviceType']) \
- and routeList[i]['orig']['en'] == routeList[j]['orig']['en'] \
- and routeList[i]['dest']['en'] == routeList[j]['dest']['en']:
- routeList[j]['serviceType'] = str(int(routeList[j]['serviceType'])+1)
+ and str(routeList[i]['serviceType']) == str(routeList[j]['serviceType']) \
+ and routeList[i]['orig']['en'] == routeList[j]['orig']['en'] \
+ and routeList[i]['dest']['en'] == routeList[j]['dest']['en']:
+ routeList[j]['serviceType'] = str(int(routeList[j]['serviceType']) + 1)
# update obj
for found in founds:
routeList[i]['co'].extend(routeList[found]['co'])
- routeList[i]['stops'].extend( routeList[found]['stops'] )
+ routeList[i]['stops'].extend(routeList[found]['stops'])
routeList[found]['skip'] = True
# append return array
_routeList.append(routeList[i])
return _routeList
-
+
+
importRouteListJson('kmb')
importRouteListJson('ctb')
importRouteListJson('nlb')
@@ -158,21 +198,30 @@ def smartUnique():
route['stops'] = {co: stops for co, stops in route['stops']}
holidays = json.load(open('holiday.json', 'r', encoding='UTF-8'))
-serviceDayMap = json.load(open('gtfs.json', 'r', encoding='UTF-8'))['serviceDayMap']
+serviceDayMap = json.load(
+ open(
+ 'gtfs.json',
+ 'r',
+ encoding='UTF-8'))['serviceDayMap']
+
def standardizeDict(d):
- return {key: value if not isinstance(value, dict) else standardizeDict(value) for key, value in sorted(d.items())}
+ return {
+ key: value if not isinstance(
+ value, dict) else standardizeDict(value) for key, value in sorted(
+ d.items())}
+
db = standardizeDict({
- 'routeList': {getRouteId(v): v for v in routeList},
- 'stopList': stopList,
- 'stopMap': stopMap,
- 'holidays': holidays,
- 'serviceDayMap': serviceDayMap,
+ 'routeList': {getRouteId(v): v for v in routeList},
+ 'stopList': stopList,
+ 'stopMap': stopMap,
+ 'holidays': holidays,
+ 'serviceDayMap': serviceDayMap,
})
-with open( 'routeFareList.mergeRoutes.json', 'w', encoding='UTF-8' ) as f:
+with open('routeFareList.mergeRoutes.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(db, ensure_ascii=False, indent=4))
-with open( 'routeFareList.mergeRoutes.min.json', 'w', encoding='UTF-8' ) as f:
+with open('routeFareList.mergeRoutes.min.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(db, ensure_ascii=False, separators=(',', ':')))
diff --git a/crawling/mergeStopList.py b/crawling/mergeStopList.py
index f00789ce..e6c73007 100644
--- a/crawling/mergeStopList.py
+++ b/crawling/mergeStopList.py
@@ -4,233 +4,274 @@
import time
from haversine import haversine, Unit
-def get_stop_group(route_list, stop_list, stop_seq_mapping, stop_list_grid, stop_id):
- DISTANCE_THRESHOLD = 50 # in metres
- BEARING_THRESHOLD = 45 # in degrees
- STOP_LIST_LIMIT = 50 # max number of stops in a group
-
- def get_stops_haversine_distance(stop_a, stop_b):
- return haversine(
- (stop_a['location']['lat'], stop_a['location']['lng']),
- (stop_b['location']['lat'], stop_b['location']['lng']),
- unit=Unit.METERS # specify that we want distance in meter, default is km
- )
-
- bearing_targets = stop_seq_mapping.get(stop_id, {}).get('bearings', [])
-
- def is_bearing_in_range(bearing):
- if BEARING_THRESHOLD >= 180 or not bearing_targets:
- return True
- for target in bearing_targets:
- bearing_min = target - BEARING_THRESHOLD
- bearing_max = target + BEARING_THRESHOLD
- if bearing_min < 0:
- bearing_min += 360
- if bearing_max > 360:
- bearing_max -= 360
- if (bearing_min <= bearing <= bearing_max or
- (bearing_min > bearing_max and (bearing <= bearing_max or bearing >= bearing_min))):
- return True
- return False
-
- def search_nearby_stops(target_stop_id, excluded_stop_id_list):
- target_stop = stop_list[target_stop_id]
- # take lat/lng up to 3 decimal places, that's about 100m x 100m square
- lat = int(target_stop['location']['lat'] * 1000)
- lng = int(target_stop['location']['lng'] * 1000)
-
- nearby_stops = []
- for stop_id in stop_list_grid.get(f"{lat}_{lng}", []):
- if (stop_id not in excluded_stop_id_list and get_stops_haversine_distance(target_stop, stop_list[stop_id]) <= DISTANCE_THRESHOLD):
- bearings = stop_seq_mapping.get(stop_id, {}).get('bearings', [])
- if any(is_bearing_in_range(b) for b in bearings):
- nearby_stops.append({
- 'id': stop_id,
- 'co': stop_seq_mapping.get(stop_id, {}).get('co', '')
- })
- return nearby_stops
-
- stop_group = []
- stop_list_entries = search_nearby_stops(stop_id, [])
-
- # recursively search for nearby stops within thresholds (distance and bearing)
- # stop searching when no new stops are found within range, or when stop list is getting too large
- i = 0
- while i < len(stop_list_entries):
- entry = stop_list_entries[i]
- stop_group.append([entry['co'], entry['id']])
- i += 1
- if len(stop_list_entries) < STOP_LIST_LIMIT:
- stop_list_entries.extend(search_nearby_stops(entry['id'], [e['id'] for e in stop_list_entries]))
-
- # to reduce size of routeFareList.min.json, excl current stop_id from final output stopMap
- return [stop for stop in stop_group if stop[1] != stop_id]
- # return stop_group
+
+def get_stop_group(
+ route_list,
+ stop_list,
+ stop_seq_mapping,
+ stop_list_grid,
+ stop_id):
+ DISTANCE_THRESHOLD = 50 # in metres
+ BEARING_THRESHOLD = 45 # in degrees
+ STOP_LIST_LIMIT = 50 # max number of stops in a group
+
+ def get_stops_haversine_distance(stop_a, stop_b):
+ return haversine(
+ (stop_a['location']['lat'], stop_a['location']['lng']),
+ (stop_b['location']['lat'], stop_b['location']['lng']),
+ unit=Unit.METERS # specify that we want distance in meter, default is km
+ )
+
+ bearing_targets = stop_seq_mapping.get(stop_id, {}).get('bearings', [])
+
+ def is_bearing_in_range(bearing):
+ if BEARING_THRESHOLD >= 180 or not bearing_targets:
+ return True
+ for target in bearing_targets:
+ bearing_min = target - BEARING_THRESHOLD
+ bearing_max = target + BEARING_THRESHOLD
+ if bearing_min < 0:
+ bearing_min += 360
+ if bearing_max > 360:
+ bearing_max -= 360
+ if (
+ bearing_min <= bearing <= bearing_max or (
+ bearing_min > bearing_max and (
+ bearing <= bearing_max or bearing >= bearing_min))):
+ return True
+ return False
+
+ def search_nearby_stops(target_stop_id, excluded_stop_id_list):
+ target_stop = stop_list[target_stop_id]
+ # take lat/lng up to 3 decimal places, that's about 100m x 100m square
+ lat = int(target_stop['location']['lat'] * 1000)
+ lng = int(target_stop['location']['lng'] * 1000)
+
+ nearby_stops = []
+ for stop_id in stop_list_grid.get(f"{lat}_{lng}", []):
+ if (stop_id not in excluded_stop_id_list and get_stops_haversine_distance(
+ target_stop, stop_list[stop_id]) <= DISTANCE_THRESHOLD):
+ bearings = stop_seq_mapping.get(stop_id, {}).get('bearings', [])
+ if any(is_bearing_in_range(b) for b in bearings):
+ nearby_stops.append({
+ 'id': stop_id,
+ 'co': stop_seq_mapping.get(stop_id, {}).get('co', '')
+ })
+ return nearby_stops
+
+ stop_group = []
+ stop_list_entries = search_nearby_stops(stop_id, [])
+
+ # recursively search for nearby stops within thresholds (distance and bearing)
+ # stop searching when no new stops are found within range, or when stop
+ # list is getting too large
+ i = 0
+ while i < len(stop_list_entries):
+ entry = stop_list_entries[i]
+ stop_group.append([entry['co'], entry['id']])
+ i += 1
+ if len(stop_list_entries) < STOP_LIST_LIMIT:
+ stop_list_entries.extend(
+ search_nearby_stops(
+ entry['id'], [
+ e['id'] for e in stop_list_entries]))
+
+ # to reduce size of routeFareList.min.json, excl current stop_id from
+ # final output stopMap
+ return [stop for stop in stop_group if stop[1] != stop_id]
+ # return stop_group
+
def get_bearing(a, b):
- φ1 = math.radians(a['lat'])
- φ2 = math.radians(b['lat'])
- λ1 = math.radians(a['lng'])
- λ2 = math.radians(b['lng'])
-
- y = math.sin(λ2 - λ1) * math.cos(φ2)
- x = (math.cos(φ1) * math.sin(φ2) -
- math.sin(φ1) * math.cos(φ2) * math.cos(λ2 - λ1))
- θ = math.atan2(y, x)
- brng = (math.degrees(θ) + 360) % 360 # in degrees
- return brng
+ φ1 = math.radians(a['lat'])
+ φ2 = math.radians(b['lat'])
+ λ1 = math.radians(a['lng'])
+ λ2 = math.radians(b['lng'])
-def get_stop_bearings(route_stops):
- unique_routes = []
- bearings = []
- for route_stop in route_stops:
- if route_stop['bearing'] != -1:
- unique_route = f"{route_stop['co']}_{route_stop['routeKey'].split('+')[0]}_{route_stop['bearing']}"
- if unique_route not in unique_routes:
- unique_routes.append(unique_route)
- bearings.append(route_stop['bearing'])
-
- if not bearings:
- return []
-
- BEARING_THRESHOLD = 45 # in degrees
- BEARING_EPSILON = 10e-6 # very small number
- bearing_groups = []
-
- for bearing in bearings:
- if bearing == -1:
- continue
- if not bearing_groups:
- bearing_groups.append([bearing])
- continue
-
- for group in bearing_groups:
- if any(abs(b - bearing) < BEARING_EPSILON for b in group):
- break
- if any(abs(b - bearing) <= BEARING_THRESHOLD or abs(b - bearing) >= 360 - BEARING_THRESHOLD for b in group):
- group.append(bearing)
- break
- else:
- bearing_groups.append([bearing])
+ y = math.sin(λ2 - λ1) * math.cos(φ2)
+ x = (math.cos(φ1) * math.sin(φ2) -
+ math.sin(φ1) * math.cos(φ2) * math.cos(λ2 - λ1))
+ θ = math.atan2(y, x)
+ brng = (math.degrees(θ) + 360) % 360 # in degrees
+ return brng
- if len(bearing_groups) == 1:
- return bearing_groups[0]
- longest_length = max(len(group) for group in bearing_groups)
- return [b for group in bearing_groups if len(group) == longest_length for b in group]
+def get_stop_bearings(route_stops):
+ unique_routes = []
+ bearings = []
+ for route_stop in route_stops:
+ if route_stop['bearing'] != -1:
+ unique_route = f"{route_stop['co']}_{route_stop['routeKey'].split('+')[0]}_{route_stop['bearing']}"
+ if unique_route not in unique_routes:
+ unique_routes.append(unique_route)
+ bearings.append(route_stop['bearing'])
+
+ if not bearings:
+ return []
+
+ BEARING_THRESHOLD = 45 # in degrees
+ BEARING_EPSILON = 10e-6 # very small number
+ bearing_groups = []
+
+ for bearing in bearings:
+ if bearing == -1:
+ continue
+ if not bearing_groups:
+ bearing_groups.append([bearing])
+ continue
+
+ for group in bearing_groups:
+ if any(abs(b - bearing) < BEARING_EPSILON for b in group):
+ break
+ if any(
+ abs(
+ b -
+ bearing) <= BEARING_THRESHOLD or abs(
+ b -
+ bearing) >= 360 -
+ BEARING_THRESHOLD for b in group):
+ group.append(bearing)
+ break
+ else:
+ bearing_groups.append([bearing])
+
+ if len(bearing_groups) == 1:
+ return bearing_groups[0]
+
+ longest_length = max(len(group) for group in bearing_groups)
+ return [b for group in bearing_groups if len(
+ group) == longest_length for b in group]
# Main function to process stops
+
+
def merge_stop_list():
- # Read the result from previous pipeline
- with open('routeFareList.mergeRoutes.min.json', 'r', encoding='UTF-8') as f:
- db = json.load(f)
-
- route_list = db['routeList']
- stop_list = db['stopList']
- start_time = time.time()
- stop_seq_mapping = {}
-
- # Preprocess the list of bearings for each stop
- for route_key, route_list_entry in route_list.items():
- stops = route_list_entry.get('stops', {})
- for co, co_stops in stops.items():
- for stop_pos, stop_id in enumerate(co_stops):
- if stop_id not in stop_seq_mapping:
- stop_seq_mapping[stop_id] = {"routeStops": [], "co": co, "bearings": []}
- if stop_pos == len(co_stops) - 1:
- stop_seq_mapping[stop_id]['routeStops'].append({
- 'routeKey': route_key,
- 'co': co,
- 'seq': stop_pos,
- 'bearing': -1
- })
- else:
- bearing = get_bearing(stop_list[stop_id]['location'], stop_list[co_stops[stop_pos + 1]]['location'])
- stop_seq_mapping[stop_id]['routeStops'].append({
- 'routeKey': route_key,
- 'co': co,
- 'seq': stop_pos,
- 'bearing': bearing
- })
-
- for stop_id in stop_seq_mapping.keys():
- stop_seq_mapping[stop_id]['bearings'] = get_stop_bearings(stop_seq_mapping[stop_id]['routeStops'])
-
- # Just dump the json in case of a need for trouble-shooting, but otherwise we do not need this file
- with open('stopMap.routeStopsSequence.json', 'w', encoding='UTF-8') as f:
- json.dump(stop_seq_mapping, f)
-
- logger.info(f'Processed routeStopsSequence in {(time.time() - start_time) * 1000:.2f}ms')
-
- # Preprocess stopList, organise stops into ~100m x ~100m squares to reduce size of nested loop later
- stop_list_grid = {}
- for stop_id, stop in stop_list.items():
- # take lat/lng up to 3 decimal places, that's about 100m x 100m square
- lat = int(stop['location']['lat'] * 1000)
- lng = int(stop['location']['lng'] * 1000)
- # add stop into the 9 grid boxes surrounding this stop
- grid = [
- f"{lat - 1}_{lng - 1}",
- f"{lat }_{lng - 1}",
- f"{lat + 1}_{lng - 1}",
- f"{lat - 1}_{lng }",
- f"{lat }_{lng }",
- f"{lat + 1}_{lng }",
- f"{lat - 1}_{lng + 1}",
- f"{lat }_{lng + 1}",
- f"{lat + 1}_{lng + 1}",
- ]
- for grid_id in grid:
- if grid_id not in stop_list_grid:
- stop_list_grid[grid_id] = []
- stop_list_grid[grid_id].append(stop_id)
-
- target_stop_list = list(stop_list.items())
- stop_map = {}
- count = 0
- group_count = 0
-
- for stop_id, stop in target_stop_list:
- count += 1
- # if count % 1000 == 0:
- # logger.info(f"Processed {count} stops ({group_count} groups) at {(time.time() - start_time) * 1000:.2f}ms")
-
- stop_group = get_stop_group(route_list, stop_list, stop_seq_mapping, stop_list_grid, stop_id)
- if len(stop_group) > 0:
- group_count += 1
- stop_map[stop_id] = stop_group
-
- logger.info(f"Processed {count} stops ({group_count} groups) at {(time.time() - start_time) * 1000:.2f}ms")
-
- with open('stopMap.json', 'w', encoding='UTF-8') as f:
- json.dump(stop_map, f, indent=4)
-
- db['stopMap'] = stop_map
-
- with open('routeFareList.json', 'w', encoding='UTF-8') as f:
- json.dump(db, f, indent=4)
-
- # reduce size of routeFareList.min.json by rounding lat/lng values to 5 decimal places
- # 5 d.p. is roughly one-metre accuracy, it is good enough for this project
- # saves around 50kb in size for 14,000 stops
- for stop_id, stop in target_stop_list:
- stop_list[stop_id]['location']['lat'] = float('%.5f' % (stop_list[stop_id]['location']['lat']))
- stop_list[stop_id]['location']['lng'] = float('%.5f' % (stop_list[stop_id]['location']['lng']))
-
- db['stopList'] = stop_list
-
- logger.info(f"Reduced location lat/lng to 5 d.p. at {(time.time() - start_time) * 1000:.2f}ms")
-
- with open('routeFareList.alpha.json', 'w', encoding='UTF-8') as f:
- json.dump(db, f, indent=4)
-
- with open('routeFareList.min.json', 'w', encoding='UTF-8') as f:
- json.dump(db, f)
-
- with open('routeFareList.alpha.min.json', 'w', encoding='UTF-8') as f:
- json.dump(db, f)
+ # Read the result from previous pipeline
+ with open('routeFareList.mergeRoutes.min.json', 'r', encoding='UTF-8') as f:
+ db = json.load(f)
+
+ route_list = db['routeList']
+ stop_list = db['stopList']
+ start_time = time.time()
+ stop_seq_mapping = {}
+
+ # Preprocess the list of bearings for each stop
+ for route_key, route_list_entry in route_list.items():
+ stops = route_list_entry.get('stops', {})
+ for co, co_stops in stops.items():
+ for stop_pos, stop_id in enumerate(co_stops):
+ if stop_id not in stop_seq_mapping:
+ stop_seq_mapping[stop_id] = {
+ "routeStops": [], "co": co, "bearings": []}
+ if stop_pos == len(co_stops) - 1:
+ stop_seq_mapping[stop_id]['routeStops'].append({
+ 'routeKey': route_key,
+ 'co': co,
+ 'seq': stop_pos,
+ 'bearing': -1
+ })
+ else:
+ bearing = get_bearing(
+ stop_list[stop_id]['location'], stop_list[co_stops[stop_pos + 1]]['location'])
+ stop_seq_mapping[stop_id]['routeStops'].append({
+ 'routeKey': route_key,
+ 'co': co,
+ 'seq': stop_pos,
+ 'bearing': bearing
+ })
+
+ for stop_id in stop_seq_mapping.keys():
+ stop_seq_mapping[stop_id]['bearings'] = get_stop_bearings(
+ stop_seq_mapping[stop_id]['routeStops'])
+
+ # Just dump the json in case of a need for trouble-shooting, but otherwise
+ # we do not need this file
+ with open('stopMap.routeStopsSequence.json', 'w', encoding='UTF-8') as f:
+ json.dump(stop_seq_mapping, f)
+
+ logger.info(
+ f'Processed routeStopsSequence in {(time.time() - start_time) * 1000:.2f}ms')
+
+ # Preprocess stopList, organise stops into ~100m x ~100m squares to reduce
+ # size of nested loop later
+ stop_list_grid = {}
+ for stop_id, stop in stop_list.items():
+ # take lat/lng up to 3 decimal places, that's about 100m x 100m square
+ lat = int(stop['location']['lat'] * 1000)
+ lng = int(stop['location']['lng'] * 1000)
+ # add stop into the 9 grid boxes surrounding this stop
+ grid = [
+ f"{lat - 1}_{lng - 1}",
+ f"{lat }_{lng - 1}",
+ f"{lat + 1}_{lng - 1}",
+ f"{lat - 1}_{lng }",
+ f"{lat }_{lng }",
+ f"{lat + 1}_{lng }",
+ f"{lat - 1}_{lng + 1}",
+ f"{lat }_{lng + 1}",
+ f"{lat + 1}_{lng + 1}",
+ ]
+ for grid_id in grid:
+ if grid_id not in stop_list_grid:
+ stop_list_grid[grid_id] = []
+ stop_list_grid[grid_id].append(stop_id)
+
+ target_stop_list = list(stop_list.items())
+ stop_map = {}
+ count = 0
+ group_count = 0
+
+ for stop_id, stop in target_stop_list:
+ count += 1
+ # if count % 1000 == 0:
+ # logger.info(f"Processed {count} stops ({group_count} groups) at {(time.time() - start_time) * 1000:.2f}ms")
+
+ stop_group = get_stop_group(
+ route_list,
+ stop_list,
+ stop_seq_mapping,
+ stop_list_grid,
+ stop_id)
+ if len(stop_group) > 0:
+ group_count += 1
+ stop_map[stop_id] = stop_group
+
+ logger.info(
+ f"Processed {count} stops ({group_count} groups) at {(time.time() - start_time) * 1000:.2f}ms")
+
+ with open('stopMap.json', 'w', encoding='UTF-8') as f:
+ json.dump(stop_map, f, indent=4)
+
+ db['stopMap'] = stop_map
+
+ with open('routeFareList.json', 'w', encoding='UTF-8') as f:
+ json.dump(db, f, indent=4)
+
+ # reduce size of routeFareList.min.json by rounding lat/lng values to 5 decimal places
+ # 5 d.p. is roughly one-metre accuracy, it is good enough for this project
+ # saves around 50kb in size for 14,000 stops
+ for stop_id, stop in target_stop_list:
+ stop_list[stop_id]['location']['lat'] = float(
+ '%.5f' % (stop_list[stop_id]['location']['lat']))
+ stop_list[stop_id]['location']['lng'] = float(
+ '%.5f' % (stop_list[stop_id]['location']['lng']))
+
+ db['stopList'] = stop_list
+
+ logger.info(
+ f"Reduced location lat/lng to 5 d.p. at {(time.time() - start_time) * 1000:.2f}ms")
+
+ with open('routeFareList.alpha.json', 'w', encoding='UTF-8') as f:
+ json.dump(db, f, indent=4)
+
+ with open('routeFareList.min.json', 'w', encoding='UTF-8') as f:
+ json.dump(db, f)
+
+ with open('routeFareList.alpha.min.json', 'w', encoding='UTF-8') as f:
+ json.dump(db, f)
+
if __name__ == "__main__":
- logging.basicConfig(level=logging.INFO)
- logger = logging.getLogger(__name__)
- merge_stop_list()
\ No newline at end of file
+ logging.basicConfig(level=logging.INFO)
+ logger = logging.getLogger(__name__)
+ merge_stop_list()
diff --git a/crawling/mtr.py b/crawling/mtr.py
index 65ca6461..19dc363c 100644
--- a/crawling/mtr.py
+++ b/crawling/mtr.py
@@ -11,11 +11,13 @@
from crawl_utils import emitRequest
+
def filterStops(route):
route['stops'] = [stop for stop in route['stops'] if stop is not None]
return route
-async def getRouteStop(co = 'mtr'):
+
+async def getRouteStop(co='mtr'):
a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
epsgTransformer = Transformer.from_crs('epsg:2326', 'epsg:4326')
@@ -24,48 +26,54 @@ async def getRouteStop(co = 'mtr'):
r = await emitRequest('https://opendata.mtr.com.hk/data/mtr_lines_and_stations.csv', a_client)
r.encoding = 'utf-8'
- reader = csv.reader(r.text.split("\n") )
- headers = next(reader,None)
+ reader = csv.reader(r.text.split("\n"))
+ headers = next(reader, None)
routes = [route for route in reader if len(route) == 7]
for [route, bound, stopCode, stopId, chn, eng, seq] in routes:
if route == "":
continue
- if route+"_"+bound not in routeList:
- routeList[route+"_"+bound] = {
- "gtfsId": None,
- "route": route,
- "bound": bound,
- "service_type": "1",
- "orig_tc": None,
- "orig_en": None,
- "dest_tc": None,
- "dest_en": None,
- "stops": [None] * 100,
- "fare": []
+ if route + "_" + bound not in routeList:
+ routeList[route + "_" + bound] = {
+ "gtfsId": None,
+ "route": route,
+ "bound": bound,
+ "service_type": "1",
+ "orig_tc": None,
+ "orig_en": None,
+ "dest_tc": None,
+ "dest_en": None,
+ "stops": [None] * 100,
+ "fare": []
}
if int(float(seq)) == 1:
- routeList[route+"_"+bound]["orig_tc"] = chn
- routeList[route+"_"+bound]["orig_en"] = eng
- routeList[route+"_"+bound]["dest_tc"] = chn
- routeList[route+"_"+bound]["dest_en"] = eng
- routeList[route+"_"+bound]["stops"][int(float(seq))] = stopCode
+ routeList[route + "_" + bound]["orig_tc"] = chn
+ routeList[route + "_" + bound]["orig_en"] = eng
+ routeList[route + "_" + bound]["dest_tc"] = chn
+ routeList[route + "_" + bound]["dest_en"] = eng
+ routeList[route + "_" + bound]["stops"][int(float(seq))] = stopCode
if stopCode not in stopList:
- r = await emitRequest('https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q=港鐵'+chn+"站", a_client, headers={'Accept': 'application/json'})
- lat, lng = epsgTransformer.transform( r.json()[0]['y'], r.json()[0]['x'] )
+ r = await emitRequest('https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q=港鐵' + chn + "站", a_client, headers={'Accept': 'application/json'})
+ lat, lng = epsgTransformer.transform(r.json()[0]['y'], r.json()[0]['x'])
stopList[stopCode] = {
- "stop": stopCode,
- "name_en": eng,
- "name_tc": chn,
- "lat": lat,
- "long": lng
+ "stop": stopCode,
+ "name_en": eng,
+ "name_tc": chn,
+ "lat": lat,
+ "long": lng
}
with open('routeList.mtr.json', 'w', encoding='UTF-8') as f:
- f.write(json.dumps(list(map(filterStops, [route for route in routeList.values() if len(route['stops']) > 0])), ensure_ascii=False))
+ f.write(
+ json.dumps(
+ list(
+ map(
+ filterStops, [
+ route for route in routeList.values() if len(
+ route['stops']) > 0])), ensure_ascii=False))
with open('stopList.mtr.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(stopList, ensure_ascii=False))
-if __name__=='__main__':
+if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
logging.getLogger('httpx').setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
diff --git a/crawling/mtrExits.py b/crawling/mtrExits.py
index 6b3fe90e..a86fc857 100644
--- a/crawling/mtrExits.py
+++ b/crawling/mtrExits.py
@@ -1,4 +1,4 @@
- # -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
import asyncio
import logging
from crawl_utils import emitRequest
@@ -13,61 +13,69 @@
mtrStops = {}
epsgTransformer = Transformer.from_crs('epsg:2326', 'epsg:4326')
+
def checkResult(results, q, stop, exit, barrierFree):
for result in results:
if result['nameZH'] == q:
- lat, lng = epsgTransformer.transform( result['y'], result['x'] )
+ lat, lng = epsgTransformer.transform(result['y'], result['x'])
res.append({
- "name_en": stop["name_en"],
- "name_zh": stop["name_tc"],
- "name": {
- "en": stop["name_en"],
- "zh": stop["name_tc"],
- },
- "exit": exit,
- "lat": lat,
- "lng": lng,
- "barrierFree": barrierFree,
+ "name_en": stop["name_en"],
+ "name_zh": stop["name_tc"],
+ "name": {
+ "en": stop["name_en"],
+ "zh": stop["name_tc"],
+ },
+ "exit": exit,
+ "lat": lat,
+ "lng": lng,
+ "barrierFree": barrierFree,
})
return True
- return False
+ return False
+
async def main():
a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
r = await emitRequest('https://opendata.mtr.com.hk/data/mtr_lines_and_stations.csv', a_client)
r.encoding = 'utf-8'
- reader = csv.reader(r.text.strip().split("\n") )
- headers = next(reader,None)
+ reader = csv.reader(r.text.strip().split("\n"))
+ headers = next(reader, None)
for entry in reader:
mtrStops[entry[3]] = {
- "name_tc": entry[4],
- "name_en": entry[5],
+ "name_tc": entry[4],
+ "name_en": entry[5],
}
r = await emitRequest("https://opendata.mtr.com.hk/data/barrier_free_facilities.csv", a_client)
r.encoding = 'utf-8'
- reader = csv.reader(r.text.strip().split("\n") )
+ reader = csv.reader(r.text.strip().split("\n"))
for entry in reader:
if entry[2] == 'Y' and entry[3] != '':
- for exit in re.findall(" [A-Z][\d]*", entry[3]):
+ for exit in re.findall(" [A-Z][\\d]*", entry[3]):
if entry[0] in mtrStops:
mtrStops[entry[0]][exit.strip()] = True
-
+
# crawl exit geolocation
for key, stop in mtrStops.items():
- q = '港鐵'+stop['name_tc']+'站進出口'
- r = await emitRequest("https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q="+q, a_client)
+ q = '港鐵' + stop['name_tc'] + '站進出口'
+ r = await emitRequest("https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q=" + q, a_client)
for char in string.ascii_uppercase:
- q = '港鐵'+stop['name_tc']+'站-'+str(char)+'進出口'
+ q = '港鐵' + stop['name_tc'] + '站-' + str(char) + '進出口'
checkResult(r.json(), q, stop, char, str(char) in stop)
- for i in range(1,10):
- q = '港鐵'+stop['name_tc']+'站-'+char+str(i)+'進出口'
- checkResult(r.json(), q, stop, char+str(i), (char+str(char)) in stop)
-
+ for i in range(1, 10):
+ q = '港鐵' + stop['name_tc'] + '站-' + char + str(i) + '進出口'
+ checkResult(
+ r.json(),
+ q,
+ stop,
+ char + str(i),
+ (char + str(char)) in stop)
+
with open('exits.mtr.json', 'w', encoding='UTF-8') as f:
- f.write(json.dumps(list({(v['name']['zh']+v['exit']): v for v in res}.values()), ensure_ascii=False))
+ f.write(json.dumps(list(
+ {(v['name']['zh'] + v['exit']): v for v in res}.values()), ensure_ascii=False))
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
logging.getLogger('httpx').setLevel(logging.WARNING)
- asyncio.run(main())
\ No newline at end of file
+ asyncio.run(main())
diff --git a/crawling/nlb.py b/crawling/nlb.py
index 810b3053..547ebfaf 100644
--- a/crawling/nlb.py
+++ b/crawling/nlb.py
@@ -7,84 +7,85 @@
import httpx
-logger=logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
+
async def getRouteStop(co):
- # define output name
- ROUTE_LIST = 'routeList.'+co+'.json'
- STOP_LIST = 'stopList.'+co+'.json'
+ # define output name
+ ROUTE_LIST = 'routeList.' + co + '.json'
+ STOP_LIST = 'stopList.' + co + '.json'
+
+ a_client = httpx.AsyncClient()
+ # load route list and stop list if exist
+ routeList = []
+ if path.isfile(ROUTE_LIST):
+ logger.warning(f"{ROUTE_LIST} already exist, skipping...")
+ return
+ else:
+ # load routes
+ r = await emitRequest('https://rt.data.gov.hk/v2/transport/nlb/route.php?action=list', a_client)
+ for route in r.json()['routes']:
+ routeList.append({
+ "id": route['routeId'],
+ "route": route['routeNo'],
+ "bound": "O",
+ "orig_en": route['routeName_e'].split(' > ')[0],
+ "orig_tc": route['routeName_c'].split(' > ')[0],
+ "dest_en": route['routeName_e'].split(' > ')[1],
+ "dest_tc": route['routeName_c'].split(' > ')[1],
+ "service_type": str(1 + route['overnightRoute'] * 2 + route['specialRoute'] * 4),
+ "stops": [],
+ "co": ["nlb"]
+ })
+ logger.info("Digested route list")
- a_client = httpx.AsyncClient()
- # load route list and stop list if exist
- routeList = []
- if path.isfile(ROUTE_LIST):
- logger.warning(f"{ROUTE_LIST} already exist, skipping...")
- return
- else:
- # load routes
- r = await emitRequest('https://rt.data.gov.hk/v2/transport/nlb/route.php?action=list', a_client)
- for route in r.json()['routes']:
- routeList.append({
- "id": route['routeId'],
- "route": route['routeNo'],
- "bound": "O",
- "orig_en": route['routeName_e'].split(' > ')[0],
- "orig_tc": route['routeName_c'].split(' > ')[0],
- "dest_en": route['routeName_e'].split(' > ')[1],
- "dest_tc": route['routeName_c'].split(' > ')[1],
- "service_type": str(1 + route['overnightRoute'] * 2 + route['specialRoute'] *4),
- "stops": [],
- "co": ["nlb"]
- })
- logger.info("Digested route list")
+ stopList = {}
+ if path.isfile(STOP_LIST):
+ with open(STOP_LIST, 'r', encoding='UTF-8') as f:
+ stopList = json.load(f)
- stopList = {}
- if path.isfile(STOP_LIST):
- with open(STOP_LIST, 'r', encoding='UTF-8') as f:
- stopList = json.load(f)
-
- async def getRouteStop(routeId):
- r = await emitRequest('https://rt.data.gov.hk/v2/transport/nlb/stop.php?action=list&routeId='+routeId, a_client)
- try:
- return r.json()['stops']
- except Exception as err:
- print(r)
- raise err
+ async def getRouteStop(routeId):
+ r = await emitRequest('https://rt.data.gov.hk/v2/transport/nlb/stop.php?action=list&routeId=' + routeId, a_client)
+ try:
+ return r.json()['stops']
+ except Exception as err:
+ print(r)
+ raise err
- async def addRouteStop(route):
- stops = await getRouteStop(route['id'])
- stopIds = []
- fares = []
- faresHoliday = []
- for stop in stops:
- if stop['stopId'] not in stopList:
- stopList[stop['stopId']] = {
- 'stop': stop['stopId'],
- 'name_en': stop['stopName_e'],
- 'name_tc': stop['stopName_c'],
- 'lat': stop['latitude'],
- 'long': stop['longitude']
- }
- stopIds.append(stop['stopId'])
- fares.append(stop['fare'])
- faresHoliday.append(stop['fareHoliday'])
- route['stops'] = stopIds
- route['fares'] = fares[0:-1]
- route['faresHoliday'] = faresHoliday[0:-1]
+ async def addRouteStop(route):
+ stops = await getRouteStop(route['id'])
+ stopIds = []
+ fares = []
+ faresHoliday = []
+ for stop in stops:
+ if stop['stopId'] not in stopList:
+ stopList[stop['stopId']] = {
+ 'stop': stop['stopId'],
+ 'name_en': stop['stopName_e'],
+ 'name_tc': stop['stopName_c'],
+ 'lat': stop['latitude'],
+ 'long': stop['longitude']
+ }
+ stopIds.append(stop['stopId'])
+ fares.append(stop['fare'])
+ faresHoliday.append(stop['fareHoliday'])
+ route['stops'] = stopIds
+ route['fares'] = fares[0:-1]
+ route['faresHoliday'] = faresHoliday[0:-1]
- async def getRouteStopList ():
- await asyncio.gather(*[addRouteStop(r) for r in routeList])
- logger.info("Digested stop list")
- return routeList
+ async def getRouteStopList():
+ await asyncio.gather(*[addRouteStop(r) for r in routeList])
+ logger.info("Digested stop list")
+ return routeList
- await getRouteStopList()
+ await getRouteStopList()
- with open(ROUTE_LIST, 'w', encoding='UTF-8') as rf, open(STOP_LIST, 'w', encoding='UTF-8') as sf:
- json.dump(routeList, rf, ensure_ascii=False)
- json.dump(stopList, sf, ensure_ascii=False)
- logger.info("Dumped lists")
+ with open(ROUTE_LIST, 'w', encoding='UTF-8') as rf, open(STOP_LIST, 'w', encoding='UTF-8') as sf:
+ json.dump(routeList, rf, ensure_ascii=False)
+ json.dump(stopList, sf, ensure_ascii=False)
+ logger.info("Dumped lists")
-if __name__=='__main__':
- logging.basicConfig(level=logging.INFO)
- logging.getLogger('httpx').setLevel(logging.WARNING)
- asyncio.run(getRouteStop('nlb'))
\ No newline at end of file
+if __name__ == '__main__':
+ logging.basicConfig(level=logging.INFO)
+ logging.getLogger('httpx').setLevel(logging.WARNING)
+ asyncio.run(getRouteStop('nlb'))
diff --git a/crawling/parseGtfs.py b/crawling/parseGtfs.py
index 74c09aa0..d9cdbd62 100644
--- a/crawling/parseGtfs.py
+++ b/crawling/parseGtfs.py
@@ -10,9 +10,11 @@
import re
from crawl_utils import emitRequest, store_version
+
def takeFirst(elem):
return int(elem[0])
+
async def parseGtfs():
a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
if not path.isfile('gtfs.zip'):
@@ -33,24 +35,30 @@ async def parseGtfs():
with open('gtfs/routes.txt', 'r', encoding='UTF-8') as csvfile:
reader = csv.reader(csvfile)
headers = next(reader, None)
- for [route_id, agency_id, route_short_name, route_long_name, route_type, route_url] in reader:
+ for [
+ route_id,
+ agency_id,
+ route_short_name,
+ route_long_name,
+ route_type,
+ route_url] in reader:
routeList[route_id] = {
- 'co': agency_id.replace('LWB', 'KMB').lower().split('+'),
- 'route': route_short_name,
- 'stops': {},
- 'fares': {},
- 'freq': {},
- 'orig': {
- 'zh': route_long_name.split(' - ')[0],
- 'en': '',
- },
- 'dest': {
- 'zh': route_long_name.split(' - ')[1].replace(' (CIRCULAR)', ''),
- 'en': '',
- },
- 'jt': routeJourneyTime[route_id]["journeyTime"] if route_id in routeJourneyTime else None
+ 'co': agency_id.replace('LWB', 'KMB').lower().split('+'),
+ 'route': route_short_name,
+ 'stops': {},
+ 'fares': {},
+ 'freq': {},
+ 'orig': {
+ 'zh': route_long_name.split(' - ')[0],
+ 'en': '',
+ },
+ 'dest': {
+ 'zh': route_long_name.split(' - ')[1].replace(' (CIRCULAR)', ''),
+ 'en': '',
+ },
+ 'jt': routeJourneyTime[route_id]["journeyTime"] if route_id in routeJourneyTime else None
}
-
+
# parse timetable
with open('gtfs/trips.txt', 'r', encoding='UTF-8') as csvfile:
reader = csv.reader(csvfile)
@@ -69,13 +77,22 @@ async def parseGtfs():
headers = next(reader, None)
for [trip_id, _start_time, end_time, headway_secs] in reader:
[route_id, bound, calendar, start_time] = trip_id.split('-')
- routeList[route_id]['freq'][bound][calendar][start_time] = (end_time[0:5].replace(':', ''), headway_secs)
+ routeList[route_id]['freq'][bound][calendar][start_time] = (
+ end_time[0:5].replace(':', ''), headway_secs)
# parse stop seq
with open('gtfs/stop_times.txt', 'r', encoding='UTF-8') as csvfile:
reader = csv.reader(csvfile)
headers = next(reader, None)
- for [trip_id, arrival_time, departure_time, stop_id, stop_sequence, pickup_type, drop_off_type, timepoint] in reader:
+ for [
+ trip_id,
+ arrival_time,
+ departure_time,
+ stop_id,
+ stop_sequence,
+ pickup_type,
+ drop_off_type,
+ timepoint] in reader:
[route_id, bound, service_id, tmp] = trip_id.split('-')
if bound not in routeList[route_id]['stops']:
routeList[route_id]['stops'][bound] = {}
@@ -85,62 +102,78 @@ async def parseGtfs():
with open('gtfs/fare_attributes.txt', 'r', encoding='UTF-8') as csvfile:
reader = csv.reader(csvfile)
headers = next(reader, None)
- for [fare_id,price,currency_type,payment_method,transfers,agency_id] in reader:
+ for [
+ fare_id,
+ price,
+ currency_type,
+ payment_method,
+ transfers,
+ agency_id] in reader:
[route_id, bound, on, off] = fare_id.split('-')
if bound not in routeList[route_id]['fares']:
routeList[route_id]['fares'][bound] = {}
- if on not in routeList[route_id]['fares'][bound] or routeList[route_id]['fares'][bound][on][1] < int(off):
- routeList[route_id]['fares'][bound][on] = ('0' if price == '0.0000' else price, int(off))
+ if on not in routeList[route_id]['fares'][bound] or routeList[route_id]['fares'][bound][on][1] < int(
+ off):
+ routeList[route_id]['fares'][bound][on] = (
+ '0' if price == '0.0000' else price, int(off))
- for route_id in routeList.keys():
+ for route_id in routeList.keys():
for bound in routeList[route_id]['stops'].keys():
_tmp = list(routeList[route_id]['stops'][bound].items())
_tmp.sort(key=takeFirst)
- routeList[route_id]['stops'][bound] = [v for k,v in _tmp]
+ routeList[route_id]['stops'][bound] = [v for k, v in _tmp]
for bound in routeList[route_id]['fares'].keys():
_tmp = list(routeList[route_id]['fares'][bound].items())
_tmp.sort(key=takeFirst)
- routeList[route_id]['fares'][bound] = [v[0] for k,v in _tmp]
+ routeList[route_id]['fares'][bound] = [v[0] for k, v in _tmp]
+
+ nameReg = re.compile('\\[(.*)\\] (.*)')
- nameReg = re.compile('\[(.*)\] (.*)')
def parseStopName(name):
ret = {}
for str in name.split('|'):
matches = nameReg.findall(str)
- if len(matches) == 0: return { "unknown": str }
+ if len(matches) == 0:
+ return {"unknown": str}
for co, gtfsName in matches:
x, y = co.split('+'), gtfsName.split('/
')
for i in range(len(x)):
ret[x[i].lower().replace('lwb', 'kmb')] = y[i if i < len(y) else 0]
return ret
-
with open('gtfs/stops.txt', 'r', encoding='UTF-8') as csvfile:
reader = csv.reader(csvfile)
headers = next(reader, None)
- for [stop_id,stop_name,stop_lat,stop_lon,zone_id,location_type,stop_timezone] in reader:
+ for [
+ stop_id,
+ stop_name,
+ stop_lat,
+ stop_lon,
+ zone_id,
+ location_type,
+ stop_timezone] in reader:
stopList[stop_id] = {
- 'stopId': stop_id,
- 'stopName': parseStopName(stop_name),
- 'lat': float(stop_lat),
- 'lng': float(stop_lon)
+ 'stopId': stop_id,
+ 'stopName': parseStopName(stop_name),
+ 'lat': float(stop_lat),
+ 'lng': float(stop_lon)
}
with open('gtfs/calendar.txt', 'r', encoding='UTF-8') as csvfile:
reader = csv.reader(csvfile)
headers = next(reader, None)
for line in reader:
- [service_id,mon,tue,wed,thur,fri,sat,sun, start_date, end_date] = line
+ [service_id, mon, tue, wed, thur, fri, sat, sun, start_date, end_date] = line
serviceDayMap[service_id] = [sun, mon, tue, wed, thur, fri, sat]
with open('gtfs.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps({
- 'routeList': routeList,
- 'stopList': stopList,
- "serviceDayMap": serviceDayMap,
+ 'routeList': routeList,
+ 'stopList': stopList,
+ "serviceDayMap": serviceDayMap,
}, ensure_ascii=False, indent=2))
-if __name__=='__main__':
+if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
logging.getLogger('httpx').setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
diff --git a/crawling/parseGtfsEn.py b/crawling/parseGtfsEn.py
index 17b1b216..acb3e463 100644
--- a/crawling/parseGtfsEn.py
+++ b/crawling/parseGtfsEn.py
@@ -10,9 +10,11 @@
import re
from crawl_utils import emitRequest, store_version
+
def takeFirst(elem):
return int(elem[0])
+
async def parseGtfs():
a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
if not path.isfile('gtfs-en.zip'):
@@ -33,24 +35,30 @@ async def parseGtfs():
with open('gtfs-en/routes.txt', 'r', encoding='UTF-8') as csvfile:
reader = csv.reader(csvfile)
headers = next(reader, None)
- for [route_id, agency_id, route_short_name, route_long_name, route_type, route_url] in reader:
+ for [
+ route_id,
+ agency_id,
+ route_short_name,
+ route_long_name,
+ route_type,
+ route_url] in reader:
routeList[route_id] = {
- 'co': agency_id.replace('LWB', 'KMB').lower().split('+'),
- 'route': route_short_name if route_short_name != "" else route_id,
- 'stops': {},
- 'fares': {},
- 'freq': {},
- 'orig': {
- 'zh': '',
- 'en': route_long_name.split(' - ')[0]
- },
- 'dest': {
- 'zh': '',
- 'en': route_long_name.split(' - ')[1].replace(' (CIRCULAR)', '')
- },
- 'jt': routeJourneyTime[route_id]["journeyTime"] if route_id in routeJourneyTime else None
+ 'co': agency_id.replace('LWB', 'KMB').lower().split('+'),
+ 'route': route_short_name if route_short_name != "" else route_id,
+ 'stops': {},
+ 'fares': {},
+ 'freq': {},
+ 'orig': {
+ 'zh': '',
+ 'en': route_long_name.split(' - ')[0]
+ },
+ 'dest': {
+ 'zh': '',
+ 'en': route_long_name.split(' - ')[1].replace(' (CIRCULAR)', '')
+ },
+ 'jt': routeJourneyTime[route_id]["journeyTime"] if route_id in routeJourneyTime else None
}
-
+
# parse timetable
with open('gtfs-en/trips.txt', 'r', encoding='UTF-8') as csvfile:
reader = csv.reader(csvfile)
@@ -69,13 +77,22 @@ async def parseGtfs():
headers = next(reader, None)
for [trip_id, _start_time, end_time, headway_secs] in reader:
[route_id, bound, calendar, start_time] = trip_id.split('-')
- routeList[route_id]['freq'][bound][calendar][start_time] = (end_time[0:5].replace(':', ''), headway_secs)
+ routeList[route_id]['freq'][bound][calendar][start_time] = (
+ end_time[0:5].replace(':', ''), headway_secs)
# parse stop seq
with open('gtfs-en/stop_times.txt', 'r', encoding='UTF-8') as csvfile:
reader = csv.reader(csvfile)
headers = next(reader, None)
- for [trip_id, arrival_time, departure_time, stop_id, stop_sequence, pickup_type, drop_off_type, timepoint] in reader:
+ for [
+ trip_id,
+ arrival_time,
+ departure_time,
+ stop_id,
+ stop_sequence,
+ pickup_type,
+ drop_off_type,
+ timepoint] in reader:
[route_id, bound, service_id, tmp] = trip_id.split('-')
if bound not in routeList[route_id]['stops']:
routeList[route_id]['stops'][bound] = {}
@@ -85,62 +102,78 @@ async def parseGtfs():
with open('gtfs-en/fare_attributes.txt', 'r', encoding='UTF-8') as csvfile:
reader = csv.reader(csvfile)
headers = next(reader, None)
- for [fare_id,price,currency_type,payment_method,transfers,agency_id] in reader:
+ for [
+ fare_id,
+ price,
+ currency_type,
+ payment_method,
+ transfers,
+ agency_id] in reader:
[route_id, bound, on, off] = fare_id.split('-')
if bound not in routeList[route_id]['fares']:
routeList[route_id]['fares'][bound] = {}
- if on not in routeList[route_id]['fares'][bound] or routeList[route_id]['fares'][bound][on][1] < int(off):
- routeList[route_id]['fares'][bound][on] = ('0' if price == '0.0000' else price, int(off))
+ if on not in routeList[route_id]['fares'][bound] or routeList[route_id]['fares'][bound][on][1] < int(
+ off):
+ routeList[route_id]['fares'][bound][on] = (
+ '0' if price == '0.0000' else price, int(off))
- for route_id in routeList.keys():
+ for route_id in routeList.keys():
for bound in routeList[route_id]['stops'].keys():
_tmp = list(routeList[route_id]['stops'][bound].items())
_tmp.sort(key=takeFirst)
- routeList[route_id]['stops'][bound] = [v for k,v in _tmp]
+ routeList[route_id]['stops'][bound] = [v for k, v in _tmp]
for bound in routeList[route_id]['fares'].keys():
_tmp = list(routeList[route_id]['fares'][bound].items())
_tmp.sort(key=takeFirst)
- routeList[route_id]['fares'][bound] = [v[0] for k,v in _tmp]
+ routeList[route_id]['fares'][bound] = [v[0] for k, v in _tmp]
+
+ nameReg = re.compile('\\[(.*)\\] (.*)')
- nameReg = re.compile('\[(.*)\] (.*)')
def parseStopName(name):
ret = {}
for str in name.split('|'):
matches = nameReg.findall(str)
- if len(matches) == 0: return { "unknown": str }
+ if len(matches) == 0:
+ return {"unknown": str}
for co, gtfsName in matches:
x, y = co.split('+'), gtfsName.split('/
')
for i in range(len(x)):
ret[x[i].lower().replace('lwb', 'kmb')] = y[i if i < len(y) else 0]
return ret
-
with open('gtfs-en/stops.txt', 'r', encoding='UTF-8') as csvfile:
reader = csv.reader(csvfile)
headers = next(reader, None)
- for [stop_id,stop_name,stop_lat,stop_lon,zone_id,location_type,stop_timezone] in reader:
+ for [
+ stop_id,
+ stop_name,
+ stop_lat,
+ stop_lon,
+ zone_id,
+ location_type,
+ stop_timezone] in reader:
stopList[stop_id] = {
- 'stopId': stop_id,
- 'stopName': parseStopName(stop_name),
- 'lat': float(stop_lat),
- 'lng': float(stop_lon)
+ 'stopId': stop_id,
+ 'stopName': parseStopName(stop_name),
+ 'lat': float(stop_lat),
+ 'lng': float(stop_lon)
}
with open('gtfs-en/calendar.txt', 'r', encoding='UTF-8') as csvfile:
reader = csv.reader(csvfile)
headers = next(reader, None)
for line in reader:
- [service_id,mon,tue,wed,thur,fri,sat,sun, start_date, end_date] = line
+ [service_id, mon, tue, wed, thur, fri, sat, sun, start_date, end_date] = line
serviceDayMap[service_id] = [sun, mon, tue, wed, thur, fri, sat]
with open('gtfs-en.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps({
- 'routeList': routeList,
- 'stopList': stopList,
- "serviceDayMap": serviceDayMap,
+ 'routeList': routeList,
+ 'stopList': stopList,
+ "serviceDayMap": serviceDayMap,
}, ensure_ascii=False, indent=2))
-if __name__=='__main__':
+if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
logging.getLogger('httpx').setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
diff --git a/crawling/parseHoliday.py b/crawling/parseHoliday.py
index 3f753ca2..c02c62ca 100644
--- a/crawling/parseHoliday.py
+++ b/crawling/parseHoliday.py
@@ -7,6 +7,7 @@
logger = logging.getLogger(__name__)
+
async def main():
if not path.isfile('holiday.json'):
async with httpx.AsyncClient() as a_client:
@@ -19,6 +20,6 @@ async def main():
else:
logger.info('holiday.json already exist, download skipped')
-if __name__=='__main__':
+if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
- asyncio.run(main())
\ No newline at end of file
+ asyncio.run(main())
diff --git a/crawling/parseJourneyTime.py b/crawling/parseJourneyTime.py
index c8651570..78248014 100644
--- a/crawling/parseJourneyTime.py
+++ b/crawling/parseJourneyTime.py
@@ -8,6 +8,7 @@
from crawl_utils import emitRequest, store_version
from datetime import datetime
+
async def parseJourneyTime():
a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
if not path.isfile('ROUTE_BUS.xml'):
@@ -19,20 +20,20 @@ async def parseJourneyTime():
routeTimeList = {}
tree = ET.parse('ROUTE_BUS.xml')
root = tree.getroot()
- version = datetime.fromisoformat(root.attrib["generated"]+"+08:00")
+ version = datetime.fromisoformat(root.attrib["generated"] + "+08:00")
store_version('routes-fares-xml/ROUTE_BUS', version.isoformat())
for route in root.iter('ROUTE'):
if route.find('ROUTE_TYPE').text == '1':
routeTimeList[route.find('ROUTE_ID').text] = {
- 'co': route.find('COMPANY_CODE').text.replace('LWB', 'KMB').lower().split('+'),
- 'route': route.find('ROUTE_NAMEC').text,
- 'journeyTime': route.find('JOURNEY_TIME').text,
+ 'co': route.find('COMPANY_CODE').text.replace('LWB', 'KMB').lower().split('+'),
+ 'route': route.find('ROUTE_NAMEC').text,
+ 'journeyTime': route.find('JOURNEY_TIME').text,
}
with open('routeTime.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(routeTimeList, ensure_ascii=False))
-if __name__=='__main__':
+if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
logging.getLogger('httpx').setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
diff --git a/crawling/routeCompare.py b/crawling/routeCompare.py
index fe13134f..9eedc7dd 100644
--- a/crawling/routeCompare.py
+++ b/crawling/routeCompare.py
@@ -12,6 +12,7 @@
from crawl_utils import emitRequest
+
async def routeCompare():
a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None))
r = await emitRequest("https://data.hkbus.app/routeFareList.min.json", a_client)
@@ -22,10 +23,13 @@ async def routeCompare():
os.makedirs("route-ts", exist_ok=True)
def isRouteEqual(a, b):
- return xxhash.xxh3_64(str(a)).hexdigest() == xxhash.xxh3_64(str(b)).hexdigest()
+ return xxhash.xxh3_64(
+ str(a)).hexdigest() == xxhash.xxh3_64(
+ str(b)).hexdigest()
for newKey in newDb['routeList']:
- if newKey not in oldDb['routeList'] or not isRouteEqual(oldDb['routeList'][newKey], newDb['routeList'][newKey]):
+ if newKey not in oldDb['routeList'] or not isRouteEqual(
+ oldDb['routeList'][newKey], newDb['routeList'][newKey]):
filename = re.sub(r'[\\\/\:\*\?\"\<\>\|]', '', newKey).upper()
with open(os.path.join("route-ts", filename), "w", encoding='utf-8') as f:
f.write(str(int(time.time())))
@@ -36,8 +40,8 @@ def isRouteEqual(a, b):
with open(os.path.join("route-ts", filename), "w", encoding='utf-8') as f:
f.write(str(int(time.time())))
-if __name__=='__main__':
+if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
logging.getLogger('httpx').setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
- asyncio.run(routeCompare())
\ No newline at end of file
+ asyncio.run(routeCompare())
diff --git a/crawling/sunferry.py b/crawling/sunferry.py
index b4749502..ec36cf34 100644
--- a/crawling/sunferry.py
+++ b/crawling/sunferry.py
@@ -11,22 +11,22 @@
gtfsStops = gtfs["stopList"]
routes = {
- "CECC": ["Central", "Cheung Chau"],
- "CCCE": ["Cheung Chau", "Central"],
- "CEMW": ["Central", "Mui Wo"],
- "MWCE": ["Mui Wo", "Central"],
- "NPHH": ["North Point", "Hung Hom"],
- "HHNP": ["Hung Hom", "North Point"],
- "NPKC": ["North Point", "Kowloon City"],
- "KCNP": ["Kowloon City", "North Point"],
- "IIPECMUW": ["Peng Chau", "Mui Wo"],
- "IIMUWPEC": ["Mui Wo", "Peng Chau"],
- "IIMUWCMW": ["Mui Wo", "Chi Ma Wan"],
- "IICMWMUW": ["Chi Ma Wan", "Mui Wo"],
- "IICMWCHC": ["Chi Ma Wan", "Cheung Chau"],
- "IICHCCMW": ["Cheung Chau", "Chi Ma Wan"],
- "IICHCMUW": ["Cheung Chau", "Mui Wo"],
- "IIMUWCHC": ["Mui Wo", "Cheung Chau "],
+ "CECC": ["Central", "Cheung Chau"],
+ "CCCE": ["Cheung Chau", "Central"],
+ "CEMW": ["Central", "Mui Wo"],
+ "MWCE": ["Mui Wo", "Central"],
+ "NPHH": ["North Point", "Hung Hom"],
+ "HHNP": ["Hung Hom", "North Point"],
+ "NPKC": ["North Point", "Kowloon City"],
+ "KCNP": ["Kowloon City", "North Point"],
+ "IIPECMUW": ["Peng Chau", "Mui Wo"],
+ "IIMUWPEC": ["Mui Wo", "Peng Chau"],
+ "IIMUWCMW": ["Mui Wo", "Chi Ma Wan"],
+ "IICMWMUW": ["Chi Ma Wan", "Mui Wo"],
+ "IICMWCHC": ["Chi Ma Wan", "Cheung Chau"],
+ "IICHCCMW": ["Cheung Chau", "Chi Ma Wan"],
+ "IICHCMUW": ["Cheung Chau", "Mui Wo"],
+ "IIMUWCHC": ["Mui Wo", "Cheung Chau "],
}
routeList = []
@@ -35,47 +35,48 @@
for [route_code, [orig, dest]] in routes.items():
for route_id, gtfsRoute in gtfsRoutes.items():
if "ferry" in gtfsRoute["co"]:
- if orig.lower() == gtfsRoute["orig"]["en"].lower() and dest.lower() == gtfsRoute["dest"]["en"].lower():
+ if orig.lower() == gtfsRoute["orig"]["en"].lower(
+ ) and dest.lower() == gtfsRoute["dest"]["en"].lower():
routeList.append({
- "gtfsId": route_id,
- "route": route_code,
- "orig_tc": gtfsZh["routeList"][route_id]["orig"]["zh"],
- "orig_en": gtfsRoute["orig"]["en"],
- "dest_tc": gtfsZh["routeList"][route_id]["dest"]["zh"],
- "dest_en": gtfsRoute["dest"]["en"],
- "service_type": 1,
- "bound": "O",
- "stops": gtfsRoute["stops"]["1"],
- "freq": gtfsRoute["freq"]["1"],
+ "gtfsId": route_id,
+ "route": route_code,
+ "orig_tc": gtfsZh["routeList"][route_id]["orig"]["zh"],
+ "orig_en": gtfsRoute["orig"]["en"],
+ "dest_tc": gtfsZh["routeList"][route_id]["dest"]["zh"],
+ "dest_en": gtfsRoute["dest"]["en"],
+ "service_type": 1,
+ "bound": "O",
+ "stops": gtfsRoute["stops"]["1"],
+ "freq": gtfsRoute["freq"]["1"],
})
elif dest.lower() == gtfsRoute["orig"]["en"].lower() and orig.lower() == gtfsRoute["dest"]["en"].lower():
routeList.append({
- "gtfsId": route_id,
- "route": route_code,
- "dest_tc": gtfsZh["routeList"][route_id]["orig"]["zh"],
- "dest_en": gtfsRoute["orig"]["en"],
- "orig_tc": gtfsZh["routeList"][route_id]["dest"]["zh"],
- "orig_en": gtfsRoute["dest"]["en"],
- "service_type": 1,
- "bound": "I",
- "stops": gtfsRoute["stops"]["2"] if "2" in gtfsRoute["stops"] else gtfsRoute["stops"]["1"][::-1],
- "freq": gtfsRoute["freq"]["2"] if "2" in gtfsRoute["freq"] else {},
+ "gtfsId": route_id,
+ "route": route_code,
+ "dest_tc": gtfsZh["routeList"][route_id]["orig"]["zh"],
+ "dest_en": gtfsRoute["orig"]["en"],
+ "orig_tc": gtfsZh["routeList"][route_id]["dest"]["zh"],
+ "orig_en": gtfsRoute["dest"]["en"],
+ "service_type": 1,
+ "bound": "I",
+ "stops": gtfsRoute["stops"]["2"] if "2" in gtfsRoute["stops"] else gtfsRoute["stops"]["1"][::-1],
+ "freq": gtfsRoute["freq"]["2"] if "2" in gtfsRoute["freq"] else {},
})
for route in routeList:
for stopId in route["stops"]:
stopList[stopId] = {
- "stop": stopId,
- "name_en": gtfsStops[stopId]["stopName"]["unknown"],
- "name_tc": gtfsZh["stopList"][stopId]["stopName"]["unknown"],
- "lat": gtfsStops[stopId]["lat"],
- "long": gtfsStops[stopId]["lng"],
+ "stop": stopId,
+ "name_en": gtfsStops[stopId]["stopName"]["unknown"],
+ "name_tc": gtfsZh["stopList"][stopId]["stopName"]["unknown"],
+ "lat": gtfsStops[stopId]["lat"],
+ "long": gtfsStops[stopId]["lng"],
}
-with open('routeList.sunferry.json', 'w', encoding='UTF-8' ) as f:
+with open('routeList.sunferry.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(routeList, ensure_ascii=False))
-with open('stopList.sunferry.json', 'w', encoding='UTF-8' ) as f:
+with open('stopList.sunferry.json', 'w', encoding='UTF-8') as f:
f.write(json.dumps(stopList, ensure_ascii=False))
diff --git a/crawling/test.py b/crawling/test.py
index 862396a5..4f5732e3 100644
--- a/crawling/test.py
+++ b/crawling/test.py
@@ -1,7 +1,7 @@
import json
import requests
-with open( 'routeFareList.json' ) as f:
+with open('routeFareList.json') as f:
newDb = json.load(f)
r = requests.get('https://hkbus.github.io/hk-bus-crawling/routeFareList.json')
@@ -9,8 +9,8 @@
for newKey in newDb['routeList']:
if newKey not in oldDb['routeList']:
- print ('new '+newKey)
+ print('new ' + newKey)
for oldKey in oldDb['routeList']:
if oldKey not in newDb['routeList']:
- print ('old '+oldKey)
\ No newline at end of file
+ print('old ' + oldKey)
diff --git a/hk_bus_eta/__init__.py b/hk_bus_eta/__init__.py
index 16c3abce..4ad91b27 100644
--- a/hk_bus_eta/__init__.py
+++ b/hk_bus_eta/__init__.py
@@ -1 +1 @@
-from .eta import HKEta
\ No newline at end of file
+from .eta import HKEta
diff --git a/hk_bus_eta/eta.py b/hk_bus_eta/eta.py
index 284fb204..62e2a798 100644
--- a/hk_bus_eta/eta.py
+++ b/hk_bus_eta/eta.py
@@ -15,132 +15,147 @@
import re
import hashlib
+
def get_platform_display(plat, lang):
- number = int(plat) if isinstance(plat, str) else plat
- if number < 0 or number > 20:
- return ("Platform {}" if lang == "en" else "{}號月台").format(number)
- if number == 0:
- return "⓿"
- if number > 10:
- return chr(9451 + (number - 11))
- return chr(10102 + (number - 1))
+ number = int(plat) if isinstance(plat, str) else plat
+ if number < 0 or number > 20:
+ return ("Platform {}" if lang == "en" else "{}號月台").format(number)
+ if number == 0:
+ return "⓿"
+ if number > 10:
+ return chr(9451 + (number - 11))
+ return chr(10102 + (number - 1))
+
class HKEta:
holidays = None
route_list = None
stop_list = None
stop_map = None
-
+
def __init__(self):
- md5 = requests.get("https://hkbus.github.io/hk-bus-crawling/routeFareList.md5").text
- r = requests.get("https://hkbus.github.io/hk-bus-crawling/routeFareList.min.json")
+ md5 = requests.get(
+ "https://hkbus.github.io/hk-bus-crawling/routeFareList.md5").text
+ r = requests.get(
+ "https://hkbus.github.io/hk-bus-crawling/routeFareList.min.json")
m = hashlib.md5()
m.update(r.text.encode('utf-8'))
if md5 != m.hexdigest():
raise Exception("Error in accessing hk-eta-db, md5sum not match")
db = r.json()
- self.holidays, self.route_list, self.stop_list, self.stop_map = db["holidays"], db["routeList"], db["stopList"], db["stopMap"]
-
+ self.holidays, self.route_list, self.stop_list, self.stop_map = db[
+ "holidays"], db["routeList"], db["stopList"], db["stopMap"]
# 0-indexed seq
- def getEtas( self, route_id, seq, language ):
- routeEntry = self.route_list[route_id]
+ def getEtas(self, route_id, seq, language):
+ routeEntry = self.route_list[route_id]
route, stops, bound = routeEntry['route'], routeEntry['stops'], routeEntry['bound']
- dest, service_type, co, nlb_id, gtfs_id = routeEntry['dest'], routeEntry['serviceType'], routeEntry['co'], routeEntry["nlbId"], routeEntry['gtfsId']
+ dest, service_type, co, nlb_id, gtfs_id = routeEntry['dest'], routeEntry[
+ 'serviceType'], routeEntry['co'], routeEntry["nlbId"], routeEntry['gtfsId']
_etas = []
for company_id in co:
if company_id == "kmb" and "kmb" in stops:
_etas.extend(self.kmb(
- route=route,
- stop_id=stops["kmb"][seq],
- bound=bound["kmb"],
- seq=seq, co = co,
- service_type = service_type
+ route=route,
+ stop_id=stops["kmb"][seq],
+ bound=bound["kmb"],
+ seq=seq, co=co,
+ service_type=service_type
))
elif company_id == "ctb" and "ctb" in stops:
_etas.extend(self.ctb(
- stop_id=stops['ctb'][seq], route=route, bound=bound['ctb'], seq=seq
+ stop_id=stops['ctb'][seq], route=route, bound=bound['ctb'], seq=seq
))
elif company_id == "nlb" and "nlb" in stops:
_etas.extend(self.nlb(
- stop_id=stops['nlb'][seq], nlb_id=nlb_id
+ stop_id=stops['nlb'][seq], nlb_id=nlb_id
))
elif company_id == "lrtfeeder" and "lrtfeeder" in stops:
_etas.extend(self.lrtfeeder(
- stop_id=stops['lrtfeeder'][seq], route=route, language=language
+ stop_id=stops['lrtfeeder'][seq], route=route, language=language
))
elif company_id == "mtr" and "mtr" in stops:
_etas.extend(self.mtr(
- stop_id=stops['mtr'][seq], route=route, bound=bound["mtr"]
+ stop_id=stops['mtr'][seq], route=route, bound=bound["mtr"]
))
elif company_id == "lightRail" and "lightRail" in stops:
_etas.extend(self.lightrail(
- stop_id=stops['lightRail'][seq], route=route, dest=dest
+ stop_id=stops['lightRail'][seq], route=route, dest=dest
))
elif company_id == "gmb" and "gmb" in stops:
- _etas.extend(self.gmb(
- stop_id=stops["gmb"][seq], gtfs_id=gtfs_id, seq=seq, bound=bound["gmb"]
- ))
+ _etas.extend(
+ self.gmb(
+ stop_id=stops["gmb"][seq],
+ gtfs_id=gtfs_id,
+ seq=seq,
+ bound=bound["gmb"]))
return _etas
-
- def kmb(self, stop_id, route, seq, service_type, co, bound ):
- data = requests.get("https://data.etabus.gov.hk/v1/transport/kmb/eta/{}/{}/{}".format(stop_id, route, service_type)).json()['data']
+
+ def kmb(self, stop_id, route, seq, service_type, co, bound):
+ data = requests.get(
+ "https://data.etabus.gov.hk/v1/transport/kmb/eta/{}/{}/{}".format(
+ stop_id, route, service_type)).json()['data']
data = list(filter(lambda e: 'eta' in e and e['dir'] == bound, data))
data.sort(key=lambda e: abs(seq - e['seq']))
data = [e for e in data if e['seq'] == data[0]['seq']]
- data = list(filter(lambda e: len(co) > 1 or service_type == e['service_type'] or e['seq'] == seq + 1,data))
+ data = list(filter(lambda e: len(co) > 1 or service_type ==
+ e['service_type'] or e['seq'] == seq + 1, data))
return [{
- "eta": e['eta'],
- "remark": {
- "zh": e['rmk_tc'],
- "en": e['rmk_en']
- },
- "co": "kmb"
+ "eta": e['eta'],
+ "remark": {
+ "zh": e['rmk_tc'],
+ "en": e['rmk_en']
+ },
+ "co": "kmb"
} for e in data]
-
+
def ctb(self, stop_id, route, bound, seq):
- data = requests.get("https://rt.data.gov.hk/v2/transport/citybus/eta/CTB/{}/{}".format(stop_id, route)).json()['data']
+ data = requests.get(
+ "https://rt.data.gov.hk/v2/transport/citybus/eta/CTB/{}/{}".format(
+ stop_id, route)).json()['data']
data = list(filter(lambda e: 'eta' in e and e['dir'] in bound, data))
data.sort(key=lambda e: abs(seq - e['seq']))
data = [e for e in data if e['seq'] == data[0]['seq']]
return [{
- "eta": e['eta'],
- "remark": {
- "zh": e['rmk_tc'],
- "en": e['rmk_en']
- },
- "co": "ctb"
+ "eta": e['eta'],
+ "remark": {
+ "zh": e['rmk_tc'],
+ "en": e['rmk_en']
+ },
+ "co": "ctb"
}for e in data]
def nlb(self, stop_id, nlb_id):
try:
- data = requests.post("https://rt.data.gov.hk/v1/transport/nlb/stop.php?action=estimatedArrivals", json={
- "routeId": nlb_id,
- "stopId": stop_id,
- "language": "zh"
- }, headers={
- "Content-Type": "text/plain"
- }).json()["estimatedArrivals"]
+ data = requests.post(
+ "https://rt.data.gov.hk/v1/transport/nlb/stop.php?action=estimatedArrivals",
+ json={
+ "routeId": nlb_id,
+ "stopId": stop_id,
+ "language": "zh"},
+ headers={
+ "Content-Type": "text/plain"}).json()["estimatedArrivals"]
data = list(filter(lambda e: 'estimatedArrivalTime' in e, data))
return [{
- "eta": e['estimatedArrivalTime'].replace(' ', 'T') + ".000+08:00",
- "remark": {
- "zh": "",
- "en": ""
- },
- "co": "nlb"
+ "eta": e['estimatedArrivalTime'].replace(' ', 'T') + ".000+08:00",
+ "remark": {
+ "zh": "",
+ "en": ""
+ },
+ "co": "nlb"
} for e in data]
except Exception as e:
return []
-
+
def lrtfeeder(self, stop_id, route, language):
- data = requests.post("https://rt.data.gov.hk/v1/transport/mtr/bus/getSchedule", json={
- "language": language,
- "routeName": route
- }, headers={
- "Content-Type": "application/json"
- }).json()['busStop']
+ data = requests.post(
+ "https://rt.data.gov.hk/v1/transport/mtr/bus/getSchedule",
+ json={
+ "language": language,
+ "routeName": route},
+ headers={
+ "Content-Type": "application/json"}).json()['busStop']
data = list(filter(lambda e: e["busStopId"] == stop_id, data))
ret = []
for buses in data:
@@ -150,75 +165,88 @@ def lrtfeeder(self, stop_id, route, language):
remark = bus["busRemark"]
elif bus["isScheduled"] == 1:
remark = "Scheduled" if language == "en" else "預定班次"
- delta_second = int(bus["departureTimeInSecond"] if bus['arrivalTimeInSecond'] == "108000" else bus["arrivalTimeInSecond"])
- dt = datetime.fromtimestamp(time.time() + delta_second + 8 * 3600 )
-
+ delta_second = int(bus["departureTimeInSecond"] if bus['arrivalTimeInSecond']
+ == "108000" else bus["arrivalTimeInSecond"])
+ dt = datetime.fromtimestamp(time.time() + delta_second + 8 * 3600)
+
ret.append({
- "eta": dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+08:00"),
- "remark": {
- language: remark
- },
- "co": "lrtfeeder"
+ "eta": dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+08:00"),
+ "remark": {
+ language: remark
+ },
+ "co": "lrtfeeder"
})
return ret
-
+
def mtr(self, stop_id, route, bound):
- res = requests.get("https://rt.data.gov.hk/v1/transport/mtr/getSchedule.php?line={}&sta={}".format(route, stop_id)).json()
+ res = requests.get(
+ "https://rt.data.gov.hk/v1/transport/mtr/getSchedule.php?line={}&sta={}".format(
+ route, stop_id)).json()
data, status = res["data"], res["status"]
-
+
if status == 0:
return []
ret = []
- for e in data["{}-{}".format(route, stop_id)]["UP" if bound[-2:1] == "U" else "DOWN"]:
+ for e in data["{}-{}".format(route, stop_id)
+ ]["UP" if bound[-2:1] == "U" else "DOWN"]:
ret.append({
- "eta": e["time"].replace(" ", "T") + "+08:00",
- "remark": {
- "zh": get_platform_display(e["plat"], "zh"),
- "en": get_platform_display(e["plat"], "en")
- },
- "co": "mtr"
+ "eta": e["time"].replace(" ", "T") + "+08:00",
+ "remark": {
+ "zh": get_platform_display(e["plat"], "zh"),
+ "en": get_platform_display(e["plat"], "en")
+ },
+ "co": "mtr"
})
return ret
-
+
def lightrail(self, stop_id, route, dest):
- platform_list = requests.get("https://rt.data.gov.hk/v1/transport/mtr/lrt/getSchedule?station_id={}".format(stop_id[2:])).json()["platform_list"]
+ platform_list = requests.get(
+ "https://rt.data.gov.hk/v1/transport/mtr/lrt/getSchedule?station_id={}".format(stop_id[2:])).json()["platform_list"]
ret = []
for platform in platform_list:
route_list, platform_id = platform["route_list"], platform["platform_id"]
for e in route_list:
- route_no, dest_ch, dest_en, stop, time_en = e["route_no"], e["dest_ch"], e["dest_en"], e["stop"], e["time_en"]
- if route == route_no and ( dest_ch == dest["zh"] or "Circular" in dest_en ) and stop == 0:
+ route_no, dest_ch, dest_en, stop, time_en = e["route_no"], e[
+ "dest_ch"], e["dest_en"], e["stop"], e["time_en"]
+ if route == route_no and (
+ dest_ch == dest["zh"] or "Circular" in dest_en) and stop == 0:
waitTime = 0
if time_en.lower() == "arriving" or time_en.lower() == "departing" or time_en == "-":
waitTime = 0
else:
waitTime = int(re.search(r'\d+', time_en).group())
- dt = datetime.fromtimestamp(time.time() + waitTime + 8 * 3600 )
+ dt = datetime.fromtimestamp(time.time() + waitTime + 8 * 3600)
ret.append({
- "eta": dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+08:00"),
- "remark": {
- "zh": get_platform_display(platform_id, "zh"),
- "en": get_platform_display(platform_id, "en")
- },
- "co": "lightrail"
+ "eta": dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+08:00"),
+ "remark": {
+ "zh": get_platform_display(platform_id, "zh"),
+ "en": get_platform_display(platform_id, "en")
+ },
+ "co": "lightrail"
})
return ret
def gmb(self, gtfs_id, stop_id, bound, seq):
- data = requests.get("https://data.etagmb.gov.hk/eta/route-stop/{}/{}".format(gtfs_id, stop_id)).json()["data"]
- data = list(filter(lambda e: (e['route_seq'] == 1 and bound == "O") or (e['route_seq'] == 2 and bound == "I"), data))
+ data = requests.get(
+ "https://data.etagmb.gov.hk/eta/route-stop/{}/{}".format(gtfs_id, stop_id)).json()["data"]
+ data = list(
+ filter(
+ lambda e: (
+ e['route_seq'] == 1 and bound == "O") or (
+ e['route_seq'] == 2 and bound == "I"),
+ data))
data = list(filter(lambda e: e["stop_seq"] == seq + 1, data))
ret = []
for e in data:
etas = e["eta"]
for eta in etas:
ret.append({
- "eta": eta["timestamp"],
- "remark": {
- "zh": eta["remarks_tc"],
- "en": eta["remarks_en"],
- },
- "co": "gmb"
+ "eta": eta["timestamp"],
+ "remark": {
+ "zh": eta["remarks_tc"],
+ "en": eta["remarks_en"],
+ },
+ "co": "gmb"
})
return ret
@@ -226,4 +254,4 @@ def gmb(self, gtfs_id, stop_id, bound, seq):
if __name__ == "__main__":
hketa = HKEta()
route_ids = list(hketa.route_list.keys())
- print(route_ids[0:10])
\ No newline at end of file
+ print(route_ids[0:10])
diff --git a/setup.py b/setup.py
index 7fd697de..df12890f 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
here = os.path.abspath(os.path.dirname(__file__))
with codecs.open(os.path.join(here, "README.md"), encoding="utf-8") as fh:
- long_description = "\n" + fh.read()
+ long_description = "\n" + fh.read()
VERSION = '2.1.5'
DESCRIPTION = 'Query the ETA (Estimated Time of Arrival) of HK Bus/Minibus/MTR/Lightrail'
@@ -21,7 +21,17 @@
long_description=long_description,
packages=find_packages(),
install_requires=['requests'],
- keywords=['python', 'hongkong', 'eta', 'estimated time of arrival', 'kmb', 'nlb', 'mtr', 'ctb', 'minibus', 'lightrail'],
+ keywords=[
+ 'python',
+ 'hongkong',
+ 'eta',
+ 'estimated time of arrival',
+ 'kmb',
+ 'nlb',
+ 'mtr',
+ 'ctb',
+ 'minibus',
+ 'lightrail'],
classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
@@ -30,5 +40,4 @@
"Operating System :: Unix",
"Operating System :: MacOS :: MacOS X",
"Operating System :: Microsoft :: Windows",
- ]
-)
\ No newline at end of file
+ ])
diff --git a/tools/normalize_json.py b/tools/normalize_json.py
index 634889bc..6bae7879 100644
--- a/tools/normalize_json.py
+++ b/tools/normalize_json.py
@@ -3,18 +3,18 @@
def main(route_fare_list_json: str):
- """
- Simple tool to normalize the routeFareList.json for easier comparison. The normalized JSON will be written to the same directory with `.norm` added.
- """
- normalized_json_name = f"{route_fare_list_json}.norm"
- with open(route_fare_list_json) as f:
- route_fare_list = json.load(f)
- route_fare_list['holidays'] = sorted(route_fare_list['holidays'])
+ """
+ Simple tool to normalize the routeFareList.json for easier comparison. The normalized JSON will be written to the same directory with `.norm` added.
+ """
+ normalized_json_name = f"{route_fare_list_json}.norm"
+ with open(route_fare_list_json) as f:
+ route_fare_list = json.load(f)
+ route_fare_list['holidays'] = sorted(route_fare_list['holidays'])
- with open(normalized_json_name, 'w') as f:
- json.dump(route_fare_list, f, sort_keys=True,
- indent=4, ensure_ascii=False)
+ with open(normalized_json_name, 'w') as f:
+ json.dump(route_fare_list, f, sort_keys=True,
+ indent=4, ensure_ascii=False)
if __name__ == '__main__':
- typer.run(main)
+ typer.run(main)