diff --git a/crawling/cleansing.py b/crawling/cleansing.py index 4d2aad40..099b7e2a 100644 --- a/crawling/cleansing.py +++ b/crawling/cleansing.py @@ -1,26 +1,31 @@ import json + def isNameMatch(name_a, name_b): tmp_a = name_a.lower() tmp_b = name_b.lower() return tmp_a.find(tmp_b) >= 0 or tmp_b.find(tmp_a) >= 0 + def countBus(freq): - if freq is None: return 0 + if freq is None: + return 0 sum = 0 for entries in freq.values(): for startTime, v in entries.items(): - if v is None: + if v is None: sum += 1 continue endTime, waitTime = v - sum += int ( ( int(endTime[0:2]) - int(startTime[0:2]) ) * 60 + int(endTime[2:4]) - int(startTime[2:4]) ) / ( int(waitTime) / 60 ) + sum += int((int(endTime[0:2]) - int(startTime[0:2])) * 60 + \ + int(endTime[2:4]) - int(startTime[2:4])) / (int(waitTime) / 60) return sum + def cleansing(co): with open('routeFareList.%s.json' % co, 'r', encoding='UTF-8') as f: routeList = json.load(f) - + for i in range(len(routeList)): route = routeList[i] route["co"] = [co for co in route["co"] if co != "ferry"] @@ -28,11 +33,18 @@ def cleansing(co): continue bestIdx, maxBus = -1, 0 for j in range(len(routeList)): - if i == j: continue + if i == j: + continue _route = routeList[j] - if route['route'] == _route['route'] and sorted(route['co']) == sorted(_route['co']) and \ - isNameMatch(route['orig_en'], _route['orig_en']) and isNameMatch(route['dest_en'], _route['dest_en']): - if 'freq' not in _route: continue + if route['route'] == _route['route'] and sorted( + route['co']) == sorted( + _route['co']) and isNameMatch( + route['orig_en'], + _route['orig_en']) and isNameMatch( + route['dest_en'], + _route['dest_en']): + if 'freq' not in _route: + continue bus = countBus(_route['freq']) if bus > maxBus: bestIdx = j @@ -42,19 +54,19 @@ def cleansing(co): routeList[i]['skip'] = True _routeList = [route for route in routeList if 'skip' not in route] - print (co, len(routeList), len(_routeList)) - + print(co, len(routeList), len(_routeList)) + with open('routeFareList.%s.cleansed.json' % co, 'w', encoding='UTF-8') as f: f.write(json.dumps(_routeList, ensure_ascii=False)) - - -cleansing ('kmb') -cleansing ('ctb') -cleansing ('nlb') -cleansing ('lrtfeeder') -cleansing ('gmb') -cleansing ('lightRail') -cleansing ('mtr') -cleansing ('sunferry') + + +cleansing('kmb') +cleansing('ctb') +cleansing('nlb') +cleansing('lrtfeeder') +cleansing('gmb') +cleansing('lightRail') +cleansing('mtr') +cleansing('sunferry') cleansing('fortuneferry') -cleansing ('hkkf') \ No newline at end of file +cleansing('hkkf') diff --git a/crawling/crawl_utils.py b/crawling/crawl_utils.py index caca1e13..cfff0ff8 100644 --- a/crawling/crawl_utils.py +++ b/crawling/crawl_utils.py @@ -4,11 +4,12 @@ import logging import os -logger=logging.getLogger(__name__) +logger = logging.getLogger(__name__) -async def emitRequest(url:str,client: httpx.AsyncClient, headers={}): - RETRY_TIMEOUT_MAX=60 - retry_timeout=1 + +async def emitRequest(url: str, client: httpx.AsyncClient, headers={}): + RETRY_TIMEOUT_MAX = 60 + retry_timeout = 1 # retry if "Too many request (429)" while True: try: @@ -16,31 +17,34 @@ async def emitRequest(url:str,client: httpx.AsyncClient, headers={}): if r.status_code == 200: return r elif r.status_code in (429, 502, 504): - logger.warning(f"status_code={r.status_code}, wait {retry_timeout} and retry. URL={url}") + logger.warning( + f"status_code={r.status_code}, wait {retry_timeout} and retry. URL={url}") await asyncio.sleep(retry_timeout) - retry_timeout = min (retry_timeout * 2, RETRY_TIMEOUT_MAX) + retry_timeout = min(retry_timeout * 2, RETRY_TIMEOUT_MAX) else: r.raise_for_status() raise Exception(r.status_code, url) except (httpx.PoolTimeout, httpx.ReadTimeout, httpx.ReadError) as e: - logger.warning(f"Exception {repr(e)} occurred, wait {retry_timeout} and retry. URL={url}") + logger.warning( + f"Exception {repr(e)} occurred, wait {retry_timeout} and retry. URL={url}") await asyncio.sleep(retry_timeout) - retry_timeout = min (retry_timeout * 2, RETRY_TIMEOUT_MAX) + retry_timeout = min(retry_timeout * 2, RETRY_TIMEOUT_MAX) def get_request_limit(): default_limit = "10" return int(os.environ.get('REQUEST_LIMIT', default_limit)) + def store_version(key: str, version: str): logger.info(f"{key} version: {version}") # "0" is prepended in filename so that this file appears first in Github directory listing try: with open('0versions.json', 'r') as f: version_dict = json.load(f) - except: + except BaseException: version_dict = {} version_dict[key] = version version_dict = dict(sorted(version_dict.items())) with open('0versions.json', 'w', encoding='UTF-8') as f: - json.dump(version_dict, f, indent=4) \ No newline at end of file + json.dump(version_dict, f, indent=4) diff --git a/crawling/ctb.py b/crawling/ctb.py index ce194647..a76d4978 100644 --- a/crawling/ctb.py +++ b/crawling/ctb.py @@ -9,94 +9,96 @@ logger = logging.getLogger(__name__) + async def getRouteStop(co): - a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None)) - # define output name - ROUTE_LIST = 'routeList.'+co+'.json' - STOP_LIST = 'stopList.'+co+'.json' - - # load route list and stop list if exist - routeList = {} - if path.isfile(ROUTE_LIST): - return - else: - # load routes - r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/route/'+co, a_client) - routeList = r.json()['data'] - - _stops = [] - stopList = {} - if path.isfile(STOP_LIST): - with open(STOP_LIST, 'r', encoding='UTF-8') as f: - stopList = json.load(f) - - # function to load single stop info - req_stop_list_limit = asyncio.Semaphore(get_request_limit()) - async def getStop ( stopId ): - async with req_stop_list_limit: - r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/stop/'+stopId, a_client) - return r.json()['data'] - - # function to async load multiple stops info - async def getStopList ( stops ): - ret = await asyncio.gather(*[getStop(stop) for stop in stops]) - return ret - - req_route_stop_limit = asyncio.Semaphore(get_request_limit()) - async def getRouteStop(param): - co, route = param - if route.get('bound', 0) != 0 or route.get('stops', {}): - return route - route['stops'] = {} - for direction in ['inbound', 'outbound']: - r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/route-stop/'+co.upper()+'/'+route['route']+"/"+direction, a_client) - route['stops'][direction] = [stop['stop'] for stop in r.json()['data']] - return route - - async def getRouteStopList (): - ret = await asyncio.gather(*[getRouteStop((co, route)) - for route in routeList]) - return ret - - routeList = await getRouteStopList() - for route in routeList: - for direction, stops in route['stops'].items(): - for stopId in stops: - _stops.append(stopId) - - # load stops for this route aync - _stops = list(set(_stops)) - _stops.sort() - - stopInfos = list( zip ( _stops, await getStopList(_stops)) ) - for stopId, stopInfo in stopInfos: - stopList[stopId] = stopInfo - - _routeList = [] - for route in routeList: - if route.get('bound', 0) != 0: - _routeList.append(route) - continue - for bound in ['inbound', 'outbound']: - if len(route['stops'][bound]) > 0: - _routeList.append({ - 'co': co, - 'route': route['route'], - 'bound': 'O' if bound == 'outbound' else 'I', - 'orig_en': route['orig_en'] if bound == 'outbound' else route['dest_en'], - 'orig_tc': route['orig_tc'] if bound == 'outbound' else route['dest_tc'], - 'dest_en': route['dest_en'] if bound == 'outbound' else route['orig_en'], - 'dest_tc': route['dest_tc'] if bound == 'outbound' else route['orig_tc'], - 'stops': list(filter(lambda stopId: bool(stopList[stopId]), route['stops'][bound])), - 'serviceType': 0 - }) - - with open(ROUTE_LIST, 'w', encoding='UTF-8') as f: - f.write(json.dumps(_routeList, ensure_ascii=False)) - with open(STOP_LIST, 'w', encoding='UTF-8') as f: - f.write(json.dumps(stopList, ensure_ascii=False)) - -if __name__=='__main__': - logging.basicConfig(level=logging.INFO) - logging.getLogger('httpx').setLevel(logging.WARNING) - asyncio.run(getRouteStop('ctb')) + a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None)) + # define output name + ROUTE_LIST = 'routeList.' + co + '.json' + STOP_LIST = 'stopList.' + co + '.json' + + # load route list and stop list if exist + routeList = {} + if path.isfile(ROUTE_LIST): + return + else: + # load routes + r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/route/' + co, a_client) + routeList = r.json()['data'] + + _stops = [] + stopList = {} + if path.isfile(STOP_LIST): + with open(STOP_LIST, 'r', encoding='UTF-8') as f: + stopList = json.load(f) + + # function to load single stop info + req_stop_list_limit = asyncio.Semaphore(get_request_limit()) + + async def getStop(stopId): + async with req_stop_list_limit: + r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/stop/' + stopId, a_client) + return r.json()['data'] + + # function to async load multiple stops info + async def getStopList(stops): + ret = await asyncio.gather(*[getStop(stop) for stop in stops]) + return ret + + req_route_stop_limit = asyncio.Semaphore(get_request_limit()) + + async def getRouteStop(param): + co, route = param + if route.get('bound', 0) != 0 or route.get('stops', {}): + return route + route['stops'] = {} + for direction in ['inbound', 'outbound']: + r = await emitRequest('https://rt.data.gov.hk/v2/transport/citybus/route-stop/' + co.upper() + '/' + route['route'] + "/" + direction, a_client) + route['stops'][direction] = [stop['stop'] for stop in r.json()['data']] + return route + + async def getRouteStopList(): + ret = await asyncio.gather(*[getRouteStop((co, route)) + for route in routeList]) + return ret + + routeList = await getRouteStopList() + for route in routeList: + for direction, stops in route['stops'].items(): + for stopId in stops: + _stops.append(stopId) + + # load stops for this route aync + _stops = sorted(set(_stops)) + + stopInfos = list(zip(_stops, await getStopList(_stops))) + for stopId, stopInfo in stopInfos: + stopList[stopId] = stopInfo + + _routeList = [] + for route in routeList: + if route.get('bound', 0) != 0: + _routeList.append(route) + continue + for bound in ['inbound', 'outbound']: + if len(route['stops'][bound]) > 0: + _routeList.append({ + 'co': co, + 'route': route['route'], + 'bound': 'O' if bound == 'outbound' else 'I', + 'orig_en': route['orig_en'] if bound == 'outbound' else route['dest_en'], + 'orig_tc': route['orig_tc'] if bound == 'outbound' else route['dest_tc'], + 'dest_en': route['dest_en'] if bound == 'outbound' else route['orig_en'], + 'dest_tc': route['dest_tc'] if bound == 'outbound' else route['orig_tc'], + 'stops': list(filter(lambda stopId: bool(stopList[stopId]), route['stops'][bound])), + 'serviceType': 0 + }) + + with open(ROUTE_LIST, 'w', encoding='UTF-8') as f: + f.write(json.dumps(_routeList, ensure_ascii=False)) + with open(STOP_LIST, 'w', encoding='UTF-8') as f: + f.write(json.dumps(stopList, ensure_ascii=False)) + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + logging.getLogger('httpx').setLevel(logging.WARNING) + asyncio.run(getRouteStop('ctb')) diff --git a/crawling/fortuneferry.py b/crawling/fortuneferry.py index 8a8d9822..188cd13b 100644 --- a/crawling/fortuneferry.py +++ b/crawling/fortuneferry.py @@ -11,11 +11,11 @@ gtfsEn = json.load(f) routes = { - "7059": ["中環", "紅磡"], - "7021": ["北角", "啟德"], - "7056": ["北角", "觀塘"], - "7025": ["屯門", "大澳"], - "7000004": ["東涌", "大澳"], + "7059": ["中環", "紅磡"], + "7021": ["北角", "啟德"], + "7056": ["北角", "觀塘"], + "7025": ["屯門", "大澳"], + "7000004": ["東涌", "大澳"], } routeList = [] @@ -24,42 +24,43 @@ for [route_code, [orig, dest]] in routes.items(): for route_id, gtfsRoute in gtfsRoutes.items(): if "ferry" in gtfsRoute["co"]: - if orig.lower() == gtfsRoute["orig"]["zh"].lower() and dest.lower() == gtfsRoute["dest"]["zh"].lower(): + if orig.lower() == gtfsRoute["orig"]["zh"].lower( + ) and dest.lower() == gtfsRoute["dest"]["zh"].lower(): routeList.append({ - "gtfsId": route_id, - "route": route_code, - "orig_tc": gtfsRoute["orig"]["zh"], - "orig_en": gtfsEn["routeList"][route_id]["orig"]["en"], - "dest_tc": gtfsRoute["dest"]["zh"], - "dest_en": gtfsEn["routeList"][route_id]["dest"]["en"], - "service_type": 1, - "bound": "O", - "stops": gtfsRoute["stops"]["1"], - "freq": gtfsRoute["freq"]["1"], - }) - if "2" in gtfsRoute["freq"]: - routeList.append({ "gtfsId": route_id, "route": route_code, - "dest_tc": gtfsRoute["orig"]["zh"], - "dest_en": gtfsEn["routeList"][route_id]["orig"]["en"], - "orig_tc": gtfsRoute["dest"]["zh"], - "orig_en": gtfsEn["routeList"][route_id]["dest"]["en"], + "orig_tc": gtfsRoute["orig"]["zh"], + "orig_en": gtfsEn["routeList"][route_id]["orig"]["en"], + "dest_tc": gtfsRoute["dest"]["zh"], + "dest_en": gtfsEn["routeList"][route_id]["dest"]["en"], "service_type": 1, - "bound": "I", - "stops": gtfsRoute["stops"]["2"] if "2" in gtfsRoute["stops"] else gtfsRoute["stops"]["1"][::-1], - "freq": gtfsRoute["freq"]["2"] if "2" in gtfsRoute["freq"] else {}, + "bound": "O", + "stops": gtfsRoute["stops"]["1"], + "freq": gtfsRoute["freq"]["1"], + }) + if "2" in gtfsRoute["freq"]: + routeList.append({ + "gtfsId": route_id, + "route": route_code, + "dest_tc": gtfsRoute["orig"]["zh"], + "dest_en": gtfsEn["routeList"][route_id]["orig"]["en"], + "orig_tc": gtfsRoute["dest"]["zh"], + "orig_en": gtfsEn["routeList"][route_id]["dest"]["en"], + "service_type": 1, + "bound": "I", + "stops": gtfsRoute["stops"]["2"] if "2" in gtfsRoute["stops"] else gtfsRoute["stops"]["1"][::-1], + "freq": gtfsRoute["freq"]["2"] if "2" in gtfsRoute["freq"] else {}, }) for route in routeList: for stopId in route["stops"]: stopList[stopId] = { - "stop": stopId, - "name_en": gtfsEn["stopList"][stopId]["stopName"]["unknown"], - "name_tc": gtfsStops[stopId]["stopName"]["unknown"], - "lat": gtfsStops[stopId]["lat"], - "long": gtfsStops[stopId]["lng"], + "stop": stopId, + "name_en": gtfsEn["stopList"][stopId]["stopName"]["unknown"], + "name_tc": gtfsStops[stopId]["stopName"]["unknown"], + "lat": gtfsStops[stopId]["lat"], + "long": gtfsStops[stopId]["lng"], } with open('routeList.fortuneferry.json', 'w', encoding='UTF-8') as f: diff --git a/crawling/gmb.py b/crawling/gmb.py index c5b79d14..bce4c084 100644 --- a/crawling/gmb.py +++ b/crawling/gmb.py @@ -10,6 +10,7 @@ logger = logging.getLogger(__name__) + async def getRouteStop(co): a_client = httpx.AsyncClient() # parse gtfs service_id @@ -18,7 +19,14 @@ async def getRouteStop(co): reader = csv.reader(csvfile) headers = next(reader, None) for [service_id, mon, tue, wed, thur, fri, sat, sun, *tmp] in reader: - serviceIdMap[service_id] = [mon == "1", tue == "1", wed == "1", thur == "1", fri == "1", sat == "1", sun == "1"] + serviceIdMap[service_id] = [ + mon == "1", + tue == "1", + wed == "1", + thur == "1", + fri == "1", + sat == "1", + sun == "1"] serviceIdMap["111"] = [True, True, False, True, True, True, True] def mapServiceId(weekdays, serviceIdMap_a): @@ -31,12 +39,12 @@ def mapServiceId(weekdays, serviceIdMap_a): def getFreq(headways, serviceIdMap_a): freq = {} for headway in headways: - service_id = mapServiceId( headway['weekdays'] , serviceIdMap_a) + service_id = mapServiceId(headway['weekdays'], serviceIdMap_a) if service_id not in freq: freq[service_id] = {} freq[service_id][headway['start_time'].replace(':', '')[:4]] = [ - headway['end_time'].replace(':', '')[:4], - str(headway['frequency'] * 60) + headway['end_time'].replace(':', '')[:4], + str(headway['frequency'] * 60) ] if headway['frequency'] is not None else None return freq @@ -48,7 +56,7 @@ def getFreq(headways, serviceIdMap_a): async def get_route_directions(route, route_no): service_type = 2 for direction in route['directions']: - rs = await emitRequest('https://data.etagmb.gov.hk/route-stop/'+str(route['route_id'])+'/'+str(direction['route_seq']), a_client) + rs = await emitRequest('https://data.etagmb.gov.hk/route-stop/' + str(route['route_id']) + '/' + str(direction['route_seq']), a_client) for stop in rs.json()['data']['route_stops']: stop_id = stop['stop_id'] @@ -118,41 +126,42 @@ async def get_route_directions(route, route_no): "name_tc": useNameTc, } routeList.append({ - "gtfsId": str(route['route_id']), - "route": route_no, - "orig_tc": direction['orig_tc'], - "orig_en": direction['orig_en'], - "dest_tc": direction['dest_tc'], - "dest_en": direction['dest_en'], - "bound": 'O' if direction['route_seq'] == 1 else 'I', - "service_type": 1 if route["description_tc"] == '正常班次' else service_type, - "stops": [str(stop['stop_id']) for stop in rs.json()['data']['route_stops']], - "freq": getFreq(direction['headways'], serviceIdMap) + "gtfsId": str(route['route_id']), + "route": route_no, + "orig_tc": direction['orig_tc'], + "orig_en": direction['orig_en'], + "dest_tc": direction['dest_tc'], + "dest_en": direction['dest_en'], + "bound": 'O' if direction['route_seq'] == 1 else 'I', + "service_type": 1 if route["description_tc"] == '正常班次' else service_type, + "stops": [str(stop['stop_id']) for stop in rs.json()['data']['route_stops']], + "freq": getFreq(direction['headways'], serviceIdMap) }) - #print(routeList) + # print(routeList) if route["description_tc"] != '正常班次': service_type += 1 - + req_route_limit = asyncio.Semaphore(get_request_limit()) - async def get_route(region:str, route_no): + + async def get_route(region: str, route_no): async with req_route_limit: - r = await emitRequest('https://data.etagmb.gov.hk/route/'+region+'/'+route_no, a_client) + r = await emitRequest('https://data.etagmb.gov.hk/route/' + region + '/' + route_no, a_client) await asyncio.gather(*[get_route_directions(route, route_no) for route in r.json()['data']]) - routeList.sort(key = lambda a: a['gtfsId']) + routeList.sort(key=lambda a: a['gtfsId']) req_route_region_limit = asyncio.Semaphore(get_request_limit()) + async def get_routes_region(region: str): async with req_route_region_limit: - r = await emitRequest('https://data.etagmb.gov.hk/route/'+region, a_client) + r = await emitRequest('https://data.etagmb.gov.hk/route/' + region, a_client) await asyncio.gather(*[get_route(region, route) for route in r.json()['data']['routes']]) - + await asyncio.gather(*[get_routes_region(r) for r in ['HKI', 'KLN', "NT"]]) with open(f'routeList.{co}.json', 'w', encoding='UTF-8') as f: json.dump(routeList, f, ensure_ascii=False) logger.info("Route done") - req_stops_limit = asyncio.Semaphore(get_request_limit()) with open("gtfs.json", "r", encoding='UTF-8') as f: gtfs = json.load(f) @@ -162,9 +171,11 @@ async def update_stop_loc(stop_id): if stop_id not in gtfsStops: logger.info(f"Getting stop {stop_id} from etagmb") async with req_stops_limit: - r = await emitRequest('https://data.etagmb.gov.hk/stop/'+str(stop_id), a_client) - stops[stop_id]['lat'] = r.json()['data']['coordinates']['wgs84']['latitude'] - stops[stop_id]['long'] = r.json()['data']['coordinates']['wgs84']['longitude'] + r = await emitRequest('https://data.etagmb.gov.hk/stop/' + str(stop_id), a_client) + stops[stop_id]['lat'] = r.json( + )['data']['coordinates']['wgs84']['latitude'] + stops[stop_id]['long'] = r.json( + )['data']['coordinates']['wgs84']['longitude'] else: logger.debug(f"Getting stop {stop_id} from gtfs") stops[stop_id]['lat'] = gtfsStops[stop_id]['lat'] @@ -173,7 +184,7 @@ async def update_stop_loc(stop_id): await asyncio.gather(*[update_stop_loc(stop_id) for stop_id in sorted(stops.keys())]) with open(f'stopList.{co}.json', 'w', encoding='UTF-8') as f: - json.dump(stops,f, ensure_ascii=False) + json.dump(stops, f, ensure_ascii=False) for stop in stopCandidates: stopCandidates[stop]["tc_others"].discard(stopCandidates[stop]["tc_used"]) stopCandidates[stop]["tc_others"] = sorted( @@ -184,11 +195,11 @@ async def update_stop_loc(stop_id): with open(f'stopCandidates.{co}.json', 'w', encoding='UTF-8') as f: def set_default(obj): if isinstance(obj, set): - return list(obj) + return list(obj) raise TypeError json.dump(stopCandidates, f, ensure_ascii=False, default=set_default) -if __name__=='__main__': - logging.basicConfig(level=logging.INFO) - logging.getLogger('httpx').setLevel(logging.WARNING) - asyncio.run(getRouteStop('gmb')) +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + logging.getLogger('httpx').setLevel(logging.WARNING) + asyncio.run(getRouteStop('gmb')) diff --git a/crawling/hkkf.py b/crawling/hkkf.py index 1ef45350..963e09bd 100644 --- a/crawling/hkkf.py +++ b/crawling/hkkf.py @@ -10,18 +10,20 @@ from crawl_utils import emitRequest routes = { - "1": ["Central Pier 4", "Sok Kwu Wan"], - "2": ["Central Pier 4", "Yung Shue Wan"], - "3": ["Central Pier 6", "Peng Chau"], - "4": ["Peng Chau", "Hei Ling Chau"], + "1": ["Central Pier 4", "Sok Kwu Wan"], + "2": ["Central Pier 4", "Yung Shue Wan"], + "3": ["Central Pier 6", "Peng Chau"], + "4": ["Peng Chau", "Hei Ling Chau"], } + def parseStop(name_en, apiStops): for stop in apiStops: if stop["name_en"].startswith(name_en): return stop raise Exception("Undefined stop") + async def getRouteStop(co): a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None)) routeList = [] @@ -30,11 +32,10 @@ async def getRouteStop(co): r = await emitRequest('https://www.hkkfeta.com/opendata/route/', a_client) apiRoutes = r.json()['data'] apiStops = [] - for stopId in [1,2,3,4,5,6]: - stop = (await emitRequest('https://www.hkkfeta.com/opendata/pier/'+str(stopId), a_client)).json()["data"] + for stopId in [1, 2, 3, 4, 5, 6]: + stop = (await emitRequest('https://www.hkkfeta.com/opendata/pier/' + str(stopId), a_client)).json()["data"] apiStops.append(stop) - with open("gtfs.json", 'r', encoding="utf-8") as f: gtfsZh = json.load(f) @@ -47,41 +48,41 @@ async def getRouteStop(co): orig = parseStop(routes[str(apiRoute["route_id"])][0], apiStops) dest = parseStop(routes[str(apiRoute["route_id"])][1], apiStops) routeList.append({ - "route": "KF" + str(apiRoute["route_id"]), - "orig_tc": orig["name_tc"], - "orig_en": orig["name_en"], - "dest_tc": dest["name_tc"], - "dest_en": dest["name_en"], - "service_type": 1, - "bound": "O", - "stops": [ - "KF" + str(orig["pier_id"]), - "KF" + str(dest["pier_id"]), - ], - "co": "hkkf", + "route": "KF" + str(apiRoute["route_id"]), + "orig_tc": orig["name_tc"], + "orig_en": orig["name_en"], + "dest_tc": dest["name_tc"], + "dest_en": dest["name_en"], + "service_type": 1, + "bound": "O", + "stops": [ + "KF" + str(orig["pier_id"]), + "KF" + str(dest["pier_id"]), + ], + "co": "hkkf", }) routeList.append({ - "route": "KF" + str(apiRoute["route_id"]), - "orig_tc": dest["name_tc"], - "orig_en": dest["name_en"], - "dest_tc": orig["name_tc"], - "dest_en": orig["name_en"], - "service_type": 1, - "bound": "I", - "stops": [ - "KF" + str(dest["pier_id"]), - "KF" + str(orig["pier_id"]), - ], - "co": "hkkf", + "route": "KF" + str(apiRoute["route_id"]), + "orig_tc": dest["name_tc"], + "orig_en": dest["name_en"], + "dest_tc": orig["name_tc"], + "dest_en": orig["name_en"], + "service_type": 1, + "bound": "I", + "stops": [ + "KF" + str(dest["pier_id"]), + "KF" + str(orig["pier_id"]), + ], + "co": "hkkf", }) for apiStop in apiStops: - stopList["KF"+str(apiStop["pier_id"])] = { - "stop": "KF"+str(apiStop["pier_id"]), - "name_en": apiStop["name_en"], - "name_tc": apiStop["name_tc"], - "lat": apiStop["lat"], - "long": apiStop["long"] + stopList["KF" + str(apiStop["pier_id"])] = { + "stop": "KF" + str(apiStop["pier_id"]), + "name_en": apiStop["name_en"], + "name_tc": apiStop["name_tc"], + "lat": apiStop["lat"], + "long": apiStop["long"] } with open('routeList.hkkf.json', 'w', encoding="utf-8") as f: @@ -90,7 +91,7 @@ async def getRouteStop(co): with open('stopList.hkkf.json', 'w', encoding="utf-8") as f: f.write(json.dumps(stopList, ensure_ascii=False)) -if __name__=='__main__': - logging.basicConfig(level=logging.INFO) - logging.getLogger('httpx').setLevel(logging.WARNING) - asyncio.run(getRouteStop('hkkf')) +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + logging.getLogger('httpx').setLevel(logging.WARNING) + asyncio.run(getRouteStop('hkkf')) diff --git a/crawling/kmb.py b/crawling/kmb.py index 89358137..d09265be 100644 --- a/crawling/kmb.py +++ b/crawling/kmb.py @@ -9,69 +9,74 @@ from crawl_utils import emitRequest + async def getRouteStop(): - a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None)) - # define output name - ROUTE_LIST = 'routeList.kmb.json' - STOP_LIST = 'stopList.kmb.json' + a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None)) + # define output name + ROUTE_LIST = 'routeList.kmb.json' + STOP_LIST = 'stopList.kmb.json' + + stopList = {} + if path.isfile(STOP_LIST): + with open(STOP_LIST, 'r', encoding='UTF-8') as f: + stopList = json.load(f) + else: + # load stops + r = await emitRequest('https://data.etabus.gov.hk/v1/transport/kmb/stop', a_client) + _stopList = r.json()['data'] + for stop in _stopList: + stopList[stop['stop']] = stop - stopList = {} - if path.isfile(STOP_LIST): - with open(STOP_LIST, 'r', encoding='UTF-8') as f: - stopList = json.load(f) - else: - # load stops - r = await emitRequest('https://data.etabus.gov.hk/v1/transport/kmb/stop', a_client) - _stopList = r.json()['data'] - for stop in _stopList: - stopList[stop['stop']] = stop + def isStopExist(stopId): + if stopId not in stopList: + print("Not exist stop: ", stopId, file=sys.stderr) + return stopId in stopList - def isStopExist( stopId ): - if stopId not in stopList: - print ("Not exist stop: ", stopId, file=sys.stderr) - return stopId in stopList + # load route list and stop list if exist + routeList = {} + if path.isfile(ROUTE_LIST): + return + else: + # load routes + r = await emitRequest('https://data.etabus.gov.hk/v1/transport/kmb/route/', a_client) + for route in r.json()['data']: + route['stops'] = {} + route['co'] = 'kmb' + routeList['+'.join([route['route'], + route['service_type'], + route['bound']])] = route - # load route list and stop list if exist - routeList = {} - if path.isfile(ROUTE_LIST): - return - else: - # load routes - r = await emitRequest('https://data.etabus.gov.hk/v1/transport/kmb/route/', a_client) - for route in r.json()['data']: - route['stops'] = {} - route['co'] = 'kmb' - routeList['+'.join([route['route'], route['service_type'], route['bound']])] = route + # load route stops + r = await emitRequest('https://data.etabus.gov.hk/v1/transport/kmb/route-stop/', a_client) + for stop in r.json()['data']: + routeKey = '+'.join([stop['route'], stop['service_type'], stop['bound']]) + if routeKey in routeList: + routeList[routeKey]['stops'][int(stop['seq'])] = stop['stop'] + else: + # if route not found, clone it from service type = 1 + _routeKey = '+'.join([stop['route'], str('1'), stop['bound']]) + routeList[routeKey] = copy.deepcopy(routeList[_routeKey]) + routeList[routeKey]['stops'] = {} + routeList[routeKey]['stops'][int(stop['seq'])] = stop['stop'] - # load route stops - r = await emitRequest('https://data.etabus.gov.hk/v1/transport/kmb/route-stop/', a_client) - for stop in r.json()['data']: - routeKey = '+'.join([stop['route'], stop['service_type'], stop['bound']]) - if routeKey in routeList: - routeList[routeKey]['stops'][int(stop['seq'])] = stop['stop'] - else: - # if route not found, clone it from service type = 1 - _routeKey = '+'.join([stop['route'], str('1'), stop['bound']]) - routeList[routeKey] = copy.deepcopy(routeList[_routeKey]) - routeList[routeKey]['stops'] = {} - routeList[routeKey]['stops'][int(stop['seq'])] = stop['stop'] + # flatten the route stops back to array + for routeKey in routeList.keys(): + stops = [routeList[routeKey]['stops'][seq] + for seq in sorted(routeList[routeKey]['stops'].keys())] + # filter non-exist stops + stops = list(filter(isStopExist, stops)) + routeList[routeKey]['stops'] = stops - # flatten the route stops back to array - for routeKey in routeList.keys(): - stops = [routeList[routeKey]['stops'][seq] for seq in sorted(routeList[routeKey]['stops'].keys())] - # filter non-exist stops - stops = list(filter(isStopExist, stops)) - routeList[routeKey]['stops'] = stops + # flatten the routeList back to array + routeList = [routeList[routeKey] + for routeKey in routeList.keys() if not routeKey.startswith('K')] - # flatten the routeList back to array - routeList = [routeList[routeKey] for routeKey in routeList.keys() if not routeKey.startswith('K')] - - with open(ROUTE_LIST, 'w', encoding='UTF-8') as f: - f.write(json.dumps(routeList, ensure_ascii=False)) - with open(STOP_LIST, 'w', encoding='UTF-8') as f: - f.write(json.dumps(stopList, ensure_ascii=False)) + with open(ROUTE_LIST, 'w', encoding='UTF-8') as f: + f.write(json.dumps(routeList, ensure_ascii=False)) + with open(STOP_LIST, 'w', encoding='UTF-8') as f: + f.write(json.dumps(stopList, ensure_ascii=False)) -if __name__=='__main__': - logging.basicConfig(level=logging.INFO) - logging.getLogger('httpx').setLevel(logging.WARNING) - asyncio.run(getRouteStop()) +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + logging.getLogger('httpx').setLevel(logging.WARNING) + asyncio.run(getRouteStop()) diff --git a/crawling/lightRail.py b/crawling/lightRail.py index 14c5b331..4a7508ef 100644 --- a/crawling/lightRail.py +++ b/crawling/lightRail.py @@ -10,7 +10,8 @@ from crawl_utils import emitRequest -async def getRouteStop(co = 'lightRail'): + +async def getRouteStop(co='lightRail'): a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None)) epsgTransformer = Transformer.from_crs('epsg:2326', 'epsg:4326') @@ -19,51 +20,53 @@ async def getRouteStop(co = 'lightRail'): stopList = {} r = await emitRequest('https://opendata.mtr.com.hk/data/light_rail_routes_and_stops.csv', a_client) - reader = csv.reader(r.text.split("\n") ) - headers = next(reader,None) + reader = csv.reader(r.text.split("\n")) + headers = next(reader, None) routes = [route for route in reader if len(route) == 7] for [route, bound, stopCode, stopId, chn, eng, seq] in routes: - if route+"_"+bound not in routeList: - routeList[route+"_"+bound] = { - "gtfsId": None, - "route": route, - "bound": "O" if bound == "1" else "I", - "service_type": "1", - "orig_tc": None, - "orig_en": None, - "dest_tc": None, - "dest_en": None, - "stops": [], - "fare": [] + if route + "_" + bound not in routeList: + routeList[route + "_" + bound] = { + "gtfsId": None, + "route": route, + "bound": "O" if bound == "1" else "I", + "service_type": "1", + "orig_tc": None, + "orig_en": None, + "dest_tc": None, + "dest_en": None, + "stops": [], + "fare": [] } if seq == "1.00": - routeList[route+"_"+bound]["orig_tc"] = chn - routeList[route+"_"+bound]["orig_en"] = eng - routeList[route+"_"+bound]["dest_tc"] = chn - routeList[route+"_"+bound]["dest_en"] = eng - routeList[route+"_"+bound]["stops"].append("LR"+stopId) - if "LR"+stopId not in stopList: - url='https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q=輕鐵-'+chn + routeList[route + "_" + bound]["orig_tc"] = chn + routeList[route + "_" + bound]["orig_en"] = eng + routeList[route + "_" + bound]["dest_tc"] = chn + routeList[route + "_" + bound]["dest_en"] = eng + routeList[route + "_" + bound]["stops"].append("LR" + stopId) + if "LR" + stopId not in stopList: + url = 'https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q=輕鐵-' + chn r = await emitRequest(url, a_client, headers={'Accept': 'application/json'}) try: - lat, lng = epsgTransformer.transform( r.json()[0]['y'], r.json()[0]['x'] ) - stopList["LR"+stopId] = { - "stop": "LR"+stopId, - "name_en": eng, - "name_tc": chn, - "lat": lat, - "long": lng + lat, lng = epsgTransformer.transform( + r.json()[0]['y'], r.json()[0]['x']) + stopList["LR" + stopId] = { + "stop": "LR" + stopId, + "name_en": eng, + "name_tc": chn, + "lat": lat, + "long": lng } - except: + except BaseException: logger.exception(f"Error parsing {url}: {r.text}") raise with open('routeList.lightRail.json', 'w', encoding='UTF-8') as f: - f.write(json.dumps([route for route in routeList.values() if len(route['stops']) > 0], ensure_ascii=False)) + f.write(json.dumps([route for route in routeList.values() + if len(route['stops']) > 0], ensure_ascii=False)) with open('stopList.lightRail.json', 'w', encoding='UTF-8') as f: f.write(json.dumps(stopList, ensure_ascii=False)) -if __name__=='__main__': +if __name__ == '__main__': logging.basicConfig(level=logging.INFO) logging.getLogger('httpx').setLevel(logging.WARNING) logger = logging.getLogger(__name__) diff --git a/crawling/lrtfeeder.py b/crawling/lrtfeeder.py index 9d570a3c..3452f746 100644 --- a/crawling/lrtfeeder.py +++ b/crawling/lrtfeeder.py @@ -10,66 +10,68 @@ from crawl_utils import emitRequest -async def getRouteStop(co = 'lrtfeeder'): + +async def getRouteStop(co='lrtfeeder'): a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None)) routeList = {} stopList = {} r = await emitRequest('https://opendata.mtr.com.hk/data/mtr_bus_routes.csv', a_client) r.encoding = 'utf-8' - reader = csv.reader(r.text.split("\n") ) - headers = next(reader,None) + reader = csv.reader(r.text.split("\n")) + headers = next(reader, None) routes = [route for route in reader if len(route) == 4] for [route, chn, eng, circular] in routes: if route == '': continue start = { - "zh": chn.split('至')[0], - "en": eng.split(' to ')[0] + "zh": chn.split('至')[0], + "en": eng.split(' to ')[0] } end = { - "zh": chn.split('至')[1], - "en": eng.split(' to ')[1] + "zh": chn.split('至')[1], + "en": eng.split(' to ')[1] } for bound in ['I', 'O']: - routeList[route+"_"+bound] = { - "route": route, - "bound": bound, - "service_type": "1", - "orig_tc": start['zh'] if bound == 'O' else end['zh'], - "dest_tc": end["zh"] if bound == 'O' else start['zh'], - "orig_en": start['en'] if bound == 'O' else end['en'], - "dest_en": end["en"] if bound == 'O' else start['en'], - "stops": [], - "co": "lrtfeeder" + routeList[route + "_" + bound] = { + "route": route, + "bound": bound, + "service_type": "1", + "orig_tc": start['zh'] if bound == 'O' else end['zh'], + "dest_tc": end["zh"] if bound == 'O' else start['zh'], + "orig_en": start['en'] if bound == 'O' else end['en'], + "dest_en": end["en"] if bound == 'O' else start['en'], + "stops": [], + "co": "lrtfeeder" } # Parse stops r = await emitRequest('https://opendata.mtr.com.hk/data/mtr_bus_stops.csv', a_client) r.encoding = 'utf-8' - reader = csv.reader(r.text.split("\n") ) - headers = next(reader,None) + reader = csv.reader(r.text.split("\n")) + headers = next(reader, None) stops = [stop for stop in reader if len(stop) == 8] for [route, bound, seq, stationId, lat, lng, name_zh, name_en] in stops: - routeKey = route+"_"+bound + routeKey = route + "_" + bound if routeKey in routeList: routeList[routeKey]['stops'].append(stationId) else: - print ("error", routeKey) + print("error", routeKey) stopList[stationId] = { - "stop": stationId, - "name_en": name_en, - "name_tc": name_zh, - "lat": lat, - "long": lng + "stop": stationId, + "name_en": name_en, + "name_tc": name_zh, + "lat": lat, + "long": lng } with open('routeList.lrtfeeder.json', 'w', encoding='UTF-8') as f: - f.write(json.dumps([route for route in routeList.values() if len(route['stops']) > 0], ensure_ascii=False)) + f.write(json.dumps([route for route in routeList.values() + if len(route['stops']) > 0], ensure_ascii=False)) with open('stopList.lrtfeeder.json', 'w', encoding='UTF-8') as f: f.write(json.dumps(stopList, ensure_ascii=False)) -if __name__=='__main__': +if __name__ == '__main__': logging.basicConfig(level=logging.INFO) logging.getLogger('httpx').setLevel(logging.WARNING) logger = logging.getLogger(__name__) diff --git a/crawling/matchGtfs.py b/crawling/matchGtfs.py index 12cf7046..8a76eef7 100644 --- a/crawling/matchGtfs.py +++ b/crawling/matchGtfs.py @@ -10,23 +10,28 @@ gtfsRoutes = gtfs['routeList'] gtfsStops = gtfs['stopList'] + def isNameMatch(name_a, name_b): - tmp_a = name_a.lower() - tmp_b = name_b.lower() - return tmp_a.find(tmp_b) >= 0 or tmp_b.find(tmp_a) >= 0 + tmp_a = name_a.lower() + tmp_b = name_b.lower() + return tmp_a.find(tmp_b) >= 0 or tmp_b.find(tmp_a) >= 0 # ctb routes only give list of stops in topological order # the actual servicing routes may skip some stop in the coStops # this DP function is trying to map the coStops back to GTFS stops -def matchStopsByDp ( coStops, gtfsStops, co, debug=False ): - co = 'unknown' if co not in gtfsStops[0]['stopName'] else co # handle unknown stop + + +def matchStopsByDp(coStops, gtfsStops, co, debug=False): + # handle unknown stop + co = 'unknown' if co not in gtfsStops[0]['stopName'] else co if len(gtfsStops) > len(coStops) + 1: return [], INFINITY_DIST if len(gtfsStops) - len(coStops) == 1: gtfsStops = gtfsStops[:-1] - + # initialization - distSum = [[INFINITY_DIST for x in range(len(coStops)+1) ] for y in range(len(gtfsStops)+1)] + distSum = [[INFINITY_DIST for x in range( + len(coStops) + 1)] for y in range(len(gtfsStops) + 1)] for j in range(len(coStops) - len(gtfsStops) + 1): distSum[0][j] = 0 @@ -35,17 +40,17 @@ def matchStopsByDp ( coStops, gtfsStops, co, debug=False ): gtfsStop = gtfsStops[i] for j in range(len(coStops)): coStop = coStops[j] - dist = ( 0 - if coStop['name_tc'] == gtfsStop['stopName'][co] - else haversine( - (float(coStop['lat']), float(coStop['long'])), - (gtfsStop['lat'], gtfsStop['lng']) - ) * 1000 - ) - - distSum[i+1][j+1] = min( - distSum[i][j] + dist, # from previous stops of both sides - distSum[i+1][j] # skipping current coStops + dist = (0 + if coStop['name_tc'] == gtfsStop['stopName'][co] + else haversine( + (float(coStop['lat']), float(coStop['long'])), + (gtfsStop['lat'], gtfsStop['lng']) + ) * 1000 + ) + + distSum[i + 1][j + 1] = min( + distSum[i][j] + dist, # from previous stops of both sides + distSum[i + 1][j] # skipping current coStops ) # fast return if no good result @@ -57,34 +62,35 @@ def matchStopsByDp ( coStops, gtfsStops, co, debug=False ): j = len(coStops) ret = [] while i > 0 and j > 0: - if distSum[i][j] == distSum[i][j-1]: + if distSum[i][j] == distSum[i][j - 1]: j -= 1 else: - ret.append( ( i-1, j-1 ) ) + ret.append((i - 1, j - 1)) i -= 1 j -= 1 ret.reverse() - + # penalty distance is given for not exact match route - penalty = sum([abs(a-b) for a, b in ret]) * 0.01 - + penalty = sum([abs(a - b) for a, b in ret]) * 0.01 + return ret, min(distSum[len(gtfsStops)]) / len(gtfsStops) + penalty def mergeRouteAsCircularRoute(routeA, routeB): return { - "co": routeA['co'], - "route": routeA["route"], - "bound": routeA["bound"] + routeB["bound"], - "orig_en": routeA["orig_en"], - "orig_tc": routeA["orig_tc"], - "dest_en": routeB["dest_en"], - "dest_tc": routeB["dest_tc"], - "serviceType": routeA["serviceType"], - "stops": routeA['stops'] + routeB['stops'], - "virtual": True, + "co": routeA['co'], + "route": routeA["route"], + "bound": routeA["bound"] + routeB["bound"], + "orig_en": routeA["orig_en"], + "orig_tc": routeA["orig_tc"], + "dest_en": routeB["dest_en"], + "dest_tc": routeB["dest_tc"], + "serviceType": routeA["serviceType"], + "stops": routeA['stops'] + routeB['stops'], + "virtual": True, } + def getVirtualCircularRoutes(routeList, routeNo): indices = [] for idx, route in enumerate(routeList): @@ -92,7 +98,7 @@ def getVirtualCircularRoutes(routeList, routeNo): indices.append(idx) if len(indices) != 2: return [] - + ret = [] routeA = routeList[indices[0]] routeB = routeList[indices[1]] @@ -100,94 +106,128 @@ def getVirtualCircularRoutes(routeList, routeNo): return [] return [ - mergeRouteAsCircularRoute(routeA, routeB), - mergeRouteAsCircularRoute(routeB, routeA) + mergeRouteAsCircularRoute(routeA, routeB), + mergeRouteAsCircularRoute(routeB, routeA) ] + def printStopMatches(bestMatch, gtfsStops, stopList, co): - stopPair = [(bestMatch[4][gtfsStopIdx], bestMatch[5]["stops"][routeStopIdx]) for gtfsStopIdx, routeStopIdx in bestMatch[2]] - print (bestMatch[3], bestMatch[0], bestMatch[1]) - print ("\t|\t".join(["運輸處", co])) - print ("\n".join([ - str(idx + 1) + " " + "\t|\t".join( - [gtfsStops[gtfsId]["stopName"][co], stopList[stopId]["name_tc"]]) for idx, (gtfsId, stopId) in enumerate(stopPair)] - ) - ) - print () + stopPair = [(bestMatch[4][gtfsStopIdx], bestMatch[5]["stops"][routeStopIdx]) + for gtfsStopIdx, routeStopIdx in bestMatch[2]] + print(bestMatch[3], bestMatch[0], bestMatch[1]) + print("\t|\t".join(["運輸處", co])) + print("\n".join([str(idx + 1) + " " + "\t|\t".join([gtfsStops[gtfsId]["stopName"][co], + stopList[stopId]["name_tc"]]) for idx, (gtfsId, stopId) in enumerate(stopPair)])) + print() + def matchRoutes(co): - print (co) - with open( 'routeList.%s.json' % co, 'r', encoding="utf-8" ) as f: + print(co) + with open('routeList.%s.json' % co, 'r', encoding="utf-8") as f: routeList = json.load(f) - with open( 'stopList.%s.json' % co, 'r', encoding="utf-8" ) as f: + with open('stopList.%s.json' % co, 'r', encoding="utf-8") as f: stopList = json.load(f) routeCandidates = [] # one pass to find matches of co vs gtfs by DP for gtfsId, gtfsRoute in gtfsRoutes.items(): debug = False and gtfsId == '1047' and gtfsRoute['orig']['zh'] == '沙田站' - if co == 'gmb' and co in gtfsRoute['co']: # handle for gmb + if co == 'gmb' and co in gtfsRoute['co']: # handle for gmb for route in routeList: if route['gtfsId'] == gtfsId: - route['fares'] = [gtfsRoute['fares']['1'][0] for i in range(len(route['stops'])-1) ] - elif ( co == "sunferry" or co == "fortuneferry" ) and "ferry" in gtfsRoute['co']: + route['fares'] = [gtfsRoute['fares']['1'][0] + for i in range(len(route['stops']) - 1)] + elif (co == "sunferry" or co == "fortuneferry") and "ferry" in gtfsRoute['co']: for route in routeList: if route['gtfsId'] == gtfsId: - route['fares'] = [gtfsRoute['fares']['1'][0] for i in range(len(route['stops'])-1) ] - elif co in gtfsRoute['co'] or ( co == "hkkf" and 'ferry' in gtfsRoute['co'] ): # handle for other companies + route['fares'] = [gtfsRoute['fares']['1'][0] + for i in range(len(route['stops']) - 1)] + # handle for other companies + elif co in gtfsRoute['co'] or (co == "hkkf" and 'ferry' in gtfsRoute['co']): for bound, stops in gtfsRoute['stops'].items(): bestMatch = (-1, INFINITY_DIST) - for route in routeList + getVirtualCircularRoutes(routeList, gtfsRoute['route']): - if ( co in gtfsRoute['co'] and route['route'] == gtfsRoute['route'] ) or \ - ( co == 'hkkf' and ( ( route["orig_tc"].startswith(gtfsRoute['orig']['zh']) and route["dest_tc"].startswith(gtfsRoute['dest']['zh']) ) or - ( route["orig_tc"].startswith(gtfsRoute['dest']['zh']) and route["dest_tc"].startswith(gtfsRoute['orig']['zh']) ) ) ): - ret, avgDist = matchStopsByDp([stopList[stop] for stop in route['stops']], [gtfsStops[stop] for stop in stops], co, debug) + for route in routeList + \ + getVirtualCircularRoutes(routeList, gtfsRoute['route']): + if ( + co in gtfsRoute['co'] and route['route'] == gtfsRoute['route']) or ( + co == 'hkkf' and ( + (route["orig_tc"].startswith( + gtfsRoute['orig']['zh']) and route["dest_tc"].startswith( + gtfsRoute['dest']['zh'])) or ( + route["orig_tc"].startswith( + gtfsRoute['dest']['zh']) and route["dest_tc"].startswith( + gtfsRoute['orig']['zh'])))): + ret, avgDist = matchStopsByDp([stopList[stop] for stop in route['stops']], [ + gtfsStops[stop] for stop in stops], co, debug) if avgDist < bestMatch[1]: bestMatch = (gtfsId, avgDist, ret, bound, stops, route) - if bestMatch[1] < DIST_DIFF: # assume matching to be avg stop distance diff is lower than 100 + # assume matching to be avg stop distance diff is lower than 100 + if bestMatch[1] < DIST_DIFF: ret, bound, stops, route = bestMatch[2:] - + routeCandidate = route.copy() - if (len(ret) == len(route['stops']) or len(ret) + 1 == len(route['stops'])) and 'gtfs' not in route and "virtual" not in route: - routeCandidate['fares'] = [gtfsRoute['fares'][bound][i] for i, j in ret[:-1]] if len(ret[:-1]) < len(gtfsRoute["fares"][bound]) + 1 else None + if ( + len(ret) == len( + route['stops']) or len(ret) + + 1 == len( + route['stops'])) and 'gtfs' not in route and "virtual" not in route: + routeCandidate['fares'] = [gtfsRoute['fares'][bound][i] for i, j in ret[:-1] + ] if len(ret[:-1]) < len(gtfsRoute["fares"][bound]) + 1 else None routeCandidate['freq'] = gtfsRoute['freq'][bound] routeCandidate['jt'] = gtfsRoute['jt'] - routeCandidate['co'] = gtfsRoute['co'] if co in gtfsRoute['co'] else ( gtfsRoute['co'] + [co] ) + routeCandidate['co'] = gtfsRoute['co'] if co in gtfsRoute['co'] else ( + gtfsRoute['co'] + [co]) routeCandidate['stops'] = [route['stops'][j] for i, j in ret] routeCandidate['gtfs'] = [gtfsId] route['found'] = True else: routeCandidate['stops'] = [route['stops'][j] for i, j in ret] - routeCandidate['fares'] = [gtfsRoute['fares'][bound][i] for i, j in ret[:-1]] if len(ret[:-1]) < len(gtfsRoute["fares"][bound]) + 1 else None + routeCandidate['fares'] = [gtfsRoute['fares'][bound][i] for i, j in ret[:-1] + ] if len(ret[:-1]) < len(gtfsRoute["fares"][bound]) + 1 else None routeCandidate['freq'] = gtfsRoute['freq'][bound] routeCandidate['jt'] = gtfsRoute['jt'] routeCandidate['co'] = gtfsRoute['co'] routeCandidate['orig_tc'] = stopList[routeCandidate['stops'][0]]['name_tc'] routeCandidate['orig_en'] = stopList[routeCandidate['stops'][0]]['name_en'] - routeCandidate['dest_tc'] = stopList[routeCandidate['stops'][-1]]['name_tc'] - routeCandidate['dest_en'] = stopList[routeCandidate['stops'][-1]]['name_en'] + routeCandidate['dest_tc'] = stopList[routeCandidate['stops'] + [-1]]['name_tc'] + routeCandidate['dest_en'] = stopList[routeCandidate['stops'] + [-1]]['name_en'] routeCandidate['service_type'] = "2" if 'found' in route else "1" routeCandidate['gtfs'] = [gtfsId] - route['found'] = True # mark the route has mapped to GTFS, mainly for ctb routes + # mark the route has mapped to GTFS, mainly for ctb routes + route['found'] = True routeCandidates.append(routeCandidate) if '_route' not in gtfsRoute: gtfsRoute['_route'] = {} gtfsRoute['_route'][co] = route.copy() elif co in gtfsRoute['co']: - print(co, gtfsRoute['route'], 'cannot match any in GTFS', file=sys.stderr) - + print( + co, + gtfsRoute['route'], + 'cannot match any in GTFS', + file=sys.stderr) + for route in routeList: if 'gtfs' not in route: route['co'] = [co] - - print (co, len([route for route in routeList if 'gtfs' not in route]), 'out of',len(routeList), 'not match') - if co != 'mtr': routeList.extend(routeCandidates) - routeList = [route for route in routeList if 'found' not in route or 'fares' in route] # skipping routes that just partially mapped to GTFS - with open( 'routeFareList.%s.json' % co, 'w', encoding='UTF-8' ) as f: + print(co, + len([route for route in routeList if 'gtfs' not in route]), + 'out of', + len(routeList), + 'not match') + if co != 'mtr': + routeList.extend(routeCandidates) + # skipping routes that just partially mapped to GTFS + routeList = [ + route for route in routeList if 'found' not in route or 'fares' in route] + + with open('routeFareList.%s.json' % co, 'w', encoding='UTF-8') as f: f.write(json.dumps(routeList, ensure_ascii=False)) - + + matchRoutes('kmb') matchRoutes('ctb') matchRoutes('nlb') @@ -208,5 +248,5 @@ def matchRoutes(co): routeFareList = {} -with open( 'routeGtfs.all.json', 'w', encoding='UTF-8' ) as f: - f.write(json.dumps(gtfsRoutes, ensure_ascii=False, indent=4)) \ No newline at end of file +with open('routeGtfs.all.json', 'w', encoding='UTF-8') as f: + f.write(json.dumps(gtfsRoutes, ensure_ascii=False, indent=4)) diff --git a/crawling/mergeRoutes.py b/crawling/mergeRoutes.py index 89a5e7f4..62e0c6eb 100644 --- a/crawling/mergeRoutes.py +++ b/crawling/mergeRoutes.py @@ -6,71 +6,106 @@ stopList = {} stopMap = {} -def getRouteObj ( route, co, stops, bound, orig, dest, seq, fares, faresHoliday, freq, jt, nlbId, gtfsId, serviceType = 1): + +def getRouteObj( + route, + co, + stops, + bound, + orig, + dest, + seq, + fares, + faresHoliday, + freq, + jt, + nlbId, + gtfsId, + serviceType=1): return { - 'route': route, - 'co': co, - 'stops': stops, - 'serviceType': serviceType, - 'bound': bound, - 'orig': orig, - 'dest': dest, - 'fares': fares, - 'faresHoliday': faresHoliday, - 'freq': freq, - 'jt': jt, - 'nlbId': nlbId, - 'gtfsId': gtfsId, - 'seq': seq + 'route': route, + 'co': co, + 'stops': stops, + 'serviceType': serviceType, + 'bound': bound, + 'orig': orig, + 'dest': dest, + 'fares': fares, + 'faresHoliday': faresHoliday, + 'freq': freq, + 'jt': jt, + 'nlbId': nlbId, + 'gtfsId': gtfsId, + 'seq': seq } + def isGtfsMatch(knownRoute, newRoute): - if knownRoute['gtfsId'] is None: return True - if 'gtfs' not in newRoute: return True + if knownRoute['gtfsId'] is None: + return True + if 'gtfs' not in newRoute: + return True return knownRoute['gtfsId'] in newRoute['gtfs'] - -def importRouteListJson( co ): - _routeList = json.load(open('routeFareList.%s.cleansed.json'%co, 'r', encoding='UTF-8')) - _stopList = json.load(open('stopList.%s.json'%co, 'r', encoding='UTF-8')) + + +def importRouteListJson(co): + _routeList = json.load( + open( + 'routeFareList.%s.cleansed.json' % + co, 'r', encoding='UTF-8')) + _stopList = json.load(open('stopList.%s.json' % co, 'r', encoding='UTF-8')) for stopId, stop in _stopList.items(): if stopId not in stopList: try: stopList[stopId] = { - 'name': { - 'en': stop['name_en'], - 'zh': stop['name_tc'] - }, - 'location': { - 'lat': float(stop['lat']), - 'lng': float(stop['long']) - } + 'name': { + 'en': stop['name_en'], + 'zh': stop['name_tc'] + }, + 'location': { + 'lat': float(stop['lat']), + 'lng': float(stop['long']) + } } - except: + except BaseException: print("Problematic stop: ", stopId, file=stderr) - + for _route in _routeList: found = False speicalType = 1 - orig = {'en': _route['orig_en'].replace('/', '/'), 'zh': _route['orig_tc'].replace('/', '/')} - dest = {'en': _route['dest_en'].replace('/', '/'), 'zh': _route['dest_tc'].replace('/', '/')} - + orig = { + 'en': _route['orig_en'].replace( + '/', + '/'), + 'zh': _route['orig_tc'].replace( + '/', + '/')} + dest = { + 'en': _route['dest_en'].replace( + '/', + '/'), + 'zh': _route['dest_tc'].replace( + '/', + '/')} + for route in routeList: - if _route['route'] == route['route'] and co in route['co'] and isGtfsMatch(route, _route): + if _route['route'] == route['route'] and co in route['co'] and isGtfsMatch( + route, _route): # skip checking if the bound is not the same if co in route["bound"] and route['bound'][co] != _route['bound']: continue - + if len(_route['stops']) == route['seq']: dist = 0 merge = True - for stop_a, stop_b in zip( _route['stops'], route['stops'][0][1] ): + for stop_a, stop_b in zip(_route['stops'], route['stops'][0][1]): stop_a = stopList[stop_a] stop_b = stopList[stop_b] - dist = haversine( - (stop_a['location']['lat'], stop_a['location']['lng']), - (stop_b['location']['lat'], stop_b['location']['lng']), - unit=Unit.METERS # specify that we want distance in metres, default unit is km + dist = haversine( + (stop_a['location']['lat'], stop_a['location']['lng']), + (stop_b['location']['lat'], stop_b['location']['lng']), + unit=Unit.METERS # specify that we want distance in metres, default unit is km ) merge = merge and dist < 300 if merge: @@ -81,37 +116,41 @@ def importRouteListJson( co ): speicalType = int(route['serviceType']) + 1 if _route["route"] == '606' and _route['dest_tc'].startswith("彩雲"): print("Yes", speicalType) - + if not found: - routeList.append( - getRouteObj( - route = _route['route'], - co = _route['co'], - serviceType = _route.get('service_type', speicalType), - stops = [(co, _route['stops'])], - bound = {co: _route['bound']}, - orig = orig, - dest = dest, - fares = _route.get('fares', None), - faresHoliday = _route.get('faresHoliday', None), - freq = _route.get('freq', None), - jt = _route.get('jt', None), - nlbId = _route.get('id', None), - gtfsId = _route.get('gtfsId', _route.get('gtfs', [None])[0]), - seq = len(_route['stops']) - ) + routeList.append( + getRouteObj( + route=_route['route'], + co=_route['co'], + serviceType=_route.get('service_type', speicalType), + stops=[(co, _route['stops'])], + bound={co: _route['bound']}, + orig=orig, + dest=dest, + fares=_route.get('fares', None), + faresHoliday=_route.get('faresHoliday', None), + freq=_route.get('freq', None), + jt=_route.get('jt', None), + nlbId=_route.get('id', None), + gtfsId=_route.get('gtfsId', _route.get('gtfs', [None])[0]), + seq=len(_route['stops']) + ) ) -def isMatchStops(stops_a, stops_b, debug = False): + +def isMatchStops(stops_a, stops_b, debug=False): if len(stops_a) != len(stops_b): return False for v in stops_a: - if stopMap.get(v, [[None,None]])[0][1] in stops_b: + if stopMap.get(v, [[None, None]])[0][1] in stops_b: return True return False + def getRouteId(v): - return '%s+%s+%s+%s'%(v['route'], v['serviceType'], v['orig']['en'], v['dest']['en']) + return '%s+%s+%s+%s' % (v['route'], v['serviceType'], + v['orig']['en'], v['dest']['en']) + def smartUnique(): _routeList = [] @@ -120,29 +159,30 @@ def smartUnique(): continue founds = [] # compare route one-by-one - for j in range(i+1, len(routeList)): + for j in range(i + 1, len(routeList)): if routeList[i]['route'] == routeList[j]['route'] \ - and len(routeList[i]['stops']) == len(routeList[j]['stops']) \ - and len([co for co in routeList[i]['co'] if co in routeList[j]['co']]) == 0 \ - and isMatchStops(routeList[i]['stops'][0][1], routeList[j]['stops'][0][1]): - founds.append( j ) + and len(routeList[i]['stops']) == len(routeList[j]['stops']) \ + and len([co for co in routeList[i]['co'] if co in routeList[j]['co']]) == 0 \ + and isMatchStops(routeList[i]['stops'][0][1], routeList[j]['stops'][0][1]): + founds.append(j) elif routeList[i]['route'] == routeList[j]['route'] \ - and str(routeList[i]['serviceType']) == str(routeList[j]['serviceType']) \ - and routeList[i]['orig']['en'] == routeList[j]['orig']['en'] \ - and routeList[i]['dest']['en'] == routeList[j]['dest']['en']: - routeList[j]['serviceType'] = str(int(routeList[j]['serviceType'])+1) + and str(routeList[i]['serviceType']) == str(routeList[j]['serviceType']) \ + and routeList[i]['orig']['en'] == routeList[j]['orig']['en'] \ + and routeList[i]['dest']['en'] == routeList[j]['dest']['en']: + routeList[j]['serviceType'] = str(int(routeList[j]['serviceType']) + 1) # update obj for found in founds: routeList[i]['co'].extend(routeList[found]['co']) - routeList[i]['stops'].extend( routeList[found]['stops'] ) + routeList[i]['stops'].extend(routeList[found]['stops']) routeList[found]['skip'] = True # append return array _routeList.append(routeList[i]) return _routeList - + + importRouteListJson('kmb') importRouteListJson('ctb') importRouteListJson('nlb') @@ -158,21 +198,30 @@ def smartUnique(): route['stops'] = {co: stops for co, stops in route['stops']} holidays = json.load(open('holiday.json', 'r', encoding='UTF-8')) -serviceDayMap = json.load(open('gtfs.json', 'r', encoding='UTF-8'))['serviceDayMap'] +serviceDayMap = json.load( + open( + 'gtfs.json', + 'r', + encoding='UTF-8'))['serviceDayMap'] + def standardizeDict(d): - return {key: value if not isinstance(value, dict) else standardizeDict(value) for key, value in sorted(d.items())} + return { + key: value if not isinstance( + value, dict) else standardizeDict(value) for key, value in sorted( + d.items())} + db = standardizeDict({ - 'routeList': {getRouteId(v): v for v in routeList}, - 'stopList': stopList, - 'stopMap': stopMap, - 'holidays': holidays, - 'serviceDayMap': serviceDayMap, + 'routeList': {getRouteId(v): v for v in routeList}, + 'stopList': stopList, + 'stopMap': stopMap, + 'holidays': holidays, + 'serviceDayMap': serviceDayMap, }) -with open( 'routeFareList.mergeRoutes.json', 'w', encoding='UTF-8' ) as f: +with open('routeFareList.mergeRoutes.json', 'w', encoding='UTF-8') as f: f.write(json.dumps(db, ensure_ascii=False, indent=4)) -with open( 'routeFareList.mergeRoutes.min.json', 'w', encoding='UTF-8' ) as f: +with open('routeFareList.mergeRoutes.min.json', 'w', encoding='UTF-8') as f: f.write(json.dumps(db, ensure_ascii=False, separators=(',', ':'))) diff --git a/crawling/mergeStopList.py b/crawling/mergeStopList.py index f00789ce..e6c73007 100644 --- a/crawling/mergeStopList.py +++ b/crawling/mergeStopList.py @@ -4,233 +4,274 @@ import time from haversine import haversine, Unit -def get_stop_group(route_list, stop_list, stop_seq_mapping, stop_list_grid, stop_id): - DISTANCE_THRESHOLD = 50 # in metres - BEARING_THRESHOLD = 45 # in degrees - STOP_LIST_LIMIT = 50 # max number of stops in a group - - def get_stops_haversine_distance(stop_a, stop_b): - return haversine( - (stop_a['location']['lat'], stop_a['location']['lng']), - (stop_b['location']['lat'], stop_b['location']['lng']), - unit=Unit.METERS # specify that we want distance in meter, default is km - ) - - bearing_targets = stop_seq_mapping.get(stop_id, {}).get('bearings', []) - - def is_bearing_in_range(bearing): - if BEARING_THRESHOLD >= 180 or not bearing_targets: - return True - for target in bearing_targets: - bearing_min = target - BEARING_THRESHOLD - bearing_max = target + BEARING_THRESHOLD - if bearing_min < 0: - bearing_min += 360 - if bearing_max > 360: - bearing_max -= 360 - if (bearing_min <= bearing <= bearing_max or - (bearing_min > bearing_max and (bearing <= bearing_max or bearing >= bearing_min))): - return True - return False - - def search_nearby_stops(target_stop_id, excluded_stop_id_list): - target_stop = stop_list[target_stop_id] - # take lat/lng up to 3 decimal places, that's about 100m x 100m square - lat = int(target_stop['location']['lat'] * 1000) - lng = int(target_stop['location']['lng'] * 1000) - - nearby_stops = [] - for stop_id in stop_list_grid.get(f"{lat}_{lng}", []): - if (stop_id not in excluded_stop_id_list and get_stops_haversine_distance(target_stop, stop_list[stop_id]) <= DISTANCE_THRESHOLD): - bearings = stop_seq_mapping.get(stop_id, {}).get('bearings', []) - if any(is_bearing_in_range(b) for b in bearings): - nearby_stops.append({ - 'id': stop_id, - 'co': stop_seq_mapping.get(stop_id, {}).get('co', '') - }) - return nearby_stops - - stop_group = [] - stop_list_entries = search_nearby_stops(stop_id, []) - - # recursively search for nearby stops within thresholds (distance and bearing) - # stop searching when no new stops are found within range, or when stop list is getting too large - i = 0 - while i < len(stop_list_entries): - entry = stop_list_entries[i] - stop_group.append([entry['co'], entry['id']]) - i += 1 - if len(stop_list_entries) < STOP_LIST_LIMIT: - stop_list_entries.extend(search_nearby_stops(entry['id'], [e['id'] for e in stop_list_entries])) - - # to reduce size of routeFareList.min.json, excl current stop_id from final output stopMap - return [stop for stop in stop_group if stop[1] != stop_id] - # return stop_group + +def get_stop_group( + route_list, + stop_list, + stop_seq_mapping, + stop_list_grid, + stop_id): + DISTANCE_THRESHOLD = 50 # in metres + BEARING_THRESHOLD = 45 # in degrees + STOP_LIST_LIMIT = 50 # max number of stops in a group + + def get_stops_haversine_distance(stop_a, stop_b): + return haversine( + (stop_a['location']['lat'], stop_a['location']['lng']), + (stop_b['location']['lat'], stop_b['location']['lng']), + unit=Unit.METERS # specify that we want distance in meter, default is km + ) + + bearing_targets = stop_seq_mapping.get(stop_id, {}).get('bearings', []) + + def is_bearing_in_range(bearing): + if BEARING_THRESHOLD >= 180 or not bearing_targets: + return True + for target in bearing_targets: + bearing_min = target - BEARING_THRESHOLD + bearing_max = target + BEARING_THRESHOLD + if bearing_min < 0: + bearing_min += 360 + if bearing_max > 360: + bearing_max -= 360 + if ( + bearing_min <= bearing <= bearing_max or ( + bearing_min > bearing_max and ( + bearing <= bearing_max or bearing >= bearing_min))): + return True + return False + + def search_nearby_stops(target_stop_id, excluded_stop_id_list): + target_stop = stop_list[target_stop_id] + # take lat/lng up to 3 decimal places, that's about 100m x 100m square + lat = int(target_stop['location']['lat'] * 1000) + lng = int(target_stop['location']['lng'] * 1000) + + nearby_stops = [] + for stop_id in stop_list_grid.get(f"{lat}_{lng}", []): + if (stop_id not in excluded_stop_id_list and get_stops_haversine_distance( + target_stop, stop_list[stop_id]) <= DISTANCE_THRESHOLD): + bearings = stop_seq_mapping.get(stop_id, {}).get('bearings', []) + if any(is_bearing_in_range(b) for b in bearings): + nearby_stops.append({ + 'id': stop_id, + 'co': stop_seq_mapping.get(stop_id, {}).get('co', '') + }) + return nearby_stops + + stop_group = [] + stop_list_entries = search_nearby_stops(stop_id, []) + + # recursively search for nearby stops within thresholds (distance and bearing) + # stop searching when no new stops are found within range, or when stop + # list is getting too large + i = 0 + while i < len(stop_list_entries): + entry = stop_list_entries[i] + stop_group.append([entry['co'], entry['id']]) + i += 1 + if len(stop_list_entries) < STOP_LIST_LIMIT: + stop_list_entries.extend( + search_nearby_stops( + entry['id'], [ + e['id'] for e in stop_list_entries])) + + # to reduce size of routeFareList.min.json, excl current stop_id from + # final output stopMap + return [stop for stop in stop_group if stop[1] != stop_id] + # return stop_group + def get_bearing(a, b): - φ1 = math.radians(a['lat']) - φ2 = math.radians(b['lat']) - λ1 = math.radians(a['lng']) - λ2 = math.radians(b['lng']) - - y = math.sin(λ2 - λ1) * math.cos(φ2) - x = (math.cos(φ1) * math.sin(φ2) - - math.sin(φ1) * math.cos(φ2) * math.cos(λ2 - λ1)) - θ = math.atan2(y, x) - brng = (math.degrees(θ) + 360) % 360 # in degrees - return brng + φ1 = math.radians(a['lat']) + φ2 = math.radians(b['lat']) + λ1 = math.radians(a['lng']) + λ2 = math.radians(b['lng']) -def get_stop_bearings(route_stops): - unique_routes = [] - bearings = [] - for route_stop in route_stops: - if route_stop['bearing'] != -1: - unique_route = f"{route_stop['co']}_{route_stop['routeKey'].split('+')[0]}_{route_stop['bearing']}" - if unique_route not in unique_routes: - unique_routes.append(unique_route) - bearings.append(route_stop['bearing']) - - if not bearings: - return [] - - BEARING_THRESHOLD = 45 # in degrees - BEARING_EPSILON = 10e-6 # very small number - bearing_groups = [] - - for bearing in bearings: - if bearing == -1: - continue - if not bearing_groups: - bearing_groups.append([bearing]) - continue - - for group in bearing_groups: - if any(abs(b - bearing) < BEARING_EPSILON for b in group): - break - if any(abs(b - bearing) <= BEARING_THRESHOLD or abs(b - bearing) >= 360 - BEARING_THRESHOLD for b in group): - group.append(bearing) - break - else: - bearing_groups.append([bearing]) + y = math.sin(λ2 - λ1) * math.cos(φ2) + x = (math.cos(φ1) * math.sin(φ2) - + math.sin(φ1) * math.cos(φ2) * math.cos(λ2 - λ1)) + θ = math.atan2(y, x) + brng = (math.degrees(θ) + 360) % 360 # in degrees + return brng - if len(bearing_groups) == 1: - return bearing_groups[0] - longest_length = max(len(group) for group in bearing_groups) - return [b for group in bearing_groups if len(group) == longest_length for b in group] +def get_stop_bearings(route_stops): + unique_routes = [] + bearings = [] + for route_stop in route_stops: + if route_stop['bearing'] != -1: + unique_route = f"{route_stop['co']}_{route_stop['routeKey'].split('+')[0]}_{route_stop['bearing']}" + if unique_route not in unique_routes: + unique_routes.append(unique_route) + bearings.append(route_stop['bearing']) + + if not bearings: + return [] + + BEARING_THRESHOLD = 45 # in degrees + BEARING_EPSILON = 10e-6 # very small number + bearing_groups = [] + + for bearing in bearings: + if bearing == -1: + continue + if not bearing_groups: + bearing_groups.append([bearing]) + continue + + for group in bearing_groups: + if any(abs(b - bearing) < BEARING_EPSILON for b in group): + break + if any( + abs( + b - + bearing) <= BEARING_THRESHOLD or abs( + b - + bearing) >= 360 - + BEARING_THRESHOLD for b in group): + group.append(bearing) + break + else: + bearing_groups.append([bearing]) + + if len(bearing_groups) == 1: + return bearing_groups[0] + + longest_length = max(len(group) for group in bearing_groups) + return [b for group in bearing_groups if len( + group) == longest_length for b in group] # Main function to process stops + + def merge_stop_list(): - # Read the result from previous pipeline - with open('routeFareList.mergeRoutes.min.json', 'r', encoding='UTF-8') as f: - db = json.load(f) - - route_list = db['routeList'] - stop_list = db['stopList'] - start_time = time.time() - stop_seq_mapping = {} - - # Preprocess the list of bearings for each stop - for route_key, route_list_entry in route_list.items(): - stops = route_list_entry.get('stops', {}) - for co, co_stops in stops.items(): - for stop_pos, stop_id in enumerate(co_stops): - if stop_id not in stop_seq_mapping: - stop_seq_mapping[stop_id] = {"routeStops": [], "co": co, "bearings": []} - if stop_pos == len(co_stops) - 1: - stop_seq_mapping[stop_id]['routeStops'].append({ - 'routeKey': route_key, - 'co': co, - 'seq': stop_pos, - 'bearing': -1 - }) - else: - bearing = get_bearing(stop_list[stop_id]['location'], stop_list[co_stops[stop_pos + 1]]['location']) - stop_seq_mapping[stop_id]['routeStops'].append({ - 'routeKey': route_key, - 'co': co, - 'seq': stop_pos, - 'bearing': bearing - }) - - for stop_id in stop_seq_mapping.keys(): - stop_seq_mapping[stop_id]['bearings'] = get_stop_bearings(stop_seq_mapping[stop_id]['routeStops']) - - # Just dump the json in case of a need for trouble-shooting, but otherwise we do not need this file - with open('stopMap.routeStopsSequence.json', 'w', encoding='UTF-8') as f: - json.dump(stop_seq_mapping, f) - - logger.info(f'Processed routeStopsSequence in {(time.time() - start_time) * 1000:.2f}ms') - - # Preprocess stopList, organise stops into ~100m x ~100m squares to reduce size of nested loop later - stop_list_grid = {} - for stop_id, stop in stop_list.items(): - # take lat/lng up to 3 decimal places, that's about 100m x 100m square - lat = int(stop['location']['lat'] * 1000) - lng = int(stop['location']['lng'] * 1000) - # add stop into the 9 grid boxes surrounding this stop - grid = [ - f"{lat - 1}_{lng - 1}", - f"{lat }_{lng - 1}", - f"{lat + 1}_{lng - 1}", - f"{lat - 1}_{lng }", - f"{lat }_{lng }", - f"{lat + 1}_{lng }", - f"{lat - 1}_{lng + 1}", - f"{lat }_{lng + 1}", - f"{lat + 1}_{lng + 1}", - ] - for grid_id in grid: - if grid_id not in stop_list_grid: - stop_list_grid[grid_id] = [] - stop_list_grid[grid_id].append(stop_id) - - target_stop_list = list(stop_list.items()) - stop_map = {} - count = 0 - group_count = 0 - - for stop_id, stop in target_stop_list: - count += 1 - # if count % 1000 == 0: - # logger.info(f"Processed {count} stops ({group_count} groups) at {(time.time() - start_time) * 1000:.2f}ms") - - stop_group = get_stop_group(route_list, stop_list, stop_seq_mapping, stop_list_grid, stop_id) - if len(stop_group) > 0: - group_count += 1 - stop_map[stop_id] = stop_group - - logger.info(f"Processed {count} stops ({group_count} groups) at {(time.time() - start_time) * 1000:.2f}ms") - - with open('stopMap.json', 'w', encoding='UTF-8') as f: - json.dump(stop_map, f, indent=4) - - db['stopMap'] = stop_map - - with open('routeFareList.json', 'w', encoding='UTF-8') as f: - json.dump(db, f, indent=4) - - # reduce size of routeFareList.min.json by rounding lat/lng values to 5 decimal places - # 5 d.p. is roughly one-metre accuracy, it is good enough for this project - # saves around 50kb in size for 14,000 stops - for stop_id, stop in target_stop_list: - stop_list[stop_id]['location']['lat'] = float('%.5f' % (stop_list[stop_id]['location']['lat'])) - stop_list[stop_id]['location']['lng'] = float('%.5f' % (stop_list[stop_id]['location']['lng'])) - - db['stopList'] = stop_list - - logger.info(f"Reduced location lat/lng to 5 d.p. at {(time.time() - start_time) * 1000:.2f}ms") - - with open('routeFareList.alpha.json', 'w', encoding='UTF-8') as f: - json.dump(db, f, indent=4) - - with open('routeFareList.min.json', 'w', encoding='UTF-8') as f: - json.dump(db, f) - - with open('routeFareList.alpha.min.json', 'w', encoding='UTF-8') as f: - json.dump(db, f) + # Read the result from previous pipeline + with open('routeFareList.mergeRoutes.min.json', 'r', encoding='UTF-8') as f: + db = json.load(f) + + route_list = db['routeList'] + stop_list = db['stopList'] + start_time = time.time() + stop_seq_mapping = {} + + # Preprocess the list of bearings for each stop + for route_key, route_list_entry in route_list.items(): + stops = route_list_entry.get('stops', {}) + for co, co_stops in stops.items(): + for stop_pos, stop_id in enumerate(co_stops): + if stop_id not in stop_seq_mapping: + stop_seq_mapping[stop_id] = { + "routeStops": [], "co": co, "bearings": []} + if stop_pos == len(co_stops) - 1: + stop_seq_mapping[stop_id]['routeStops'].append({ + 'routeKey': route_key, + 'co': co, + 'seq': stop_pos, + 'bearing': -1 + }) + else: + bearing = get_bearing( + stop_list[stop_id]['location'], stop_list[co_stops[stop_pos + 1]]['location']) + stop_seq_mapping[stop_id]['routeStops'].append({ + 'routeKey': route_key, + 'co': co, + 'seq': stop_pos, + 'bearing': bearing + }) + + for stop_id in stop_seq_mapping.keys(): + stop_seq_mapping[stop_id]['bearings'] = get_stop_bearings( + stop_seq_mapping[stop_id]['routeStops']) + + # Just dump the json in case of a need for trouble-shooting, but otherwise + # we do not need this file + with open('stopMap.routeStopsSequence.json', 'w', encoding='UTF-8') as f: + json.dump(stop_seq_mapping, f) + + logger.info( + f'Processed routeStopsSequence in {(time.time() - start_time) * 1000:.2f}ms') + + # Preprocess stopList, organise stops into ~100m x ~100m squares to reduce + # size of nested loop later + stop_list_grid = {} + for stop_id, stop in stop_list.items(): + # take lat/lng up to 3 decimal places, that's about 100m x 100m square + lat = int(stop['location']['lat'] * 1000) + lng = int(stop['location']['lng'] * 1000) + # add stop into the 9 grid boxes surrounding this stop + grid = [ + f"{lat - 1}_{lng - 1}", + f"{lat }_{lng - 1}", + f"{lat + 1}_{lng - 1}", + f"{lat - 1}_{lng }", + f"{lat }_{lng }", + f"{lat + 1}_{lng }", + f"{lat - 1}_{lng + 1}", + f"{lat }_{lng + 1}", + f"{lat + 1}_{lng + 1}", + ] + for grid_id in grid: + if grid_id not in stop_list_grid: + stop_list_grid[grid_id] = [] + stop_list_grid[grid_id].append(stop_id) + + target_stop_list = list(stop_list.items()) + stop_map = {} + count = 0 + group_count = 0 + + for stop_id, stop in target_stop_list: + count += 1 + # if count % 1000 == 0: + # logger.info(f"Processed {count} stops ({group_count} groups) at {(time.time() - start_time) * 1000:.2f}ms") + + stop_group = get_stop_group( + route_list, + stop_list, + stop_seq_mapping, + stop_list_grid, + stop_id) + if len(stop_group) > 0: + group_count += 1 + stop_map[stop_id] = stop_group + + logger.info( + f"Processed {count} stops ({group_count} groups) at {(time.time() - start_time) * 1000:.2f}ms") + + with open('stopMap.json', 'w', encoding='UTF-8') as f: + json.dump(stop_map, f, indent=4) + + db['stopMap'] = stop_map + + with open('routeFareList.json', 'w', encoding='UTF-8') as f: + json.dump(db, f, indent=4) + + # reduce size of routeFareList.min.json by rounding lat/lng values to 5 decimal places + # 5 d.p. is roughly one-metre accuracy, it is good enough for this project + # saves around 50kb in size for 14,000 stops + for stop_id, stop in target_stop_list: + stop_list[stop_id]['location']['lat'] = float( + '%.5f' % (stop_list[stop_id]['location']['lat'])) + stop_list[stop_id]['location']['lng'] = float( + '%.5f' % (stop_list[stop_id]['location']['lng'])) + + db['stopList'] = stop_list + + logger.info( + f"Reduced location lat/lng to 5 d.p. at {(time.time() - start_time) * 1000:.2f}ms") + + with open('routeFareList.alpha.json', 'w', encoding='UTF-8') as f: + json.dump(db, f, indent=4) + + with open('routeFareList.min.json', 'w', encoding='UTF-8') as f: + json.dump(db, f) + + with open('routeFareList.alpha.min.json', 'w', encoding='UTF-8') as f: + json.dump(db, f) + if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - logger = logging.getLogger(__name__) - merge_stop_list() \ No newline at end of file + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) + merge_stop_list() diff --git a/crawling/mtr.py b/crawling/mtr.py index 65ca6461..19dc363c 100644 --- a/crawling/mtr.py +++ b/crawling/mtr.py @@ -11,11 +11,13 @@ from crawl_utils import emitRequest + def filterStops(route): route['stops'] = [stop for stop in route['stops'] if stop is not None] return route -async def getRouteStop(co = 'mtr'): + +async def getRouteStop(co='mtr'): a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None)) epsgTransformer = Transformer.from_crs('epsg:2326', 'epsg:4326') @@ -24,48 +26,54 @@ async def getRouteStop(co = 'mtr'): r = await emitRequest('https://opendata.mtr.com.hk/data/mtr_lines_and_stations.csv', a_client) r.encoding = 'utf-8' - reader = csv.reader(r.text.split("\n") ) - headers = next(reader,None) + reader = csv.reader(r.text.split("\n")) + headers = next(reader, None) routes = [route for route in reader if len(route) == 7] for [route, bound, stopCode, stopId, chn, eng, seq] in routes: if route == "": continue - if route+"_"+bound not in routeList: - routeList[route+"_"+bound] = { - "gtfsId": None, - "route": route, - "bound": bound, - "service_type": "1", - "orig_tc": None, - "orig_en": None, - "dest_tc": None, - "dest_en": None, - "stops": [None] * 100, - "fare": [] + if route + "_" + bound not in routeList: + routeList[route + "_" + bound] = { + "gtfsId": None, + "route": route, + "bound": bound, + "service_type": "1", + "orig_tc": None, + "orig_en": None, + "dest_tc": None, + "dest_en": None, + "stops": [None] * 100, + "fare": [] } if int(float(seq)) == 1: - routeList[route+"_"+bound]["orig_tc"] = chn - routeList[route+"_"+bound]["orig_en"] = eng - routeList[route+"_"+bound]["dest_tc"] = chn - routeList[route+"_"+bound]["dest_en"] = eng - routeList[route+"_"+bound]["stops"][int(float(seq))] = stopCode + routeList[route + "_" + bound]["orig_tc"] = chn + routeList[route + "_" + bound]["orig_en"] = eng + routeList[route + "_" + bound]["dest_tc"] = chn + routeList[route + "_" + bound]["dest_en"] = eng + routeList[route + "_" + bound]["stops"][int(float(seq))] = stopCode if stopCode not in stopList: - r = await emitRequest('https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q=港鐵'+chn+"站", a_client, headers={'Accept': 'application/json'}) - lat, lng = epsgTransformer.transform( r.json()[0]['y'], r.json()[0]['x'] ) + r = await emitRequest('https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q=港鐵' + chn + "站", a_client, headers={'Accept': 'application/json'}) + lat, lng = epsgTransformer.transform(r.json()[0]['y'], r.json()[0]['x']) stopList[stopCode] = { - "stop": stopCode, - "name_en": eng, - "name_tc": chn, - "lat": lat, - "long": lng + "stop": stopCode, + "name_en": eng, + "name_tc": chn, + "lat": lat, + "long": lng } with open('routeList.mtr.json', 'w', encoding='UTF-8') as f: - f.write(json.dumps(list(map(filterStops, [route for route in routeList.values() if len(route['stops']) > 0])), ensure_ascii=False)) + f.write( + json.dumps( + list( + map( + filterStops, [ + route for route in routeList.values() if len( + route['stops']) > 0])), ensure_ascii=False)) with open('stopList.mtr.json', 'w', encoding='UTF-8') as f: f.write(json.dumps(stopList, ensure_ascii=False)) -if __name__=='__main__': +if __name__ == '__main__': logging.basicConfig(level=logging.INFO) logging.getLogger('httpx').setLevel(logging.WARNING) logger = logging.getLogger(__name__) diff --git a/crawling/mtrExits.py b/crawling/mtrExits.py index 6b3fe90e..a86fc857 100644 --- a/crawling/mtrExits.py +++ b/crawling/mtrExits.py @@ -1,4 +1,4 @@ - # -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import asyncio import logging from crawl_utils import emitRequest @@ -13,61 +13,69 @@ mtrStops = {} epsgTransformer = Transformer.from_crs('epsg:2326', 'epsg:4326') + def checkResult(results, q, stop, exit, barrierFree): for result in results: if result['nameZH'] == q: - lat, lng = epsgTransformer.transform( result['y'], result['x'] ) + lat, lng = epsgTransformer.transform(result['y'], result['x']) res.append({ - "name_en": stop["name_en"], - "name_zh": stop["name_tc"], - "name": { - "en": stop["name_en"], - "zh": stop["name_tc"], - }, - "exit": exit, - "lat": lat, - "lng": lng, - "barrierFree": barrierFree, + "name_en": stop["name_en"], + "name_zh": stop["name_tc"], + "name": { + "en": stop["name_en"], + "zh": stop["name_tc"], + }, + "exit": exit, + "lat": lat, + "lng": lng, + "barrierFree": barrierFree, }) return True - return False + return False + async def main(): a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None)) r = await emitRequest('https://opendata.mtr.com.hk/data/mtr_lines_and_stations.csv', a_client) r.encoding = 'utf-8' - reader = csv.reader(r.text.strip().split("\n") ) - headers = next(reader,None) + reader = csv.reader(r.text.strip().split("\n")) + headers = next(reader, None) for entry in reader: mtrStops[entry[3]] = { - "name_tc": entry[4], - "name_en": entry[5], + "name_tc": entry[4], + "name_en": entry[5], } r = await emitRequest("https://opendata.mtr.com.hk/data/barrier_free_facilities.csv", a_client) r.encoding = 'utf-8' - reader = csv.reader(r.text.strip().split("\n") ) + reader = csv.reader(r.text.strip().split("\n")) for entry in reader: if entry[2] == 'Y' and entry[3] != '': - for exit in re.findall(" [A-Z][\d]*", entry[3]): + for exit in re.findall(" [A-Z][\\d]*", entry[3]): if entry[0] in mtrStops: mtrStops[entry[0]][exit.strip()] = True - + # crawl exit geolocation for key, stop in mtrStops.items(): - q = '港鐵'+stop['name_tc']+'站進出口' - r = await emitRequest("https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q="+q, a_client) + q = '港鐵' + stop['name_tc'] + '站進出口' + r = await emitRequest("https://geodata.gov.hk/gs/api/v1.0.0/locationSearch?q=" + q, a_client) for char in string.ascii_uppercase: - q = '港鐵'+stop['name_tc']+'站-'+str(char)+'進出口' + q = '港鐵' + stop['name_tc'] + '站-' + str(char) + '進出口' checkResult(r.json(), q, stop, char, str(char) in stop) - for i in range(1,10): - q = '港鐵'+stop['name_tc']+'站-'+char+str(i)+'進出口' - checkResult(r.json(), q, stop, char+str(i), (char+str(char)) in stop) - + for i in range(1, 10): + q = '港鐵' + stop['name_tc'] + '站-' + char + str(i) + '進出口' + checkResult( + r.json(), + q, + stop, + char + str(i), + (char + str(char)) in stop) + with open('exits.mtr.json', 'w', encoding='UTF-8') as f: - f.write(json.dumps(list({(v['name']['zh']+v['exit']): v for v in res}.values()), ensure_ascii=False)) + f.write(json.dumps(list( + {(v['name']['zh'] + v['exit']): v for v in res}.values()), ensure_ascii=False)) if __name__ == '__main__': logging.basicConfig(level=logging.INFO) logging.getLogger('httpx').setLevel(logging.WARNING) - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) diff --git a/crawling/nlb.py b/crawling/nlb.py index 810b3053..547ebfaf 100644 --- a/crawling/nlb.py +++ b/crawling/nlb.py @@ -7,84 +7,85 @@ import httpx -logger=logging.getLogger(__name__) +logger = logging.getLogger(__name__) + async def getRouteStop(co): - # define output name - ROUTE_LIST = 'routeList.'+co+'.json' - STOP_LIST = 'stopList.'+co+'.json' + # define output name + ROUTE_LIST = 'routeList.' + co + '.json' + STOP_LIST = 'stopList.' + co + '.json' + + a_client = httpx.AsyncClient() + # load route list and stop list if exist + routeList = [] + if path.isfile(ROUTE_LIST): + logger.warning(f"{ROUTE_LIST} already exist, skipping...") + return + else: + # load routes + r = await emitRequest('https://rt.data.gov.hk/v2/transport/nlb/route.php?action=list', a_client) + for route in r.json()['routes']: + routeList.append({ + "id": route['routeId'], + "route": route['routeNo'], + "bound": "O", + "orig_en": route['routeName_e'].split(' > ')[0], + "orig_tc": route['routeName_c'].split(' > ')[0], + "dest_en": route['routeName_e'].split(' > ')[1], + "dest_tc": route['routeName_c'].split(' > ')[1], + "service_type": str(1 + route['overnightRoute'] * 2 + route['specialRoute'] * 4), + "stops": [], + "co": ["nlb"] + }) + logger.info("Digested route list") - a_client = httpx.AsyncClient() - # load route list and stop list if exist - routeList = [] - if path.isfile(ROUTE_LIST): - logger.warning(f"{ROUTE_LIST} already exist, skipping...") - return - else: - # load routes - r = await emitRequest('https://rt.data.gov.hk/v2/transport/nlb/route.php?action=list', a_client) - for route in r.json()['routes']: - routeList.append({ - "id": route['routeId'], - "route": route['routeNo'], - "bound": "O", - "orig_en": route['routeName_e'].split(' > ')[0], - "orig_tc": route['routeName_c'].split(' > ')[0], - "dest_en": route['routeName_e'].split(' > ')[1], - "dest_tc": route['routeName_c'].split(' > ')[1], - "service_type": str(1 + route['overnightRoute'] * 2 + route['specialRoute'] *4), - "stops": [], - "co": ["nlb"] - }) - logger.info("Digested route list") + stopList = {} + if path.isfile(STOP_LIST): + with open(STOP_LIST, 'r', encoding='UTF-8') as f: + stopList = json.load(f) - stopList = {} - if path.isfile(STOP_LIST): - with open(STOP_LIST, 'r', encoding='UTF-8') as f: - stopList = json.load(f) - - async def getRouteStop(routeId): - r = await emitRequest('https://rt.data.gov.hk/v2/transport/nlb/stop.php?action=list&routeId='+routeId, a_client) - try: - return r.json()['stops'] - except Exception as err: - print(r) - raise err + async def getRouteStop(routeId): + r = await emitRequest('https://rt.data.gov.hk/v2/transport/nlb/stop.php?action=list&routeId=' + routeId, a_client) + try: + return r.json()['stops'] + except Exception as err: + print(r) + raise err - async def addRouteStop(route): - stops = await getRouteStop(route['id']) - stopIds = [] - fares = [] - faresHoliday = [] - for stop in stops: - if stop['stopId'] not in stopList: - stopList[stop['stopId']] = { - 'stop': stop['stopId'], - 'name_en': stop['stopName_e'], - 'name_tc': stop['stopName_c'], - 'lat': stop['latitude'], - 'long': stop['longitude'] - } - stopIds.append(stop['stopId']) - fares.append(stop['fare']) - faresHoliday.append(stop['fareHoliday']) - route['stops'] = stopIds - route['fares'] = fares[0:-1] - route['faresHoliday'] = faresHoliday[0:-1] + async def addRouteStop(route): + stops = await getRouteStop(route['id']) + stopIds = [] + fares = [] + faresHoliday = [] + for stop in stops: + if stop['stopId'] not in stopList: + stopList[stop['stopId']] = { + 'stop': stop['stopId'], + 'name_en': stop['stopName_e'], + 'name_tc': stop['stopName_c'], + 'lat': stop['latitude'], + 'long': stop['longitude'] + } + stopIds.append(stop['stopId']) + fares.append(stop['fare']) + faresHoliday.append(stop['fareHoliday']) + route['stops'] = stopIds + route['fares'] = fares[0:-1] + route['faresHoliday'] = faresHoliday[0:-1] - async def getRouteStopList (): - await asyncio.gather(*[addRouteStop(r) for r in routeList]) - logger.info("Digested stop list") - return routeList + async def getRouteStopList(): + await asyncio.gather(*[addRouteStop(r) for r in routeList]) + logger.info("Digested stop list") + return routeList - await getRouteStopList() + await getRouteStopList() - with open(ROUTE_LIST, 'w', encoding='UTF-8') as rf, open(STOP_LIST, 'w', encoding='UTF-8') as sf: - json.dump(routeList, rf, ensure_ascii=False) - json.dump(stopList, sf, ensure_ascii=False) - logger.info("Dumped lists") + with open(ROUTE_LIST, 'w', encoding='UTF-8') as rf, open(STOP_LIST, 'w', encoding='UTF-8') as sf: + json.dump(routeList, rf, ensure_ascii=False) + json.dump(stopList, sf, ensure_ascii=False) + logger.info("Dumped lists") -if __name__=='__main__': - logging.basicConfig(level=logging.INFO) - logging.getLogger('httpx').setLevel(logging.WARNING) - asyncio.run(getRouteStop('nlb')) \ No newline at end of file +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + logging.getLogger('httpx').setLevel(logging.WARNING) + asyncio.run(getRouteStop('nlb')) diff --git a/crawling/parseGtfs.py b/crawling/parseGtfs.py index 74c09aa0..d9cdbd62 100644 --- a/crawling/parseGtfs.py +++ b/crawling/parseGtfs.py @@ -10,9 +10,11 @@ import re from crawl_utils import emitRequest, store_version + def takeFirst(elem): return int(elem[0]) + async def parseGtfs(): a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None)) if not path.isfile('gtfs.zip'): @@ -33,24 +35,30 @@ async def parseGtfs(): with open('gtfs/routes.txt', 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile) headers = next(reader, None) - for [route_id, agency_id, route_short_name, route_long_name, route_type, route_url] in reader: + for [ + route_id, + agency_id, + route_short_name, + route_long_name, + route_type, + route_url] in reader: routeList[route_id] = { - 'co': agency_id.replace('LWB', 'KMB').lower().split('+'), - 'route': route_short_name, - 'stops': {}, - 'fares': {}, - 'freq': {}, - 'orig': { - 'zh': route_long_name.split(' - ')[0], - 'en': '', - }, - 'dest': { - 'zh': route_long_name.split(' - ')[1].replace(' (CIRCULAR)', ''), - 'en': '', - }, - 'jt': routeJourneyTime[route_id]["journeyTime"] if route_id in routeJourneyTime else None + 'co': agency_id.replace('LWB', 'KMB').lower().split('+'), + 'route': route_short_name, + 'stops': {}, + 'fares': {}, + 'freq': {}, + 'orig': { + 'zh': route_long_name.split(' - ')[0], + 'en': '', + }, + 'dest': { + 'zh': route_long_name.split(' - ')[1].replace(' (CIRCULAR)', ''), + 'en': '', + }, + 'jt': routeJourneyTime[route_id]["journeyTime"] if route_id in routeJourneyTime else None } - + # parse timetable with open('gtfs/trips.txt', 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile) @@ -69,13 +77,22 @@ async def parseGtfs(): headers = next(reader, None) for [trip_id, _start_time, end_time, headway_secs] in reader: [route_id, bound, calendar, start_time] = trip_id.split('-') - routeList[route_id]['freq'][bound][calendar][start_time] = (end_time[0:5].replace(':', ''), headway_secs) + routeList[route_id]['freq'][bound][calendar][start_time] = ( + end_time[0:5].replace(':', ''), headway_secs) # parse stop seq with open('gtfs/stop_times.txt', 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile) headers = next(reader, None) - for [trip_id, arrival_time, departure_time, stop_id, stop_sequence, pickup_type, drop_off_type, timepoint] in reader: + for [ + trip_id, + arrival_time, + departure_time, + stop_id, + stop_sequence, + pickup_type, + drop_off_type, + timepoint] in reader: [route_id, bound, service_id, tmp] = trip_id.split('-') if bound not in routeList[route_id]['stops']: routeList[route_id]['stops'][bound] = {} @@ -85,62 +102,78 @@ async def parseGtfs(): with open('gtfs/fare_attributes.txt', 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile) headers = next(reader, None) - for [fare_id,price,currency_type,payment_method,transfers,agency_id] in reader: + for [ + fare_id, + price, + currency_type, + payment_method, + transfers, + agency_id] in reader: [route_id, bound, on, off] = fare_id.split('-') if bound not in routeList[route_id]['fares']: routeList[route_id]['fares'][bound] = {} - if on not in routeList[route_id]['fares'][bound] or routeList[route_id]['fares'][bound][on][1] < int(off): - routeList[route_id]['fares'][bound][on] = ('0' if price == '0.0000' else price, int(off)) + if on not in routeList[route_id]['fares'][bound] or routeList[route_id]['fares'][bound][on][1] < int( + off): + routeList[route_id]['fares'][bound][on] = ( + '0' if price == '0.0000' else price, int(off)) - for route_id in routeList.keys(): + for route_id in routeList.keys(): for bound in routeList[route_id]['stops'].keys(): _tmp = list(routeList[route_id]['stops'][bound].items()) _tmp.sort(key=takeFirst) - routeList[route_id]['stops'][bound] = [v for k,v in _tmp] + routeList[route_id]['stops'][bound] = [v for k, v in _tmp] for bound in routeList[route_id]['fares'].keys(): _tmp = list(routeList[route_id]['fares'][bound].items()) _tmp.sort(key=takeFirst) - routeList[route_id]['fares'][bound] = [v[0] for k,v in _tmp] + routeList[route_id]['fares'][bound] = [v[0] for k, v in _tmp] + + nameReg = re.compile('\\[(.*)\\] (.*)') - nameReg = re.compile('\[(.*)\] (.*)') def parseStopName(name): ret = {} for str in name.split('|'): matches = nameReg.findall(str) - if len(matches) == 0: return { "unknown": str } + if len(matches) == 0: + return {"unknown": str} for co, gtfsName in matches: x, y = co.split('+'), gtfsName.split('/
') for i in range(len(x)): ret[x[i].lower().replace('lwb', 'kmb')] = y[i if i < len(y) else 0] return ret - with open('gtfs/stops.txt', 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile) headers = next(reader, None) - for [stop_id,stop_name,stop_lat,stop_lon,zone_id,location_type,stop_timezone] in reader: + for [ + stop_id, + stop_name, + stop_lat, + stop_lon, + zone_id, + location_type, + stop_timezone] in reader: stopList[stop_id] = { - 'stopId': stop_id, - 'stopName': parseStopName(stop_name), - 'lat': float(stop_lat), - 'lng': float(stop_lon) + 'stopId': stop_id, + 'stopName': parseStopName(stop_name), + 'lat': float(stop_lat), + 'lng': float(stop_lon) } with open('gtfs/calendar.txt', 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile) headers = next(reader, None) for line in reader: - [service_id,mon,tue,wed,thur,fri,sat,sun, start_date, end_date] = line + [service_id, mon, tue, wed, thur, fri, sat, sun, start_date, end_date] = line serviceDayMap[service_id] = [sun, mon, tue, wed, thur, fri, sat] with open('gtfs.json', 'w', encoding='UTF-8') as f: f.write(json.dumps({ - 'routeList': routeList, - 'stopList': stopList, - "serviceDayMap": serviceDayMap, + 'routeList': routeList, + 'stopList': stopList, + "serviceDayMap": serviceDayMap, }, ensure_ascii=False, indent=2)) -if __name__=='__main__': +if __name__ == '__main__': logging.basicConfig(level=logging.INFO) logging.getLogger('httpx').setLevel(logging.WARNING) logger = logging.getLogger(__name__) diff --git a/crawling/parseGtfsEn.py b/crawling/parseGtfsEn.py index 17b1b216..acb3e463 100644 --- a/crawling/parseGtfsEn.py +++ b/crawling/parseGtfsEn.py @@ -10,9 +10,11 @@ import re from crawl_utils import emitRequest, store_version + def takeFirst(elem): return int(elem[0]) + async def parseGtfs(): a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None)) if not path.isfile('gtfs-en.zip'): @@ -33,24 +35,30 @@ async def parseGtfs(): with open('gtfs-en/routes.txt', 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile) headers = next(reader, None) - for [route_id, agency_id, route_short_name, route_long_name, route_type, route_url] in reader: + for [ + route_id, + agency_id, + route_short_name, + route_long_name, + route_type, + route_url] in reader: routeList[route_id] = { - 'co': agency_id.replace('LWB', 'KMB').lower().split('+'), - 'route': route_short_name if route_short_name != "" else route_id, - 'stops': {}, - 'fares': {}, - 'freq': {}, - 'orig': { - 'zh': '', - 'en': route_long_name.split(' - ')[0] - }, - 'dest': { - 'zh': '', - 'en': route_long_name.split(' - ')[1].replace(' (CIRCULAR)', '') - }, - 'jt': routeJourneyTime[route_id]["journeyTime"] if route_id in routeJourneyTime else None + 'co': agency_id.replace('LWB', 'KMB').lower().split('+'), + 'route': route_short_name if route_short_name != "" else route_id, + 'stops': {}, + 'fares': {}, + 'freq': {}, + 'orig': { + 'zh': '', + 'en': route_long_name.split(' - ')[0] + }, + 'dest': { + 'zh': '', + 'en': route_long_name.split(' - ')[1].replace(' (CIRCULAR)', '') + }, + 'jt': routeJourneyTime[route_id]["journeyTime"] if route_id in routeJourneyTime else None } - + # parse timetable with open('gtfs-en/trips.txt', 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile) @@ -69,13 +77,22 @@ async def parseGtfs(): headers = next(reader, None) for [trip_id, _start_time, end_time, headway_secs] in reader: [route_id, bound, calendar, start_time] = trip_id.split('-') - routeList[route_id]['freq'][bound][calendar][start_time] = (end_time[0:5].replace(':', ''), headway_secs) + routeList[route_id]['freq'][bound][calendar][start_time] = ( + end_time[0:5].replace(':', ''), headway_secs) # parse stop seq with open('gtfs-en/stop_times.txt', 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile) headers = next(reader, None) - for [trip_id, arrival_time, departure_time, stop_id, stop_sequence, pickup_type, drop_off_type, timepoint] in reader: + for [ + trip_id, + arrival_time, + departure_time, + stop_id, + stop_sequence, + pickup_type, + drop_off_type, + timepoint] in reader: [route_id, bound, service_id, tmp] = trip_id.split('-') if bound not in routeList[route_id]['stops']: routeList[route_id]['stops'][bound] = {} @@ -85,62 +102,78 @@ async def parseGtfs(): with open('gtfs-en/fare_attributes.txt', 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile) headers = next(reader, None) - for [fare_id,price,currency_type,payment_method,transfers,agency_id] in reader: + for [ + fare_id, + price, + currency_type, + payment_method, + transfers, + agency_id] in reader: [route_id, bound, on, off] = fare_id.split('-') if bound not in routeList[route_id]['fares']: routeList[route_id]['fares'][bound] = {} - if on not in routeList[route_id]['fares'][bound] or routeList[route_id]['fares'][bound][on][1] < int(off): - routeList[route_id]['fares'][bound][on] = ('0' if price == '0.0000' else price, int(off)) + if on not in routeList[route_id]['fares'][bound] or routeList[route_id]['fares'][bound][on][1] < int( + off): + routeList[route_id]['fares'][bound][on] = ( + '0' if price == '0.0000' else price, int(off)) - for route_id in routeList.keys(): + for route_id in routeList.keys(): for bound in routeList[route_id]['stops'].keys(): _tmp = list(routeList[route_id]['stops'][bound].items()) _tmp.sort(key=takeFirst) - routeList[route_id]['stops'][bound] = [v for k,v in _tmp] + routeList[route_id]['stops'][bound] = [v for k, v in _tmp] for bound in routeList[route_id]['fares'].keys(): _tmp = list(routeList[route_id]['fares'][bound].items()) _tmp.sort(key=takeFirst) - routeList[route_id]['fares'][bound] = [v[0] for k,v in _tmp] + routeList[route_id]['fares'][bound] = [v[0] for k, v in _tmp] + + nameReg = re.compile('\\[(.*)\\] (.*)') - nameReg = re.compile('\[(.*)\] (.*)') def parseStopName(name): ret = {} for str in name.split('|'): matches = nameReg.findall(str) - if len(matches) == 0: return { "unknown": str } + if len(matches) == 0: + return {"unknown": str} for co, gtfsName in matches: x, y = co.split('+'), gtfsName.split('/
') for i in range(len(x)): ret[x[i].lower().replace('lwb', 'kmb')] = y[i if i < len(y) else 0] return ret - with open('gtfs-en/stops.txt', 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile) headers = next(reader, None) - for [stop_id,stop_name,stop_lat,stop_lon,zone_id,location_type,stop_timezone] in reader: + for [ + stop_id, + stop_name, + stop_lat, + stop_lon, + zone_id, + location_type, + stop_timezone] in reader: stopList[stop_id] = { - 'stopId': stop_id, - 'stopName': parseStopName(stop_name), - 'lat': float(stop_lat), - 'lng': float(stop_lon) + 'stopId': stop_id, + 'stopName': parseStopName(stop_name), + 'lat': float(stop_lat), + 'lng': float(stop_lon) } with open('gtfs-en/calendar.txt', 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile) headers = next(reader, None) for line in reader: - [service_id,mon,tue,wed,thur,fri,sat,sun, start_date, end_date] = line + [service_id, mon, tue, wed, thur, fri, sat, sun, start_date, end_date] = line serviceDayMap[service_id] = [sun, mon, tue, wed, thur, fri, sat] with open('gtfs-en.json', 'w', encoding='UTF-8') as f: f.write(json.dumps({ - 'routeList': routeList, - 'stopList': stopList, - "serviceDayMap": serviceDayMap, + 'routeList': routeList, + 'stopList': stopList, + "serviceDayMap": serviceDayMap, }, ensure_ascii=False, indent=2)) -if __name__=='__main__': +if __name__ == '__main__': logging.basicConfig(level=logging.INFO) logging.getLogger('httpx').setLevel(logging.WARNING) logger = logging.getLogger(__name__) diff --git a/crawling/parseHoliday.py b/crawling/parseHoliday.py index 3f753ca2..c02c62ca 100644 --- a/crawling/parseHoliday.py +++ b/crawling/parseHoliday.py @@ -7,6 +7,7 @@ logger = logging.getLogger(__name__) + async def main(): if not path.isfile('holiday.json'): async with httpx.AsyncClient() as a_client: @@ -19,6 +20,6 @@ async def main(): else: logger.info('holiday.json already exist, download skipped') -if __name__=='__main__': +if __name__ == '__main__': logging.basicConfig(level=logging.INFO) - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) diff --git a/crawling/parseJourneyTime.py b/crawling/parseJourneyTime.py index c8651570..78248014 100644 --- a/crawling/parseJourneyTime.py +++ b/crawling/parseJourneyTime.py @@ -8,6 +8,7 @@ from crawl_utils import emitRequest, store_version from datetime import datetime + async def parseJourneyTime(): a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None)) if not path.isfile('ROUTE_BUS.xml'): @@ -19,20 +20,20 @@ async def parseJourneyTime(): routeTimeList = {} tree = ET.parse('ROUTE_BUS.xml') root = tree.getroot() - version = datetime.fromisoformat(root.attrib["generated"]+"+08:00") + version = datetime.fromisoformat(root.attrib["generated"] + "+08:00") store_version('routes-fares-xml/ROUTE_BUS', version.isoformat()) for route in root.iter('ROUTE'): if route.find('ROUTE_TYPE').text == '1': routeTimeList[route.find('ROUTE_ID').text] = { - 'co': route.find('COMPANY_CODE').text.replace('LWB', 'KMB').lower().split('+'), - 'route': route.find('ROUTE_NAMEC').text, - 'journeyTime': route.find('JOURNEY_TIME').text, + 'co': route.find('COMPANY_CODE').text.replace('LWB', 'KMB').lower().split('+'), + 'route': route.find('ROUTE_NAMEC').text, + 'journeyTime': route.find('JOURNEY_TIME').text, } with open('routeTime.json', 'w', encoding='UTF-8') as f: f.write(json.dumps(routeTimeList, ensure_ascii=False)) -if __name__=='__main__': +if __name__ == '__main__': logging.basicConfig(level=logging.INFO) logging.getLogger('httpx').setLevel(logging.WARNING) logger = logging.getLogger(__name__) diff --git a/crawling/routeCompare.py b/crawling/routeCompare.py index fe13134f..9eedc7dd 100644 --- a/crawling/routeCompare.py +++ b/crawling/routeCompare.py @@ -12,6 +12,7 @@ from crawl_utils import emitRequest + async def routeCompare(): a_client = httpx.AsyncClient(timeout=httpx.Timeout(30.0, pool=None)) r = await emitRequest("https://data.hkbus.app/routeFareList.min.json", a_client) @@ -22,10 +23,13 @@ async def routeCompare(): os.makedirs("route-ts", exist_ok=True) def isRouteEqual(a, b): - return xxhash.xxh3_64(str(a)).hexdigest() == xxhash.xxh3_64(str(b)).hexdigest() + return xxhash.xxh3_64( + str(a)).hexdigest() == xxhash.xxh3_64( + str(b)).hexdigest() for newKey in newDb['routeList']: - if newKey not in oldDb['routeList'] or not isRouteEqual(oldDb['routeList'][newKey], newDb['routeList'][newKey]): + if newKey not in oldDb['routeList'] or not isRouteEqual( + oldDb['routeList'][newKey], newDb['routeList'][newKey]): filename = re.sub(r'[\\\/\:\*\?\"\<\>\|]', '', newKey).upper() with open(os.path.join("route-ts", filename), "w", encoding='utf-8') as f: f.write(str(int(time.time()))) @@ -36,8 +40,8 @@ def isRouteEqual(a, b): with open(os.path.join("route-ts", filename), "w", encoding='utf-8') as f: f.write(str(int(time.time()))) -if __name__=='__main__': +if __name__ == '__main__': logging.basicConfig(level=logging.INFO) logging.getLogger('httpx').setLevel(logging.WARNING) logger = logging.getLogger(__name__) - asyncio.run(routeCompare()) \ No newline at end of file + asyncio.run(routeCompare()) diff --git a/crawling/sunferry.py b/crawling/sunferry.py index b4749502..ec36cf34 100644 --- a/crawling/sunferry.py +++ b/crawling/sunferry.py @@ -11,22 +11,22 @@ gtfsStops = gtfs["stopList"] routes = { - "CECC": ["Central", "Cheung Chau"], - "CCCE": ["Cheung Chau", "Central"], - "CEMW": ["Central", "Mui Wo"], - "MWCE": ["Mui Wo", "Central"], - "NPHH": ["North Point", "Hung Hom"], - "HHNP": ["Hung Hom", "North Point"], - "NPKC": ["North Point", "Kowloon City"], - "KCNP": ["Kowloon City", "North Point"], - "IIPECMUW": ["Peng Chau", "Mui Wo"], - "IIMUWPEC": ["Mui Wo", "Peng Chau"], - "IIMUWCMW": ["Mui Wo", "Chi Ma Wan"], - "IICMWMUW": ["Chi Ma Wan", "Mui Wo"], - "IICMWCHC": ["Chi Ma Wan", "Cheung Chau"], - "IICHCCMW": ["Cheung Chau", "Chi Ma Wan"], - "IICHCMUW": ["Cheung Chau", "Mui Wo"], - "IIMUWCHC": ["Mui Wo", "Cheung Chau "], + "CECC": ["Central", "Cheung Chau"], + "CCCE": ["Cheung Chau", "Central"], + "CEMW": ["Central", "Mui Wo"], + "MWCE": ["Mui Wo", "Central"], + "NPHH": ["North Point", "Hung Hom"], + "HHNP": ["Hung Hom", "North Point"], + "NPKC": ["North Point", "Kowloon City"], + "KCNP": ["Kowloon City", "North Point"], + "IIPECMUW": ["Peng Chau", "Mui Wo"], + "IIMUWPEC": ["Mui Wo", "Peng Chau"], + "IIMUWCMW": ["Mui Wo", "Chi Ma Wan"], + "IICMWMUW": ["Chi Ma Wan", "Mui Wo"], + "IICMWCHC": ["Chi Ma Wan", "Cheung Chau"], + "IICHCCMW": ["Cheung Chau", "Chi Ma Wan"], + "IICHCMUW": ["Cheung Chau", "Mui Wo"], + "IIMUWCHC": ["Mui Wo", "Cheung Chau "], } routeList = [] @@ -35,47 +35,48 @@ for [route_code, [orig, dest]] in routes.items(): for route_id, gtfsRoute in gtfsRoutes.items(): if "ferry" in gtfsRoute["co"]: - if orig.lower() == gtfsRoute["orig"]["en"].lower() and dest.lower() == gtfsRoute["dest"]["en"].lower(): + if orig.lower() == gtfsRoute["orig"]["en"].lower( + ) and dest.lower() == gtfsRoute["dest"]["en"].lower(): routeList.append({ - "gtfsId": route_id, - "route": route_code, - "orig_tc": gtfsZh["routeList"][route_id]["orig"]["zh"], - "orig_en": gtfsRoute["orig"]["en"], - "dest_tc": gtfsZh["routeList"][route_id]["dest"]["zh"], - "dest_en": gtfsRoute["dest"]["en"], - "service_type": 1, - "bound": "O", - "stops": gtfsRoute["stops"]["1"], - "freq": gtfsRoute["freq"]["1"], + "gtfsId": route_id, + "route": route_code, + "orig_tc": gtfsZh["routeList"][route_id]["orig"]["zh"], + "orig_en": gtfsRoute["orig"]["en"], + "dest_tc": gtfsZh["routeList"][route_id]["dest"]["zh"], + "dest_en": gtfsRoute["dest"]["en"], + "service_type": 1, + "bound": "O", + "stops": gtfsRoute["stops"]["1"], + "freq": gtfsRoute["freq"]["1"], }) elif dest.lower() == gtfsRoute["orig"]["en"].lower() and orig.lower() == gtfsRoute["dest"]["en"].lower(): routeList.append({ - "gtfsId": route_id, - "route": route_code, - "dest_tc": gtfsZh["routeList"][route_id]["orig"]["zh"], - "dest_en": gtfsRoute["orig"]["en"], - "orig_tc": gtfsZh["routeList"][route_id]["dest"]["zh"], - "orig_en": gtfsRoute["dest"]["en"], - "service_type": 1, - "bound": "I", - "stops": gtfsRoute["stops"]["2"] if "2" in gtfsRoute["stops"] else gtfsRoute["stops"]["1"][::-1], - "freq": gtfsRoute["freq"]["2"] if "2" in gtfsRoute["freq"] else {}, + "gtfsId": route_id, + "route": route_code, + "dest_tc": gtfsZh["routeList"][route_id]["orig"]["zh"], + "dest_en": gtfsRoute["orig"]["en"], + "orig_tc": gtfsZh["routeList"][route_id]["dest"]["zh"], + "orig_en": gtfsRoute["dest"]["en"], + "service_type": 1, + "bound": "I", + "stops": gtfsRoute["stops"]["2"] if "2" in gtfsRoute["stops"] else gtfsRoute["stops"]["1"][::-1], + "freq": gtfsRoute["freq"]["2"] if "2" in gtfsRoute["freq"] else {}, }) for route in routeList: for stopId in route["stops"]: stopList[stopId] = { - "stop": stopId, - "name_en": gtfsStops[stopId]["stopName"]["unknown"], - "name_tc": gtfsZh["stopList"][stopId]["stopName"]["unknown"], - "lat": gtfsStops[stopId]["lat"], - "long": gtfsStops[stopId]["lng"], + "stop": stopId, + "name_en": gtfsStops[stopId]["stopName"]["unknown"], + "name_tc": gtfsZh["stopList"][stopId]["stopName"]["unknown"], + "lat": gtfsStops[stopId]["lat"], + "long": gtfsStops[stopId]["lng"], } -with open('routeList.sunferry.json', 'w', encoding='UTF-8' ) as f: +with open('routeList.sunferry.json', 'w', encoding='UTF-8') as f: f.write(json.dumps(routeList, ensure_ascii=False)) -with open('stopList.sunferry.json', 'w', encoding='UTF-8' ) as f: +with open('stopList.sunferry.json', 'w', encoding='UTF-8') as f: f.write(json.dumps(stopList, ensure_ascii=False)) diff --git a/crawling/test.py b/crawling/test.py index 862396a5..4f5732e3 100644 --- a/crawling/test.py +++ b/crawling/test.py @@ -1,7 +1,7 @@ import json import requests -with open( 'routeFareList.json' ) as f: +with open('routeFareList.json') as f: newDb = json.load(f) r = requests.get('https://hkbus.github.io/hk-bus-crawling/routeFareList.json') @@ -9,8 +9,8 @@ for newKey in newDb['routeList']: if newKey not in oldDb['routeList']: - print ('new '+newKey) + print('new ' + newKey) for oldKey in oldDb['routeList']: if oldKey not in newDb['routeList']: - print ('old '+oldKey) \ No newline at end of file + print('old ' + oldKey) diff --git a/hk_bus_eta/__init__.py b/hk_bus_eta/__init__.py index 16c3abce..4ad91b27 100644 --- a/hk_bus_eta/__init__.py +++ b/hk_bus_eta/__init__.py @@ -1 +1 @@ -from .eta import HKEta \ No newline at end of file +from .eta import HKEta diff --git a/hk_bus_eta/eta.py b/hk_bus_eta/eta.py index 284fb204..62e2a798 100644 --- a/hk_bus_eta/eta.py +++ b/hk_bus_eta/eta.py @@ -15,132 +15,147 @@ import re import hashlib + def get_platform_display(plat, lang): - number = int(plat) if isinstance(plat, str) else plat - if number < 0 or number > 20: - return ("Platform {}" if lang == "en" else "{}號月台").format(number) - if number == 0: - return "⓿" - if number > 10: - return chr(9451 + (number - 11)) - return chr(10102 + (number - 1)) + number = int(plat) if isinstance(plat, str) else plat + if number < 0 or number > 20: + return ("Platform {}" if lang == "en" else "{}號月台").format(number) + if number == 0: + return "⓿" + if number > 10: + return chr(9451 + (number - 11)) + return chr(10102 + (number - 1)) + class HKEta: holidays = None route_list = None stop_list = None stop_map = None - + def __init__(self): - md5 = requests.get("https://hkbus.github.io/hk-bus-crawling/routeFareList.md5").text - r = requests.get("https://hkbus.github.io/hk-bus-crawling/routeFareList.min.json") + md5 = requests.get( + "https://hkbus.github.io/hk-bus-crawling/routeFareList.md5").text + r = requests.get( + "https://hkbus.github.io/hk-bus-crawling/routeFareList.min.json") m = hashlib.md5() m.update(r.text.encode('utf-8')) if md5 != m.hexdigest(): raise Exception("Error in accessing hk-eta-db, md5sum not match") db = r.json() - self.holidays, self.route_list, self.stop_list, self.stop_map = db["holidays"], db["routeList"], db["stopList"], db["stopMap"] - + self.holidays, self.route_list, self.stop_list, self.stop_map = db[ + "holidays"], db["routeList"], db["stopList"], db["stopMap"] # 0-indexed seq - def getEtas( self, route_id, seq, language ): - routeEntry = self.route_list[route_id] + def getEtas(self, route_id, seq, language): + routeEntry = self.route_list[route_id] route, stops, bound = routeEntry['route'], routeEntry['stops'], routeEntry['bound'] - dest, service_type, co, nlb_id, gtfs_id = routeEntry['dest'], routeEntry['serviceType'], routeEntry['co'], routeEntry["nlbId"], routeEntry['gtfsId'] + dest, service_type, co, nlb_id, gtfs_id = routeEntry['dest'], routeEntry[ + 'serviceType'], routeEntry['co'], routeEntry["nlbId"], routeEntry['gtfsId'] _etas = [] for company_id in co: if company_id == "kmb" and "kmb" in stops: _etas.extend(self.kmb( - route=route, - stop_id=stops["kmb"][seq], - bound=bound["kmb"], - seq=seq, co = co, - service_type = service_type + route=route, + stop_id=stops["kmb"][seq], + bound=bound["kmb"], + seq=seq, co=co, + service_type=service_type )) elif company_id == "ctb" and "ctb" in stops: _etas.extend(self.ctb( - stop_id=stops['ctb'][seq], route=route, bound=bound['ctb'], seq=seq + stop_id=stops['ctb'][seq], route=route, bound=bound['ctb'], seq=seq )) elif company_id == "nlb" and "nlb" in stops: _etas.extend(self.nlb( - stop_id=stops['nlb'][seq], nlb_id=nlb_id + stop_id=stops['nlb'][seq], nlb_id=nlb_id )) elif company_id == "lrtfeeder" and "lrtfeeder" in stops: _etas.extend(self.lrtfeeder( - stop_id=stops['lrtfeeder'][seq], route=route, language=language + stop_id=stops['lrtfeeder'][seq], route=route, language=language )) elif company_id == "mtr" and "mtr" in stops: _etas.extend(self.mtr( - stop_id=stops['mtr'][seq], route=route, bound=bound["mtr"] + stop_id=stops['mtr'][seq], route=route, bound=bound["mtr"] )) elif company_id == "lightRail" and "lightRail" in stops: _etas.extend(self.lightrail( - stop_id=stops['lightRail'][seq], route=route, dest=dest + stop_id=stops['lightRail'][seq], route=route, dest=dest )) elif company_id == "gmb" and "gmb" in stops: - _etas.extend(self.gmb( - stop_id=stops["gmb"][seq], gtfs_id=gtfs_id, seq=seq, bound=bound["gmb"] - )) + _etas.extend( + self.gmb( + stop_id=stops["gmb"][seq], + gtfs_id=gtfs_id, + seq=seq, + bound=bound["gmb"])) return _etas - - def kmb(self, stop_id, route, seq, service_type, co, bound ): - data = requests.get("https://data.etabus.gov.hk/v1/transport/kmb/eta/{}/{}/{}".format(stop_id, route, service_type)).json()['data'] + + def kmb(self, stop_id, route, seq, service_type, co, bound): + data = requests.get( + "https://data.etabus.gov.hk/v1/transport/kmb/eta/{}/{}/{}".format( + stop_id, route, service_type)).json()['data'] data = list(filter(lambda e: 'eta' in e and e['dir'] == bound, data)) data.sort(key=lambda e: abs(seq - e['seq'])) data = [e for e in data if e['seq'] == data[0]['seq']] - data = list(filter(lambda e: len(co) > 1 or service_type == e['service_type'] or e['seq'] == seq + 1,data)) + data = list(filter(lambda e: len(co) > 1 or service_type == + e['service_type'] or e['seq'] == seq + 1, data)) return [{ - "eta": e['eta'], - "remark": { - "zh": e['rmk_tc'], - "en": e['rmk_en'] - }, - "co": "kmb" + "eta": e['eta'], + "remark": { + "zh": e['rmk_tc'], + "en": e['rmk_en'] + }, + "co": "kmb" } for e in data] - + def ctb(self, stop_id, route, bound, seq): - data = requests.get("https://rt.data.gov.hk/v2/transport/citybus/eta/CTB/{}/{}".format(stop_id, route)).json()['data'] + data = requests.get( + "https://rt.data.gov.hk/v2/transport/citybus/eta/CTB/{}/{}".format( + stop_id, route)).json()['data'] data = list(filter(lambda e: 'eta' in e and e['dir'] in bound, data)) data.sort(key=lambda e: abs(seq - e['seq'])) data = [e for e in data if e['seq'] == data[0]['seq']] return [{ - "eta": e['eta'], - "remark": { - "zh": e['rmk_tc'], - "en": e['rmk_en'] - }, - "co": "ctb" + "eta": e['eta'], + "remark": { + "zh": e['rmk_tc'], + "en": e['rmk_en'] + }, + "co": "ctb" }for e in data] def nlb(self, stop_id, nlb_id): try: - data = requests.post("https://rt.data.gov.hk/v1/transport/nlb/stop.php?action=estimatedArrivals", json={ - "routeId": nlb_id, - "stopId": stop_id, - "language": "zh" - }, headers={ - "Content-Type": "text/plain" - }).json()["estimatedArrivals"] + data = requests.post( + "https://rt.data.gov.hk/v1/transport/nlb/stop.php?action=estimatedArrivals", + json={ + "routeId": nlb_id, + "stopId": stop_id, + "language": "zh"}, + headers={ + "Content-Type": "text/plain"}).json()["estimatedArrivals"] data = list(filter(lambda e: 'estimatedArrivalTime' in e, data)) return [{ - "eta": e['estimatedArrivalTime'].replace(' ', 'T') + ".000+08:00", - "remark": { - "zh": "", - "en": "" - }, - "co": "nlb" + "eta": e['estimatedArrivalTime'].replace(' ', 'T') + ".000+08:00", + "remark": { + "zh": "", + "en": "" + }, + "co": "nlb" } for e in data] except Exception as e: return [] - + def lrtfeeder(self, stop_id, route, language): - data = requests.post("https://rt.data.gov.hk/v1/transport/mtr/bus/getSchedule", json={ - "language": language, - "routeName": route - }, headers={ - "Content-Type": "application/json" - }).json()['busStop'] + data = requests.post( + "https://rt.data.gov.hk/v1/transport/mtr/bus/getSchedule", + json={ + "language": language, + "routeName": route}, + headers={ + "Content-Type": "application/json"}).json()['busStop'] data = list(filter(lambda e: e["busStopId"] == stop_id, data)) ret = [] for buses in data: @@ -150,75 +165,88 @@ def lrtfeeder(self, stop_id, route, language): remark = bus["busRemark"] elif bus["isScheduled"] == 1: remark = "Scheduled" if language == "en" else "預定班次" - delta_second = int(bus["departureTimeInSecond"] if bus['arrivalTimeInSecond'] == "108000" else bus["arrivalTimeInSecond"]) - dt = datetime.fromtimestamp(time.time() + delta_second + 8 * 3600 ) - + delta_second = int(bus["departureTimeInSecond"] if bus['arrivalTimeInSecond'] + == "108000" else bus["arrivalTimeInSecond"]) + dt = datetime.fromtimestamp(time.time() + delta_second + 8 * 3600) + ret.append({ - "eta": dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+08:00"), - "remark": { - language: remark - }, - "co": "lrtfeeder" + "eta": dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+08:00"), + "remark": { + language: remark + }, + "co": "lrtfeeder" }) return ret - + def mtr(self, stop_id, route, bound): - res = requests.get("https://rt.data.gov.hk/v1/transport/mtr/getSchedule.php?line={}&sta={}".format(route, stop_id)).json() + res = requests.get( + "https://rt.data.gov.hk/v1/transport/mtr/getSchedule.php?line={}&sta={}".format( + route, stop_id)).json() data, status = res["data"], res["status"] - + if status == 0: return [] ret = [] - for e in data["{}-{}".format(route, stop_id)]["UP" if bound[-2:1] == "U" else "DOWN"]: + for e in data["{}-{}".format(route, stop_id) + ]["UP" if bound[-2:1] == "U" else "DOWN"]: ret.append({ - "eta": e["time"].replace(" ", "T") + "+08:00", - "remark": { - "zh": get_platform_display(e["plat"], "zh"), - "en": get_platform_display(e["plat"], "en") - }, - "co": "mtr" + "eta": e["time"].replace(" ", "T") + "+08:00", + "remark": { + "zh": get_platform_display(e["plat"], "zh"), + "en": get_platform_display(e["plat"], "en") + }, + "co": "mtr" }) return ret - + def lightrail(self, stop_id, route, dest): - platform_list = requests.get("https://rt.data.gov.hk/v1/transport/mtr/lrt/getSchedule?station_id={}".format(stop_id[2:])).json()["platform_list"] + platform_list = requests.get( + "https://rt.data.gov.hk/v1/transport/mtr/lrt/getSchedule?station_id={}".format(stop_id[2:])).json()["platform_list"] ret = [] for platform in platform_list: route_list, platform_id = platform["route_list"], platform["platform_id"] for e in route_list: - route_no, dest_ch, dest_en, stop, time_en = e["route_no"], e["dest_ch"], e["dest_en"], e["stop"], e["time_en"] - if route == route_no and ( dest_ch == dest["zh"] or "Circular" in dest_en ) and stop == 0: + route_no, dest_ch, dest_en, stop, time_en = e["route_no"], e[ + "dest_ch"], e["dest_en"], e["stop"], e["time_en"] + if route == route_no and ( + dest_ch == dest["zh"] or "Circular" in dest_en) and stop == 0: waitTime = 0 if time_en.lower() == "arriving" or time_en.lower() == "departing" or time_en == "-": waitTime = 0 else: waitTime = int(re.search(r'\d+', time_en).group()) - dt = datetime.fromtimestamp(time.time() + waitTime + 8 * 3600 ) + dt = datetime.fromtimestamp(time.time() + waitTime + 8 * 3600) ret.append({ - "eta": dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+08:00"), - "remark": { - "zh": get_platform_display(platform_id, "zh"), - "en": get_platform_display(platform_id, "en") - }, - "co": "lightrail" + "eta": dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+08:00"), + "remark": { + "zh": get_platform_display(platform_id, "zh"), + "en": get_platform_display(platform_id, "en") + }, + "co": "lightrail" }) return ret def gmb(self, gtfs_id, stop_id, bound, seq): - data = requests.get("https://data.etagmb.gov.hk/eta/route-stop/{}/{}".format(gtfs_id, stop_id)).json()["data"] - data = list(filter(lambda e: (e['route_seq'] == 1 and bound == "O") or (e['route_seq'] == 2 and bound == "I"), data)) + data = requests.get( + "https://data.etagmb.gov.hk/eta/route-stop/{}/{}".format(gtfs_id, stop_id)).json()["data"] + data = list( + filter( + lambda e: ( + e['route_seq'] == 1 and bound == "O") or ( + e['route_seq'] == 2 and bound == "I"), + data)) data = list(filter(lambda e: e["stop_seq"] == seq + 1, data)) ret = [] for e in data: etas = e["eta"] for eta in etas: ret.append({ - "eta": eta["timestamp"], - "remark": { - "zh": eta["remarks_tc"], - "en": eta["remarks_en"], - }, - "co": "gmb" + "eta": eta["timestamp"], + "remark": { + "zh": eta["remarks_tc"], + "en": eta["remarks_en"], + }, + "co": "gmb" }) return ret @@ -226,4 +254,4 @@ def gmb(self, gtfs_id, stop_id, bound, seq): if __name__ == "__main__": hketa = HKEta() route_ids = list(hketa.route_list.keys()) - print(route_ids[0:10]) \ No newline at end of file + print(route_ids[0:10]) diff --git a/setup.py b/setup.py index 7fd697de..df12890f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ here = os.path.abspath(os.path.dirname(__file__)) with codecs.open(os.path.join(here, "README.md"), encoding="utf-8") as fh: - long_description = "\n" + fh.read() + long_description = "\n" + fh.read() VERSION = '2.1.5' DESCRIPTION = 'Query the ETA (Estimated Time of Arrival) of HK Bus/Minibus/MTR/Lightrail' @@ -21,7 +21,17 @@ long_description=long_description, packages=find_packages(), install_requires=['requests'], - keywords=['python', 'hongkong', 'eta', 'estimated time of arrival', 'kmb', 'nlb', 'mtr', 'ctb', 'minibus', 'lightrail'], + keywords=[ + 'python', + 'hongkong', + 'eta', + 'estimated time of arrival', + 'kmb', + 'nlb', + 'mtr', + 'ctb', + 'minibus', + 'lightrail'], classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", @@ -30,5 +40,4 @@ "Operating System :: Unix", "Operating System :: MacOS :: MacOS X", "Operating System :: Microsoft :: Windows", - ] -) \ No newline at end of file + ]) diff --git a/tools/normalize_json.py b/tools/normalize_json.py index 634889bc..6bae7879 100644 --- a/tools/normalize_json.py +++ b/tools/normalize_json.py @@ -3,18 +3,18 @@ def main(route_fare_list_json: str): - """ - Simple tool to normalize the routeFareList.json for easier comparison. The normalized JSON will be written to the same directory with `.norm` added. - """ - normalized_json_name = f"{route_fare_list_json}.norm" - with open(route_fare_list_json) as f: - route_fare_list = json.load(f) - route_fare_list['holidays'] = sorted(route_fare_list['holidays']) + """ + Simple tool to normalize the routeFareList.json for easier comparison. The normalized JSON will be written to the same directory with `.norm` added. + """ + normalized_json_name = f"{route_fare_list_json}.norm" + with open(route_fare_list_json) as f: + route_fare_list = json.load(f) + route_fare_list['holidays'] = sorted(route_fare_list['holidays']) - with open(normalized_json_name, 'w') as f: - json.dump(route_fare_list, f, sort_keys=True, - indent=4, ensure_ascii=False) + with open(normalized_json_name, 'w') as f: + json.dump(route_fare_list, f, sort_keys=True, + indent=4, ensure_ascii=False) if __name__ == '__main__': - typer.run(main) + typer.run(main)