diff --git a/README.md b/README.md index 7f0772c..7d33734 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ In this repository, I go through some of the ways you can parse and process the location translation / altnames / alternative names / location name in a foreign language database from Geonames.org. +> Please note that this database is in active development. + The alternames database contains geoname_id, language, and the name of the location in that language. ![CSV file](image-1.png) @@ -49,6 +51,14 @@ Getting the altname using the included JSON file: {'pt': 'Oslo', 'en': 'Oslo', 'ko': '오슬로', 'ru': 'Осло', 'ja': 'オスロ', 'ar': 'أوسلو', 'es': 'Oslo', 'zh': '奥斯陆', 'de': 'Oslo', 'fr': 'Oslo'} ``` +Aside from the ISO language code, the dataset also contains the `preferred` (preferred or official name) and `shortname` (shortname for the location) values. + +Notes / Subjective Decisions: + +> This is the beginning of a public database project. So, your opinion matters. Please open an issue to provide feedback. + +- Note that for rows where the `preferred` ISO language is `en`, we have removed them as if the `preferred` ISO language is not present, the fallback value should default to English. +- Where multiple entries are made for the same language, we have chosen the first entry. There obviously a better solution for this. The included notebook and databases can be used with a variety of IPinfo IP databases that include location information. diff --git a/geoname_alt_names.zip b/geoname_alt_names.zip index 208e379..1c404f0 100644 Binary files a/geoname_alt_names.zip and b/geoname_alt_names.zip differ diff --git a/processing_geonames_altnames.ipynb b/processing_geonames_altnames.ipynb index 61b83fc..4b5c8f8 100644 --- a/processing_geonames_altnames.ipynb +++ b/processing_geonames_altnames.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 14, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -23,20 +23,20 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "alternate_names_columns = ['alternateNameId', 'geonameid', 'isolanguage', 'alternate name', 'isPreferredName', 'isShortName', 'isColloquial', 'isHistoric', 'from', 'to']\n", "alternate_names_dtypes = {'alternateNameId': int, 'geonameid': int, 'isolanguage': str, 'alternate name': str, 'isPreferredName':str, 'isShortName':str, 'isColloquial':str, 'isHistoric':str, 'from':str, 'to': str}\n", - "filename = 'alternateNamesV2.txt'\n", + "filename = './alternateNamesV2.txt'\n", "alname_df = pd.read_csv(filename, sep='\\t', header=None, names=alternate_names_columns, dtype=alternate_names_dtypes)\n", "alname_df.fillna('', inplace=True)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -158,7 +158,7 @@ "4 " ] }, - "execution_count": 13, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -178,7 +178,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Step 1: Preparsing\n", + "Preparsing\n", "- ISO language code are 2 letters (Major filter)\n", "- We need to remove where ISO language values are empty\n", "- We need to remove where Geoname ID values are empty\n", @@ -187,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -211,81 +211,291 @@ " \n", " \n", " \n", + " alternateNameId\n", " geonameid\n", " isolanguage\n", - " alt_name\n", + " alternate name\n", + " isPreferredName\n", + " isShortName\n", + " isColloquial\n", + " isHistoric\n", + " from\n", + " to\n", " \n", " \n", " \n", " \n", " 0\n", + " 2181137\n", " 3038886\n", " ca\n", " Pic de Tristaina\n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " 1\n", + " 13898832\n", " 3038899\n", " ca\n", " Tossalet i Vinyals\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " 2\n", + " 13898833\n", " 3038914\n", " ca\n", " Canal de la Tosa\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " 3\n", + " 13898834\n", " 3038995\n", " ca\n", " Bosc del Solobre\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " 4\n", + " 13898836\n", " 3039037\n", " ca\n", " Solà d’Engordany\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "\n", "" ], "text/plain": [ - " geonameid isolanguage alt_name\n", - "0 3038886 ca Pic de Tristaina\n", - "1 3038899 ca Tossalet i Vinyals\n", - "2 3038914 ca Canal de la Tosa\n", - "3 3038995 ca Bosc del Solobre\n", - "4 3039037 ca Solà d’Engordany" + " alternateNameId geonameid isolanguage alternate name isPreferredName \\\n", + "0 2181137 3038886 ca Pic de Tristaina 1 \n", + "1 13898832 3038899 ca Tossalet i Vinyals \n", + "2 13898833 3038914 ca Canal de la Tosa \n", + "3 13898834 3038995 ca Bosc del Solobre \n", + "4 13898836 3039037 ca Solà d’Engordany \n", + "\n", + " isShortName isColloquial isHistoric from to \n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 " ] }, - "execution_count": 26, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "alname_df_all_values = db.sql(\"\"\"SELECT geonameid, isolanguage, \"alternate name\" as alt_name\n", + "alname_df_all_values_imported = db.sql(\"\"\"SELECT *\n", "FROM alname_df\n", "WHERE geonameid is not null\n", "AND isolanguage !=''\n", - "AND alt_name !='' \n", + "AND \"alternate name\" !='' \n", "AND LENGTH(isolanguage) <= 2\n", "\"\"\").df()\n", - "alname_df_all_values.head()" + "alname_df_all_values_imported.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Bringing shortname and preferred name in the isolanguage column." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Filter rows where isPreferredName equals 1 (except for all values `en`)\n", + "preferred_row = alname_df_all_values_imported[(alname_df_all_values_imported['isPreferredName'] == \"1\") & (alname_df_all_values_imported['isolanguage'] != \"en\")].copy()\n", + "\n", + "# Update the isolanguage column to 'preferred'\n", + "preferred_row['isolanguage'] = 'preferred'\n", + "\n", + "# Filter rows where isShortName equals 1\n", + "shortname_row = alname_df_all_values_imported[alname_df_all_values_imported['isShortName'] == \"1\"].copy()\n", + "\n", + "# Update the isolanguage column to 'shortname'\n", + "shortname_row['isolanguage'] = 'shortname'\n", + "\n", + "# Concatenate the original DataFrame with the preferred row DataFrame\n", + "alname_df_all_values_imported_pref_shortname = pd.concat([alname_df_all_values_imported, preferred_row, shortname_row], ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
geonameidisolanguagealt_name
03038886caPic de Tristaina
13038899caTossalet i Vinyals
23038914caCanal de la Tosa
33038995caBosc del Solobre
43039037caSolà d’Engordany
............
75513097729890shortnameNorth America
75513108354411shortnameSoviet Union
75513118354411shortnameСССР
75513128354411shortnameUSSR
75513138378500shortnameBelgica
\n", + "

7551314 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " geonameid isolanguage alt_name\n", + "0 3038886 ca Pic de Tristaina\n", + "1 3038899 ca Tossalet i Vinyals\n", + "2 3038914 ca Canal de la Tosa\n", + "3 3038995 ca Bosc del Solobre\n", + "4 3039037 ca Solà d’Engordany\n", + "... ... ... ...\n", + "7551309 7729890 shortname North America\n", + "7551310 8354411 shortname Soviet Union\n", + "7551311 8354411 shortname СССР\n", + "7551312 8354411 shortname USSR\n", + "7551313 8378500 shortname Belgica\n", + "\n", + "[7551314 rows x 3 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alname_df_all_values = db.sql(\"\"\"SELECT geonameid, isolanguage, \"alternate name\" as alt_name\n", + "FROM alname_df_all_values_imported_pref_shortname\n", + "\"\"\").df()\n", + "alname_df_all_values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Optional Step 1: Taking a look at the top 20 listed languages in the database (Disclaimer in step 3)" + "Optional Step: Taking a look at the top languages in the database (Disclaimer in step 3)" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -366,54 +576,79 @@ " \n", " \n", " 10\n", + " preferred\n", + " 188012\n", + " \n", + " \n", + " 11\n", " fr\n", " 152656\n", " \n", " \n", - " 11\n", + " 12\n", " fi\n", " 139919\n", " \n", " \n", - " 12\n", + " 13\n", " ko\n", " 139003\n", " \n", " \n", - " 13\n", + " 14\n", " uk\n", " 126704\n", " \n", " \n", - " 14\n", + " 15\n", " pt\n", " 95756\n", " \n", " \n", - " 15\n", + " 16\n", " de\n", " 87443\n", " \n", " \n", - " 16\n", + " 17\n", + " shortname\n", + " 87329\n", + " \n", + " \n", + " 18\n", " hy\n", " 77986\n", " \n", " \n", - " 17\n", + " 19\n", " sr\n", " 64587\n", " \n", " \n", - " 18\n", + " 20\n", " el\n", " 56444\n", " \n", " \n", - " 19\n", + " 21\n", " mk\n", " 56342\n", " \n", + " \n", + " 22\n", + " tr\n", + " 54982\n", + " \n", + " \n", + " 23\n", + " kk\n", + " 52791\n", + " \n", + " \n", + " 24\n", + " it\n", + " 51753\n", + " \n", " \n", "\n", "" @@ -430,19 +665,24 @@ "7 ar 304521\n", "8 th 265978\n", "9 ja 196185\n", - "10 fr 152656\n", - "11 fi 139919\n", - "12 ko 139003\n", - "13 uk 126704\n", - "14 pt 95756\n", - "15 de 87443\n", - "16 hy 77986\n", - "17 sr 64587\n", - "18 el 56444\n", - "19 mk 56342" + "10 preferred 188012\n", + "11 fr 152656\n", + "12 fi 139919\n", + "13 ko 139003\n", + "14 uk 126704\n", + "15 pt 95756\n", + "16 de 87443\n", + "17 shortname 87329\n", + "18 hy 77986\n", + "19 sr 64587\n", + "20 el 56444\n", + "21 mk 56342\n", + "22 tr 54982\n", + "23 kk 52791\n", + "24 it 51753" ] }, - "execution_count": 27, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -453,7 +693,7 @@ "FROM alname_df_all_values\n", "GROUP BY isolanguage\n", "ORDER BY lng_count DESC\n", - "LIMIT 20\n", + "LIMIT 25\n", "''').df()" ] }, @@ -461,12 +701,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Step 2: Only picking out 10 languages" + "Only picking out 10 languages + shortname + preferred" ] }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -498,33 +738,33 @@ " \n", " \n", " 0\n", - " 8210545\n", - " en\n", - " Regans Lagoon\n", + " 9089920\n", + " zh\n", + " 拿司\n", " \n", " \n", " 1\n", - " 8210546\n", - " en\n", - " Regents Lake\n", + " 9089921\n", + " zh\n", + " 素地屋\n", " \n", " \n", " 2\n", - " 8210547\n", - " en\n", - " Round Lake\n", + " 9089923\n", + " zh\n", + " 日阿洞\n", " \n", " \n", " 3\n", - " 8210548\n", - " en\n", - " Lake Ricketson\n", + " 9089925\n", + " zh\n", + " 白扭洽色\n", " \n", " \n", " 4\n", - " 8210549\n", - " en\n", - " Ritchies Lagoon\n", + " 9089927\n", + " zh\n", + " 强波隔勒\n", " \n", " \n", " ...\n", @@ -533,58 +773,58 @@ " ...\n", " \n", " \n", - " 3815569\n", - " 7668943\n", - " en\n", - " Tiger Cave Temple\n", + " 4090910\n", + " 30521\n", + " ar\n", + " Āl ‘Īsá\n", " \n", " \n", - " 3815570\n", - " 7730329\n", - " en\n", - " Khok Kathiam Airport\n", + " 4090911\n", + " 30521\n", + " ar\n", + " ال عيسى\n", " \n", " \n", - " 3815571\n", - " 7730329\n", - " en\n", - " Koke Kathiem Airport\n", + " 4090912\n", + " 30524\n", + " ar\n", + " الضحاكي\n", " \n", " \n", - " 3815572\n", - " 7778911\n", - " en\n", - " James Bond Island\n", + " 4090913\n", + " 30527\n", + " ar\n", + " Ḩijlān\n", " \n", " \n", - " 3815573\n", - " 7793027\n", - " en\n", - " Doi Tao Lake\n", + " 4090914\n", + " 30530\n", + " ar\n", + " الرومية\n", " \n", " \n", "\n", - "

3815574 rows × 3 columns

\n", + "

4090915 rows × 3 columns

\n", "" ], "text/plain": [ - " geonameid isolanguage alt_name\n", - "0 8210545 en Regans Lagoon\n", - "1 8210546 en Regents Lake\n", - "2 8210547 en Round Lake\n", - "3 8210548 en Lake Ricketson\n", - "4 8210549 en Ritchies Lagoon\n", - "... ... ... ...\n", - "3815569 7668943 en Tiger Cave Temple\n", - "3815570 7730329 en Khok Kathiam Airport\n", - "3815571 7730329 en Koke Kathiem Airport\n", - "3815572 7778911 en James Bond Island\n", - "3815573 7793027 en Doi Tao Lake\n", + " geonameid isolanguage alt_name\n", + "0 9089920 zh 拿司\n", + "1 9089921 zh 素地屋\n", + "2 9089923 zh 日阿洞\n", + "3 9089925 zh 白扭洽色\n", + "4 9089927 zh 强波隔勒\n", + "... ... ... ...\n", + "4090910 30521 ar Āl ‘Īsá\n", + "4090911 30521 ar ال عيسى\n", + "4090912 30524 ar الضحاكي\n", + "4090913 30527 ar Ḩijlān\n", + "4090914 30530 ar الرومية\n", "\n", - "[3815574 rows x 3 columns]" + "[4090915 rows x 3 columns]" ] }, - "execution_count": 114, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -594,7 +834,7 @@ "alname_df_all_values_selected = db.sql('''\n", "SELECT *\n", "FROM alname_df_all_values\n", - "WHERE isolanguage in ('zh', 'en', 'es', 'fr', 'ja', 'pt', 'ru', 'de', 'ar', 'ko')\n", + "WHERE isolanguage in ('zh', 'en', 'es', 'fr', 'ja', 'pt', 'ru', 'de', 'ar', 'ko', 'shortname', 'preferred')\n", "''').df()\n", "alname_df_all_values_selected" ] @@ -603,13 +843,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Step 3: Taking the first alt name by each language\n", + "Taking the first alt name by each language\n", "- Geonames have multiple entries for the same language and same location. There could be dozens of alternative names for the same location within the same language." ] }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -641,33 +881,33 @@ " \n", " \n", " 0\n", - " 650\n", - " ru\n", - " Бастак\n", + " 1727\n", + " ar\n", + " معوبرة\n", " \n", " \n", " 1\n", - " 1816\n", + " 1766\n", " ar\n", - " حبات نزال\n", + " خان خيام الوليد\n", " \n", " \n", " 2\n", - " 2106\n", + " 1807\n", " ar\n", - " مطار رشت الدولي\n", + " خشام\n", " \n", " \n", " 3\n", - " 2254\n", + " 1813\n", " ar\n", - " وادي غدير الحمل\n", + " عبود الجدعان\n", " \n", " \n", " 4\n", - " 2287\n", + " 2028\n", " ar\n", - " الفياضية\n", + " درة بیداد العلیا\n", " \n", " \n", " ...\n", @@ -676,58 +916,58 @@ " ...\n", " \n", " \n", - " 3424813\n", - " 12746130\n", - " ko\n", - " 인천광역시역사자료관\n", + " 3652220\n", + " 12740426\n", + " ru\n", + " Bol’shoy Naergen\n", " \n", " \n", - " 3424814\n", - " 12746165\n", - " ko\n", - " 수산물품관리원인천지원\n", + " 3652221\n", + " 12746331\n", + " preferred\n", + " 桶屋町通り\n", " \n", " \n", - " 3424815\n", - " 12746177\n", - " ko\n", - " 송학동삼가\n", + " 3652222\n", + " 12746428\n", + " zh\n", + " 鄂城区\n", " \n", " \n", - " 3424816\n", - " 12746201\n", - " ko\n", - " 한국근대문학관\n", + " 3652223\n", + " 12746467\n", + " en\n", + " Masjid Mubarak\n", " \n", " \n", - " 3424817\n", - " 12746246\n", - " ja\n", - " しょうふくじちょくしもん\n", + " 3652224\n", + " 12746811\n", + " ar\n", + " ذا لاين\n", " \n", " \n", "\n", - "

3424818 rows × 3 columns

\n", + "

3652225 rows × 3 columns

\n", "" ], "text/plain": [ - " geonameid isolanguage alt_name\n", - "0 650 ru Бастак\n", - "1 1816 ar حبات نزال\n", - "2 2106 ar مطار رشت الدولي\n", - "3 2254 ar وادي غدير الحمل\n", - "4 2287 ar الفياضية\n", - "... ... ... ...\n", - "3424813 12746130 ko 인천광역시역사자료관\n", - "3424814 12746165 ko 수산물품관리원인천지원\n", - "3424815 12746177 ko 송학동삼가\n", - "3424816 12746201 ko 한국근대문학관\n", - "3424817 12746246 ja しょうふくじちょくしもん\n", + " geonameid isolanguage alt_name\n", + "0 1727 ar معوبرة\n", + "1 1766 ar خان خيام الوليد\n", + "2 1807 ar خشام\n", + "3 1813 ar عبود الجدعان\n", + "4 2028 ar درة بیداد العلیا\n", + "... ... ... ...\n", + "3652220 12740426 ru Bol’shoy Naergen\n", + "3652221 12746331 preferred 桶屋町通り\n", + "3652222 12746428 zh 鄂城区\n", + "3652223 12746467 en Masjid Mubarak\n", + "3652224 12746811 ar ذا لاين\n", "\n", - "[3424818 rows x 3 columns]" + "[3652225 rows x 3 columns]" ] }, - "execution_count": 115, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -749,13 +989,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Step 4: Choose English as one of the mandatory listed languages.\n", + "Choose English as one of the mandatory listed languages.\n", "- If English is not listed for a particular geoname entry, we will not select it." ] }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -787,33 +1027,33 @@ " \n", " \n", " 0\n", - " 11592844\n", - " en\n", - " Pybus\n", + " 12181875\n", + " ru\n", + " Станция Новгород-Северский\n", " \n", " \n", " 1\n", - " 11593903\n", + " 12182202\n", " en\n", - " Southwestern Washington Experiment Station\n", + " Kamkusa Msiska\n", " \n", " \n", " 2\n", - " 11594363\n", + " 12182211\n", " en\n", - " Piana Rocks Lighthouse\n", + " Mwafulirwa Mwambelo\n", " \n", " \n", " 3\n", - " 11608472\n", + " 12182323\n", " en\n", - " Santana do Livramento\n", + " Junju\n", " \n", " \n", " 4\n", - " 11608475\n", + " 12182457\n", " en\n", - " San Pedro Pochutla Airport\n", + " Kanyenda Masanja\n", " \n", " \n", " ...\n", @@ -822,58 +1062,58 @@ " ...\n", " \n", " \n", - " 944962\n", - " 8556013\n", - " en\n", - " Windarling Mine\n", + " 1046753\n", + " 6578827\n", + " zh\n", + " 第三號隧道\n", " \n", " \n", - " 944963\n", - " 8562582\n", - " en\n", - " Takaiwa Tunnel\n", + " 1046754\n", + " 6586550\n", + " zh\n", + " Ti-shih-hao Ch’iao\n", " \n", " \n", - " 944964\n", - " 8436037\n", - " es\n", - " Estación de Ronda\n", + " 1046755\n", + " 6589562\n", + " zh\n", + " 省立臺南醫院\n", " \n", " \n", - " 944965\n", - " 8538664\n", + " 1046756\n", + " 6607371\n", " en\n", - " Mount Kanigan\n", + " Pushkino\n", " \n", " \n", - " 944966\n", - " 8507553\n", - " zh\n", - " 黄花农场十一队\n", + " 1046757\n", + " 6461416\n", + " preferred\n", + " Козма презвитер\n", " \n", " \n", "\n", - "

944967 rows × 3 columns

\n", + "

1046758 rows × 3 columns

\n", "" ], "text/plain": [ - " geonameid isolanguage alt_name\n", - "0 11592844 en Pybus\n", - "1 11593903 en Southwestern Washington Experiment Station\n", - "2 11594363 en Piana Rocks Lighthouse\n", - "3 11608472 en Santana do Livramento\n", - "4 11608475 en San Pedro Pochutla Airport\n", - "... ... ... ...\n", - "944962 8556013 en Windarling Mine\n", - "944963 8562582 en Takaiwa Tunnel\n", - "944964 8436037 es Estación de Ronda\n", - "944965 8538664 en Mount Kanigan\n", - "944966 8507553 zh 黄花农场十一队\n", + " geonameid isolanguage alt_name\n", + "0 12181875 ru Станция Новгород-Северский\n", + "1 12182202 en Kamkusa Msiska\n", + "2 12182211 en Mwafulirwa Mwambelo\n", + "3 12182323 en Junju\n", + "4 12182457 en Kanyenda Masanja\n", + "... ... ... ...\n", + "1046753 6578827 zh 第三號隧道\n", + "1046754 6586550 zh Ti-shih-hao Ch’iao\n", + "1046755 6589562 zh 省立臺南醫院\n", + "1046756 6607371 en Pushkino\n", + "1046757 6461416 preferred Козма презвитер\n", "\n", - "[944967 rows x 3 columns]" + "[1046758 rows x 3 columns]" ] }, - "execution_count": 116, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -895,12 +1135,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Optional Step 2: Test how many languages are there for each geoname entry" + "Optional Step: Test how many languages are there for each geoname entry" ] }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -931,73 +1171,85 @@ " \n", " \n", " 0\n", - " 10\n", - " 4150\n", + " 12\n", + " 857\n", " \n", " \n", " 1\n", - " 9\n", - " 1650\n", + " 11\n", + " 1511\n", " \n", " \n", " 2\n", - " 8\n", - " 4334\n", + " 10\n", + " 2407\n", " \n", " \n", " 3\n", - " 7\n", - " 2545\n", + " 9\n", + " 4122\n", " \n", " \n", " 4\n", - " 6\n", - " 2666\n", + " 8\n", + " 2246\n", " \n", " \n", " 5\n", - " 5\n", - " 3839\n", + " 7\n", + " 2372\n", " \n", " \n", " 6\n", - " 4\n", - " 6798\n", + " 6\n", + " 2990\n", " \n", " \n", " 7\n", - " 3\n", - " 12548\n", + " 5\n", + " 4238\n", " \n", " \n", " 8\n", - " 2\n", - " 91567\n", + " 4\n", + " 18787\n", " \n", " \n", " 9\n", + " 3\n", + " 55159\n", + " \n", + " \n", + " 10\n", + " 2\n", + " 55981\n", + " \n", + " \n", + " 11\n", " 1\n", - " 552969\n", + " 532396\n", " \n", " \n", "\n", "" ], "text/plain": [ - " count_lang lang_group_count\n", - "0 10 4150\n", - "1 9 1650\n", - "2 8 4334\n", - "3 7 2545\n", - "4 6 2666\n", - "5 5 3839\n", - "6 4 6798\n", - "7 3 12548\n", - "8 2 91567\n", - "9 1 552969" + " count_lang lang_group_count\n", + "0 12 857\n", + "1 11 1511\n", + "2 10 2407\n", + "3 9 4122\n", + "4 8 2246\n", + "5 7 2372\n", + "6 6 2990\n", + "7 5 4238\n", + "8 4 18787\n", + "9 3 55159\n", + "10 2 55981\n", + "11 1 532396" ] }, - "execution_count": 155, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1020,14 +1272,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Optional Step 3: Minimum number of required languages\n", + "Optional Step: Minimum number of required languages\n", "- If you want a tabular output select the max languges in the input language array (which is 10).\n", "- Here as an example we can select geonames with more than 5 alt names" ] }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1059,33 +1311,33 @@ " \n", " \n", " 0\n", - " 895949\n", - " es\n", - " Zambia\n", + " 4231976\n", + " shortname\n", + " Worth\n", " \n", " \n", " 1\n", - " 896140\n", - " pt\n", - " Província Ocidental\n", + " 4234477\n", + " ar\n", + " بروكبورت\n", " \n", " \n", " 2\n", - " 899274\n", - " fr\n", - " Samfya\n", + " 4235954\n", + " ar\n", + " مقاطعة كلارك\n", " \n", " \n", " 3\n", - " 912764\n", - " ja\n", - " カサマ\n", + " 4237730\n", + " fr\n", + " Comté d'Effingham\n", " \n", " \n", " 4\n", - " 921780\n", - " de\n", - " Mohéli\n", + " 4247802\n", + " shortname\n", + " Randolph\n", " \n", " \n", " ...\n", @@ -1094,58 +1346,58 @@ " ...\n", " \n", " \n", - " 144023\n", - " 97990\n", - " fr\n", - " Bakouba\n", + " 402395\n", + " 1835329\n", + " preferred\n", + " 대구광역시\n", " \n", " \n", - " 144024\n", - " 119505\n", + " 402396\n", + " 1839118\n", " en\n", - " Qazvin\n", + " Biyangdo Island\n", " \n", " \n", - " 144025\n", - " 119505\n", - " ko\n", - " 카즈빈\n", + " 402397\n", + " 1855274\n", + " ja\n", + " 西蒲原郡\n", " \n", " \n", - " 144026\n", - " 124763\n", - " es\n", - " Markazi\n", + " 402398\n", + " 1503219\n", + " en\n", + " Kizhi-Aksy\n", " \n", " \n", - " 144027\n", - " 128231\n", + " 402399\n", + " 2023427\n", " ru\n", - " Керман\n", + " Исток\n", " \n", " \n", "\n", - "

144028 rows × 3 columns

\n", + "

402400 rows × 3 columns

\n", "" ], "text/plain": [ - " geonameid isolanguage alt_name\n", - "0 895949 es Zambia\n", - "1 896140 pt Província Ocidental\n", - "2 899274 fr Samfya\n", - "3 912764 ja カサマ\n", - "4 921780 de Mohéli\n", - "... ... ... ...\n", - "144023 97990 fr Bakouba\n", - "144024 119505 en Qazvin\n", - "144025 119505 ko 카즈빈\n", - "144026 124763 es Markazi\n", - "144027 128231 ru Керман\n", + " geonameid isolanguage alt_name\n", + "0 4231976 shortname Worth\n", + "1 4234477 ar بروكبورت\n", + "2 4235954 ar مقاطعة كلارك\n", + "3 4237730 fr Comté d'Effingham\n", + "4 4247802 shortname Randolph\n", + "... ... ... ...\n", + "402395 1835329 preferred 대구광역시\n", + "402396 1839118 en Biyangdo Island\n", + "402397 1855274 ja 西蒲原郡\n", + "402398 1503219 en Kizhi-Aksy\n", + "402399 2023427 ru Исток\n", "\n", - "[144028 rows x 3 columns]" + "[402400 rows x 3 columns]" ] }, - "execution_count": 158, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1160,7 +1412,7 @@ "\tGROUP BY geonameid\n", ") t2\n", "ON t1.geonameid = t2.geonameid\n", - "WHERE t2.count_lang > 4\n", + "WHERE t2.count_lang > 2\n", "''').df()" ] }, @@ -1168,15 +1420,123 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Optional Step 4: JSONification of the language and alt_names\n", + "Optional Step: JSONification of the language and alt_names\n", "- The dictionary represented in a JSON like string" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
geonameidiso_language_alt_names
014{\"en\": \"Takht Arreh Do\", \"ar\": \"تخت أرة دو\"}
1254{\"en\": \"Pol-e Bala Rud\", \"ar\": \"جسر بالارود\"}
2301{\"fr\": \"Shahr-e ziba\", \"en\": \"Shahr-e Ziba\"}
3470{\"fr\": \"place Azadi\", \"pt\": \"Praça Azadi\", \"zh...
4566{\"de\": \"Ghar-e-Roodafshan\", \"fr\": \"Ghar-e-Rood...
.........
68306112746714{\"en\": \"Bosiljevo interchange\"}
68306212746729{\"en\": \"Ringwood Magistrates Court\"}
68306312746744{\"en\": \"Appleton Dock\"}
68306412746795{\"ja\": \"葛城地蔵尊\", \"en\": \"Katsuragi Kṣitigarbha\",...
68306512746811{\"de\": \"The Line\", \"en\": \"The Line\", \"ru\": \"Ли...
\n", + "

683066 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " geonameid iso_language_alt_names\n", + "0 14 {\"en\": \"Takht Arreh Do\", \"ar\": \"تخت أرة دو\"}\n", + "1 254 {\"en\": \"Pol-e Bala Rud\", \"ar\": \"جسر بالارود\"}\n", + "2 301 {\"fr\": \"Shahr-e ziba\", \"en\": \"Shahr-e Ziba\"}\n", + "3 470 {\"fr\": \"place Azadi\", \"pt\": \"Praça Azadi\", \"zh...\n", + "4 566 {\"de\": \"Ghar-e-Roodafshan\", \"fr\": \"Ghar-e-Rood...\n", + "... ... ...\n", + "683061 12746714 {\"en\": \"Bosiljevo interchange\"}\n", + "683062 12746729 {\"en\": \"Ringwood Magistrates Court\"}\n", + "683063 12746744 {\"en\": \"Appleton Dock\"}\n", + "683064 12746795 {\"ja\": \"葛城地蔵尊\", \"en\": \"Katsuragi Kṣitigarbha\",...\n", + "683065 12746811 {\"de\": \"The Line\", \"en\": \"The Line\", \"ru\": \"Ли...\n", + "\n", + "[683066 rows x 2 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "alname_df_all_values_selected_only_first_loc_en_filtered_jsonified = alname_df_all_values_selected_only_first_loc_en_filtered.groupby('geonameid').apply(lambda x: json.dumps(dict(zip(x['isolanguage'], x['alt_name'])), ensure_ascii=False)).reset_index()\n", "alname_df_all_values_selected_only_first_loc_en_filtered_jsonified.columns = ['geonameid', 'iso_language_alt_names']\n", @@ -1187,20 +1547,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Step 6: Converting the dataset in to Python Dictionary for JSON output" + "Converting the dataset in to Python Dictionary for JSON output" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "alname_df_all_values_selected_only_first_loc_en_filtered_dictionariezed_df = alname_df_all_values_selected_only_first_loc_en_filtered.groupby('geonameid').apply(lambda x: dict(zip(x['isolanguage'], x['alt_name']))).reset_index()\n", "alname_df_all_values_selected_only_first_loc_en_filtered_dictionariezed_df.columns = ['geonameid', 'iso_language_alt_names']\n", "\n", - "alname_df_all_values_selected_only_first_loc_en_filtered_dict = alname_df_all_values_selected_only_first_loc_en_filtered_dictionariezed_df.set_index('geonameid')['iso_language_alt_names'].to_dict()\n", - "alname_df_all_values_selected_only_first_loc_en_filtered_dict" + "alname_df_all_values_selected_only_first_loc_en_filtered_dict = alname_df_all_values_selected_only_first_loc_en_filtered_dictionariezed_df.set_index('geonameid')['iso_language_alt_names'].to_dict()" ] }, { @@ -1219,25 +1578,26 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "| | geonameid | isolanguage | alt_name |\n", - "|-------:|------------:|:--------------|:-----------|\n", - "| 63100 | 1269750 | en | India |\n", - "| 63101 | 1269750 | ko | 인도 |\n", - "| 177456 | 1269750 | ja | インド |\n", - "| 465842 | 1269750 | pt | Índia |\n", - "| 562186 | 1269750 | es | India |\n", - "| 601777 | 1269750 | zh | 印度 |\n", - "| 721371 | 1269750 | ar | الهند |\n", - "| 810115 | 1269750 | fr | Inde |\n", - "| 839386 | 1269750 | ru | Индия |\n", - "| 922130 | 1269750 | de | Indien |\n" + "| | geonameid | isolanguage | alt_name |\n", + "|-------:|------------:|:--------------|:------------|\n", + "| 62447 | 1269750 | fr | Inde |\n", + "| 111100 | 1269750 | ar | الهند |\n", + "| 245786 | 1269750 | ru | Индия |\n", + "| 278989 | 1269750 | es | Unión India |\n", + "| 314823 | 1269750 | zh | 印度 |\n", + "| 444853 | 1269750 | preferred | Ινδία |\n", + "| 581391 | 1269750 | ja | インド |\n", + "| 614529 | 1269750 | en | India |\n", + "| 659641 | 1269750 | ko | 인도 |\n", + "| 817618 | 1269750 | de | Indien |\n", + "| 993566 | 1269750 | pt | Índia |\n" ] } ], @@ -1254,7 +1614,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1264,11 +1624,11 @@ "│ geonameid │ iso_language_alt_names │\n", "│ int32 │ varchar │\n", "├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────┤\n", - "│ 1269750 │ {\"en\": \"India\", \"ko\": \"인도\", \"ja\": \"インド\", \"pt\": \"Índia\", \"es\": \"India\", \"zh\": \"印度\", \"ar\": \"الهند… │\n", + "│ 1269750 │ {\"fr\": \"Inde\", \"ar\": \"الهند\", \"ru\": \"Индия\", \"es\": \"Unión India\", \"zh\": \"印度\", \"preferred\": \"Ινδία\", … │\n", "└───────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────┘" ] }, - "execution_count": 122, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1290,25 +1650,26 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'en': 'India',\n", - " 'ko': '인도',\n", - " 'ja': 'インド',\n", - " 'pt': 'Índia',\n", - " 'es': 'India',\n", - " 'zh': '印度',\n", + "{'fr': 'Inde',\n", " 'ar': 'الهند',\n", - " 'fr': 'Inde',\n", " 'ru': 'Индия',\n", - " 'de': 'Indien'}" + " 'es': 'Unión India',\n", + " 'zh': '印度',\n", + " 'preferred': 'Ινδία',\n", + " 'ja': 'インド',\n", + " 'en': 'India',\n", + " 'ko': '인도',\n", + " 'de': 'Indien',\n", + " 'pt': 'Índia'}" ] }, - "execution_count": 150, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1328,7 +1689,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -1337,7 +1698,7 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [