From 3ff764a7753e27623dc47a7f7e9bca3f70e61ccb Mon Sep 17 00:00:00 2001 From: yashy3nugu Date: Fri, 9 Dec 2022 18:18:24 +0530 Subject: [PATCH 01/11] Add fuzzy matching to standardized districts --- scraper/Gemfile | 2 + scraper/Gemfile.lock | 2 + scraper/scripts/districts.json | 625 +++++++++++++++++++++++++++++++++ scraper/scripts/methods.rb | 66 +++- 4 files changed, 693 insertions(+), 2 deletions(-) create mode 100644 scraper/scripts/districts.json diff --git a/scraper/Gemfile b/scraper/Gemfile index a4816b39..eb0aaca4 100644 --- a/scraper/Gemfile +++ b/scraper/Gemfile @@ -2,3 +2,5 @@ source "https://rubygems.org" gem 'nokogiri' gem 'httparty' + +gem "fuzzy_match", "~> 2.1" diff --git a/scraper/Gemfile.lock b/scraper/Gemfile.lock index fb656089..861a2a00 100644 --- a/scraper/Gemfile.lock +++ b/scraper/Gemfile.lock @@ -1,6 +1,7 @@ GEM remote: https://rubygems.org/ specs: + fuzzy_match (2.1.0) httparty (0.20.0) mime-types (~> 3.0) multi_xml (>= 0.5.2) @@ -18,6 +19,7 @@ PLATFORMS ruby DEPENDENCIES + fuzzy_match (~> 2.1) httparty nokogiri diff --git a/scraper/scripts/districts.json b/scraper/scripts/districts.json new file mode 100644 index 00000000..201c050b --- /dev/null +++ b/scraper/scripts/districts.json @@ -0,0 +1,625 @@ +{ + "districts": [ + "SOUTH ANDAMAN", + "PRAKASAM", + "KURNOOL", + "SRIKAKULAM", + "EAST GODAVARI", + "GUNTUR", + "ANANTAPUR", + "SRI POTTI SRIRAMULU NELLORE", + "Y.S.R.", + "WEST GODAVARI", + "VIZIANAGARAM", + "CHITTOOR", + "KRISHNA", + "VISAKHAPATNAM", + "WEST SIANG", + "DIBANG VALLEY", + "EAST SIANG", + "WEST KAMENG", + "CHANGLANG", + "UPPER SUBANSIRI", + "TIRAP", + "PAPUM PARE", + "ANJAW", + "KURUNG KUMEY", + "UPPER SIANG", + "LOHIT", + "LOWER SUBANSIRI", + "LOWER DIBANG VALLEY", + "EAST KAMENG", + "TAWANG", + "BONGAIGAON", + "SIVASAGAR", + "KARIMGANJ", + "GOLAGHAT", + "BARPETA", + "CHIRANG", + "LAKHIMPUR", + "DHUBRI", + "SONITPUR", + "KARBI ANGLONG", + "DIBRUGARH", + "TINSUKIA", + "DHEMAJI", + "NAGAON", + "GOALPARA", + "KOKRAJHAR", + "KAMRUP METROPOLITAN", + "DIMA HASAO", + "HAILAKANDI", + "JORHAT", + "DARRANG", + "CACHAR", + "MORIGAON", + "NALBARI", + "BAKSA", + "KAMRUP", + "UDALGURI", + "BANKA", + "ARARIA", + "PURBA CHAMPARAN", + "BHOJPUR", + "ARWAL", + "AURANGABAD", + "PASHCHIM CHAMPARAN", + "KISHANGANJ", + "SITAMARHI", + "BEGUSARAI", + "PATNA", + "PURNIA", + "LAKHISARAI", + "GOPALGANJ", + "SHEIKHPURA", + "DARBHANGA", + "KAIMUR (BHABUA)", + "BHAGALPUR", + "NALANDA", + "ROHTAS", + "SUPAUL", + "GAYA", + "BUXAR", + "SARAN", + "SAMASTIPUR", + "MADHUBANI", + "KHAGARIA", + "VAISHALI", + "NAWADA", + "MUNGER", + "JAMUI", + "JEHANABAD", + "MUZAFFARPUR", + "KATIHAR", + "MADHEPURA", + "SIWAN", + "SAHARSA", + "SHEOHAR", + "CHANDIGARH", + "DHAMTARI", + "RAIPUR", + "JANJGIR - CHAMPA", + "DURG", + "RAJNANDGAON", + "SURGUJA", + "UTTAR BASTAR KANKER", + "DAKSHIN BASTAR DANTEWADA", + "MAHASAMUND", + "JASHPUR", + "KORIYA", + "RAIGARH", + "BASTAR", + "BIJAPUR", + "BILASPUR", + "KABEERDHAM", + "KORBA", + "NARAYANPUR", + "D & N HAVELI", + "DAMAN", + "DIU", + "NORTH GOA", + "SOUTH GOA", + "AHMADABAD", + "BHARUCH", + "AMRELI", + "ANAND", + "KACHCHH", + "KHEDA", + "JUNAGADH", + "SURAT", + "SABAR KANTHA", + "BANAS KANTHA", + "JAMNAGAR", + "BHAVNAGAR", + "RAJKOT", + "NAVSARI", + "PATAN", + "PORBANDAR", + "VADODARA", + "SURENDRANAGAR", + "GANDHINAGAR", + "DOHAD", + "VALSAD", + "PANCH MAHALS", + "MAHESANA", + "NARMADA", + "THE DANGS", + "TAPI", + "AMBALA", + "KARNAL", + "MAHENDRAGARH", + "JHAJJAR", + "HISAR", + "REWARI", + "BHIWANI", + "FATEHABAD", + "KAITHAL", + "SIRSA", + "FARIDABAD", + "GURGAON", + "MEWAT", + "SONIPAT", + "PALWAL", + "JIND", + "ROHTAK", + "KURUKSHETRA", + "PANCHKULA", + "PANIPAT", + "YAMUNANAGAR", + "SOLAN", + "KANGRA", + "KULLU", + "HAMIRPUR", + "CHAMBA", + "SHIMLA", + "UNA", + "MANDI", + "SIRMAUR", + "ANANTNAG", + "JAMMU", + "PULWAMA", + "BADGAM", + "BANDIPORE", + "RAMBAN", + "BARAMULA", + "SAMBA", + "KATHUA", + "DODA", + "UDHAMPUR", + "KULGAM", + "GANDERBAL", + "KUPWARA", + "RAJOURI", + "KARGIL", + "REASI", + "KISHTWAR", + "LEH(LADAKH)", + "PUNCH", + "SHUPIYAN", + "SRINAGAR", + "PURBI SINGHBHUM", + "CHATRA", + "GIRIDIH", + "SAHIBGANJ", + "DUMKA", + "PALAMU", + "RANCHI", + "PASHCHIMI SINGHBHUM", + "BOKARO", + "DHANBAD", + "RAMGARH", + "DEOGHAR", + "KODARMA", + "GARHWA", + "GODDA", + "GUMLA", + "HAZARIBAGH", + "JAMTARA", + "KHUNTI", + "SARAIKELA-KHARSAWAN", + "LATEHAR", + "LOHARDAGA", + "PAKUR", + "SIMDEGA", + "GULBARGA", + "BELGAUM", + "DHARWAD", + "HASSAN", + "BAGALKOT", + "BANGALORE", + "UTTARA KANNADA", + "BIDAR", + "CHIKKABALLAPURA", + "RAICHUR", + "KOLAR", + "HAVERI", + "MYSORE", + "DAKSHINA KANNADA", + "BELLARY", + "SHIMOGA", + "YADGIR", + "RAMANAGARA", + "CHIKMAGALUR", + "CHITRADURGA", + "CHAMARAJANAGAR", + "DAVANAGERE", + "TUMKUR", + "BANGALORE RURAL", + "GADAG", + "KOPPAL", + "UDUPI", + "MANDYA", + "KODAGU", + "PATHANAMTHITTA", + "ALAPPUZHA", + "ERNAKULAM", + "KANNUR", + "THIRUVANANTHAPURAM", + "THRISSUR", + "KOTTAYAM", + "PALAKKAD", + "KOZHIKODE", + "WAYANAD", + "KASARAGOD", + "KOLLAM", + "IDUKKI", + "MALAPPURAM", + "SHAJAPUR", + "PANNA", + "BHIND", + "ALIRAJPUR", + "RATLAM", + "ANUPPUR", + "SATNA", + "CHHINDWARA", + "MORENA", + "BETUL", + "BARWANI", + "GWALIOR", + "GUNA", + "ASHOKNAGAR", + "SEHORE", + "NEEMUCH", + "HOSHANGABAD", + "CHHATARPUR", + "TIKAMGARH", + "SHIVPURI", + "RAISEN", + "UJJAIN", + "DHAR", + "SHEOPUR", + "DATIA", + "DEWAS", + "BALAGHAT", + "REWA", + "MANDLA", + "SAGAR", + "JABALPUR", + "SEONI", + "KATNI", + "KHARGONE (WEST NIMAR)", + "VIDISHA", + "SHAHDOL", + "BHOPAL", + "INDORE", + "MANDSAUR", + "RAJGARH", + "BURHANPUR", + "UMARIA", + "KHANDWA (EAST NIMAR)", + "NARSIMHAPUR", + "SIDHI", + "DAMOH", + "DINDORI", + "HARDA", + "JHABUA", + "SINGRAULI", + "WARDHA", + "AMRAVATI", + "GADCHIROLI", + "AHMADNAGAR", + "LATUR", + "SOLAPUR", + "AKOLA", + "PUNE", + "JALGAON", + "JALNA", + "THANE", + "BID", + "NANDED", + "GONDIYA", + "BULDANA", + "SANGLI", + "HINGOLI", + "CHANDRAPUR", + "YAVATMAL", + "NASHIK", + "BHANDARA", + "NAGPUR", + "OSMANABAD", + "RATNAGIRI", + "SATARA", + "SINDHUDURG", + "NANDURBAR", + "DHULE", + "KOLHAPUR", + "PARBHANI", + "MUMBAI", + "WASHIM", + "IMPHAL EAST", + "BISHNUPUR", + "THOUBAL", + "IMPHAL WEST", + "CHANDEL", + "SOUTH GARO HILLS", + "JAINTIA HILLS", + "WEST KHASI HILLS", + "RIBHOI", + "EAST GARO HILLS", + "EAST KHASI HILLS", + "WEST GARO HILLS", + "AIZAWL", + "KOLASIB", + "CHAMPHAI", + "LUNGLEI", + "LAWNGTLAI", + "MAMIT", + "SERCHHIP", + "SAIHA", + "MON", + "WOKHA", + "MOKOKCHUNG", + "PHEK", + "DIMAPUR", + "PEREN", + "KIPHIRE", + "KOHIMA", + "TUENSANG", + "LONGLENG", + "ZUNHEBOTO", + "SOUTH WEST", + "EAST", + "NEW DELHI", + "NORTH", + "SOUTH", + "KENDUJHAR", + "ANUGUL", + "GANJAM", + "CUTTACK", + "BARGARH", + "BALANGIR", + "BALESHWAR", + "KANDHAMAL", + "MALKANGIRI", + "KHORDHA", + "MAYURBHANJ", + "BHADRAK", + "BAUDH", + "JHARSUGUDA", + "KALAHANDI", + "DHENKANAL", + "SUBARNAPUR", + "SUNDARGARH", + "SAMBALPUR", + "JAJAPUR", + "NAYAGARH", + "DEBAGARH", + "RAYAGADA", + "JAGATSINGHAPUR", + "KORAPUT", + "GAJAPATI", + "KENDRAPARA", + "NUAPADA", + "PURI", + "NABARANGAPUR", + "KARAIKAL", + "PUDUCHERRY", + "MAHE", + "YANAM", + "FIROZPUR", + "JALANDHAR", + "SANGRUR", + "AMRISTAR", + "FATHEGARH SAHIB", + "RUPNAGAR", + "MOGA", + "SHAHID BHAGAT SINGH NAGAR", + "TARN TARAN", + "SAHIBZADA AJIT SINGH NAGAR", + "MANSA", + "MUKSTAR", + "BARNALA", + "GURDASPUR", + "BATHINDA", + "KAPURTHALA", + "PATIALA", + "HOSHIARPUR", + "LUDHIANA", + "FARIDKOT", + "SIROHI", + "AJMER", + "JHALAWAR", + "ALWAR", + "RAJSAMAND", + "BARAN", + "GANGANAGAR", + "BHILWARA", + "JHUNJHUNUN", + "JAIPUR", + "PALI", + "BARMER", + "DAUSA", + "BANSWARA", + "DHAULPUR", + "CHITTAURGARH", + "BHARATPUR", + "HANUMANGARH", + "UDAIPUR", + "JALOR", + "CHURU", + "BIKANER", + "JODHPUR", + "BUNDI", + "PRATAPGARH", + "NAGAUR", + "TONK", + "DUNGARPUR", + "SIKAR", + "SAWAI MADHOPUR", + "KARAULI", + "KOTA", + "JAISALMER", + "EAST DISTRICT", + "WEST DISTRICT", + "SOUTH DISTRICT", + "NORTH DISTRICT", + "MADURAI", + "RAMANATHAPURAM", + "TIRUNELVELI", + "KANCHEEPURAM", + "THE NILGIRIS", + "THANJAVUR", + "DINDIGUL", + "KANNIYAKUMARI", + "NAMAKKAL", + "VELLORE", + "PUDUKKOTTAI", + "COIMBATORE", + "THOOTHUKKUDI", + "ERODE", + "VILUPPURAM", + "THENI", + "CUDDALORE", + "TIRUVANNAMALAI", + "THIRUVALLUR", + "SALEM", + "KARUR", + "ARIYALUR", + "PERAMBALUR", + "VIRUDHUNAGAR", + "TIRUPPUR", + "DHARMAPURI", + "TIRUCHIRAPPALLI", + "KRISHNAGIRI", + "CHENNAI", + "SIVAGANGA", + "NAGAPATTINAM", + "THIRUVARUR", + "MAHBUBNAGAR", + "ADILABAD", + "MEDAK", + "NIZAMABAD", + "RANGAREDDY", + "NALGONDA", + "WARANGAL", + "HYDERABAD", + "KARIMNAGAR", + "KHAMMAM", + "WEST TRIPURA", + "SOUTH TRIPURA", + "DHALAI", + "NORTH TRIPURA", + "WEST TRIPURA\nWEST TRIPURA", + "AURAIYA", + "AGRA", + "MAU", + "BIJNOR", + "BAGHPAT", + "MORADABAD", + "MIRZAPUR", + "MUZZAFFARNAGAR", + "KAUSHAMBI", + "KANPUR DEHAT", + "AMBEDKAR NAGAR", + "ETAH", + "ALIGARH", + "ALLAHABAD", + "SHAHJAHANPUR", + "BUDAUN", + "KANSHIRAM NAGAR", + "SAHARANPUR", + "SULTANPUR", + "LUCKNOW", + "AMROHA", + "MAHARAJGANJ", + "BULANDSHAHR", + "BAREILLY", + "BANDA", + "AZAMGARH", + "UNNAO", + "FAIZABAD", + "GHAZIABAD", + "RAE BARELI", + "JAUNPUR", + "GHAZIPUR", + "BAHRAICH", + "MEERUT", + "FATEHPUR", + "MATHURA", + "ETAWAH", + "BALLIA", + "BALRAMPUR", + "BASTI", + "BARA BANKI", + "GORAKHPUR", + "SIDDHARTHNAGAR", + "JHANSI", + "DEORIA", + "PILIBHIT", + "LAKHIMPUR KHERI", + "HARDOI", + "MAINPURI", + "BHADOHI", + "SHRAWASTI", + "GAUTAM BUDDHA NAGAR", + "RAMPUR", + "KANPUR NAGAR", + "SITAPUR", + "CHANDAULI", + "MAHOBA", + "KANNAUJ", + "CHITRAKOOT", + "SONBHADRA", + "GONDA", + "FIROZABAD", + "FARRUKHABAD", + "VARANASI", + "SANT KABIR NAGAR", + "MAHAMAYA NAGAR", + "KUSHINAGAR", + "JALAUN", + "LALITPUR", + "ALMORA", + "RUDRAPRAYAG", + "CHAMOLI", + "BAGESHWAR", + "UDHAM SINGH NAGAR", + "CHAMPAWAT", + "UTTARKASHI", + "PITHORAGARH", + "HARDWAR", + "NAINITAL", + "TEHRI GARHWAL", + "DEHRADUN", + "GARHWAL", + "JALPAIGURI", + "HUGLI", + "BARDDHAMAN", + "NORTH TWENTY FOUR PARGANAS", + "DAKSHIN DINAJPUR", + "BANKURA", + "SOUTH TWENTY FOUR PARGANAS", + "MURSHIDABAD", + "NADIA", + "BIRBHUM", + "PASCHIM MEDINIPUR", + "PURBA MEDINIPUR", + "UTTAR DINAJPUR", + "DARJILING", + "KOCH BIHAR", + "MALDAH", + "HAORA", + "PURULIYA", + "KOLKATA" + ] +} \ No newline at end of file diff --git a/scraper/scripts/methods.rb b/scraper/scripts/methods.rb index c69bc652..0d5354a9 100644 --- a/scraper/scripts/methods.rb +++ b/scraper/scripts/methods.rb @@ -5,6 +5,7 @@ require 'fileutils' require 'nokogiri' require 'open-uri' +require 'fuzzy_match' require './methods_nach' require './utils' require './iso3166' @@ -182,9 +183,48 @@ def parse_contact(std_code, phone) end end +def get_unmatched_district(district, row, matcher) + + if district.nil? + return district + end + + # Here the CITY2 field contains the district which can be matched + if /\(?\s*U\s*\.?\s*P\s*\.?\s*\)?/.match?(district) || /\(?\s*U\s*\.?\s*T\s*\.?\s*\)?/.match?(district) || /\(?\s*M\s*\.?\s*P\s*\.?\s*\)?/.match?(district) + return matcher.find(sanitize(row['CITY2'])) + elsif district === "KGF" + return "KOLAR" + elsif district === "M.P.K.V." + return "AHMADNAGAR" + elsif district === "PCMC" + return "PUNE" + elsif district === "GMC" + return "SRINAGAR" + elsif district === "110027" + return "NEW DELHI" + elsif district === "612 103" + return "THANJAVUR" + elsif district === "273005" + return "GORAKHPUR" + elsif district === "2 M" + return "GANGANAGAR" + elsif row["IFSC"] === "PUNB0667000" + return "HAMIRPUR" + elsif row["IFSC"] === "PUNB0667000" + return "SHIMLA" + else + return district + end +end + def parse_csv(files, banks, additional_attributes = {}) data = {} + districts = JSON.parse(File.read('districts.json')) + matcher = FuzzyMatch.new(districts["districts"]) + + district_map = Hash.new() + files.each do |file| row_index = 0 headings = [] @@ -260,7 +300,29 @@ def parse_csv(files, banks, additional_attributes = {}) # which have the flipped values for CITY1 and CITY2 row['CITY'] = sanitize(row['CITY2']) row['CENTRE'] = sanitize(row['CITY1']) - row['DISTRICT'] = sanitize(row['CITY1']) + district = sanitize(row['CITY1']) + row['DISTRICT'] = district + + if district_map.has_key?(district) + row['DISTRICT CLEANED'] = district_map[district] + else + matched = matcher.find(district) + + # Single match + if matched.kind_of?(Array) + row['DISTRICT CLEANED'] = matched[0] + district_map[district] = matched[0] + # Return the most matched when there are multiple matches + elsif matched.kind_of?(String) + row['DISTRICT CLEANED'] = matched + district_map[district] = matched + # Edge cases where it is impossible to match using fuzzy logic + else + fixed_district = get_unmatched_district(district,row,matcher) + row['DISTRICT CLEANED'] = fixed_district + district_map[district] = fixed_district + end + end # Delete rows we don't want in output # Merged into CONTACRT @@ -276,7 +338,7 @@ def parse_csv(files, banks, additional_attributes = {}) def export_csv(data) CSV.open('data/IFSC.csv', 'wb') do |csv| - keys = ['BANK','IFSC','BRANCH','CENTRE','DISTRICT','STATE','ADDRESS','CONTACT','IMPS','RTGS','CITY','ISO3166','NEFT','MICR','UPI','SWIFT'] + keys = ['BANK','IFSC','BRANCH','CENTRE','DISTRICT','DISTRICT CLEANED','STATE','ADDRESS','CONTACT','IMPS','RTGS','CITY','ISO3166','NEFT','MICR','UPI','SWIFT'] csv << keys data.each do |code, ifsc_data| sorted_data = [] From dba10243b82495fc0c8e45e732e09d5a39b55713 Mon Sep 17 00:00:00 2001 From: yashy3nugu Date: Sat, 10 Dec 2022 20:49:08 +0530 Subject: [PATCH 02/11] Update districts.json --- scraper/scripts/districts.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper/scripts/districts.json b/scraper/scripts/districts.json index 201c050b..5391fb72 100644 --- a/scraper/scripts/districts.json +++ b/scraper/scripts/districts.json @@ -519,7 +519,7 @@ "SOUTH TRIPURA", "DHALAI", "NORTH TRIPURA", - "WEST TRIPURA\nWEST TRIPURA", + "WEST TRIPURA", "AURAIYA", "AGRA", "MAU", From 98a969067390780e29d493c09ee8d5317bd69233 Mon Sep 17 00:00:00 2001 From: yashy3nugu Date: Mon, 12 Dec 2022 11:59:46 +0530 Subject: [PATCH 03/11] Update tests --- tests/php/DatasetTest.php | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/php/DatasetTest.php b/tests/php/DatasetTest.php index 65c5f38d..ef6e1e78 100644 --- a/tests/php/DatasetTest.php +++ b/tests/php/DatasetTest.php @@ -18,6 +18,7 @@ class DatasetTest extends TestCase 'BRANCH', 'CENTRE', 'DISTRICT', + 'DISTRICT CLEANED', 'STATE', 'ADDRESS', 'CONTACT', From 480d6dd48eb69989dbe109ea09b08a275d65a9fb Mon Sep 17 00:00:00 2001 From: yashy3nugu Date: Mon, 12 Dec 2022 13:10:16 +0530 Subject: [PATCH 04/11] Add field to imps --- scraper/scripts/methods.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/scraper/scripts/methods.rb b/scraper/scripts/methods.rb index 0d5354a9..318e52d7 100644 --- a/scraper/scripts/methods.rb +++ b/scraper/scripts/methods.rb @@ -34,6 +34,7 @@ def parse_imps(banks) 'BRANCH' => "#{banknames[code]} IMPS", 'CENTRE' => 'NA', 'DISTRICT' => 'NA', + 'DISTRICT CLEANED' => 'NA', 'STATE' => 'MAHARASHTRA', 'ADDRESS' => 'NA', 'CONTACT' => nil, From 8bcbd8c7a9849d335d1c8946584349d5702bce2f Mon Sep 17 00:00:00 2001 From: yashy3nugu Date: Mon, 12 Dec 2022 16:25:08 +0530 Subject: [PATCH 05/11] Update patch to add field --- src/patches/ifsc/iccl.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/patches/ifsc/iccl.yml b/src/patches/ifsc/iccl.yml index 3f47db73..2a9c9359 100644 --- a/src/patches/ifsc/iccl.yml +++ b/src/patches/ifsc/iccl.yml @@ -16,6 +16,7 @@ ifsc: CITY: MUMBAI CONTACT: null DISTRICT: Mumbai + DISTRICT CLEANED: MUMBAI IMPS: false MICR: null NEFT: false From 29c270896e62e7b046ea845fc3db08bc7d7436d2 Mon Sep 17 00:00:00 2001 From: yashy3nugu Date: Tue, 13 Dec 2022 18:53:06 +0530 Subject: [PATCH 06/11] Overwrite DISTRICT field --- scraper/scripts/methods.rb | 12 +++++------- src/patches/ifsc/iccl.yml | 3 +-- tests/php/DatasetTest.php | 1 - 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/scraper/scripts/methods.rb b/scraper/scripts/methods.rb index 318e52d7..17c134f2 100644 --- a/scraper/scripts/methods.rb +++ b/scraper/scripts/methods.rb @@ -34,7 +34,6 @@ def parse_imps(banks) 'BRANCH' => "#{banknames[code]} IMPS", 'CENTRE' => 'NA', 'DISTRICT' => 'NA', - 'DISTRICT CLEANED' => 'NA', 'STATE' => 'MAHARASHTRA', 'ADDRESS' => 'NA', 'CONTACT' => nil, @@ -302,25 +301,24 @@ def parse_csv(files, banks, additional_attributes = {}) row['CITY'] = sanitize(row['CITY2']) row['CENTRE'] = sanitize(row['CITY1']) district = sanitize(row['CITY1']) - row['DISTRICT'] = district if district_map.has_key?(district) - row['DISTRICT CLEANED'] = district_map[district] + row['DISTRICT'] = district_map[district] else matched = matcher.find(district) # Single match if matched.kind_of?(Array) - row['DISTRICT CLEANED'] = matched[0] + row['DISTRICT'] = matched[0] district_map[district] = matched[0] # Return the most matched when there are multiple matches elsif matched.kind_of?(String) - row['DISTRICT CLEANED'] = matched + row['DISTRICT'] = matched district_map[district] = matched # Edge cases where it is impossible to match using fuzzy logic else fixed_district = get_unmatched_district(district,row,matcher) - row['DISTRICT CLEANED'] = fixed_district + row['DISTRICT'] = fixed_district district_map[district] = fixed_district end end @@ -339,7 +337,7 @@ def parse_csv(files, banks, additional_attributes = {}) def export_csv(data) CSV.open('data/IFSC.csv', 'wb') do |csv| - keys = ['BANK','IFSC','BRANCH','CENTRE','DISTRICT','DISTRICT CLEANED','STATE','ADDRESS','CONTACT','IMPS','RTGS','CITY','ISO3166','NEFT','MICR','UPI','SWIFT'] + keys = ['BANK','IFSC','BRANCH','CENTRE','DISTRICT','STATE','ADDRESS','CONTACT','IMPS','RTGS','CITY','ISO3166','NEFT','MICR','UPI','SWIFT'] csv << keys data.each do |code, ifsc_data| sorted_data = [] diff --git a/src/patches/ifsc/iccl.yml b/src/patches/ifsc/iccl.yml index 2a9c9359..bbfd6f4e 100644 --- a/src/patches/ifsc/iccl.yml +++ b/src/patches/ifsc/iccl.yml @@ -15,8 +15,7 @@ ifsc: CENTRE: Mumbai CITY: MUMBAI CONTACT: null - DISTRICT: Mumbai - DISTRICT CLEANED: MUMBAI + DISTRICT: Mumbai IMPS: false MICR: null NEFT: false diff --git a/tests/php/DatasetTest.php b/tests/php/DatasetTest.php index ef6e1e78..65c5f38d 100644 --- a/tests/php/DatasetTest.php +++ b/tests/php/DatasetTest.php @@ -18,7 +18,6 @@ class DatasetTest extends TestCase 'BRANCH', 'CENTRE', 'DISTRICT', - 'DISTRICT CLEANED', 'STATE', 'ADDRESS', 'CONTACT', From f9dcf44f2c1ec47c5b7ca375500ded8b5e7553d5 Mon Sep 17 00:00:00 2001 From: yashy3nugu Date: Tue, 13 Dec 2022 19:38:07 +0530 Subject: [PATCH 07/11] Convert districts to title case --- scraper/scripts/districts.json | 1240 ++++++++++++++++---------------- 1 file changed, 620 insertions(+), 620 deletions(-) diff --git a/scraper/scripts/districts.json b/scraper/scripts/districts.json index 5391fb72..b0f4c382 100644 --- a/scraper/scripts/districts.json +++ b/scraper/scripts/districts.json @@ -1,625 +1,625 @@ { "districts": [ - "SOUTH ANDAMAN", - "PRAKASAM", - "KURNOOL", - "SRIKAKULAM", - "EAST GODAVARI", - "GUNTUR", - "ANANTAPUR", - "SRI POTTI SRIRAMULU NELLORE", + "South Andaman", + "Prakasam", + "Kurnool", + "Srikakulam", + "East Godavari", + "Guntur", + "Anantapur", + "Sri Potti Sriramulu Nellore", "Y.S.R.", - "WEST GODAVARI", - "VIZIANAGARAM", - "CHITTOOR", - "KRISHNA", - "VISAKHAPATNAM", - "WEST SIANG", - "DIBANG VALLEY", - "EAST SIANG", - "WEST KAMENG", - "CHANGLANG", - "UPPER SUBANSIRI", - "TIRAP", - "PAPUM PARE", - "ANJAW", - "KURUNG KUMEY", - "UPPER SIANG", - "LOHIT", - "LOWER SUBANSIRI", - "LOWER DIBANG VALLEY", - "EAST KAMENG", - "TAWANG", - "BONGAIGAON", - "SIVASAGAR", - "KARIMGANJ", - "GOLAGHAT", - "BARPETA", - "CHIRANG", - "LAKHIMPUR", - "DHUBRI", - "SONITPUR", - "KARBI ANGLONG", - "DIBRUGARH", - "TINSUKIA", - "DHEMAJI", - "NAGAON", - "GOALPARA", - "KOKRAJHAR", - "KAMRUP METROPOLITAN", - "DIMA HASAO", - "HAILAKANDI", - "JORHAT", - "DARRANG", - "CACHAR", - "MORIGAON", - "NALBARI", - "BAKSA", - "KAMRUP", - "UDALGURI", - "BANKA", - "ARARIA", - "PURBA CHAMPARAN", - "BHOJPUR", - "ARWAL", - "AURANGABAD", - "PASHCHIM CHAMPARAN", - "KISHANGANJ", - "SITAMARHI", - "BEGUSARAI", - "PATNA", - "PURNIA", - "LAKHISARAI", - "GOPALGANJ", - "SHEIKHPURA", - "DARBHANGA", - "KAIMUR (BHABUA)", - "BHAGALPUR", - "NALANDA", - "ROHTAS", - "SUPAUL", - "GAYA", - "BUXAR", - "SARAN", - "SAMASTIPUR", - "MADHUBANI", - "KHAGARIA", - "VAISHALI", - "NAWADA", - "MUNGER", - "JAMUI", - "JEHANABAD", - "MUZAFFARPUR", - "KATIHAR", - "MADHEPURA", - "SIWAN", - "SAHARSA", - "SHEOHAR", - "CHANDIGARH", - "DHAMTARI", - "RAIPUR", - "JANJGIR - CHAMPA", - "DURG", - "RAJNANDGAON", - "SURGUJA", - "UTTAR BASTAR KANKER", - "DAKSHIN BASTAR DANTEWADA", - "MAHASAMUND", - "JASHPUR", - "KORIYA", - "RAIGARH", - "BASTAR", - "BIJAPUR", - "BILASPUR", - "KABEERDHAM", - "KORBA", - "NARAYANPUR", - "D & N HAVELI", - "DAMAN", - "DIU", - "NORTH GOA", - "SOUTH GOA", - "AHMADABAD", - "BHARUCH", - "AMRELI", - "ANAND", - "KACHCHH", - "KHEDA", - "JUNAGADH", - "SURAT", - "SABAR KANTHA", - "BANAS KANTHA", - "JAMNAGAR", - "BHAVNAGAR", - "RAJKOT", - "NAVSARI", - "PATAN", - "PORBANDAR", - "VADODARA", - "SURENDRANAGAR", - "GANDHINAGAR", - "DOHAD", - "VALSAD", - "PANCH MAHALS", - "MAHESANA", - "NARMADA", - "THE DANGS", - "TAPI", - "AMBALA", - "KARNAL", - "MAHENDRAGARH", - "JHAJJAR", - "HISAR", - "REWARI", - "BHIWANI", - "FATEHABAD", - "KAITHAL", - "SIRSA", - "FARIDABAD", - "GURGAON", - "MEWAT", - "SONIPAT", - "PALWAL", - "JIND", - "ROHTAK", - "KURUKSHETRA", - "PANCHKULA", - "PANIPAT", - "YAMUNANAGAR", - "SOLAN", - "KANGRA", - "KULLU", - "HAMIRPUR", - "CHAMBA", - "SHIMLA", - "UNA", - "MANDI", - "SIRMAUR", - "ANANTNAG", - "JAMMU", - "PULWAMA", - "BADGAM", - "BANDIPORE", - "RAMBAN", - "BARAMULA", - "SAMBA", - "KATHUA", - "DODA", - "UDHAMPUR", - "KULGAM", - "GANDERBAL", - "KUPWARA", - "RAJOURI", - "KARGIL", - "REASI", - "KISHTWAR", - "LEH(LADAKH)", - "PUNCH", - "SHUPIYAN", - "SRINAGAR", - "PURBI SINGHBHUM", - "CHATRA", - "GIRIDIH", - "SAHIBGANJ", - "DUMKA", - "PALAMU", - "RANCHI", - "PASHCHIMI SINGHBHUM", - "BOKARO", - "DHANBAD", - "RAMGARH", - "DEOGHAR", - "KODARMA", - "GARHWA", - "GODDA", - "GUMLA", - "HAZARIBAGH", - "JAMTARA", - "KHUNTI", - "SARAIKELA-KHARSAWAN", - "LATEHAR", - "LOHARDAGA", - "PAKUR", - "SIMDEGA", - "GULBARGA", - "BELGAUM", - "DHARWAD", - "HASSAN", - "BAGALKOT", - "BANGALORE", - "UTTARA KANNADA", - "BIDAR", - "CHIKKABALLAPURA", - "RAICHUR", - "KOLAR", - "HAVERI", - "MYSORE", - "DAKSHINA KANNADA", - "BELLARY", - "SHIMOGA", - "YADGIR", - "RAMANAGARA", - "CHIKMAGALUR", - "CHITRADURGA", - "CHAMARAJANAGAR", - "DAVANAGERE", - "TUMKUR", - "BANGALORE RURAL", - "GADAG", - "KOPPAL", - "UDUPI", - "MANDYA", - "KODAGU", - "PATHANAMTHITTA", - "ALAPPUZHA", - "ERNAKULAM", - "KANNUR", - "THIRUVANANTHAPURAM", - "THRISSUR", - "KOTTAYAM", - "PALAKKAD", - "KOZHIKODE", - "WAYANAD", - "KASARAGOD", - "KOLLAM", - "IDUKKI", - "MALAPPURAM", - "SHAJAPUR", - "PANNA", - "BHIND", - "ALIRAJPUR", - "RATLAM", - "ANUPPUR", - "SATNA", - "CHHINDWARA", - "MORENA", - "BETUL", - "BARWANI", - "GWALIOR", - "GUNA", - "ASHOKNAGAR", - "SEHORE", - "NEEMUCH", - "HOSHANGABAD", - "CHHATARPUR", - "TIKAMGARH", - "SHIVPURI", - "RAISEN", - "UJJAIN", - "DHAR", - "SHEOPUR", - "DATIA", - "DEWAS", - "BALAGHAT", - "REWA", - "MANDLA", - "SAGAR", - "JABALPUR", - "SEONI", - "KATNI", - "KHARGONE (WEST NIMAR)", - "VIDISHA", - "SHAHDOL", - "BHOPAL", - "INDORE", - "MANDSAUR", - "RAJGARH", - "BURHANPUR", - "UMARIA", - "KHANDWA (EAST NIMAR)", - "NARSIMHAPUR", - "SIDHI", - "DAMOH", - "DINDORI", - "HARDA", - "JHABUA", - "SINGRAULI", - "WARDHA", - "AMRAVATI", - "GADCHIROLI", - "AHMADNAGAR", - "LATUR", - "SOLAPUR", - "AKOLA", - "PUNE", - "JALGAON", - "JALNA", - "THANE", - "BID", - "NANDED", - "GONDIYA", - "BULDANA", - "SANGLI", - "HINGOLI", - "CHANDRAPUR", - "YAVATMAL", - "NASHIK", - "BHANDARA", - "NAGPUR", - "OSMANABAD", - "RATNAGIRI", - "SATARA", - "SINDHUDURG", - "NANDURBAR", - "DHULE", - "KOLHAPUR", - "PARBHANI", - "MUMBAI", - "WASHIM", - "IMPHAL EAST", - "BISHNUPUR", - "THOUBAL", - "IMPHAL WEST", - "CHANDEL", - "SOUTH GARO HILLS", - "JAINTIA HILLS", - "WEST KHASI HILLS", - "RIBHOI", - "EAST GARO HILLS", - "EAST KHASI HILLS", - "WEST GARO HILLS", - "AIZAWL", - "KOLASIB", - "CHAMPHAI", - "LUNGLEI", - "LAWNGTLAI", - "MAMIT", - "SERCHHIP", - "SAIHA", - "MON", - "WOKHA", - "MOKOKCHUNG", - "PHEK", - "DIMAPUR", - "PEREN", - "KIPHIRE", - "KOHIMA", - "TUENSANG", - "LONGLENG", - "ZUNHEBOTO", - "SOUTH WEST", - "EAST", - "NEW DELHI", - "NORTH", - "SOUTH", - "KENDUJHAR", - "ANUGUL", - "GANJAM", - "CUTTACK", - "BARGARH", - "BALANGIR", - "BALESHWAR", - "KANDHAMAL", - "MALKANGIRI", - "KHORDHA", - "MAYURBHANJ", - "BHADRAK", - "BAUDH", - "JHARSUGUDA", - "KALAHANDI", - "DHENKANAL", - "SUBARNAPUR", - "SUNDARGARH", - "SAMBALPUR", - "JAJAPUR", - "NAYAGARH", - "DEBAGARH", - "RAYAGADA", - "JAGATSINGHAPUR", - "KORAPUT", - "GAJAPATI", - "KENDRAPARA", - "NUAPADA", - "PURI", - "NABARANGAPUR", - "KARAIKAL", - "PUDUCHERRY", - "MAHE", - "YANAM", - "FIROZPUR", - "JALANDHAR", - "SANGRUR", - "AMRISTAR", - "FATHEGARH SAHIB", - "RUPNAGAR", - "MOGA", - "SHAHID BHAGAT SINGH NAGAR", - "TARN TARAN", - "SAHIBZADA AJIT SINGH NAGAR", - "MANSA", - "MUKSTAR", - "BARNALA", - "GURDASPUR", - "BATHINDA", - "KAPURTHALA", - "PATIALA", - "HOSHIARPUR", - "LUDHIANA", - "FARIDKOT", - "SIROHI", - "AJMER", - "JHALAWAR", - "ALWAR", - "RAJSAMAND", - "BARAN", - "GANGANAGAR", - "BHILWARA", - "JHUNJHUNUN", - "JAIPUR", - "PALI", - "BARMER", - "DAUSA", - "BANSWARA", - "DHAULPUR", - "CHITTAURGARH", - "BHARATPUR", - "HANUMANGARH", - "UDAIPUR", - "JALOR", - "CHURU", - "BIKANER", - "JODHPUR", - "BUNDI", - "PRATAPGARH", - "NAGAUR", - "TONK", - "DUNGARPUR", - "SIKAR", - "SAWAI MADHOPUR", - "KARAULI", - "KOTA", - "JAISALMER", - "EAST DISTRICT", - "WEST DISTRICT", - "SOUTH DISTRICT", - "NORTH DISTRICT", - "MADURAI", - "RAMANATHAPURAM", - "TIRUNELVELI", - "KANCHEEPURAM", - "THE NILGIRIS", - "THANJAVUR", - "DINDIGUL", - "KANNIYAKUMARI", - "NAMAKKAL", - "VELLORE", - "PUDUKKOTTAI", - "COIMBATORE", - "THOOTHUKKUDI", - "ERODE", - "VILUPPURAM", - "THENI", - "CUDDALORE", - "TIRUVANNAMALAI", - "THIRUVALLUR", - "SALEM", - "KARUR", - "ARIYALUR", - "PERAMBALUR", - "VIRUDHUNAGAR", - "TIRUPPUR", - "DHARMAPURI", - "TIRUCHIRAPPALLI", - "KRISHNAGIRI", - "CHENNAI", - "SIVAGANGA", - "NAGAPATTINAM", - "THIRUVARUR", - "MAHBUBNAGAR", - "ADILABAD", - "MEDAK", - "NIZAMABAD", - "RANGAREDDY", - "NALGONDA", - "WARANGAL", - "HYDERABAD", - "KARIMNAGAR", - "KHAMMAM", - "WEST TRIPURA", - "SOUTH TRIPURA", - "DHALAI", - "NORTH TRIPURA", - "WEST TRIPURA", - "AURAIYA", - "AGRA", - "MAU", - "BIJNOR", - "BAGHPAT", - "MORADABAD", - "MIRZAPUR", - "MUZZAFFARNAGAR", - "KAUSHAMBI", - "KANPUR DEHAT", - "AMBEDKAR NAGAR", - "ETAH", - "ALIGARH", - "ALLAHABAD", - "SHAHJAHANPUR", - "BUDAUN", - "KANSHIRAM NAGAR", - "SAHARANPUR", - "SULTANPUR", - "LUCKNOW", - "AMROHA", - "MAHARAJGANJ", - "BULANDSHAHR", - "BAREILLY", - "BANDA", - "AZAMGARH", - "UNNAO", - "FAIZABAD", - "GHAZIABAD", - "RAE BARELI", - "JAUNPUR", - "GHAZIPUR", - "BAHRAICH", - "MEERUT", - "FATEHPUR", - "MATHURA", - "ETAWAH", - "BALLIA", - "BALRAMPUR", - "BASTI", - "BARA BANKI", - "GORAKHPUR", - "SIDDHARTHNAGAR", - "JHANSI", - "DEORIA", - "PILIBHIT", - "LAKHIMPUR KHERI", - "HARDOI", - "MAINPURI", - "BHADOHI", - "SHRAWASTI", - "GAUTAM BUDDHA NAGAR", - "RAMPUR", - "KANPUR NAGAR", - "SITAPUR", - "CHANDAULI", - "MAHOBA", - "KANNAUJ", - "CHITRAKOOT", - "SONBHADRA", - "GONDA", - "FIROZABAD", - "FARRUKHABAD", - "VARANASI", - "SANT KABIR NAGAR", - "MAHAMAYA NAGAR", - "KUSHINAGAR", - "JALAUN", - "LALITPUR", - "ALMORA", - "RUDRAPRAYAG", - "CHAMOLI", - "BAGESHWAR", - "UDHAM SINGH NAGAR", - "CHAMPAWAT", - "UTTARKASHI", - "PITHORAGARH", - "HARDWAR", - "NAINITAL", - "TEHRI GARHWAL", - "DEHRADUN", - "GARHWAL", - "JALPAIGURI", - "HUGLI", - "BARDDHAMAN", - "NORTH TWENTY FOUR PARGANAS", - "DAKSHIN DINAJPUR", - "BANKURA", - "SOUTH TWENTY FOUR PARGANAS", - "MURSHIDABAD", - "NADIA", - "BIRBHUM", - "PASCHIM MEDINIPUR", - "PURBA MEDINIPUR", - "UTTAR DINAJPUR", - "DARJILING", - "KOCH BIHAR", - "MALDAH", - "HAORA", - "PURULIYA", - "KOLKATA" + "West Godavari", + "Vizianagaram", + "Chittoor", + "Krishna", + "Visakhapatnam", + "West Siang", + "Dibang Valley", + "East Siang", + "West Kameng", + "Changlang", + "Upper Subansiri", + "Tirap", + "Papum Pare", + "Anjaw", + "Kurung Kumey", + "Upper Siang", + "Lohit", + "Lower Subansiri", + "Lower Dibang Valley", + "East Kameng", + "Tawang", + "Bongaigaon", + "Sivasagar", + "Karimganj", + "Golaghat", + "Barpeta", + "Chirang", + "Lakhimpur", + "Dhubri", + "Sonitpur", + "Karbi Anglong", + "Dibrugarh", + "Tinsukia", + "Dhemaji", + "Nagaon", + "Goalpara", + "Kokrajhar", + "Kamrup Metropolitan", + "Dima Hasao", + "Hailakandi", + "Jorhat", + "Darrang", + "Cachar", + "Morigaon", + "Nalbari", + "Baksa", + "Kamrup", + "Udalguri", + "Banka", + "Araria", + "Purba Champaran", + "Bhojpur", + "Arwal", + "Aurangabad", + "Pashchim Champaran", + "Kishanganj", + "Sitamarhi", + "Begusarai", + "Patna", + "Purnia", + "Lakhisarai", + "Gopalganj", + "Sheikhpura", + "Darbhanga", + "Kaimur (Bhabua)", + "Bhagalpur", + "Nalanda", + "Rohtas", + "Supaul", + "Gaya", + "Buxar", + "Saran", + "Samastipur", + "Madhubani", + "Khagaria", + "Vaishali", + "Nawada", + "Munger", + "Jamui", + "Jehanabad", + "Muzaffarpur", + "Katihar", + "Madhepura", + "Siwan", + "Saharsa", + "Sheohar", + "Chandigarh", + "Dhamtari", + "Raipur", + "Janjgir - Champa", + "Durg", + "Rajnandgaon", + "Surguja", + "Uttar Bastar Kanker", + "Dakshin Bastar Dantewada", + "Mahasamund", + "Jashpur", + "Koriya", + "Raigarh", + "Bastar", + "Bijapur", + "Bilaspur", + "Kabeerdham", + "Korba", + "Narayanpur", + "D & N Haveli", + "Daman", + "Diu", + "North Goa", + "South Goa", + "Ahmadabad", + "Bharuch", + "Amreli", + "Anand", + "Kachchh", + "Kheda", + "Junagadh", + "Surat", + "Sabar Kantha", + "Banas Kantha", + "Jamnagar", + "Bhavnagar", + "Rajkot", + "Navsari", + "Patan", + "Porbandar", + "Vadodara", + "Surendranagar", + "Gandhinagar", + "Dohad", + "Valsad", + "Panch Mahals", + "Mahesana", + "Narmada", + "The Dangs", + "Tapi", + "Ambala", + "Karnal", + "Mahendragarh", + "Jhajjar", + "Hisar", + "Rewari", + "Bhiwani", + "Fatehabad", + "Kaithal", + "Sirsa", + "Faridabad", + "Gurgaon", + "Mewat", + "Sonipat", + "Palwal", + "Jind", + "Rohtak", + "Kurukshetra", + "Panchkula", + "Panipat", + "Yamunanagar", + "Solan", + "Kangra", + "Kullu", + "Hamirpur", + "Chamba", + "Shimla", + "Una", + "Mandi", + "Sirmaur", + "Anantnag", + "Jammu", + "Pulwama", + "Badgam", + "Bandipore", + "Ramban", + "Baramula", + "Samba", + "Kathua", + "Doda", + "Udhampur", + "Kulgam", + "Ganderbal", + "Kupwara", + "Rajouri", + "Kargil", + "Reasi", + "Kishtwar", + "Leh(Ladakh)", + "Punch", + "Shupiyan", + "Srinagar", + "Purbi Singhbhum", + "Chatra", + "Giridih", + "Sahibganj", + "Dumka", + "Palamu", + "Ranchi", + "Pashchimi Singhbhum", + "Bokaro", + "Dhanbad", + "Ramgarh", + "Deoghar", + "Kodarma", + "Garhwa", + "Godda", + "Gumla", + "Hazaribagh", + "Jamtara", + "Khunti", + "Saraikela-Kharsawan", + "Latehar", + "Lohardaga", + "Pakur", + "Simdega", + "Gulbarga", + "Belgaum", + "Dharwad", + "Hassan", + "Bagalkot", + "Bangalore", + "Uttara Kannada", + "Bidar", + "Chikkaballapura", + "Raichur", + "Kolar", + "Haveri", + "Mysore", + "Dakshina Kannada", + "Bellary", + "Shimoga", + "Yadgir", + "Ramanagara", + "Chikmagalur", + "Chitradurga", + "Chamarajanagar", + "Davanagere", + "Tumkur", + "Bangalore Rural", + "Gadag", + "Koppal", + "Udupi", + "Mandya", + "Kodagu", + "Pathanamthitta", + "Alappuzha", + "Ernakulam", + "Kannur", + "Thiruvananthapuram", + "Thrissur", + "Kottayam", + "Palakkad", + "Kozhikode", + "Wayanad", + "Kasaragod", + "Kollam", + "Idukki", + "Malappuram", + "Shajapur", + "Panna", + "Bhind", + "Alirajpur", + "Ratlam", + "Anuppur", + "Satna", + "Chhindwara", + "Morena", + "Betul", + "Barwani", + "Gwalior", + "Guna", + "Ashoknagar", + "Sehore", + "Neemuch", + "Hoshangabad", + "Chhatarpur", + "Tikamgarh", + "Shivpuri", + "Raisen", + "Ujjain", + "Dhar", + "Sheopur", + "Datia", + "Dewas", + "Balaghat", + "Rewa", + "Mandla", + "Sagar", + "Jabalpur", + "Seoni", + "Katni", + "Khargone (West Nimar)", + "Vidisha", + "Shahdol", + "Bhopal", + "Indore", + "Mandsaur", + "Rajgarh", + "Burhanpur", + "Umaria", + "Khandwa (East Nimar)", + "Narsimhapur", + "Sidhi", + "Damoh", + "Dindori", + "Harda", + "Jhabua", + "Singrauli", + "Wardha", + "Amravati", + "Gadchiroli", + "Ahmadnagar", + "Latur", + "Solapur", + "Akola", + "Pune", + "Jalgaon", + "Jalna", + "Thane", + "Bid", + "Nanded", + "Gondiya", + "Buldana", + "Sangli", + "Hingoli", + "Chandrapur", + "Yavatmal", + "Nashik", + "Bhandara", + "Nagpur", + "Osmanabad", + "Ratnagiri", + "Satara", + "Sindhudurg", + "Nandurbar", + "Dhule", + "Kolhapur", + "Parbhani", + "Mumbai", + "Washim", + "Imphal East", + "Bishnupur", + "Thoubal", + "Imphal West", + "Chandel", + "South Garo Hills", + "Jaintia Hills", + "West Khasi Hills", + "Ribhoi", + "East Garo Hills", + "East Khasi Hills", + "West Garo Hills", + "Aizawl", + "Kolasib", + "Champhai", + "Lunglei", + "Lawngtlai", + "Mamit", + "Serchhip", + "Saiha", + "Mon", + "Wokha", + "Mokokchung", + "Phek", + "Dimapur", + "Peren", + "Kiphire", + "Kohima", + "Tuensang", + "Longleng", + "Zunheboto", + "South West", + "East", + "New Delhi", + "North", + "South", + "Kendujhar", + "Anugul", + "Ganjam", + "Cuttack", + "Bargarh", + "Balangir", + "Baleshwar", + "Kandhamal", + "Malkangiri", + "Khordha", + "Mayurbhanj", + "Bhadrak", + "Baudh", + "Jharsuguda", + "Kalahandi", + "Dhenkanal", + "Subarnapur", + "Sundargarh", + "Sambalpur", + "Jajapur", + "Nayagarh", + "Debagarh", + "Rayagada", + "Jagatsinghapur", + "Koraput", + "Gajapati", + "Kendrapara", + "Nuapada", + "Puri", + "Nabarangapur", + "Karaikal", + "Puducherry", + "Mahe", + "Yanam", + "Firozpur", + "Jalandhar", + "Sangrur", + "Amristar", + "Fathegarh Sahib", + "Rupnagar", + "Moga", + "Shahid Bhagat Singh Nagar", + "Tarn Taran", + "Sahibzada Ajit Singh Nagar", + "Mansa", + "Mukstar", + "Barnala", + "Gurdaspur", + "Bathinda", + "Kapurthala", + "Patiala", + "Hoshiarpur", + "Ludhiana", + "Faridkot", + "Sirohi", + "Ajmer", + "Jhalawar", + "Alwar", + "Rajsamand", + "Baran", + "Ganganagar", + "Bhilwara", + "Jhunjhunun", + "Jaipur", + "Pali", + "Barmer", + "Dausa", + "Banswara", + "Dhaulpur", + "Chittaurgarh", + "Bharatpur", + "Hanumangarh", + "Udaipur", + "Jalor", + "Churu", + "Bikaner", + "Jodhpur", + "Bundi", + "Pratapgarh", + "Nagaur", + "Tonk", + "Dungarpur", + "Sikar", + "Sawai Madhopur", + "Karauli", + "Kota", + "Jaisalmer", + "East District", + "West District", + "South District", + "North District", + "Madurai", + "Ramanathapuram", + "Tirunelveli", + "Kancheepuram", + "The Nilgiris", + "Thanjavur", + "Dindigul", + "Kanniyakumari", + "Namakkal", + "Vellore", + "Pudukkottai", + "Coimbatore", + "Thoothukkudi", + "Erode", + "Viluppuram", + "Theni", + "Cuddalore", + "Tiruvannamalai", + "Thiruvallur", + "Salem", + "Karur", + "Ariyalur", + "Perambalur", + "Virudhunagar", + "Tiruppur", + "Dharmapuri", + "Tiruchirappalli", + "Krishnagiri", + "Chennai", + "Sivaganga", + "Nagapattinam", + "Thiruvarur", + "Mahbubnagar", + "Adilabad", + "Medak", + "Nizamabad", + "Rangareddy", + "Nalgonda", + "Warangal", + "Hyderabad", + "Karimnagar", + "Khammam", + "West Tripura", + "South Tripura", + "Dhalai", + "North Tripura", + "West Tripura", + "Auraiya", + "Agra", + "Mau", + "Bijnor", + "Baghpat", + "Moradabad", + "Mirzapur", + "Muzzaffarnagar", + "Kaushambi", + "Kanpur Dehat", + "Ambedkar Nagar", + "Etah", + "Aligarh", + "Allahabad", + "Shahjahanpur", + "Budaun", + "Kanshiram Nagar", + "Saharanpur", + "Sultanpur", + "Lucknow", + "Amroha", + "Maharajganj", + "Bulandshahr", + "Bareilly", + "Banda", + "Azamgarh", + "Unnao", + "Faizabad", + "Ghaziabad", + "Rae Bareli", + "Jaunpur", + "Ghazipur", + "Bahraich", + "Meerut", + "Fatehpur", + "Mathura", + "Etawah", + "Ballia", + "Balrampur", + "Basti", + "Bara Banki", + "Gorakhpur", + "Siddharthnagar", + "Jhansi", + "Deoria", + "Pilibhit", + "Lakhimpur Kheri", + "Hardoi", + "Mainpuri", + "Bhadohi", + "Shrawasti", + "Gautam Buddha Nagar", + "Rampur", + "Kanpur Nagar", + "Sitapur", + "Chandauli", + "Mahoba", + "Kannauj", + "Chitrakoot", + "Sonbhadra", + "Gonda", + "Firozabad", + "Farrukhabad", + "Varanasi", + "Sant Kabir Nagar", + "Mahamaya Nagar", + "Kushinagar", + "Jalaun", + "Lalitpur", + "Almora", + "Rudraprayag", + "Chamoli", + "Bageshwar", + "Udham Singh Nagar", + "Champawat", + "Uttarkashi", + "Pithoragarh", + "Hardwar", + "Nainital", + "Tehri Garhwal", + "Dehradun", + "Garhwal", + "Jalpaiguri", + "Hugli", + "Barddhaman", + "North Twenty Four Parganas", + "Dakshin Dinajpur", + "Bankura", + "South Twenty Four Parganas", + "Murshidabad", + "Nadia", + "Birbhum", + "Paschim Medinipur", + "Purba Medinipur", + "Uttar Dinajpur", + "Darjiling", + "Koch Bihar", + "Maldah", + "Haora", + "Puruliya", + "Kolkata" ] } \ No newline at end of file From 587118dd89df6d80e8c0e11d3117609b9d981983 Mon Sep 17 00:00:00 2001 From: yashy3nugu Date: Tue, 13 Dec 2022 20:31:00 +0530 Subject: [PATCH 08/11] Add patches to seperate files --- scraper/scripts/districts.rb | 29 ++++++++++++++++++++++++++ scraper/scripts/methods.rb | 35 +------------------------------- src/patches/ifsc/PUNB0113200.yml | 8 ++++++++ src/patches/ifsc/PUNB0445400.yml | 8 ++++++++ src/patches/ifsc/PUNB0667000.yml | 8 ++++++++ 5 files changed, 54 insertions(+), 34 deletions(-) create mode 100644 scraper/scripts/districts.rb create mode 100644 src/patches/ifsc/PUNB0113200.yml create mode 100644 src/patches/ifsc/PUNB0445400.yml create mode 100644 src/patches/ifsc/PUNB0667000.yml diff --git a/scraper/scripts/districts.rb b/scraper/scripts/districts.rb new file mode 100644 index 00000000..45ecb297 --- /dev/null +++ b/scraper/scripts/districts.rb @@ -0,0 +1,29 @@ +def get_unmatched_district(district, row, matcher) + + if district.nil? + return district + end + + # Here the CITY2 field contains the district which can be matched + if /\(?\s*U\s*\.?\s*P\s*\.?\s*\)?/.match?(district) || /\(?\s*U\s*\.?\s*T\s*\.?\s*\)?/.match?(district) || /\(?\s*M\s*\.?\s*P\s*\.?\s*\)?/.match?(district) + return matcher.find(sanitize(row['CITY2'])) + elsif district === "KGF" + return "KOLAR" + elsif district === "M.P.K.V." + return "AHMADNAGAR" + elsif district === "PCMC" + return "PUNE" + elsif district === "GMC" + return "SRINAGAR" + elsif district === "110027" + return "NEW DELHI" + elsif district === "612 103" + return "THANJAVUR" + elsif district === "273005" + return "GORAKHPUR" + elsif district === "2 M" + return "GANGANAGAR" + else + return district + end +end \ No newline at end of file diff --git a/scraper/scripts/methods.rb b/scraper/scripts/methods.rb index 17c134f2..eac0a69d 100644 --- a/scraper/scripts/methods.rb +++ b/scraper/scripts/methods.rb @@ -9,6 +9,7 @@ require './methods_nach' require './utils' require './iso3166' +require './districts' HEADINGS_INSERT = %w[ BANK @@ -183,40 +184,6 @@ def parse_contact(std_code, phone) end end -def get_unmatched_district(district, row, matcher) - - if district.nil? - return district - end - - # Here the CITY2 field contains the district which can be matched - if /\(?\s*U\s*\.?\s*P\s*\.?\s*\)?/.match?(district) || /\(?\s*U\s*\.?\s*T\s*\.?\s*\)?/.match?(district) || /\(?\s*M\s*\.?\s*P\s*\.?\s*\)?/.match?(district) - return matcher.find(sanitize(row['CITY2'])) - elsif district === "KGF" - return "KOLAR" - elsif district === "M.P.K.V." - return "AHMADNAGAR" - elsif district === "PCMC" - return "PUNE" - elsif district === "GMC" - return "SRINAGAR" - elsif district === "110027" - return "NEW DELHI" - elsif district === "612 103" - return "THANJAVUR" - elsif district === "273005" - return "GORAKHPUR" - elsif district === "2 M" - return "GANGANAGAR" - elsif row["IFSC"] === "PUNB0667000" - return "HAMIRPUR" - elsif row["IFSC"] === "PUNB0667000" - return "SHIMLA" - else - return district - end -end - def parse_csv(files, banks, additional_attributes = {}) data = {} diff --git a/src/patches/ifsc/PUNB0113200.yml b/src/patches/ifsc/PUNB0113200.yml new file mode 100644 index 00000000..35f9f89a --- /dev/null +++ b/src/patches/ifsc/PUNB0113200.yml @@ -0,0 +1,8 @@ +# Replaces (H.P.) in district field +--- +action: patch +patch: + DISTRICT: Shimla +ifsc: + - PUNB0113200 + \ No newline at end of file diff --git a/src/patches/ifsc/PUNB0445400.yml b/src/patches/ifsc/PUNB0445400.yml new file mode 100644 index 00000000..8348485c --- /dev/null +++ b/src/patches/ifsc/PUNB0445400.yml @@ -0,0 +1,8 @@ +# Replaces (HP) in district field +--- +action: patch +patch: + DISTRICT: Kullu +ifsc: + - PUNB0445400 + \ No newline at end of file diff --git a/src/patches/ifsc/PUNB0667000.yml b/src/patches/ifsc/PUNB0667000.yml new file mode 100644 index 00000000..ebf92aa2 --- /dev/null +++ b/src/patches/ifsc/PUNB0667000.yml @@ -0,0 +1,8 @@ +# Replaces H.P in district field +--- +action: patch +patch: + DISTRICT: Hamirpur +ifsc: + - PUNB0667000 + \ No newline at end of file From 67d5b6d7d43af6761c9720b397e80f07cf62c7a0 Mon Sep 17 00:00:00 2001 From: yashy3nugu Date: Wed, 14 Dec 2022 12:42:08 +0530 Subject: [PATCH 09/11] Remove extra space --- src/patches/ifsc/iccl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/patches/ifsc/iccl.yml b/src/patches/ifsc/iccl.yml index bbfd6f4e..3f47db73 100644 --- a/src/patches/ifsc/iccl.yml +++ b/src/patches/ifsc/iccl.yml @@ -15,7 +15,7 @@ ifsc: CENTRE: Mumbai CITY: MUMBAI CONTACT: null - DISTRICT: Mumbai + DISTRICT: Mumbai IMPS: false MICR: null NEFT: false From f13dcd56528bcc030389b369df3dd5f54a75f5f7 Mon Sep 17 00:00:00 2001 From: yashy3nugu Date: Wed, 14 Dec 2022 16:26:08 +0530 Subject: [PATCH 10/11] Add Dice coefficient threshold --- scraper/Gemfile | 2 ++ scraper/Gemfile.lock | 14 ++++++++++++++ scraper/scripts/methods.rb | 27 +++++++++++++++++++-------- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/scraper/Gemfile b/scraper/Gemfile index eb0aaca4..c8def2f2 100644 --- a/scraper/Gemfile +++ b/scraper/Gemfile @@ -4,3 +4,5 @@ gem 'nokogiri' gem 'httparty' gem "fuzzy_match", "~> 2.1" + +gem "amatch", "~> 0.4.1" diff --git a/scraper/Gemfile.lock b/scraper/Gemfile.lock index 861a2a00..4d042491 100644 --- a/scraper/Gemfile.lock +++ b/scraper/Gemfile.lock @@ -1,6 +1,9 @@ GEM remote: https://rubygems.org/ specs: + amatch (0.4.1) + mize + tins (~> 1.0) fuzzy_match (2.1.0) httparty (0.20.0) mime-types (~> 3.0) @@ -9,16 +12,27 @@ GEM mime-types-data (~> 3.2015) mime-types-data (3.2022.0105) mini_portile2 (2.8.0) + mize (0.4.1) + protocol (~> 2.0) multi_xml (0.6.0) nokogiri (1.13.9) mini_portile2 (~> 2.8.0) racc (~> 1.4) + protocol (2.0.0) + ruby_parser (~> 3.0) racc (1.6.0) + ruby_parser (3.19.2) + sexp_processor (~> 4.16) + sexp_processor (4.16.1) + sync (0.5.0) + tins (1.32.1) + sync PLATFORMS ruby DEPENDENCIES + amatch (~> 0.4.1) fuzzy_match (~> 2.1) httparty nokogiri diff --git a/scraper/scripts/methods.rb b/scraper/scripts/methods.rb index eac0a69d..0b2d6481 100644 --- a/scraper/scripts/methods.rb +++ b/scraper/scripts/methods.rb @@ -272,22 +272,33 @@ def parse_csv(files, banks, additional_attributes = {}) if district_map.has_key?(district) row['DISTRICT'] = district_map[district] else + # Fuzzy matching is done here matched = matcher.find(district) - # Single match + # Multiple matches if matched.kind_of?(Array) - row['DISTRICT'] = matched[0] - district_map[district] = matched[0] - # Return the most matched when there are multiple matches + matched_district = matched[0] + score = FuzzyMatch.score_class.new(district,matched_district).dices_coefficient_similar + + # Single match elsif matched.kind_of?(String) - row['DISTRICT'] = matched - district_map[district] = matched - # Edge cases where it is impossible to match using fuzzy logic + matched_district = matched + score = FuzzyMatch.score_class.new(district,matched_district).dices_coefficient_similar + # No match else + score = 0 + end + + if score >= 0.5 + row['DISTRICT'] = matched_district + district_map[district] = matched_district + else + # When the dice's coefficent is not enough + # Either manual patches are done if possible, or the existing value is added fixed_district = get_unmatched_district(district,row,matcher) row['DISTRICT'] = fixed_district district_map[district] = fixed_district - end + end end # Delete rows we don't want in output From b442050ffaa9b8d5aed199e16df91e0dd97e12e7 Mon Sep 17 00:00:00 2001 From: yashy3nugu Date: Wed, 14 Dec 2022 18:02:47 +0530 Subject: [PATCH 11/11] Add title case for unmatched districts --- scraper/scripts/districts.rb | 16 ++++++++-------- scraper/scripts/methods.rb | 6 +++--- scraper/scripts/utils.rb | 8 ++++++++ src/patches/ifsc/SBIN0005181.yml | 2 +- src/patches/ifsc/state-mh.yml | 2 +- 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/scraper/scripts/districts.rb b/scraper/scripts/districts.rb index 45ecb297..b252221e 100644 --- a/scraper/scripts/districts.rb +++ b/scraper/scripts/districts.rb @@ -8,21 +8,21 @@ def get_unmatched_district(district, row, matcher) if /\(?\s*U\s*\.?\s*P\s*\.?\s*\)?/.match?(district) || /\(?\s*U\s*\.?\s*T\s*\.?\s*\)?/.match?(district) || /\(?\s*M\s*\.?\s*P\s*\.?\s*\)?/.match?(district) return matcher.find(sanitize(row['CITY2'])) elsif district === "KGF" - return "KOLAR" + return "Kolar" elsif district === "M.P.K.V." - return "AHMADNAGAR" + return "Ahmadnagar" elsif district === "PCMC" - return "PUNE" + return "Pune" elsif district === "GMC" - return "SRINAGAR" + return "Srinagar" elsif district === "110027" - return "NEW DELHI" + return "New Delhi" elsif district === "612 103" - return "THANJAVUR" + return "Thanjavur" elsif district === "273005" - return "GORAKHPUR" + return "Gorakhpur" elsif district === "2 M" - return "GANGANAGAR" + return "Ganganagar" else return district end diff --git a/scraper/scripts/methods.rb b/scraper/scripts/methods.rb index 0b2d6481..96eef29e 100644 --- a/scraper/scripts/methods.rb +++ b/scraper/scripts/methods.rb @@ -277,12 +277,12 @@ def parse_csv(files, banks, additional_attributes = {}) # Multiple matches if matched.kind_of?(Array) - matched_district = matched[0] + matched_district = title_case(matched[0]) score = FuzzyMatch.score_class.new(district,matched_district).dices_coefficient_similar # Single match elsif matched.kind_of?(String) - matched_district = matched + matched_district = title_case(matched) score = FuzzyMatch.score_class.new(district,matched_district).dices_coefficient_similar # No match else @@ -295,7 +295,7 @@ def parse_csv(files, banks, additional_attributes = {}) else # When the dice's coefficent is not enough # Either manual patches are done if possible, or the existing value is added - fixed_district = get_unmatched_district(district,row,matcher) + fixed_district = title_case(get_unmatched_district(district,row,matcher)) row['DISTRICT'] = fixed_district district_map[district] = fixed_district end diff --git a/scraper/scripts/utils.rb b/scraper/scripts/utils.rb index e5638dd1..2ddd4d63 100644 --- a/scraper/scripts/utils.rb +++ b/scraper/scripts/utils.rb @@ -41,3 +41,11 @@ def fix_pipe_delimited_address!(row) def bank_name_from_code(code) Razorpay::IFSC::IFSC.bank_name_for(code) end + +def title_case(district) + return nil if district.nil? + + title_case_district = district.gsub(/\w+/) do |word| + word.capitalize + end +end \ No newline at end of file diff --git a/src/patches/ifsc/SBIN0005181.yml b/src/patches/ifsc/SBIN0005181.yml index 7c611965..955dd494 100644 --- a/src/patches/ifsc/SBIN0005181.yml +++ b/src/patches/ifsc/SBIN0005181.yml @@ -6,7 +6,7 @@ action: patch patch: CITY: AHMEDABAD - DISTRICT: AHMEDABAD + DISTRICT: Ahmedabad CENTRE: AHMEDABAD STATE: GUJARAT ADDRESS: A/24, SWASTIK SOCIETY, 1ST FLOOR NEAR STADIUM PANCH RASTA C.G. ROAD diff --git a/src/patches/ifsc/state-mh.yml b/src/patches/ifsc/state-mh.yml index 24beba1e..ebf0f0de 100644 --- a/src/patches/ifsc/state-mh.yml +++ b/src/patches/ifsc/state-mh.yml @@ -4,7 +4,7 @@ action: patch patch: CITY: MUMBAI - DISTRICT: MUMBAI + DISTRICT: Mumbai CENTRE: MUMBAI STATE: MAHARASHTRA ISO3166: IN-MH