diff --git a/scraper/Gemfile b/scraper/Gemfile index a4816b39..c8def2f2 100644 --- a/scraper/Gemfile +++ b/scraper/Gemfile @@ -2,3 +2,7 @@ source "https://rubygems.org" gem 'nokogiri' gem 'httparty' + +gem "fuzzy_match", "~> 2.1" + +gem "amatch", "~> 0.4.1" diff --git a/scraper/Gemfile.lock b/scraper/Gemfile.lock index fb656089..4d042491 100644 --- a/scraper/Gemfile.lock +++ b/scraper/Gemfile.lock @@ -1,6 +1,10 @@ GEM remote: https://rubygems.org/ specs: + amatch (0.4.1) + mize + tins (~> 1.0) + fuzzy_match (2.1.0) httparty (0.20.0) mime-types (~> 3.0) multi_xml (>= 0.5.2) @@ -8,16 +12,28 @@ GEM mime-types-data (~> 3.2015) mime-types-data (3.2022.0105) mini_portile2 (2.8.0) + mize (0.4.1) + protocol (~> 2.0) multi_xml (0.6.0) nokogiri (1.13.9) mini_portile2 (~> 2.8.0) racc (~> 1.4) + protocol (2.0.0) + ruby_parser (~> 3.0) racc (1.6.0) + ruby_parser (3.19.2) + sexp_processor (~> 4.16) + sexp_processor (4.16.1) + sync (0.5.0) + tins (1.32.1) + sync PLATFORMS ruby DEPENDENCIES + amatch (~> 0.4.1) + fuzzy_match (~> 2.1) httparty nokogiri diff --git a/scraper/scripts/districts.json b/scraper/scripts/districts.json new file mode 100644 index 00000000..b0f4c382 --- /dev/null +++ b/scraper/scripts/districts.json @@ -0,0 +1,625 @@ +{ + "districts": [ + "South Andaman", + "Prakasam", + "Kurnool", + "Srikakulam", + "East Godavari", + "Guntur", + "Anantapur", + "Sri Potti Sriramulu Nellore", + "Y.S.R.", + "West Godavari", + "Vizianagaram", + "Chittoor", + "Krishna", + "Visakhapatnam", + "West Siang", + "Dibang Valley", + "East Siang", + "West Kameng", + "Changlang", + "Upper Subansiri", + "Tirap", + "Papum Pare", + "Anjaw", + "Kurung Kumey", + "Upper Siang", + "Lohit", + "Lower Subansiri", + "Lower Dibang Valley", + "East Kameng", + "Tawang", + "Bongaigaon", + "Sivasagar", + "Karimganj", + "Golaghat", + "Barpeta", + "Chirang", + "Lakhimpur", + "Dhubri", + "Sonitpur", + "Karbi Anglong", + "Dibrugarh", + "Tinsukia", + "Dhemaji", + "Nagaon", + "Goalpara", + "Kokrajhar", + "Kamrup Metropolitan", + "Dima Hasao", + "Hailakandi", + "Jorhat", + "Darrang", + "Cachar", + "Morigaon", + "Nalbari", + "Baksa", + "Kamrup", + "Udalguri", + "Banka", + "Araria", + "Purba Champaran", + "Bhojpur", + "Arwal", + "Aurangabad", + "Pashchim Champaran", + "Kishanganj", + "Sitamarhi", + "Begusarai", + "Patna", + "Purnia", + "Lakhisarai", + "Gopalganj", + "Sheikhpura", + "Darbhanga", + "Kaimur (Bhabua)", + "Bhagalpur", + "Nalanda", + "Rohtas", + "Supaul", + "Gaya", + "Buxar", + "Saran", + "Samastipur", + "Madhubani", + "Khagaria", + "Vaishali", + "Nawada", + "Munger", + "Jamui", + "Jehanabad", + "Muzaffarpur", + "Katihar", + "Madhepura", + "Siwan", + "Saharsa", + "Sheohar", + "Chandigarh", + "Dhamtari", + "Raipur", + "Janjgir - Champa", + "Durg", + "Rajnandgaon", + "Surguja", + "Uttar Bastar Kanker", + "Dakshin Bastar Dantewada", + "Mahasamund", + "Jashpur", + "Koriya", + "Raigarh", + "Bastar", + "Bijapur", + "Bilaspur", + "Kabeerdham", + "Korba", + "Narayanpur", + "D & N Haveli", + "Daman", + "Diu", + "North Goa", + "South Goa", + "Ahmadabad", + "Bharuch", + "Amreli", + "Anand", + "Kachchh", + "Kheda", + "Junagadh", + "Surat", + "Sabar Kantha", + "Banas Kantha", + "Jamnagar", + "Bhavnagar", + "Rajkot", + "Navsari", + "Patan", + "Porbandar", + "Vadodara", + "Surendranagar", + "Gandhinagar", + "Dohad", + "Valsad", + "Panch Mahals", + "Mahesana", + "Narmada", + "The Dangs", + "Tapi", + "Ambala", + "Karnal", + "Mahendragarh", + "Jhajjar", + "Hisar", + "Rewari", + "Bhiwani", + "Fatehabad", + "Kaithal", + "Sirsa", + "Faridabad", + "Gurgaon", + "Mewat", + "Sonipat", + "Palwal", + "Jind", + "Rohtak", + "Kurukshetra", + "Panchkula", + "Panipat", + "Yamunanagar", + "Solan", + "Kangra", + "Kullu", + "Hamirpur", + "Chamba", + "Shimla", + "Una", + "Mandi", + "Sirmaur", + "Anantnag", + "Jammu", + "Pulwama", + "Badgam", + "Bandipore", + "Ramban", + "Baramula", + "Samba", + "Kathua", + "Doda", + "Udhampur", + "Kulgam", + "Ganderbal", + "Kupwara", + "Rajouri", + "Kargil", + "Reasi", + "Kishtwar", + "Leh(Ladakh)", + "Punch", + "Shupiyan", + "Srinagar", + "Purbi Singhbhum", + "Chatra", + "Giridih", + "Sahibganj", + "Dumka", + "Palamu", + "Ranchi", + "Pashchimi Singhbhum", + "Bokaro", + "Dhanbad", + "Ramgarh", + "Deoghar", + "Kodarma", + "Garhwa", + "Godda", + "Gumla", + "Hazaribagh", + "Jamtara", + "Khunti", + "Saraikela-Kharsawan", + "Latehar", + "Lohardaga", + "Pakur", + "Simdega", + "Gulbarga", + "Belgaum", + "Dharwad", + "Hassan", + "Bagalkot", + "Bangalore", + "Uttara Kannada", + "Bidar", + "Chikkaballapura", + "Raichur", + "Kolar", + "Haveri", + "Mysore", + "Dakshina Kannada", + "Bellary", + "Shimoga", + "Yadgir", + "Ramanagara", + "Chikmagalur", + "Chitradurga", + "Chamarajanagar", + "Davanagere", + "Tumkur", + "Bangalore Rural", + "Gadag", + "Koppal", + "Udupi", + "Mandya", + "Kodagu", + "Pathanamthitta", + "Alappuzha", + "Ernakulam", + "Kannur", + "Thiruvananthapuram", + "Thrissur", + "Kottayam", + "Palakkad", + "Kozhikode", + "Wayanad", + "Kasaragod", + "Kollam", + "Idukki", + "Malappuram", + "Shajapur", + "Panna", + "Bhind", + "Alirajpur", + "Ratlam", + "Anuppur", + "Satna", + "Chhindwara", + "Morena", + "Betul", + "Barwani", + "Gwalior", + "Guna", + "Ashoknagar", + "Sehore", + "Neemuch", + "Hoshangabad", + "Chhatarpur", + "Tikamgarh", + "Shivpuri", + "Raisen", + "Ujjain", + "Dhar", + "Sheopur", + "Datia", + "Dewas", + "Balaghat", + "Rewa", + "Mandla", + "Sagar", + "Jabalpur", + "Seoni", + "Katni", + "Khargone (West Nimar)", + "Vidisha", + "Shahdol", + "Bhopal", + "Indore", + "Mandsaur", + "Rajgarh", + "Burhanpur", + "Umaria", + "Khandwa (East Nimar)", + "Narsimhapur", + "Sidhi", + "Damoh", + "Dindori", + "Harda", + "Jhabua", + "Singrauli", + "Wardha", + "Amravati", + "Gadchiroli", + "Ahmadnagar", + "Latur", + "Solapur", + "Akola", + "Pune", + "Jalgaon", + "Jalna", + "Thane", + "Bid", + "Nanded", + "Gondiya", + "Buldana", + "Sangli", + "Hingoli", + "Chandrapur", + "Yavatmal", + "Nashik", + "Bhandara", + "Nagpur", + "Osmanabad", + "Ratnagiri", + "Satara", + "Sindhudurg", + "Nandurbar", + "Dhule", + "Kolhapur", + "Parbhani", + "Mumbai", + "Washim", + "Imphal East", + "Bishnupur", + "Thoubal", + "Imphal West", + "Chandel", + "South Garo Hills", + "Jaintia Hills", + "West Khasi Hills", + "Ribhoi", + "East Garo Hills", + "East Khasi Hills", + "West Garo Hills", + "Aizawl", + "Kolasib", + "Champhai", + "Lunglei", + "Lawngtlai", + "Mamit", + "Serchhip", + "Saiha", + "Mon", + "Wokha", + "Mokokchung", + "Phek", + "Dimapur", + "Peren", + "Kiphire", + "Kohima", + "Tuensang", + "Longleng", + "Zunheboto", + "South West", + "East", + "New Delhi", + "North", + "South", + "Kendujhar", + "Anugul", + "Ganjam", + "Cuttack", + "Bargarh", + "Balangir", + "Baleshwar", + "Kandhamal", + "Malkangiri", + "Khordha", + "Mayurbhanj", + "Bhadrak", + "Baudh", + "Jharsuguda", + "Kalahandi", + "Dhenkanal", + "Subarnapur", + "Sundargarh", + "Sambalpur", + "Jajapur", + "Nayagarh", + "Debagarh", + "Rayagada", + "Jagatsinghapur", + "Koraput", + "Gajapati", + "Kendrapara", + "Nuapada", + "Puri", + "Nabarangapur", + "Karaikal", + "Puducherry", + "Mahe", + "Yanam", + "Firozpur", + "Jalandhar", + "Sangrur", + "Amristar", + "Fathegarh Sahib", + "Rupnagar", + "Moga", + "Shahid Bhagat Singh Nagar", + "Tarn Taran", + "Sahibzada Ajit Singh Nagar", + "Mansa", + "Mukstar", + "Barnala", + "Gurdaspur", + "Bathinda", + "Kapurthala", + "Patiala", + "Hoshiarpur", + "Ludhiana", + "Faridkot", + "Sirohi", + "Ajmer", + "Jhalawar", + "Alwar", + "Rajsamand", + "Baran", + "Ganganagar", + "Bhilwara", + "Jhunjhunun", + "Jaipur", + "Pali", + "Barmer", + "Dausa", + "Banswara", + "Dhaulpur", + "Chittaurgarh", + "Bharatpur", + "Hanumangarh", + "Udaipur", + "Jalor", + "Churu", + "Bikaner", + "Jodhpur", + "Bundi", + "Pratapgarh", + "Nagaur", + "Tonk", + "Dungarpur", + "Sikar", + "Sawai Madhopur", + "Karauli", + "Kota", + "Jaisalmer", + "East District", + "West District", + "South District", + "North District", + "Madurai", + "Ramanathapuram", + "Tirunelveli", + "Kancheepuram", + "The Nilgiris", + "Thanjavur", + "Dindigul", + "Kanniyakumari", + "Namakkal", + "Vellore", + "Pudukkottai", + "Coimbatore", + "Thoothukkudi", + "Erode", + "Viluppuram", + "Theni", + "Cuddalore", + "Tiruvannamalai", + "Thiruvallur", + "Salem", + "Karur", + "Ariyalur", + "Perambalur", + "Virudhunagar", + "Tiruppur", + "Dharmapuri", + "Tiruchirappalli", + "Krishnagiri", + "Chennai", + "Sivaganga", + "Nagapattinam", + "Thiruvarur", + "Mahbubnagar", + "Adilabad", + "Medak", + "Nizamabad", + "Rangareddy", + "Nalgonda", + "Warangal", + "Hyderabad", + "Karimnagar", + "Khammam", + "West Tripura", + "South Tripura", + "Dhalai", + "North Tripura", + "West Tripura", + "Auraiya", + "Agra", + "Mau", + "Bijnor", + "Baghpat", + "Moradabad", + "Mirzapur", + "Muzzaffarnagar", + "Kaushambi", + "Kanpur Dehat", + "Ambedkar Nagar", + "Etah", + "Aligarh", + "Allahabad", + "Shahjahanpur", + "Budaun", + "Kanshiram Nagar", + "Saharanpur", + "Sultanpur", + "Lucknow", + "Amroha", + "Maharajganj", + "Bulandshahr", + "Bareilly", + "Banda", + "Azamgarh", + "Unnao", + "Faizabad", + "Ghaziabad", + "Rae Bareli", + "Jaunpur", + "Ghazipur", + "Bahraich", + "Meerut", + "Fatehpur", + "Mathura", + "Etawah", + "Ballia", + "Balrampur", + "Basti", + "Bara Banki", + "Gorakhpur", + "Siddharthnagar", + "Jhansi", + "Deoria", + "Pilibhit", + "Lakhimpur Kheri", + "Hardoi", + "Mainpuri", + "Bhadohi", + "Shrawasti", + "Gautam Buddha Nagar", + "Rampur", + "Kanpur Nagar", + "Sitapur", + "Chandauli", + "Mahoba", + "Kannauj", + "Chitrakoot", + "Sonbhadra", + "Gonda", + "Firozabad", + "Farrukhabad", + "Varanasi", + "Sant Kabir Nagar", + "Mahamaya Nagar", + "Kushinagar", + "Jalaun", + "Lalitpur", + "Almora", + "Rudraprayag", + "Chamoli", + "Bageshwar", + "Udham Singh Nagar", + "Champawat", + "Uttarkashi", + "Pithoragarh", + "Hardwar", + "Nainital", + "Tehri Garhwal", + "Dehradun", + "Garhwal", + "Jalpaiguri", + "Hugli", + "Barddhaman", + "North Twenty Four Parganas", + "Dakshin Dinajpur", + "Bankura", + "South Twenty Four Parganas", + "Murshidabad", + "Nadia", + "Birbhum", + "Paschim Medinipur", + "Purba Medinipur", + "Uttar Dinajpur", + "Darjiling", + "Koch Bihar", + "Maldah", + "Haora", + "Puruliya", + "Kolkata" + ] +} \ No newline at end of file diff --git a/scraper/scripts/districts.rb b/scraper/scripts/districts.rb new file mode 100644 index 00000000..b252221e --- /dev/null +++ b/scraper/scripts/districts.rb @@ -0,0 +1,29 @@ +def get_unmatched_district(district, row, matcher) + + if district.nil? + return district + end + + # Here the CITY2 field contains the district which can be matched + if /\(?\s*U\s*\.?\s*P\s*\.?\s*\)?/.match?(district) || /\(?\s*U\s*\.?\s*T\s*\.?\s*\)?/.match?(district) || /\(?\s*M\s*\.?\s*P\s*\.?\s*\)?/.match?(district) + return matcher.find(sanitize(row['CITY2'])) + elsif district === "KGF" + return "Kolar" + elsif district === "M.P.K.V." + return "Ahmadnagar" + elsif district === "PCMC" + return "Pune" + elsif district === "GMC" + return "Srinagar" + elsif district === "110027" + return "New Delhi" + elsif district === "612 103" + return "Thanjavur" + elsif district === "273005" + return "Gorakhpur" + elsif district === "2 M" + return "Ganganagar" + else + return district + end +end \ No newline at end of file diff --git a/scraper/scripts/methods.rb b/scraper/scripts/methods.rb index c69bc652..96eef29e 100644 --- a/scraper/scripts/methods.rb +++ b/scraper/scripts/methods.rb @@ -5,9 +5,11 @@ require 'fileutils' require 'nokogiri' require 'open-uri' +require 'fuzzy_match' require './methods_nach' require './utils' require './iso3166' +require './districts' HEADINGS_INSERT = %w[ BANK @@ -185,6 +187,11 @@ def parse_contact(std_code, phone) def parse_csv(files, banks, additional_attributes = {}) data = {} + districts = JSON.parse(File.read('districts.json')) + matcher = FuzzyMatch.new(districts["districts"]) + + district_map = Hash.new() + files.each do |file| row_index = 0 headings = [] @@ -260,7 +267,39 @@ def parse_csv(files, banks, additional_attributes = {}) # which have the flipped values for CITY1 and CITY2 row['CITY'] = sanitize(row['CITY2']) row['CENTRE'] = sanitize(row['CITY1']) - row['DISTRICT'] = sanitize(row['CITY1']) + district = sanitize(row['CITY1']) + + if district_map.has_key?(district) + row['DISTRICT'] = district_map[district] + else + # Fuzzy matching is done here + matched = matcher.find(district) + + # Multiple matches + if matched.kind_of?(Array) + matched_district = title_case(matched[0]) + score = FuzzyMatch.score_class.new(district,matched_district).dices_coefficient_similar + + # Single match + elsif matched.kind_of?(String) + matched_district = title_case(matched) + score = FuzzyMatch.score_class.new(district,matched_district).dices_coefficient_similar + # No match + else + score = 0 + end + + if score >= 0.5 + row['DISTRICT'] = matched_district + district_map[district] = matched_district + else + # When the dice's coefficent is not enough + # Either manual patches are done if possible, or the existing value is added + fixed_district = title_case(get_unmatched_district(district,row,matcher)) + row['DISTRICT'] = fixed_district + district_map[district] = fixed_district + end + end # Delete rows we don't want in output # Merged into CONTACRT diff --git a/scraper/scripts/utils.rb b/scraper/scripts/utils.rb index e5638dd1..2ddd4d63 100644 --- a/scraper/scripts/utils.rb +++ b/scraper/scripts/utils.rb @@ -41,3 +41,11 @@ def fix_pipe_delimited_address!(row) def bank_name_from_code(code) Razorpay::IFSC::IFSC.bank_name_for(code) end + +def title_case(district) + return nil if district.nil? + + title_case_district = district.gsub(/\w+/) do |word| + word.capitalize + end +end \ No newline at end of file diff --git a/src/patches/ifsc/PUNB0113200.yml b/src/patches/ifsc/PUNB0113200.yml new file mode 100644 index 00000000..35f9f89a --- /dev/null +++ b/src/patches/ifsc/PUNB0113200.yml @@ -0,0 +1,8 @@ +# Replaces (H.P.) in district field +--- +action: patch +patch: + DISTRICT: Shimla +ifsc: + - PUNB0113200 + \ No newline at end of file diff --git a/src/patches/ifsc/PUNB0445400.yml b/src/patches/ifsc/PUNB0445400.yml new file mode 100644 index 00000000..8348485c --- /dev/null +++ b/src/patches/ifsc/PUNB0445400.yml @@ -0,0 +1,8 @@ +# Replaces (HP) in district field +--- +action: patch +patch: + DISTRICT: Kullu +ifsc: + - PUNB0445400 + \ No newline at end of file diff --git a/src/patches/ifsc/PUNB0667000.yml b/src/patches/ifsc/PUNB0667000.yml new file mode 100644 index 00000000..ebf92aa2 --- /dev/null +++ b/src/patches/ifsc/PUNB0667000.yml @@ -0,0 +1,8 @@ +# Replaces H.P in district field +--- +action: patch +patch: + DISTRICT: Hamirpur +ifsc: + - PUNB0667000 + \ No newline at end of file diff --git a/src/patches/ifsc/SBIN0005181.yml b/src/patches/ifsc/SBIN0005181.yml index 7c611965..955dd494 100644 --- a/src/patches/ifsc/SBIN0005181.yml +++ b/src/patches/ifsc/SBIN0005181.yml @@ -6,7 +6,7 @@ action: patch patch: CITY: AHMEDABAD - DISTRICT: AHMEDABAD + DISTRICT: Ahmedabad CENTRE: AHMEDABAD STATE: GUJARAT ADDRESS: A/24, SWASTIK SOCIETY, 1ST FLOOR NEAR STADIUM PANCH RASTA C.G. ROAD diff --git a/src/patches/ifsc/state-mh.yml b/src/patches/ifsc/state-mh.yml index 24beba1e..ebf0f0de 100644 --- a/src/patches/ifsc/state-mh.yml +++ b/src/patches/ifsc/state-mh.yml @@ -4,7 +4,7 @@ action: patch patch: CITY: MUMBAI - DISTRICT: MUMBAI + DISTRICT: Mumbai CENTRE: MUMBAI STATE: MAHARASHTRA ISO3166: IN-MH