First commit

Chetan11-dev · Nov 19, 2021 · 56c039f · 56c039f
commit 56c039f
Show file tree

Hide file tree

Showing 10 changed files with 403 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.vscode/*
+__pycache__/*
+imagenes-icono.ico
diff --git a/README.md b/README.md
@@ -0,0 +1,63 @@
+# Google Maps / Google My Business Scraper 🌎📊
+
+This is script is a scraping script developed with Python and its automation library Selenium. **Consists of reading a list of keywords, searching them in the Google Maps search, and getting its data and cover image**.
+
+The script goes one by one searching for the keyword, and storing the data in a list, to finally export it to an Excel file located in the folder specified by the user when running the script.
+
+In the presentation video I show the script running without hiding the Chrome window, and it can be seen quite clear the process that the script follows.
+
+However, although I have added [that version](build/maps_scraper_juaristech_windowed_demo.exe) in the build folder, the final version and the one it is in the source code, works without showing the Chrome window, because it works with 5 simultaneous threads to increase the speed and obtain the results faster.
+
+For now the script works only for Spanish and English languages, however, I can add more languages in the future.
+
+[![Google Maps Scraper](https://juaristech.com/wp-content/uploads/2021/11/google-maps-scraper-result.jpg)](https://juaristech.com/google-maps-scraper "JuarisTech")
+
+## How to Run It
+
+To execute this script you need to run it in the command prompt.
+
+```bash
+google_maps_scraper_juaristech.exe
+```
+
+Then, some questions will appear, which are necessary to run the script:
+
+1. You will need to type "ES" for Spanish or "EN" for English.
+
+    ```bash
+    [1] Introduce the language, (ES o EN): 
+    ```
+2. You will need to specify the folder to save the output Excel and images. For example: *D:\Projects\Spain\Madrid\output\\*
+
+    ```bash
+    [2] Introduce the path to save the images:
+    ```
+
+3. To finish, you need to specify where is located the *.txt* file with the keywords to search. For example: *D:\Projects\Spain\Madrid\places.txt*
+
+    ```bash
+    [3] Introduce the path of the keywords txt file:
+    ```
+
+Then the script starts to work, and when it finished, the Excel file would appear in the output folder.
+
+---
+
+For any doubts about how to use the program, you can read the article of our web or see the demo video.
+
+- Explanatory article: https://juaristech.com/google-maps-data-scraper
+- Demo video: https://www.youtube.com/channel/UCAUKSLj_OR1PfguW2ODUD3Q
+
+## Requirements
+
+The used requirements are specified in the requierements.txt file. If you want to execute the *.py* script from python, you can install the dependencies with the next command:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Contact
+
+- Website: [JuarisTech](https://juaristech.com/)
+- Email: [email protected]
+
diff --git a/build/maps_scraper_juaristech.exe b/build/maps_scraper_juaristech.exe
diff --git a/build/maps_scraper_juaristech_windowed_demo.exe b/build/maps_scraper_juaristech_windowed_demo.exe
diff --git a/exportarDatos.py b/exportarDatos.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+
+import xlwt
+
+class ExportarDatosMaps:
+
+    def __init__(self, nombreFichero, ruta, listaLugares):
+        self.nombreFichero = nombreFichero
+        self.ruta = ruta
+        self.listaLugares = listaLugares
+
+    def exportarExcel(self):
+        writeBook= xlwt.Workbook(encoding='utf-8')
+        sheet = writeBook.add_sheet("document",cell_overwrite_ok=True)
+        style = xlwt.XFStyle()
+
+        sheet.write(0, 0, 'KEYWORD')
+        sheet.write(0, 1, 'NAME')
+        sheet.write(0, 2, 'CATEGORY')
+        sheet.write(0, 3, 'DIRECTION')
+        sheet.write(0, 4, 'PHONE')
+        sheet.write(0, 5, 'WEB')
+        sheet.write(0, 6, 'PLUS CODE')
+        sheet.write(0, 7, 'OPEN HOURS')
+        sheet.write(0, 8, 'STARS')
+        sheet.write(0, 9, 'REVIEWS')
+
+        cont=1
+        for lugar in self.listaLugares:
+            sheet.write(cont, 0, lugar.keyword)
+            sheet.write(cont, 1, lugar.nombre)
+            sheet.write(cont, 2, lugar.categoria)
+            sheet.write(cont, 3, lugar.direccion)
+            sheet.write(cont, 4, lugar.telefono)
+            sheet.write(cont, 5, lugar.web)
+            sheet.write(cont, 6, lugar.pluscode)
+            sheet.write(cont, 7, lugar.horario)
+            sheet.write(cont, 8, lugar.estrellas)
+            sheet.write(cont, 9, lugar.resenas)
+            cont = cont + 1
+
+        writeBook.save(self.ruta+self.nombreFichero)
diff --git a/lugar_maps.py b/lugar_maps.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+
+class LugarMaps:
+
+    def __init__(self):
+        self.keyword = ''
+        self.nombre = ''
+        self.categoria = ''
+        self.direccion = ''
+        self.telefono = ''
+        self.web = ''
+        self.pluscode = ''
+        self.estrellas = ''
+        self.resenas = ''
+        self.horario = ''
diff --git a/main_datos_maps.py b/main_datos_maps.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+
+
+from exportarDatos import ExportarDatosMaps
+from maps_data_scraper import GoogleMapsDataScraper
+from threading import Thread
+import sys
+import os
+
+def split_list(a, n):
+    k, m = divmod(len(a), n)
+    return list((a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)))
+
+def scrapearMaps(idioma, lista, outputFolder, resultados, hilo):
+    scraper = GoogleMapsDataScraper(idioma, outputFolder)
+    scraper.initDriver()
+    listaLugares = []
+
+    cont=1
+    for l in lista:
+        lugar = scraper.scrapearDatos(l)
+
+        if(lugar != None):
+            print('Hilo nº '+str(hilo)+' ' +str(cont) + '/' + str(len(lista)) + ' - OK - ' + l)
+            listaLugares.append(lugar)
+        else:
+            print('Hilo nº '+str(hilo)+' ' +str(cont) + '/' + str(len(lista)) + ' - ERROR - ' + l)
+        cont +=1
+
+    resultados[hilo] = listaLugares
+def mainGoogleMaps(idioma, ficheroKw, outputFolder):
+    archivo = open(ficheroKw,'r', encoding='utf-8')
+    listaF = archivo.read().splitlines()
+    archivo.close()
+
+    hilos = 5
+    listaHilos = [None] * hilos
+    listaResultados = [None] * hilos
+    divididos = split_list(listaF, hilos)
+
+    for i in range(len(listaHilos)):
+        listaHilos[i] = Thread(target = scrapearMaps, args=(idioma, divididos[i], outputFolder, listaResultados, i,))
+        listaHilos[i].start()
+
+    for i in range(len(listaHilos)):
+        listaHilos[i].join()
+
+    listaFinal = []
+
+    for i in range(len(listaResultados)):
+        listaFinal = listaFinal + listaResultados[i]
+
+    exportar = ExportarDatosMaps(outputFolder+'00_output.xls','', listaFinal)
+    exportar.exportarExcel()
+
+if __name__ == "__main__":
+    while True:
+        idioma = input('----------\n[1] Introduce the language, (ES o EN): ')
+        if(idioma != 'ES' and idioma != 'EN'):
+            print("----------\n** Error ** That is not a valid language. Enter a valid language\n")
+            continue
+        else:
+            break
+
+    while True:
+        fichero = input('----------\n[2] Introduce the path to save the images: ')
+        if(os.path.isdir(fichero) == False):
+            print("----------\n** Error ** That is not a valid folder. Enter a valid folder\n")
+            continue
+        else:
+            caracter = fichero[len(fichero)-1]
+            if(caracter != '/' or caracter != '\\'):
+                fichero = fichero.replace('/','\\')+'\\'
+            break
+
+    while True:
+        kwLugares = input('----------\n[3] Introduce the path of the keywords txt file: ')
+        if(os.path.isfile(kwLugares) == False):
+            print("----------\n** Error ** That is not a valid txt file. Enter a valid file\n")
+            continue
+        else:
+            break
+
+    mainGoogleMaps(idioma,kwLugares, fichero)
diff --git a/maps_data_scraper.py b/maps_data_scraper.py
@@ -0,0 +1,182 @@
+# -*- coding: utf-8 -*-
+
+import random
+import time
+import urllib.request
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+from lugar_maps import LugarMaps
+
+class GoogleMapsDataScraper:
+
+    def __init__(self, idioma, imgOutput):
+        self.driver = None
+        self.errorCont = 0
+        self.imgOutput = imgOutput
+        self.configuracion = self.setConfiguracion(idioma)
+
+    def setConfiguracion(self, idioma):
+        conf = {
+            'idioma': '--lang=es-ES',
+            'textoEstrellas': 'estrellas',
+            'textoReviews': 'reseñas',
+            'textoDireccion': 'Dirección: ',
+            'textoWeb': 'Sitio web: ',
+            'textoTelefono': 'Teléfono: ',
+            'textoPlusCode': 'Plus Code: ',
+            'textoHorario': 'Ocultar el horario de la semana',
+            'remplazarHorario': [' Ocultar el horario de la semana', 'El horario podría cambiar', '; ']
+        }
+        if(idioma == 'EN'):
+            conf['idioma'] = '--lang=en-GB'
+            conf['textoEstrellas'] = 'stars'
+            conf['textoReviews'] = 'reviews'
+            conf['textoDireccion'] = 'Address: '
+            conf['textoWeb'] = 'Website: '
+            conf['textoTelefono'] = 'Phone: '
+            conf['textoPlusCode'] = 'Plus code: '
+            conf['textoHorario'] = 'Hide open hours for the week'
+            conf['remplazarHorario'] = ['. Hide open hours for the week', 'Hours might differ', '; ']
+
+        return conf
+
+    def initDriver(self):
+        try:
+            chrome_options = webdriver.ChromeOptions()
+            chrome_options.add_argument('--headless')
+            chrome_options.add_argument('--no-sandbox')
+            chrome_options.add_argument('--disable-dev-shm-usage')
+            chrome_options.add_argument('--log-level=3')
+            chrome_options.add_argument(self.configuracion['idioma'])
+            s=Service(ChromeDriverManager().install())
+            self.driver = webdriver.Chrome(service=s, options=chrome_options)
+            self.driver.get('https://www.google.com/')            
+            self.driver.find_element_by_xpath('//*[@id="L2AGLb"]').click()
+            time.sleep(2)
+            self.driver.get('https://www.google.com/maps/')
+            return True
+        except:
+            print('Error with the Chrome Driver')
+            return False
+
+    def quitarTildes(self, s):
+        replacements = (("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"),)
+        for a, b in replacements:
+            s = s.replace(a, b).replace(a.upper(), b.upper())
+        return s
+
+    def scrapearDatos(self, kw):
+        try:
+            lugar = LugarMaps()
+            lugar.keyword = kw
+            if(self.errorCont == 5):
+                self.errorCont = 0
+                time.sleep(1)
+                self.driver.get('https://www.google.com/maps/')
+                time.sleep(2)
+            time.sleep(random.randint(1,3))
+            inputBox = WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchboxinput"]')))
+            inputBox.click()
+            inputBox.clear()
+            inputBox.click()
+            time.sleep(1)
+            inputBox.send_keys(kw)
+            time.sleep(1)
+            inputBox.send_keys(Keys.ENTER)
+            time.sleep(4)
+
+            if(self.isLoaded(kw) == False):
+                return None
+
+            divImg = self.driver.find_element_by_id('pane')
+            titulo = divImg.find_element_by_tag_name('h1').text
+            lugar.nombre = titulo
+            time.sleep(1)
+            try:
+                val = WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[contains(@aria-label, "'+self.configuracion['textoEstrellas']+'")]')))
+                valoraciones = val.get_attribute("aria-label")            
+                estrellas = valoraciones.replace(self.configuracion['textoEstrellas'],'').replace(' ','')
+
+                val = self.driver.find_element_by_xpath('//*[contains(@aria-label, "'+self.configuracion['textoReviews']+'")]')
+                valoraciones = val.get_attribute("aria-label")            
+                numResenas = valoraciones.replace(self.configuracion['textoReviews'],'').replace(' ','')
+
+                lugar.estrellas = self.checkValoraciones(estrellas)
+                lugar.resenas = self.checkValoraciones(numResenas)
+            except Exception as e:
+                print(e)
+                pass
+
+            try:
+                imgSrc = divImg.find_element_by_tag_name('img').get_attribute("src")
+                if(not 'gstatic' in imgSrc):
+                    urllib.request.urlretrieve(imgSrc, self.imgOutput+self.quitarTildes(kw.replace('º','').replace('.','').replace(' ','-').replace('/','-')).lower()+'.jpg')
+            except Exception as e:
+                print(e)
+                print('No se ha podido obtener la imagen')
+                pass
+
+            lugar.categoria = self.buscar_xpath('//*[@jsaction="pane.rating.category"]')
+            lugar.direccion = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoDireccion']+'")]')
+            lugar.web = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoWeb']+'")]')
+            lugar.telefono = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoTelefono']+'")]')
+            lugar.pluscode = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoPlusCode']+'")]')
+
+            lugar.horario = self.getHorario()
+
+            return lugar
+        except Exception as e:
+            print(e)
+            self.errorCont += 1
+            return None
+
+    def buscar_xpath(self, xpath):
+        try:
+            resultado = self.driver.find_element_by_xpath(xpath).text
+            return resultado
+        except:
+            return ''
+
+    def getHorario(self):
+        try:
+            horario = self.driver.find_element_by_xpath('//*[contains(@aria-label, "'+self.configuracion['textoHorario']+'")]').get_attribute('aria-label')
+            horario = horario.replace(self.configuracion['remplazarHorario'][0], '')
+            horario = horario.replace(self.configuracion['remplazarHorario'][1], '')
+            horario = horario.replace(self.configuracion['remplazarHorario'][2], '\n')
+            return horario
+        except:
+            return ''
+
+    def isLoaded(self, kw):
+        divImg = self.driver.find_element_by_id('pane')
+        titulo = divImg.find_elements_by_tag_name('h1')
+        vacio = True
+        for a in titulo:
+            if(a.text != ''):
+                return True
+        if(vacio):
+            try:
+                resultados = self.driver.find_element_by_xpath('//div[contains(@aria-label, "'+kw+'")]')
+                enlace = resultados.find_element_by_tag_name('a')
+                enlace.click()
+                time.sleep(3)
+                return True
+            except:
+                pass
+        return False
+
+    def checkValoraciones(self, val):
+        if(self.configuracion['textoEstrellas'] in val or self.configuracion['textoReviews'] in val):
+            return ''
+        else:
+            return val
+
+
+    def endDriver(self):
+        self.driver.quit()