diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0bccc06 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.vscode/* +__pycache__/* +imagenes-icono.ico \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..29a7f19 --- /dev/null +++ b/README.md @@ -0,0 +1,63 @@ +# Google Maps / Google My Business Scraper 🌎📊 + +This is script is a scraping script developed with Python and its automation library Selenium. **Consists of reading a list of keywords, searching them in the Google Maps search, and getting its data and cover image**. + +The script goes one by one searching for the keyword, and storing the data in a list, to finally export it to an Excel file located in the folder specified by the user when running the script. + +In the presentation video I show the script running without hiding the Chrome window, and it can be seen quite clear the process that the script follows. + +However, although I have added [that version](build/maps_scraper_juaristech_windowed_demo.exe) in the build folder, the final version and the one it is in the source code, works without showing the Chrome window, because it works with 5 simultaneous threads to increase the speed and obtain the results faster. + +For now the script works only for Spanish and English languages, however, I can add more languages in the future. + +[![Google Maps Scraper](https://juaristech.com/wp-content/uploads/2021/11/google-maps-scraper-result.jpg)](https://juaristech.com/google-maps-scraper "JuarisTech") + +## How to Run It + +To execute this script you need to run it in the command prompt. + +```bash +google_maps_scraper_juaristech.exe +``` + +Then, some questions will appear, which are necessary to run the script: + +1. You will need to type "ES" for Spanish or "EN" for English. + + ```bash + [1] Introduce the language, (ES o EN): + ``` +2. You will need to specify the folder to save the output Excel and images. For example: *D:\Projects\Spain\Madrid\output\\* + + ```bash + [2] Introduce the path to save the images: + ``` + +3. To finish, you need to specify where is located the *.txt* file with the keywords to search. For example: *D:\Projects\Spain\Madrid\places.txt* + + ```bash + [3] Introduce the path of the keywords txt file: + ``` + +Then the script starts to work, and when it finished, the Excel file would appear in the output folder. + +--- + +For any doubts about how to use the program, you can read the article of our web or see the demo video. + +- Explanatory article: https://juaristech.com/google-maps-data-scraper +- Demo video: https://www.youtube.com/channel/UCAUKSLj_OR1PfguW2ODUD3Q + +## Requirements + +The used requirements are specified in the requierements.txt file. If you want to execute the *.py* script from python, you can install the dependencies with the next command: + +```bash +pip install -r requirements.txt +``` + +## Contact + +- Website: [JuarisTech](https://juaristech.com/) +- Email: admin@juaristech.com + diff --git a/build/maps_scraper_juaristech.exe b/build/maps_scraper_juaristech.exe new file mode 100644 index 0000000..03d9297 Binary files /dev/null and b/build/maps_scraper_juaristech.exe differ diff --git a/build/maps_scraper_juaristech_windowed_demo.exe b/build/maps_scraper_juaristech_windowed_demo.exe new file mode 100644 index 0000000..9324126 Binary files /dev/null and b/build/maps_scraper_juaristech_windowed_demo.exe differ diff --git a/exportarDatos.py b/exportarDatos.py new file mode 100644 index 0000000..ade4e22 --- /dev/null +++ b/exportarDatos.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- + +import xlwt + +class ExportarDatosMaps: + + def __init__(self, nombreFichero, ruta, listaLugares): + self.nombreFichero = nombreFichero + self.ruta = ruta + self.listaLugares = listaLugares + + def exportarExcel(self): + writeBook= xlwt.Workbook(encoding='utf-8') + sheet = writeBook.add_sheet("document",cell_overwrite_ok=True) + style = xlwt.XFStyle() + + sheet.write(0, 0, 'KEYWORD') + sheet.write(0, 1, 'NAME') + sheet.write(0, 2, 'CATEGORY') + sheet.write(0, 3, 'DIRECTION') + sheet.write(0, 4, 'PHONE') + sheet.write(0, 5, 'WEB') + sheet.write(0, 6, 'PLUS CODE') + sheet.write(0, 7, 'OPEN HOURS') + sheet.write(0, 8, 'STARS') + sheet.write(0, 9, 'REVIEWS') + + cont=1 + for lugar in self.listaLugares: + sheet.write(cont, 0, lugar.keyword) + sheet.write(cont, 1, lugar.nombre) + sheet.write(cont, 2, lugar.categoria) + sheet.write(cont, 3, lugar.direccion) + sheet.write(cont, 4, lugar.telefono) + sheet.write(cont, 5, lugar.web) + sheet.write(cont, 6, lugar.pluscode) + sheet.write(cont, 7, lugar.horario) + sheet.write(cont, 8, lugar.estrellas) + sheet.write(cont, 9, lugar.resenas) + cont = cont + 1 + + writeBook.save(self.ruta+self.nombreFichero) \ No newline at end of file diff --git a/lugar_maps.py b/lugar_maps.py new file mode 100644 index 0000000..306afaf --- /dev/null +++ b/lugar_maps.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- + +class LugarMaps: + + def __init__(self): + self.keyword = '' + self.nombre = '' + self.categoria = '' + self.direccion = '' + self.telefono = '' + self.web = '' + self.pluscode = '' + self.estrellas = '' + self.resenas = '' + self.horario = '' \ No newline at end of file diff --git a/main_datos_maps.py b/main_datos_maps.py new file mode 100644 index 0000000..3f91ac2 --- /dev/null +++ b/main_datos_maps.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + + +from exportarDatos import ExportarDatosMaps +from maps_data_scraper import GoogleMapsDataScraper +from threading import Thread +import sys +import os + +def split_list(a, n): + k, m = divmod(len(a), n) + return list((a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))) + +def scrapearMaps(idioma, lista, outputFolder, resultados, hilo): + scraper = GoogleMapsDataScraper(idioma, outputFolder) + scraper.initDriver() + listaLugares = [] + + cont=1 + for l in lista: + lugar = scraper.scrapearDatos(l) + + if(lugar != None): + print('Hilo nº '+str(hilo)+' ' +str(cont) + '/' + str(len(lista)) + ' - OK - ' + l) + listaLugares.append(lugar) + else: + print('Hilo nº '+str(hilo)+' ' +str(cont) + '/' + str(len(lista)) + ' - ERROR - ' + l) + cont +=1 + + resultados[hilo] = listaLugares +def mainGoogleMaps(idioma, ficheroKw, outputFolder): + archivo = open(ficheroKw,'r', encoding='utf-8') + listaF = archivo.read().splitlines() + archivo.close() + + hilos = 5 + listaHilos = [None] * hilos + listaResultados = [None] * hilos + divididos = split_list(listaF, hilos) + + for i in range(len(listaHilos)): + listaHilos[i] = Thread(target = scrapearMaps, args=(idioma, divididos[i], outputFolder, listaResultados, i,)) + listaHilos[i].start() + + for i in range(len(listaHilos)): + listaHilos[i].join() + + listaFinal = [] + + for i in range(len(listaResultados)): + listaFinal = listaFinal + listaResultados[i] + + exportar = ExportarDatosMaps(outputFolder+'00_output.xls','', listaFinal) + exportar.exportarExcel() + +if __name__ == "__main__": + while True: + idioma = input('----------\n[1] Introduce the language, (ES o EN): ') + if(idioma != 'ES' and idioma != 'EN'): + print("----------\n** Error ** That is not a valid language. Enter a valid language\n") + continue + else: + break + + while True: + fichero = input('----------\n[2] Introduce the path to save the images: ') + if(os.path.isdir(fichero) == False): + print("----------\n** Error ** That is not a valid folder. Enter a valid folder\n") + continue + else: + caracter = fichero[len(fichero)-1] + if(caracter != '/' or caracter != '\\'): + fichero = fichero.replace('/','\\')+'\\' + break + + while True: + kwLugares = input('----------\n[3] Introduce the path of the keywords txt file: ') + if(os.path.isfile(kwLugares) == False): + print("----------\n** Error ** That is not a valid txt file. Enter a valid file\n") + continue + else: + break + + mainGoogleMaps(idioma,kwLugares, fichero) \ No newline at end of file diff --git a/maps_data_scraper.py b/maps_data_scraper.py new file mode 100644 index 0000000..47dd944 --- /dev/null +++ b/maps_data_scraper.py @@ -0,0 +1,182 @@ +# -*- coding: utf-8 -*- + +import random +import time +import urllib.request +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from webdriver_manager.chrome import ChromeDriverManager +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +from lugar_maps import LugarMaps + +class GoogleMapsDataScraper: + + def __init__(self, idioma, imgOutput): + self.driver = None + self.errorCont = 0 + self.imgOutput = imgOutput + self.configuracion = self.setConfiguracion(idioma) + + def setConfiguracion(self, idioma): + conf = { + 'idioma': '--lang=es-ES', + 'textoEstrellas': 'estrellas', + 'textoReviews': 'reseñas', + 'textoDireccion': 'Dirección: ', + 'textoWeb': 'Sitio web: ', + 'textoTelefono': 'Teléfono: ', + 'textoPlusCode': 'Plus Code: ', + 'textoHorario': 'Ocultar el horario de la semana', + 'remplazarHorario': [' Ocultar el horario de la semana', 'El horario podría cambiar', '; '] + } + if(idioma == 'EN'): + conf['idioma'] = '--lang=en-GB' + conf['textoEstrellas'] = 'stars' + conf['textoReviews'] = 'reviews' + conf['textoDireccion'] = 'Address: ' + conf['textoWeb'] = 'Website: ' + conf['textoTelefono'] = 'Phone: ' + conf['textoPlusCode'] = 'Plus code: ' + conf['textoHorario'] = 'Hide open hours for the week' + conf['remplazarHorario'] = ['. Hide open hours for the week', 'Hours might differ', '; '] + + return conf + + def initDriver(self): + try: + chrome_options = webdriver.ChromeOptions() + chrome_options.add_argument('--headless') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--log-level=3') + chrome_options.add_argument(self.configuracion['idioma']) + s=Service(ChromeDriverManager().install()) + self.driver = webdriver.Chrome(service=s, options=chrome_options) + self.driver.get('https://www.google.com/') + self.driver.find_element_by_xpath('//*[@id="L2AGLb"]').click() + time.sleep(2) + self.driver.get('https://www.google.com/maps/') + return True + except: + print('Error with the Chrome Driver') + return False + + def quitarTildes(self, s): + replacements = (("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"),) + for a, b in replacements: + s = s.replace(a, b).replace(a.upper(), b.upper()) + return s + + def scrapearDatos(self, kw): + try: + lugar = LugarMaps() + lugar.keyword = kw + if(self.errorCont == 5): + self.errorCont = 0 + time.sleep(1) + self.driver.get('https://www.google.com/maps/') + time.sleep(2) + time.sleep(random.randint(1,3)) + inputBox = WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchboxinput"]'))) + inputBox.click() + inputBox.clear() + inputBox.click() + time.sleep(1) + inputBox.send_keys(kw) + time.sleep(1) + inputBox.send_keys(Keys.ENTER) + time.sleep(4) + + if(self.isLoaded(kw) == False): + return None + + divImg = self.driver.find_element_by_id('pane') + titulo = divImg.find_element_by_tag_name('h1').text + lugar.nombre = titulo + time.sleep(1) + try: + val = WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[contains(@aria-label, "'+self.configuracion['textoEstrellas']+'")]'))) + valoraciones = val.get_attribute("aria-label") + estrellas = valoraciones.replace(self.configuracion['textoEstrellas'],'').replace(' ','') + + val = self.driver.find_element_by_xpath('//*[contains(@aria-label, "'+self.configuracion['textoReviews']+'")]') + valoraciones = val.get_attribute("aria-label") + numResenas = valoraciones.replace(self.configuracion['textoReviews'],'').replace(' ','') + + lugar.estrellas = self.checkValoraciones(estrellas) + lugar.resenas = self.checkValoraciones(numResenas) + except Exception as e: + print(e) + pass + + try: + imgSrc = divImg.find_element_by_tag_name('img').get_attribute("src") + if(not 'gstatic' in imgSrc): + urllib.request.urlretrieve(imgSrc, self.imgOutput+self.quitarTildes(kw.replace('º','').replace('.','').replace(' ','-').replace('/','-')).lower()+'.jpg') + except Exception as e: + print(e) + print('No se ha podido obtener la imagen') + pass + + lugar.categoria = self.buscar_xpath('//*[@jsaction="pane.rating.category"]') + lugar.direccion = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoDireccion']+'")]') + lugar.web = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoWeb']+'")]') + lugar.telefono = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoTelefono']+'")]') + lugar.pluscode = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoPlusCode']+'")]') + + lugar.horario = self.getHorario() + + return lugar + except Exception as e: + print(e) + self.errorCont += 1 + return None + + def buscar_xpath(self, xpath): + try: + resultado = self.driver.find_element_by_xpath(xpath).text + return resultado + except: + return '' + + def getHorario(self): + try: + horario = self.driver.find_element_by_xpath('//*[contains(@aria-label, "'+self.configuracion['textoHorario']+'")]').get_attribute('aria-label') + horario = horario.replace(self.configuracion['remplazarHorario'][0], '') + horario = horario.replace(self.configuracion['remplazarHorario'][1], '') + horario = horario.replace(self.configuracion['remplazarHorario'][2], '\n') + return horario + except: + return '' + + def isLoaded(self, kw): + divImg = self.driver.find_element_by_id('pane') + titulo = divImg.find_elements_by_tag_name('h1') + vacio = True + for a in titulo: + if(a.text != ''): + return True + if(vacio): + try: + resultados = self.driver.find_element_by_xpath('//div[contains(@aria-label, "'+kw+'")]') + enlace = resultados.find_element_by_tag_name('a') + enlace.click() + time.sleep(3) + return True + except: + pass + return False + + def checkValoraciones(self, val): + if(self.configuracion['textoEstrellas'] in val or self.configuracion['textoReviews'] in val): + return '' + else: + return val + + + def endDriver(self): + self.driver.quit() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fe8571d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +selenium==4.0.0 +webdriver_manager==3.5.2 +xlwt==1.3.0 diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..c15c1e3 --- /dev/null +++ b/utils.py @@ -0,0 +1,11 @@ +import sys +import os + +def resource_path(relative_path): + """ To get resources path for creating the .exe with PyInstaller """ + try: + base_path = sys._MEIPASS + except Exception: + base_path = os.path.abspath(".") + + return os.path.join(base_path, relative_path) \ No newline at end of file