Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
patxijuaristi committed Nov 19, 2021
0 parents commit 56c039f
Show file tree
Hide file tree
Showing 10 changed files with 403 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.vscode/*
__pycache__/*
imagenes-icono.ico
63 changes: 63 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Google Maps / Google My Business Scraper 🌎📊

This is script is a scraping script developed with Python and its automation library Selenium. **Consists of reading a list of keywords, searching them in the Google Maps search, and getting its data and cover image**.

The script goes one by one searching for the keyword, and storing the data in a list, to finally export it to an Excel file located in the folder specified by the user when running the script.

In the presentation video I show the script running without hiding the Chrome window, and it can be seen quite clear the process that the script follows.

However, although I have added [that version](build/maps_scraper_juaristech_windowed_demo.exe) in the build folder, the final version and the one it is in the source code, works without showing the Chrome window, because it works with 5 simultaneous threads to increase the speed and obtain the results faster.

For now the script works only for Spanish and English languages, however, I can add more languages in the future.

[![Google Maps Scraper](https://juaristech.com/wp-content/uploads/2021/11/google-maps-scraper-result.jpg)](https://juaristech.com/google-maps-scraper "JuarisTech")

## How to Run It

To execute this script you need to run it in the command prompt.

```bash
google_maps_scraper_juaristech.exe
```

Then, some questions will appear, which are necessary to run the script:

1. You will need to type "ES" for Spanish or "EN" for English.

```bash
[1] Introduce the language, (ES o EN):
```
2. You will need to specify the folder to save the output Excel and images. For example: *D:\Projects\Spain\Madrid\output\\*

```bash
[2] Introduce the path to save the images:
```

3. To finish, you need to specify where is located the *.txt* file with the keywords to search. For example: *D:\Projects\Spain\Madrid\places.txt*

```bash
[3] Introduce the path of the keywords txt file:
```

Then the script starts to work, and when it finished, the Excel file would appear in the output folder.

---

For any doubts about how to use the program, you can read the article of our web or see the demo video.

- Explanatory article: https://juaristech.com/google-maps-data-scraper
- Demo video: https://www.youtube.com/channel/UCAUKSLj_OR1PfguW2ODUD3Q

## Requirements

The used requirements are specified in the requierements.txt file. If you want to execute the *.py* script from python, you can install the dependencies with the next command:

```bash
pip install -r requirements.txt
```

## Contact

- Website: [JuarisTech](https://juaristech.com/)
- Email: [email protected]

Binary file added build/maps_scraper_juaristech.exe
Binary file not shown.
Binary file added build/maps_scraper_juaristech_windowed_demo.exe
Binary file not shown.
42 changes: 42 additions & 0 deletions exportarDatos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-

import xlwt

class ExportarDatosMaps:

def __init__(self, nombreFichero, ruta, listaLugares):
self.nombreFichero = nombreFichero
self.ruta = ruta
self.listaLugares = listaLugares

def exportarExcel(self):
writeBook= xlwt.Workbook(encoding='utf-8')
sheet = writeBook.add_sheet("document",cell_overwrite_ok=True)
style = xlwt.XFStyle()

sheet.write(0, 0, 'KEYWORD')
sheet.write(0, 1, 'NAME')
sheet.write(0, 2, 'CATEGORY')
sheet.write(0, 3, 'DIRECTION')
sheet.write(0, 4, 'PHONE')
sheet.write(0, 5, 'WEB')
sheet.write(0, 6, 'PLUS CODE')
sheet.write(0, 7, 'OPEN HOURS')
sheet.write(0, 8, 'STARS')
sheet.write(0, 9, 'REVIEWS')

cont=1
for lugar in self.listaLugares:
sheet.write(cont, 0, lugar.keyword)
sheet.write(cont, 1, lugar.nombre)
sheet.write(cont, 2, lugar.categoria)
sheet.write(cont, 3, lugar.direccion)
sheet.write(cont, 4, lugar.telefono)
sheet.write(cont, 5, lugar.web)
sheet.write(cont, 6, lugar.pluscode)
sheet.write(cont, 7, lugar.horario)
sheet.write(cont, 8, lugar.estrellas)
sheet.write(cont, 9, lugar.resenas)
cont = cont + 1

writeBook.save(self.ruta+self.nombreFichero)
15 changes: 15 additions & 0 deletions lugar_maps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-

class LugarMaps:

def __init__(self):
self.keyword = ''
self.nombre = ''
self.categoria = ''
self.direccion = ''
self.telefono = ''
self.web = ''
self.pluscode = ''
self.estrellas = ''
self.resenas = ''
self.horario = ''
84 changes: 84 additions & 0 deletions main_datos_maps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-


from exportarDatos import ExportarDatosMaps
from maps_data_scraper import GoogleMapsDataScraper
from threading import Thread
import sys
import os

def split_list(a, n):
k, m = divmod(len(a), n)
return list((a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)))

def scrapearMaps(idioma, lista, outputFolder, resultados, hilo):
scraper = GoogleMapsDataScraper(idioma, outputFolder)
scraper.initDriver()
listaLugares = []

cont=1
for l in lista:
lugar = scraper.scrapearDatos(l)

if(lugar != None):
print('Hilo nº '+str(hilo)+' ' +str(cont) + '/' + str(len(lista)) + ' - OK - ' + l)
listaLugares.append(lugar)
else:
print('Hilo nº '+str(hilo)+' ' +str(cont) + '/' + str(len(lista)) + ' - ERROR - ' + l)
cont +=1

resultados[hilo] = listaLugares
def mainGoogleMaps(idioma, ficheroKw, outputFolder):
archivo = open(ficheroKw,'r', encoding='utf-8')
listaF = archivo.read().splitlines()
archivo.close()

hilos = 5
listaHilos = [None] * hilos
listaResultados = [None] * hilos
divididos = split_list(listaF, hilos)

for i in range(len(listaHilos)):
listaHilos[i] = Thread(target = scrapearMaps, args=(idioma, divididos[i], outputFolder, listaResultados, i,))
listaHilos[i].start()

for i in range(len(listaHilos)):
listaHilos[i].join()

listaFinal = []

for i in range(len(listaResultados)):
listaFinal = listaFinal + listaResultados[i]

exportar = ExportarDatosMaps(outputFolder+'00_output.xls','', listaFinal)
exportar.exportarExcel()

if __name__ == "__main__":
while True:
idioma = input('----------\n[1] Introduce the language, (ES o EN): ')
if(idioma != 'ES' and idioma != 'EN'):
print("----------\n** Error ** That is not a valid language. Enter a valid language\n")
continue
else:
break

while True:
fichero = input('----------\n[2] Introduce the path to save the images: ')
if(os.path.isdir(fichero) == False):
print("----------\n** Error ** That is not a valid folder. Enter a valid folder\n")
continue
else:
caracter = fichero[len(fichero)-1]
if(caracter != '/' or caracter != '\\'):
fichero = fichero.replace('/','\\')+'\\'
break

while True:
kwLugares = input('----------\n[3] Introduce the path of the keywords txt file: ')
if(os.path.isfile(kwLugares) == False):
print("----------\n** Error ** That is not a valid txt file. Enter a valid file\n")
continue
else:
break

mainGoogleMaps(idioma,kwLugares, fichero)
182 changes: 182 additions & 0 deletions maps_data_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
# -*- coding: utf-8 -*-

import random
import time
import urllib.request
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from lugar_maps import LugarMaps

class GoogleMapsDataScraper:

def __init__(self, idioma, imgOutput):
self.driver = None
self.errorCont = 0
self.imgOutput = imgOutput
self.configuracion = self.setConfiguracion(idioma)

def setConfiguracion(self, idioma):
conf = {
'idioma': '--lang=es-ES',
'textoEstrellas': 'estrellas',
'textoReviews': 'reseñas',
'textoDireccion': 'Dirección: ',
'textoWeb': 'Sitio web: ',
'textoTelefono': 'Teléfono: ',
'textoPlusCode': 'Plus Code: ',
'textoHorario': 'Ocultar el horario de la semana',
'remplazarHorario': [' Ocultar el horario de la semana', 'El horario podría cambiar', '; ']
}
if(idioma == 'EN'):
conf['idioma'] = '--lang=en-GB'
conf['textoEstrellas'] = 'stars'
conf['textoReviews'] = 'reviews'
conf['textoDireccion'] = 'Address: '
conf['textoWeb'] = 'Website: '
conf['textoTelefono'] = 'Phone: '
conf['textoPlusCode'] = 'Plus code: '
conf['textoHorario'] = 'Hide open hours for the week'
conf['remplazarHorario'] = ['. Hide open hours for the week', 'Hours might differ', '; ']

return conf

def initDriver(self):
try:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--log-level=3')
chrome_options.add_argument(self.configuracion['idioma'])
s=Service(ChromeDriverManager().install())
self.driver = webdriver.Chrome(service=s, options=chrome_options)
self.driver.get('https://www.google.com/')
self.driver.find_element_by_xpath('//*[@id="L2AGLb"]').click()
time.sleep(2)
self.driver.get('https://www.google.com/maps/')
return True
except:
print('Error with the Chrome Driver')
return False

def quitarTildes(self, s):
replacements = (("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"),)
for a, b in replacements:
s = s.replace(a, b).replace(a.upper(), b.upper())
return s

def scrapearDatos(self, kw):
try:
lugar = LugarMaps()
lugar.keyword = kw
if(self.errorCont == 5):
self.errorCont = 0
time.sleep(1)
self.driver.get('https://www.google.com/maps/')
time.sleep(2)
time.sleep(random.randint(1,3))
inputBox = WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchboxinput"]')))
inputBox.click()
inputBox.clear()
inputBox.click()
time.sleep(1)
inputBox.send_keys(kw)
time.sleep(1)
inputBox.send_keys(Keys.ENTER)
time.sleep(4)

if(self.isLoaded(kw) == False):
return None

divImg = self.driver.find_element_by_id('pane')
titulo = divImg.find_element_by_tag_name('h1').text
lugar.nombre = titulo
time.sleep(1)
try:
val = WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[contains(@aria-label, "'+self.configuracion['textoEstrellas']+'")]')))
valoraciones = val.get_attribute("aria-label")
estrellas = valoraciones.replace(self.configuracion['textoEstrellas'],'').replace(' ','')

val = self.driver.find_element_by_xpath('//*[contains(@aria-label, "'+self.configuracion['textoReviews']+'")]')
valoraciones = val.get_attribute("aria-label")
numResenas = valoraciones.replace(self.configuracion['textoReviews'],'').replace(' ','')

lugar.estrellas = self.checkValoraciones(estrellas)
lugar.resenas = self.checkValoraciones(numResenas)
except Exception as e:
print(e)
pass

try:
imgSrc = divImg.find_element_by_tag_name('img').get_attribute("src")
if(not 'gstatic' in imgSrc):
urllib.request.urlretrieve(imgSrc, self.imgOutput+self.quitarTildes(kw.replace('º','').replace('.','').replace(' ','-').replace('/','-')).lower()+'.jpg')
except Exception as e:
print(e)
print('No se ha podido obtener la imagen')
pass

lugar.categoria = self.buscar_xpath('//*[@jsaction="pane.rating.category"]')
lugar.direccion = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoDireccion']+'")]')
lugar.web = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoWeb']+'")]')
lugar.telefono = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoTelefono']+'")]')
lugar.pluscode = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoPlusCode']+'")]')

lugar.horario = self.getHorario()

return lugar
except Exception as e:
print(e)
self.errorCont += 1
return None

def buscar_xpath(self, xpath):
try:
resultado = self.driver.find_element_by_xpath(xpath).text
return resultado
except:
return ''

def getHorario(self):
try:
horario = self.driver.find_element_by_xpath('//*[contains(@aria-label, "'+self.configuracion['textoHorario']+'")]').get_attribute('aria-label')
horario = horario.replace(self.configuracion['remplazarHorario'][0], '')
horario = horario.replace(self.configuracion['remplazarHorario'][1], '')
horario = horario.replace(self.configuracion['remplazarHorario'][2], '\n')
return horario
except:
return ''

def isLoaded(self, kw):
divImg = self.driver.find_element_by_id('pane')
titulo = divImg.find_elements_by_tag_name('h1')
vacio = True
for a in titulo:
if(a.text != ''):
return True
if(vacio):
try:
resultados = self.driver.find_element_by_xpath('//div[contains(@aria-label, "'+kw+'")]')
enlace = resultados.find_element_by_tag_name('a')
enlace.click()
time.sleep(3)
return True
except:
pass
return False

def checkValoraciones(self, val):
if(self.configuracion['textoEstrellas'] in val or self.configuracion['textoReviews'] in val):
return ''
else:
return val


def endDriver(self):
self.driver.quit()
Loading

0 comments on commit 56c039f

Please sign in to comment.