forked from patxijuaristi/google_maps_scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 56c039f
Showing
10 changed files
with
403 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
.vscode/* | ||
__pycache__/* | ||
imagenes-icono.ico |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# Google Maps / Google My Business Scraper 🌎📊 | ||
|
||
This is script is a scraping script developed with Python and its automation library Selenium. **Consists of reading a list of keywords, searching them in the Google Maps search, and getting its data and cover image**. | ||
|
||
The script goes one by one searching for the keyword, and storing the data in a list, to finally export it to an Excel file located in the folder specified by the user when running the script. | ||
|
||
In the presentation video I show the script running without hiding the Chrome window, and it can be seen quite clear the process that the script follows. | ||
|
||
However, although I have added [that version](build/maps_scraper_juaristech_windowed_demo.exe) in the build folder, the final version and the one it is in the source code, works without showing the Chrome window, because it works with 5 simultaneous threads to increase the speed and obtain the results faster. | ||
|
||
For now the script works only for Spanish and English languages, however, I can add more languages in the future. | ||
|
||
[](https://juaristech.com/google-maps-scraper "JuarisTech") | ||
|
||
## How to Run It | ||
|
||
To execute this script you need to run it in the command prompt. | ||
|
||
```bash | ||
google_maps_scraper_juaristech.exe | ||
``` | ||
|
||
Then, some questions will appear, which are necessary to run the script: | ||
|
||
1. You will need to type "ES" for Spanish or "EN" for English. | ||
|
||
```bash | ||
[1] Introduce the language, (ES o EN): | ||
``` | ||
2. You will need to specify the folder to save the output Excel and images. For example: *D:\Projects\Spain\Madrid\output\\* | ||
|
||
```bash | ||
[2] Introduce the path to save the images: | ||
``` | ||
|
||
3. To finish, you need to specify where is located the *.txt* file with the keywords to search. For example: *D:\Projects\Spain\Madrid\places.txt* | ||
|
||
```bash | ||
[3] Introduce the path of the keywords txt file: | ||
``` | ||
|
||
Then the script starts to work, and when it finished, the Excel file would appear in the output folder. | ||
|
||
--- | ||
|
||
For any doubts about how to use the program, you can read the article of our web or see the demo video. | ||
|
||
- Explanatory article: https://juaristech.com/google-maps-data-scraper | ||
- Demo video: https://www.youtube.com/channel/UCAUKSLj_OR1PfguW2ODUD3Q | ||
|
||
## Requirements | ||
|
||
The used requirements are specified in the requierements.txt file. If you want to execute the *.py* script from python, you can install the dependencies with the next command: | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Contact | ||
|
||
- Website: [JuarisTech](https://juaristech.com/) | ||
- Email: [email protected] | ||
|
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import xlwt | ||
|
||
class ExportarDatosMaps: | ||
|
||
def __init__(self, nombreFichero, ruta, listaLugares): | ||
self.nombreFichero = nombreFichero | ||
self.ruta = ruta | ||
self.listaLugares = listaLugares | ||
|
||
def exportarExcel(self): | ||
writeBook= xlwt.Workbook(encoding='utf-8') | ||
sheet = writeBook.add_sheet("document",cell_overwrite_ok=True) | ||
style = xlwt.XFStyle() | ||
|
||
sheet.write(0, 0, 'KEYWORD') | ||
sheet.write(0, 1, 'NAME') | ||
sheet.write(0, 2, 'CATEGORY') | ||
sheet.write(0, 3, 'DIRECTION') | ||
sheet.write(0, 4, 'PHONE') | ||
sheet.write(0, 5, 'WEB') | ||
sheet.write(0, 6, 'PLUS CODE') | ||
sheet.write(0, 7, 'OPEN HOURS') | ||
sheet.write(0, 8, 'STARS') | ||
sheet.write(0, 9, 'REVIEWS') | ||
|
||
cont=1 | ||
for lugar in self.listaLugares: | ||
sheet.write(cont, 0, lugar.keyword) | ||
sheet.write(cont, 1, lugar.nombre) | ||
sheet.write(cont, 2, lugar.categoria) | ||
sheet.write(cont, 3, lugar.direccion) | ||
sheet.write(cont, 4, lugar.telefono) | ||
sheet.write(cont, 5, lugar.web) | ||
sheet.write(cont, 6, lugar.pluscode) | ||
sheet.write(cont, 7, lugar.horario) | ||
sheet.write(cont, 8, lugar.estrellas) | ||
sheet.write(cont, 9, lugar.resenas) | ||
cont = cont + 1 | ||
|
||
writeBook.save(self.ruta+self.nombreFichero) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
class LugarMaps: | ||
|
||
def __init__(self): | ||
self.keyword = '' | ||
self.nombre = '' | ||
self.categoria = '' | ||
self.direccion = '' | ||
self.telefono = '' | ||
self.web = '' | ||
self.pluscode = '' | ||
self.estrellas = '' | ||
self.resenas = '' | ||
self.horario = '' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
|
||
from exportarDatos import ExportarDatosMaps | ||
from maps_data_scraper import GoogleMapsDataScraper | ||
from threading import Thread | ||
import sys | ||
import os | ||
|
||
def split_list(a, n): | ||
k, m = divmod(len(a), n) | ||
return list((a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))) | ||
|
||
def scrapearMaps(idioma, lista, outputFolder, resultados, hilo): | ||
scraper = GoogleMapsDataScraper(idioma, outputFolder) | ||
scraper.initDriver() | ||
listaLugares = [] | ||
|
||
cont=1 | ||
for l in lista: | ||
lugar = scraper.scrapearDatos(l) | ||
|
||
if(lugar != None): | ||
print('Hilo nº '+str(hilo)+' ' +str(cont) + '/' + str(len(lista)) + ' - OK - ' + l) | ||
listaLugares.append(lugar) | ||
else: | ||
print('Hilo nº '+str(hilo)+' ' +str(cont) + '/' + str(len(lista)) + ' - ERROR - ' + l) | ||
cont +=1 | ||
|
||
resultados[hilo] = listaLugares | ||
def mainGoogleMaps(idioma, ficheroKw, outputFolder): | ||
archivo = open(ficheroKw,'r', encoding='utf-8') | ||
listaF = archivo.read().splitlines() | ||
archivo.close() | ||
|
||
hilos = 5 | ||
listaHilos = [None] * hilos | ||
listaResultados = [None] * hilos | ||
divididos = split_list(listaF, hilos) | ||
|
||
for i in range(len(listaHilos)): | ||
listaHilos[i] = Thread(target = scrapearMaps, args=(idioma, divididos[i], outputFolder, listaResultados, i,)) | ||
listaHilos[i].start() | ||
|
||
for i in range(len(listaHilos)): | ||
listaHilos[i].join() | ||
|
||
listaFinal = [] | ||
|
||
for i in range(len(listaResultados)): | ||
listaFinal = listaFinal + listaResultados[i] | ||
|
||
exportar = ExportarDatosMaps(outputFolder+'00_output.xls','', listaFinal) | ||
exportar.exportarExcel() | ||
|
||
if __name__ == "__main__": | ||
while True: | ||
idioma = input('----------\n[1] Introduce the language, (ES o EN): ') | ||
if(idioma != 'ES' and idioma != 'EN'): | ||
print("----------\n** Error ** That is not a valid language. Enter a valid language\n") | ||
continue | ||
else: | ||
break | ||
|
||
while True: | ||
fichero = input('----------\n[2] Introduce the path to save the images: ') | ||
if(os.path.isdir(fichero) == False): | ||
print("----------\n** Error ** That is not a valid folder. Enter a valid folder\n") | ||
continue | ||
else: | ||
caracter = fichero[len(fichero)-1] | ||
if(caracter != '/' or caracter != '\\'): | ||
fichero = fichero.replace('/','\\')+'\\' | ||
break | ||
|
||
while True: | ||
kwLugares = input('----------\n[3] Introduce the path of the keywords txt file: ') | ||
if(os.path.isfile(kwLugares) == False): | ||
print("----------\n** Error ** That is not a valid txt file. Enter a valid file\n") | ||
continue | ||
else: | ||
break | ||
|
||
mainGoogleMaps(idioma,kwLugares, fichero) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import random | ||
import time | ||
import urllib.request | ||
from selenium import webdriver | ||
from selenium.webdriver.chrome.service import Service | ||
from webdriver_manager.chrome import ChromeDriverManager | ||
from selenium.webdriver.common.keys import Keys | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from selenium.webdriver.support import expected_conditions as EC | ||
|
||
from lugar_maps import LugarMaps | ||
|
||
class GoogleMapsDataScraper: | ||
|
||
def __init__(self, idioma, imgOutput): | ||
self.driver = None | ||
self.errorCont = 0 | ||
self.imgOutput = imgOutput | ||
self.configuracion = self.setConfiguracion(idioma) | ||
|
||
def setConfiguracion(self, idioma): | ||
conf = { | ||
'idioma': '--lang=es-ES', | ||
'textoEstrellas': 'estrellas', | ||
'textoReviews': 'reseñas', | ||
'textoDireccion': 'Dirección: ', | ||
'textoWeb': 'Sitio web: ', | ||
'textoTelefono': 'Teléfono: ', | ||
'textoPlusCode': 'Plus Code: ', | ||
'textoHorario': 'Ocultar el horario de la semana', | ||
'remplazarHorario': [' Ocultar el horario de la semana', 'El horario podría cambiar', '; '] | ||
} | ||
if(idioma == 'EN'): | ||
conf['idioma'] = '--lang=en-GB' | ||
conf['textoEstrellas'] = 'stars' | ||
conf['textoReviews'] = 'reviews' | ||
conf['textoDireccion'] = 'Address: ' | ||
conf['textoWeb'] = 'Website: ' | ||
conf['textoTelefono'] = 'Phone: ' | ||
conf['textoPlusCode'] = 'Plus code: ' | ||
conf['textoHorario'] = 'Hide open hours for the week' | ||
conf['remplazarHorario'] = ['. Hide open hours for the week', 'Hours might differ', '; '] | ||
|
||
return conf | ||
|
||
def initDriver(self): | ||
try: | ||
chrome_options = webdriver.ChromeOptions() | ||
chrome_options.add_argument('--headless') | ||
chrome_options.add_argument('--no-sandbox') | ||
chrome_options.add_argument('--disable-dev-shm-usage') | ||
chrome_options.add_argument('--log-level=3') | ||
chrome_options.add_argument(self.configuracion['idioma']) | ||
s=Service(ChromeDriverManager().install()) | ||
self.driver = webdriver.Chrome(service=s, options=chrome_options) | ||
self.driver.get('https://www.google.com/') | ||
self.driver.find_element_by_xpath('//*[@id="L2AGLb"]').click() | ||
time.sleep(2) | ||
self.driver.get('https://www.google.com/maps/') | ||
return True | ||
except: | ||
print('Error with the Chrome Driver') | ||
return False | ||
|
||
def quitarTildes(self, s): | ||
replacements = (("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"),) | ||
for a, b in replacements: | ||
s = s.replace(a, b).replace(a.upper(), b.upper()) | ||
return s | ||
|
||
def scrapearDatos(self, kw): | ||
try: | ||
lugar = LugarMaps() | ||
lugar.keyword = kw | ||
if(self.errorCont == 5): | ||
self.errorCont = 0 | ||
time.sleep(1) | ||
self.driver.get('https://www.google.com/maps/') | ||
time.sleep(2) | ||
time.sleep(random.randint(1,3)) | ||
inputBox = WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchboxinput"]'))) | ||
inputBox.click() | ||
inputBox.clear() | ||
inputBox.click() | ||
time.sleep(1) | ||
inputBox.send_keys(kw) | ||
time.sleep(1) | ||
inputBox.send_keys(Keys.ENTER) | ||
time.sleep(4) | ||
|
||
if(self.isLoaded(kw) == False): | ||
return None | ||
|
||
divImg = self.driver.find_element_by_id('pane') | ||
titulo = divImg.find_element_by_tag_name('h1').text | ||
lugar.nombre = titulo | ||
time.sleep(1) | ||
try: | ||
val = WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[contains(@aria-label, "'+self.configuracion['textoEstrellas']+'")]'))) | ||
valoraciones = val.get_attribute("aria-label") | ||
estrellas = valoraciones.replace(self.configuracion['textoEstrellas'],'').replace(' ','') | ||
|
||
val = self.driver.find_element_by_xpath('//*[contains(@aria-label, "'+self.configuracion['textoReviews']+'")]') | ||
valoraciones = val.get_attribute("aria-label") | ||
numResenas = valoraciones.replace(self.configuracion['textoReviews'],'').replace(' ','') | ||
|
||
lugar.estrellas = self.checkValoraciones(estrellas) | ||
lugar.resenas = self.checkValoraciones(numResenas) | ||
except Exception as e: | ||
print(e) | ||
pass | ||
|
||
try: | ||
imgSrc = divImg.find_element_by_tag_name('img').get_attribute("src") | ||
if(not 'gstatic' in imgSrc): | ||
urllib.request.urlretrieve(imgSrc, self.imgOutput+self.quitarTildes(kw.replace('º','').replace('.','').replace(' ','-').replace('/','-')).lower()+'.jpg') | ||
except Exception as e: | ||
print(e) | ||
print('No se ha podido obtener la imagen') | ||
pass | ||
|
||
lugar.categoria = self.buscar_xpath('//*[@jsaction="pane.rating.category"]') | ||
lugar.direccion = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoDireccion']+'")]') | ||
lugar.web = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoWeb']+'")]') | ||
lugar.telefono = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoTelefono']+'")]') | ||
lugar.pluscode = self.buscar_xpath('//*[contains(@aria-label, "'+self.configuracion['textoPlusCode']+'")]') | ||
|
||
lugar.horario = self.getHorario() | ||
|
||
return lugar | ||
except Exception as e: | ||
print(e) | ||
self.errorCont += 1 | ||
return None | ||
|
||
def buscar_xpath(self, xpath): | ||
try: | ||
resultado = self.driver.find_element_by_xpath(xpath).text | ||
return resultado | ||
except: | ||
return '' | ||
|
||
def getHorario(self): | ||
try: | ||
horario = self.driver.find_element_by_xpath('//*[contains(@aria-label, "'+self.configuracion['textoHorario']+'")]').get_attribute('aria-label') | ||
horario = horario.replace(self.configuracion['remplazarHorario'][0], '') | ||
horario = horario.replace(self.configuracion['remplazarHorario'][1], '') | ||
horario = horario.replace(self.configuracion['remplazarHorario'][2], '\n') | ||
return horario | ||
except: | ||
return '' | ||
|
||
def isLoaded(self, kw): | ||
divImg = self.driver.find_element_by_id('pane') | ||
titulo = divImg.find_elements_by_tag_name('h1') | ||
vacio = True | ||
for a in titulo: | ||
if(a.text != ''): | ||
return True | ||
if(vacio): | ||
try: | ||
resultados = self.driver.find_element_by_xpath('//div[contains(@aria-label, "'+kw+'")]') | ||
enlace = resultados.find_element_by_tag_name('a') | ||
enlace.click() | ||
time.sleep(3) | ||
return True | ||
except: | ||
pass | ||
return False | ||
|
||
def checkValoraciones(self, val): | ||
if(self.configuracion['textoEstrellas'] in val or self.configuracion['textoReviews'] in val): | ||
return '' | ||
else: | ||
return val | ||
|
||
|
||
def endDriver(self): | ||
self.driver.quit() |
Oops, something went wrong.