Skip to content

Commit

Permalink
Correction for not first results
Browse files Browse the repository at this point in the history
  • Loading branch information
patxijuaristi committed Aug 15, 2022
1 parent 059f335 commit fc453fc
Showing 1 changed file with 21 additions and 15 deletions.
36 changes: 21 additions & 15 deletions maps_data_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def initDriver(self):
self.driver = webdriver.Chrome(service=s, options=chrome_options)
self.driver.get('https://www.google.com/')
try:
self.driver.find_element_by_xpath('//*[@id="L2AGLb"]').click()
self.driver.find_element(By.XPATH, '//*[@id="L2AGLb"]').click()
except:
pass
time.sleep(2)
Expand Down Expand Up @@ -98,17 +98,22 @@ def scrapearDatos(self, kw):
return None

divImg = self.driver.find_element(By.XPATH, '//*[@id="pane"]/following-sibling::div')
titulo = divImg.find_element_by_tag_name('h1').text
titulo = divImg.find_element(By.TAG_NAME, 'h1').text
lugar.nombre = titulo
time.sleep(1)
try:
val = WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[contains(@aria-label, "'+self.configuracion['textoEstrellas']+'")]')))
valoraciones = val.get_attribute("aria-label")
estrellas = valoraciones.replace(self.configuracion['textoEstrellas'],'').replace(' ','')

val = self.driver.find_element_by_xpath('//*[contains(@aria-label, "'+self.configuracion['textoReviews']+'")]')
valoraciones = val.get_attribute("aria-label")
numResenas = valoraciones.replace(self.configuracion['textoReviews'],'').replace(' ','')
if '(' in val.text and ')' in val.text:
dividido = val.text.replace(')','').split('(')
estrellas = dividido[0]
numResenas = dividido[1]
else:
valoraciones = val.get_attribute("aria-label")
estrellas = valoraciones.replace(self.configuracion['textoEstrellas'],'').replace(' ','')

val = self.driver.find_element(By.XPATH, '//*[contains(@aria-label, "'+self.configuracion['textoReviews']+'")]')
valoraciones = val.get_attribute("aria-label")
numResenas = valoraciones.replace(self.configuracion['textoReviews'],'').replace(' ','')

lugar.estrellas = self.checkValoraciones(estrellas)
lugar.resenas = self.checkValoraciones(numResenas)
Expand All @@ -117,8 +122,9 @@ def scrapearDatos(self, kw):
pass

try:
imgSrc = divImg.find_element_by_tag_name('img').get_attribute("src")
if(not 'gstatic' in imgSrc):
imgSrc = divImg.find_element(By.XPATH, '//img[@decoding="async"]').get_attribute("src")
#imgSrc = divImg.find_element(By.TAG_NAME, 'img').get_attribute("src")
if not 'gstatic' in imgSrc or not 'streetviewpixels' in imgSrc :
urllib.request.urlretrieve(imgSrc, self.imgOutput+self.quitarTildes(kw.replace('º','').replace('.','').replace(' ','-').replace('/','-')).lower()+'.jpg')
except Exception as e:
print(e)
Expand All @@ -141,14 +147,14 @@ def scrapearDatos(self, kw):

def buscar_xpath(self, xpath):
try:
resultado = self.driver.find_element_by_xpath(xpath).text
resultado = self.driver.find_element(By.XPATH, xpath).text
return resultado
except:
return ''

def getHorario(self):
try:
horario = self.driver.find_element_by_xpath('//*[contains(@aria-label, "'+self.configuracion['textoHorario']+'")]').get_attribute('aria-label')
horario = self.driver.find_element(By.XPATH, '//*[contains(@aria-label, "'+self.configuracion['textoHorario']+'")]').get_attribute('aria-label')
horario = horario.replace(self.configuracion['remplazarHorario'][0], '')
horario = horario.replace(self.configuracion['remplazarHorario'][1], '')
horario = horario.replace(self.configuracion['remplazarHorario'][2], '\n')
Expand All @@ -159,15 +165,15 @@ def getHorario(self):
def isLoaded(self, kw):
#divImg = self.driver.find_element_by_id('pane')
divImg = self.driver.find_element(By.XPATH, '//*[@id="pane"]/following-sibling::div')
titulo = divImg.find_elements_by_tag_name('h1')
titulo = divImg.find_elements(By.TAG_NAME, 'h1')
vacio = True
for a in titulo:
if(a.text != ''):
return True
if(vacio):
try:
resultados = self.driver.find_element_by_xpath('//div[contains(@aria-label, "'+kw+'")]')
enlace = resultados.find_element_by_tag_name('a')
resultados = self.driver.find_element(By.XPATH, '//div[contains(@aria-label, "'+kw+'")]')
enlace = resultados.find_element(By.TAG_NAME, 'a')
enlace.click()
time.sleep(3)
return True
Expand Down

0 comments on commit fc453fc

Please sign in to comment.