-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetEmailFromInmetroRBLE.py
122 lines (96 loc) · 4.49 KB
/
getEmailFromInmetroRBLE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
## 2018 - Jp Mondoni - get email list from INMETRO website using Beautiful Soup ##
## Simple algorithm to find e-mails from INMETRO website based on company pages ##
## It is setup with RBLE certified URLs, but could be anything ##
## The list will be further used as a marketing tool ##
from lxml import html
import requests, re, sys, os, time, csv, string
from bs4 import BeautifulSoup
labList = []
contactInfo = []
rowlist = []
## This Function visits each URL on the list and get every href tag.
## The labList[] will further be used to get the lab contact.
def getHrefFromUrl(pages):
print('[' + time.strftime("%H:%M:%S") + ']>: Searching for INMETRO labs pages urls')
for i in range(len(pages)):
page = requests.get(pages[i])
soup = BeautifulSoup(page.text, "lxml")
for link in soup.findAll('a', attrs={'href': re.compile("^detalhe_laboratorio\.asp\?nom_apelido=")}): # Regex with aimed link
acqLink = str(link.get('href'))
#print(acqLink)
labList.append(acqLink)
print('[' + time.strftime("%H:%M:%S") + ']>: Discovered ' + str(len(labList)) + ' laboratories web pages.')
getLabContact(labList)
def getLabContact(acqList):
# número de acreditação, nome do laboratório, situação, gerente, área e estado.
i = 0
print('[' + time.strftime("%H:%M:%S") + ']>: Acquiring prospect company contact information')
for i in range(len(acqList)):
#for i in range(1):
url = ('http://www.inmetro.gov.br/laboratorios/rble/' + labList[i])
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
for link in soup.findAll('a', attrs={'href': re.compile("^mailto:")}):
acqMail = str(link.get('href'))
# get "Número acreditação"
acreds = soup.find_all(lambda tag: tag.name == "td" and "Número da Acreditação " in tag.get_text())
acreds = acreds[2:]
for acred in acreds:
num_acred = acred.find_next('td').get_text()
num_acred = normalize_string(num_acred)
# get Nome laboratório
nomes = soup.find_all(lambda tag: tag.name == "td" and "Laboratório" in tag.get_text())
nomes = nomes[6:7]
for nome in nomes:
nome_lab = nome.find_next('td').get_text()
nome_lab = normalize_string(nome_lab)
# get Situação
situacoes = soup.find_all(lambda tag: tag.name == "td" and "Situação" in tag.get_text())
situacoes = situacoes[2:3]
for situacao in situacoes:
situacao_lab = situacao.find_next('td').get_text()
situacao_lab = normalize_string(situacao_lab)
# get Gerente
gerentes = soup.find_all(lambda tag: tag.name == "td" and "Gerente Técnico" in tag.get_text())
gerentes = gerentes[2:3]
for gerente in gerentes:
nome_gerente = gerente.find_next('td').get_text()
nome_gerente = normalize_string(nome_gerente)
# get UF
ufs = soup.find_all(lambda tag: tag.name == "td" and "UF" in tag.get_text())
ufs = ufs[2:3]
for uf in ufs:
sigla_uf = uf.find_next('td').get_text()
sigla_uf = normalize_string(sigla_uf)
contactInfo.append([acqMail[7:], num_acred, nome_lab, situacao_lab, nome_gerente, sigla_uf, url]) # Write info on list
print(contactInfo)
i=i+1
print('[' + time.strftime("%H:%M:%S") + ']>: Discovered ' + str(len(contactInfo)) + ' contacts.')
generateCSV(contactInfo)
def generateCSV(prospectList):
print('[' + time.strftime("%H:%M:%S") + ']>: Writing csv file with company URL page and E-mail contact')
# with open('newcsv.csv', 'a') as labsCSV: # Append mode
with open('newcsv.csv', 'w') as labsCSV: # Write mode
for i in range(len(prospectList)-1):
wr = csv.writer(labsCSV, quoting=csv.QUOTE_ALL)
wr.writerow(prospectList[i])
print('[' + time.strftime("%H:%M:%S") + ']>: Finished writings csv file. Total of ' + str(len(prospectList)) + ' rows were written.')
# Remove whitespaces, tabs, linebreaks and captalize string initials
def normalize_string(text):
text = text.replace("\n", "")
text = text.replace("\t", "")
text = text.strip()
if(len(text) != 2):
text = string.capwords(text)
return text
baseURL = 'http://www.inmetro.gov.br/laboratorios/rble/lista_laboratorios.asp?sigLab=&ordem=&tituloLab=&uf=&pais=&descr_escopo=&classe_ensaio=&area_atividade=&ind_tipo_busca=&pagina='
## Alternate version using txt file with pre-defined URLs
#with open('Pages.txt', encoding='utf-8') as file:
# rowlist = file.readlines()
# rowlist = [x.strip() for x in rowlist]
# #print(rowlist)
## NOTE: Some websites may block consecutive connection requests. Be careful to don't block your IP due to high number of connections.
for x in range(0,56):
pageURL = baseURL+str(x)
rowlist.append(pageURL)
getHrefFromUrl(rowlist)