-
Notifications
You must be signed in to change notification settings - Fork 78
/
Copy pathgenerate-cache.py
125 lines (100 loc) · 4.15 KB
/
generate-cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/python3
import hashlib
import json
import os
import logging
import sys
sys.path.append('./disposable/')
from disposable import disposableHostGenerator
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
def create_cache():
"""Create a hash-based cache of domains and their sources.
The cache is stored in a 'cache' directory with one file per
2-character prefix of the domain hash.
Each file contains a JSON object with the domain hash as keys
and the domain and its source as values.
"""
external_sources = []
for source in disposableHostGenerator.sources:
if source.get('external'):
external_sources.append(source['src'])
domain_cache = {}
if not os.path.exists('cache'):
os.makedirs('cache')
try:
# Split domains hash based on first 2 hex characters
with open('domains.txt') as f:
for domain in f:
domain = domain.strip()
if domain.startswith('#') or domain == '':
continue
domain_hash = hashlib.sha1(domain.encode('utf8')).hexdigest()
hash_prefix = domain_hash[:2]
if hash_prefix not in domain_cache:
domain_cache[hash_prefix] = {}
domain_cache[hash_prefix][domain_hash] = {
'domain': domain,
'strict': False,
'src': []
}
# Add domains from grey/strict list
with open('disposable/greylist.txt') as f:
for domain in f:
domain = domain.strip()
if domain.startswith('#') or domain == '':
continue
domain_hash = hashlib.sha1(domain.encode('utf8')).hexdigest()
hash_prefix = domain_hash[:2]
if hash_prefix not in domain_cache:
domain_cache[hash_prefix] = {}
domain_cache[hash_prefix][domain_hash] = {
'domain': domain,
'strict': True,
'src': [
{
'url': 'https://raw.githubusercontent.com/disposable/disposable/master/greylist.txt',
'ext': False
}
]
}
# Add domains from grey/whitelist list
with open('disposable/whitelist.txt') as f:
for domain in f:
domain = domain.strip()
if domain.startswith('#') or domain == '':
continue
domain_hash = hashlib.sha1(domain.encode('utf8')).hexdigest()
hash_prefix = domain_hash[:2]
if hash_prefix not in domain_cache:
domain_cache[hash_prefix] = {}
domain_cache[hash_prefix][domain_hash] = {
'domain': domain,
'whitelist': True,
'src': [
{
'url': 'https://raw.githubusercontent.com/disposable/disposable/master/whitelist.txt',
'ext': False
}
]
}
with open('domains_source_map.txt', 'r') as f:
for line in f:
line = line.strip()
if line.startswith('#') or line == '' or ':' not in line:
continue
source_url, domain = line.rsplit(':', 1)
domain_hash = hashlib.sha1(domain.encode('utf8')).hexdigest()
hash_prefix = domain_hash[:2]
if domain_hash not in domain_cache.get(hash_prefix, {}):
continue
domain_cache[hash_prefix][domain_hash]['src'].append({
'url': source_url,
'ext': source_url in external_sources
})
for hash_prefix, domain_data in domain_cache.items():
with open('cache/' + hash_prefix + '.json', 'w') as f:
json.dump(domain_data, f)
except Exception as e:
logging.error(e)
if __name__ == '__main__':
create_cache()