-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathwlscrape.py
164 lines (153 loc) · 6.03 KB
/
wlscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python
from __future__ import print_function
from lxml import html
import itertools
import argparse
import requests
import json
import wget
import sys
import os
__author__ = "Vesselin Bontchev <[email protected]>"
__license__ = "GPL"
__VERSION__ = "1.07"
site = "https://wikileaks.org"
area = "/akp-emails/"
def error(e):
print("Error: %s." % e, file=sys.stderr)
sys.exit(-1)
def processData(tree, ext, blacklist):
try:
links = tree.xpath("//*[@id='searchresult']/tbody/tr/td[2]/a/@href")
sizes = tree.xpath("//*[@id='searchresult']/tbody/tr/td[3]/text()")
md5s = tree.xpath("//*[@id='searchresult']/tbody/tr/td[4]/text()")
except Exception as e:
error(e)
data = []
for i in range(len(links)):
theUrl = site + area + links[i]
if (not theUrl in blacklist):
data.append({"md5" : md5s[i], "url" : theUrl, "ext" : ext, "size" : sizes[i]})
return data
def processExtension(ext, blacklist, options):
unique = options.unique
pageNums = options.pages
url = site + area + "?file=" + ext
if (options.spam):
url += "&spam=1"
if (options.duplicates):
url += "&dupl=1"
url += "&count=" + str(options.elements) + "&page=1&#searchresult"
print(url, file=sys.stderr)
pageNum = 1
theData = []
seen = set()
try:
page = requests.get(url)
tree = html.fromstring(page.content)
if ((len(pageNums) == 0) or (pageNum in pageNums)):
newData = processData(tree, ext, blacklist)
if (unique):
for element in newData:
if (not element["md5"] in seen):
seen.add(element["md5"])
theData.append(element)
else:
theData += newData
#nextButtonXPath = "//*[@id='right-pane']/div[5]/div/ul/li[last()]/a/@href"
nextButtonXPath = "//*[@id='right-pane']/div[3]/div/ul/li[last()]/a/@href"
next = tree.xpath(nextButtonXPath)
while len(next):
url = site + next[0]
print (url, file=sys.stderr)
pageNum += 1
page = requests.get(url)
tree = html.fromstring(page.content)
if ((len(pageNums) == 0) or (pageNum in pageNums)):
newData = processData(tree, ext, blacklist)
if (unique):
for element in newData:
if (not element["md5"] in seen):
seen.add(element["md5"])
theData.append(element)
else:
theData += newData
next = tree.xpath(nextButtonXPath)
except Exception as e:
error(e)
return theData
def makeOutputDir(pageNum):
outputDir = str(pageNum).zfill(3)
try:
if (not os.path.exists(outputDir)):
os.mkdir(outputDir)
except Exception as e:
error(e)
return outputDir
def printTheData(theData, options):
md5only = options.md5only
download = options.download
elementsPerPage = options.elements
print("Number of files found: %d." % len(theData), file=sys.stderr)
try:
jsonData = json.dumps(theData, indent=4)
if (not md5only and not download):
print(jsonData)
else:
if (download):
paginate = False
outputDir = ""
elementNum = 1
if (len(theData) > elementsPerPage):
paginate = True
pageNum = 1
outputDir = makeOutputDir(pageNum)
for element in theData:
md5 = element["md5"].upper()
if (md5only):
print(md5)
if (download):
fileName = md5 + "." + element["ext"]
if (paginate):
if (elementNum > elementsPerPage):
elementNum = 1
pageNum += 1
outputDir = makeOutputDir(pageNum)
fileName = os.path.join(outputDir, fileName)
outputFile = wget.download(element["url"], out=fileName)
elementNum += 1
except Exception as e:
error(e)
def getList(argument):
return list(set(itertools.chain.from_iterable([range(int(y[0]), int(y[1]) + 1) for y in [(x.split('-') + [x])[:2] for x in argument.split(',')]])))
def main(args):
theData = []
theBlacklist = []
if (args.blacklist != None):
try:
with open(args.blacklist, "r") as blacklistFile:
theBlacklist = [line.split()[0] for line in blacklistFile.read().split('\n') if line.strip()]
except Exception as e:
error(e)
if (args.elements < 10):
args.elements = 10
if (args.elements > 200):
args.elements = 200
for ext in args.ext:
theData += processExtension(ext, theBlacklist, args)
printTheData(theData, args)
if __name__ == "__main__":
parser = argparse.ArgumentParser(version="%(prog)s version " + __VERSION__,
description="Scapes suspected malware from Wikileaks.")
parser.add_argument("-m", "--md5only", action="store_true", help="only list the hashes")
parser.add_argument("-d", "--download", action="store_true", help="download the files")
parser.add_argument("-s", "--spam", action="store_true", help="look in the spam folder too")
parser.add_argument("-p", "--duplicates", action="store_true", help="include duplicates")
parser.add_argument("-u", "--unique", action="store_true", help="fetch only one entry per unique MD5")
parser.add_argument("-e", "--elements", type=int, default=200, help="elements per page")
parser.add_argument("-b", "--blacklist", dest="blacklist", default=None, help="blacklist file")
parser.add_argument("-a", "--pages", type=getList, dest="pages", default=[], help="list of pages to process")
parser.add_argument("ext", nargs="+", help="file extension")
args = parser.parse_args()
main(args)
sys.exit(0)