-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract_usernames.py
executable file
·98 lines (75 loc) · 2.75 KB
/
extract_usernames.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#! /usr/bin/env python3
import argparse
from html.parser import HTMLParser
import re
import requests
class Options(argparse.ArgumentParser):
"""
Consolidates the argument handling.
"""
def __init__(self):
super().__init__(description='Extract a list of user names from a given post.')
self.parsed_args = None
self._blacklist = []
self.add_argument("post",
help="The post to extract the names from")
self.add_argument("--exclude", nargs='+', metavar="USER",
help="Specifically exclude the given user(s)")
self.add_argument("--exclude-list", metavar="FILE",
help="Specifically exclude the user(s) given in the file (one per line)")
#
def store_args(self):
self.parsed_args = self.parse_args()
self._compile_blacklist()
def _compile_blacklist(self):
self._add_command_line_exclusions()
self._exclude_channels_from_file()
def _add_command_line_exclusions(self):
if self.parsed_args.exclude:
for user in self.parsed_args.exclude:
self._blacklist.append(user)
def _exclude_channels_from_file(self):
if self.parsed_args.exclude_list:
with open(self.parsed_args.exclude_list, 'r') as f:
for line in f:
self._blacklist.append(line.rstrip('\n') )
def filter_users(self, users):
filtered = []
for user in users:
if user not in self._blacklist:
filtered.append(user)
return filtered
class MyParser(HTMLParser):
def __init__(self, text):
super().__init__()
self.extract = False
self.extracted = []
self.usernames = []
self.feed(text)
self._extract_usernames()
def handle_starttag(self, tag, attrs):
if tag == "ts-rocket":
self.extract = True
def handle_endtag(self, tag):
if tag == "ts-rocket":
self.extract = False
def handle_data(self, data):
if self.extract:
if not "editors are" in data:
self.extracted.append(data)
def _extract_usernames(self):
found = []
matcher = re.compile("(@[a-zA-Z][a-zA-Z._]+( [A-Z][a-zA-Z.-]*)?)")
for line in self.extracted:
found.extend(re.findall(matcher, line))
unique = set()
for full, last in found:
unique.add(full)
self.usernames = sorted(unique, key=lambda s: s.casefold())
if __name__ == '__main__':
options = Options()
options.store_args()
post = requests.get(options.parsed_args.post)
parser = MyParser(post.text)
for user in options.filter_users(parser.usernames):
print(user)