-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcheck.py
23 lines (20 loc) · 838 Bytes
/
check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import re
import collections
import sys
filename = 'data.txt'
if len(sys.argv) >= 2:
filename = sys.argv[1]
textfile = open(filename, 'r')
filetext = textfile.read()
textfile.close()
#New method might not work on some site indexes, if they don't have a ':' in front of the address
matches = re.findall(":[a-zA-Z0-9]{30,34}", filetext)
#This old method gets false positives, when the name and address of a site are the same
#matches = re.findall("[a-zA-Z0-9]{30,34}", filetext)
#Convert to lowercase because some indexes don't care about accuracy
matches = [x.lower() for x in matches]
print "Total number of sites: ",len(matches)
dups = [item for item, count in collections.Counter(matches).items() if count > 1]
print "Number of Duplicates: ", len(dups)
print "Unique sites: ", len(matches)-len(dups)
print "Duplicated Sites: ", dups