Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Saskwoch patch 1 #1

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 51 additions & 29 deletions mbox-extract-attachments.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

# Related RFCs: 2047, 2044, 1522

# 20150907 JMW Python Version 3.4.3 and other changes

__author__ = "Pablo Castellano <[email protected]>"
__license__ = "GNU GPLv3+"
Expand All @@ -36,21 +37,35 @@
import os
import sys
import email

import codecs # 20150907 JMW For codec error handling

BLACKLIST = ('signature.asc', 'message-footer.txt', 'smime.p7s')
VERBOSE = 1

attachments = 0 #Count extracted attachment
skipped = 0

# 20150907 JMW For codec error handling
# Taken from http://www.gossamer-threads.com/lists/python/python/780611#780611
def replace_spc_error_handler(error):
# error is an UnicodeEncodeError/UnicodeDecodeError instance
# with these attributes:
# object = unicode object being encoded
# start:end = slice of object with error
# reason = error message
# Must return a tuple (replacement unicode object,
# index into object to continue encoding)
# or raise the same or another exception
return (u' ' * (error.end-error.start), error.end)


# Search for filename or find recursively if it's multipart
def extract_attachment(payload):
global attachments, skipped
filename = payload.get_filename()

if filename is not None:
print "\nAttachment found!"
print("\nAttachment found!")
if filename.find('=?') != -1:
ll = email.header.decode_header(filename)
filename = ""
Expand All @@ -60,7 +75,7 @@ def extract_attachment(payload):
if filename in BLACKLIST:
skipped = skipped + 1
if (VERBOSE >= 1):
print "Skipping %s (blacklist)\n" %filename
print("Skipping %s (blacklist)\n" %filename)
return

# Puede no venir especificado el nombre del archivo??
Expand All @@ -70,15 +85,15 @@ def extract_attachment(payload):
content = payload.as_string()
# Skip headers, go to the content
fh = content.find('\n\n')
content = content[fh:]
content = content[fh:].encode('utf-8') # 20150709 JMW Address error I was getting

# if it's base64....
if payload.get('Content-Transfer-Encoding') == 'base64':
content = base64.decodestring(content)
# quoted-printable
# what else? ...

print "Extracting %s (%d bytes)\n" %(filename, len(content))
print("Extracting %s (%d bytes)\n" %(filename, len(content)))

n = 1
orig_filename = filename
Expand All @@ -87,74 +102,81 @@ def extract_attachment(payload):
n = n+1

try:
fp = open(filename, "w")
fp = open(filename, "wb") # 201500907 JMW Needed to make binary
# fp = open(str(i) + "_" + filename, "w")
fp.write(content)
except IOError:
print "Aborted, IOError!!!"
print("Aborted, IOError!!!")
sys.exit(2)
finally:
fp.close()

attachments = attachments + 1
else:
if payload.is_multipart():
for payl in payload.get_payload():
for payl in payload.get_payload(decode=False): # 20150907 JMW Needed to make decode=False
extract_attachment(payl)


###
print "Extract attachments from mbox files"
print "Copyright (C) 2012 Pablo Castellano"
print "This program comes with ABSOLUTELY NO WARRANTY."
print "This is free software, and you are welcome to redistribute it under certain conditions."
print
print("Extract attachments from mbox files")
print("Copyright (C) 2012 Pablo Castellano")
print("This program comes with ABSOLUTELY NO WARRANTY.")
print("This is free software, and you are welcome to redistribute it under certain conditions.")
print()

codecs.register_error("replace_spc", replace_spc_error_handler) # 20150907 JMW Register error handler

if len(sys.argv) < 2 or len(sys.argv) > 3:
print "Usage: %s <mbox_file> [directory]" %sys.argv[0]
print("Usage: %s <mbox_file> [directory]" %sys.argv[0])
sys.exit(0)

filename = sys.argv[1]
directory = os.path.curdir

if not os.path.exists(filename):
print "File doesn't exist:", filename
print("File doesn't exist:", filename)
sys.exit(1)

if len(sys.argv) == 3:
directory = sys.argv[2]
if not os.path.exists(directory) or not os.path.isdir(directory):
print "Directory doesn't exist:", directory
print("Directory doesn't exist:", directory)
sys.exit(1)

mb = mailbox.mbox(filename)
nmes = len(mb)
# nmes = len(mb) # 20150907 JMW Commented out as not used and a performance hit

os.chdir(directory)

for i in range(len(mb)):
for i in range(len(mb)): # 20150907 JMW With >140k msgs I used a numeric literal here for performance reasons
if (VERBOSE >= 2):
print "Analyzing message number", i
print("Analyzing message number", i)

mes = mb.get_message(i)
em = email.message_from_string(mes.as_string())

subject = em.get('Subject')
if subject.find('=?') != -1:
ll = email.header.decode_header(subject)
subject = ""
for l in ll:
subject = subject + l[0]
# 20150907 JMW Ended up adding exception handling to skip two messages that were throwing NonType
try:
if subject.find('=?') != -1:
ll = email.header.decode_header(subject)
subject = ""
for l in ll:
subject = subject + l[0].decode('utf-8', "replace_spc") # 20150907 JMW Explicit decode
except AttributeError:
print("NonType encountered")
continue

em_from = em.get('From')
if em_from.find('=?') != -1:
ll = email.header.decode_header(em_from)
em_from = ""
for l in ll:
em_from = em_from + l[0]
em_from = em_from + l[0].decode('utf-8', "replace_spc") # 20150907 JMW Explicit decode

if (VERBOSE >= 2):
print "%s - From: %s" %(subject, em_from)
print("%s - From: %s" %(subject, em_from))

filename = mes.get_filename()

Expand All @@ -165,6 +187,6 @@ def extract_attachment(payload):
else:
extract_attachment(em)

print "\n--------------"
print "Total attachments extracted:", attachments
print "Total attachments skipped:", skipped
print("\n--------------")
print("Total attachments extracted:", attachments)
print("Total attachments skipped:", skipped)