#!/usr/bin/python # -*- encoding: utf-8 -*- # # Microsoft Internet Explorer 5 can save a web pages as .mht files as an # multi-part message in MIME format. # # This script reverts the process, decoding the attached files and saving # them into a folder. The HTML content is modified to make all HREF relative # and everything is placed in the same directory, index.html as root document. # # I assume there is only a single HTML file in the message and I care only about # images with extensions: jpg jpeg gif png bmp, other attachments are ignored. # # Usage: # mht2html.py file.mht [more_files.mht] import email import os import os.path import sys def convert(this): """Parses the file and extracts the HTML code and the attached images.""" fin = open(this) msg = email.message_from_file(fin) fin.close() base = this[:this.rindex('.')] if msg.is_multipart(): parts = msg.get_payload() html = parts[0].get_payload(decode = True) imgs = parts[1:] try: os.mkdir(base) except OSError, e: if e.errno != 17: # Exit if error unless file already exists, which is OK print "ERROR: Cannot create %s: %s" % (base, e) sys.exit(1) # Save the images, taking their filenames from their Content-Location header for img in [p for p in parts[1:] if p.get_content_maintype() == 'image']: location = [t[1] for t in img._headers if t[0] == 'Content-Location'][0] filename = os.path.basename(location) fd = open(os.path.join(base, filename), 'wb') fd.write(img.get_payload(decode = True)) fd.close() # In the HTML content, replace the images location (full URL) for their file name html = html.replace(location, filename) # Save the HTML content into index.html fd = open(os.path.join(base, 'index.html'), 'w') fd.write(html) fd.close() else: fout = open(base + '.txt', 'w') msg.get_payload(fout, decode = True) fout.close() if __name__ == '__main__': if len(sys.argv) == 0: print "Usage: %s file.mht [more_files.mht]" % sys.argv[0] for each in sys.argv[1:]: convert(each)