1import email, urllib
2from WebKit import *
3
4def loadMHT(filename):
5    """
6    Load a .HMT HTML archive and return the WebArchive representation.
7    """
8    return HMTLoad(filename).asWebArchive()
9
10
11class MHTLoader (object):
12    """
13    A loader for .mht files, and archive format used by MS Internet Explorer
14    on Windows.
15    """
16
17    def __init__(self, filename):
18        self.filename = filename
19
20        # root of the archive (index into self.parts)
21        self.root = None
22
23        # filename -> (content-type, data)
24        self.parts = {}
25
26        self.loadFile(filename)
27
28    def loadFile(self, filename):
29        fp = open(filename, 'r')
30        msg = email.message_from_file(fp)
31        fp.close()
32
33        for part in msg.walk():
34            if part.get_content_maintype() == 'multipart':
35                continue
36
37            filename = part.get('Content-Location')
38            contentType = part.get_content_type()
39            data = part.get_payload(decode=True)
40
41            self.parts[filename] = (contentType, data)
42            if self.root is None:
43                self.root = filename
44
45    def fixupURL(self, url):
46        # IE creates MHT files with file: URLS containing backslashes,
47        # NSURL insists that those are invalid, replace backslashes by
48        # forward slashes.
49        if url.startswith('file:'):
50            return url.replace('\\', '/')
51        else:
52            return url
53
54    def asWebArchive(self):
55        """
56        Convert the MHT archive to a webarchive.
57        """
58        rootType, rootText = self.parts[self.root]
59        pageResource = WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_(
60                NSData.dataWithBytes_length_(rootText.replace('\\', '/'), len(rootText)),
61                NSURL.URLWithString_(self.fixupURL(self.root)),
62                NSString.stringWithString_(rootType),
63                None,
64                None)
65
66        resources = []
67        for url in self.parts:
68            if url == self.root: continue
69
70            tp, data = self.parts[url]
71            resources.append(WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_(
72                NSData.dataWithBytes_length_(data, len(data)),
73                NSURL.URLWithString_(self.fixupURL(url)),
74                NSString.stringWithString_(tp),
75                None,
76                None))
77
78        return WebArchive.alloc().initWithMainResource_subresources_subframeArchives_(
79                pageResource, resources, None)
80
81
82def main():
83    # Testing...
84    p = MHTLoader('audit-web.mht')
85    a = p.asWebArchive()
86    d = a.data()
87    fp = open('audit-web.webarchive', 'wb')
88    fp.write(a.data().bytes())
89    fp.close()
90
91if __name__ == "__main__":
92    main()
93
94