1import email, urllib 2from WebKit import * 3 4def loadMHT(filename): 5 """ 6 Load a .HMT HTML archive and return the WebArchive representation. 7 """ 8 return HMTLoad(filename).asWebArchive() 9 10 11class MHTLoader (object): 12 """ 13 A loader for .mht files, and archive format used by MS Internet Explorer 14 on Windows. 15 """ 16 17 def __init__(self, filename): 18 self.filename = filename 19 20 # root of the archive (index into self.parts) 21 self.root = None 22 23 # filename -> (content-type, data) 24 self.parts = {} 25 26 self.loadFile(filename) 27 28 def loadFile(self, filename): 29 fp = open(filename, 'r') 30 msg = email.message_from_file(fp) 31 fp.close() 32 33 for part in msg.walk(): 34 if part.get_content_maintype() == 'multipart': 35 continue 36 37 filename = part.get('Content-Location') 38 contentType = part.get_content_type() 39 data = part.get_payload(decode=True) 40 41 self.parts[filename] = (contentType, data) 42 if self.root is None: 43 self.root = filename 44 45 def fixupURL(self, url): 46 # IE creates MHT files with file: URLS containing backslashes, 47 # NSURL insists that those are invalid, replace backslashes by 48 # forward slashes. 49 if url.startswith('file:'): 50 return url.replace('\\', '/') 51 else: 52 return url 53 54 def asWebArchive(self): 55 """ 56 Convert the MHT archive to a webarchive. 57 """ 58 rootType, rootText = self.parts[self.root] 59 pageResource = WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_( 60 NSData.dataWithBytes_length_(rootText.replace('\\', '/'), len(rootText)), 61 NSURL.URLWithString_(self.fixupURL(self.root)), 62 NSString.stringWithString_(rootType), 63 None, 64 None) 65 66 resources = [] 67 for url in self.parts: 68 if url == self.root: continue 69 70 tp, data = self.parts[url] 71 resources.append(WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_( 72 NSData.dataWithBytes_length_(data, len(data)), 73 NSURL.URLWithString_(self.fixupURL(url)), 74 NSString.stringWithString_(tp), 75 None, 76 None)) 77 78 return WebArchive.alloc().initWithMainResource_subresources_subframeArchives_( 79 pageResource, resources, None) 80 81 82def main(): 83 # Testing... 84 p = MHTLoader('audit-web.mht') 85 a = p.asWebArchive() 86 d = a.data() 87 fp = open('audit-web.webarchive', 'wb') 88 fp.write(a.data().bytes()) 89 fp.close() 90 91if __name__ == "__main__": 92 main() 93