1""" 2Parse a PDF file and print some information about it. 3 4Usage: 5 python parse_page_contents.py inputfile ... 6""" 7from Quartz import * 8import objc 9import sys 10import array 11 12import Quartz 13 14class MyDataScan (object): 15 def __init__(self): 16 self.numImagesWithColorThisPage = 0 17 self.numImageMasksThisPage = 0 18 self.numImagesMaskedWithMaskThisPage = 0 19 self.numImagesMaskedWithColorsThisPage = 0 20 21def printPageResults(outFile, myData, pageNum): 22 if myData.numImagesWithColorThisPage: 23 print >>outFile, "Found %d images with intrinsic color on page %d."%( 24 myData.numImagesWithColorThisPage, pageNum) 25 26 if myData.numImageMasksThisPage: 27 print >>outFile, "Found %d image masks on page %d."%( 28 myData.numImageMasksThisPage, 29 pageNum) 30 31 if myData.numImagesMaskedWithMaskThisPage: 32 print >>outFile, "Found %d images masked with masks on page %d."%( 33 myData.numImagesMaskedWithMaskThisPage, 34 pageNum) 35 36 if myData.numImagesMaskedWithColorsThisPage: 37 print >>outFile, "Found %d images masked with colors on page %d."%( 38 myData.numImagesMaskedWithColorsThisPage, 39 pageNum) 40 41def printDocResults(outFile, totPages, totImages): 42 print >>outFile 43 print >>outFile, "Summary: %d page document contains %d images."%( 44 totPages, totImages) 45 print >>outFile 46 47 48def checkImageType(imageDict, myScanData): 49 hasMaskKey, isMask = CGPDFDictionaryGetBoolean(imageDict, "ImageMask", None); 50 if not hasMaskKey: 51 hasMaskKey, isMask = CGPDFDictionaryGetBoolean(imageDict, "IM", None); 52 53 if hasMaskKey and isMask: 54 myScanData.numImageMasksThisPage += 1 55 return 56 57 # If image is masked with an alpha image it has an SMask entry. 58 hasSMaskKey, object = CGPDFDictionaryGetObject(imageDict, "SMask", None) 59 if hasSMaskKey: 60 # This object must be an XObject that is an image. 61 # This code assumes the PDF is well formed in this regard. 62 myScanData.numImagesMaskedWithMaskThisPage += 1 63 return 64 65 # If this image is masked with an image or with colors it has 66 # a Mask entry. 67 hasMask, object = CGPDFDictionaryGetObject(imageDict, "Mask", None) 68 if hasMask: 69 # If the object is an XObject then the mask is an image. 70 # If it is an array, the mask is an array of colors. 71 type = CGPDFObjectGetType(object) 72 # Check if it is a stream type which it must be to be an XObject. 73 if type == kCGPDFObjectTypeStream: 74 myScanData.numImagesMaskedWithMaskThisPage += 1 75 elif type == kCGPDFObjectTypeArray: 76 myScanData.numImagesMaskedWithColorsThisPage += 1 77 else: 78 print >>sys.stderr, "Mask entry in Image object is not well formed!" 79 80 return 81 82 # This image is not a mask, is not masked with another image or 83 # color so it must be an image with intrinsic color with no mask. 84 myScanData.numImagesWithColorThisPage += 1 85 86# The "Do" operator consumes one value off the stack, the name of 87# the object to execute. The name is a resource in the resource 88# dictionary of the page and the object corresponding to that name 89# is an XObject. The most common types of XObjects are either 90# Form objects or Image objects. This code only counts images. 91# 92# Note that forms, patterns, and potentially other resources contain 93# images. This code only counts the top level images in a PDF document, 94# not images embedded in other resources. 95@objc.callbackFor(CGPDFOperatorTableSetCallback) 96def myOperator_Do(s, info): 97 # Check to see if this is an image or not. 98 cs = CGPDFScannerGetContentStream(s) 99 100 # The Do operator takes a name. Pop the name off the 101 # stack. If this fails then the argument to the 102 # Do operator is not a name and is therefore invalid! 103 res, name = CGPDFScannerPopName(s, None) 104 if not res: 105 print >>sys.stderr, "Couldn't pop name off stack!" 106 return 107 108 # Get the resource with type "XObject" and the name 109 # obtained from the stack. 110 xobject = CGPDFContentStreamGetResource(cs, "XObject", name); 111 if xobject is None: 112 print >>sys.stderr, "Couldn't get XObject with name %s"%(name,) 113 return 114 115 # An XObject must be a stream so obtain the value from the xobject 116 # as if it were a stream. If this fails, the PDF is malformed. 117 res, stream = CGPDFObjectGetValue(xobject, kCGPDFObjectTypeStream, None) 118 if not res: 119 print >>sys.stderr, "XObject '%s' is not a stream"%(name,) 120 return 121 122 print stream 123 124 # Streams consist of a dictionary and the data associated 125 # with the stream. This code only cares about the dictionary. 126 dict = CGPDFStreamGetDictionary(stream); 127 if dict is None: 128 print >>sys.stderr, "Couldn't obtain dictionary from stream %s!"%(name,) 129 return 130 131 # An XObject dict has a Subtype that indicates what kind it is. 132 res, name = CGPDFDictionaryGetName(dict, "Subtype", None) 133 if not res: 134 print >>sys.stderr, "Couldn't get SubType of dictionary object!" 135 return 136 137 # This code is interested in the "Image" Subtype of an XObject. 138 # Check whether this object has Subtype of "Image". 139 if name != "Image": 140 # The Subtype is not "Image" so this must be a form 141 # or other type of XObject. 142 return 143 144 145 # This is an Image so figure out what variety of image it is. 146 checkImageType(dict, info) 147 148# This callback handles inline images. Inline images end with the 149# "EI" operator. 150@objc.callbackFor(CGPDFOperatorTableSetCallback) 151def myOperator_EI(s, info): 152 print "EI" 153 # When the scanner encounters the EI operator, it has a 154 # stream corresponding to the image on the operand stack. 155 # This code pops the stream off the stack in order to 156 # examine it. 157 res, stream = CGPDFScannerPopStream(s, None) 158 if not res: 159 print >>sys.stderr, "Couldn't create stream from inline image" 160 return 161 162 # Get the image dictionary from the stream. 163 dict = CGPDFStreamGetDictionary(stream); 164 if dict is None: 165 print >>sys.stderr, "Couldn't get dict from inline image stream!" 166 return 167 168 # By definition the stream passed to EI is an image so 169 # pass it to the code to check the type of image. 170 checkImageType(dict, info) 171 172def createMyOperatorTable(): 173 myTable = CGPDFOperatorTableCreate() 174 CGPDFOperatorTableSetCallback(myTable, "Do", myOperator_Do) 175 CGPDFOperatorTableSetCallback(myTable, "EI", myOperator_EI) 176 return myTable 177 178def dumpPageStreams(url, outFile): 179 # Create a CGPDFDocumentRef from the input PDF file. 180 pdfDoc = CGPDFDocumentCreateWithURL(url); 181 if pdfDoc is None: 182 print >>sys.stderr, "Couldn't open PDF document!" 183 return 184 185 # Create the operator table with the needed callbacks. 186 table = createMyOperatorTable(); 187 if table is None: 188 print >>sys.stderr, "Couldn't create operator table!" 189 return 190 191 # Initialize the count of the images. 192 totalImages = 0 193 194 # Obtain the total number of pages for the document. 195 totPages = CGPDFDocumentGetNumberOfPages(pdfDoc) 196 197 # Loop over all the pages in the document, scanning the 198 # content stream of each one. 199 for i in range(1, totPages+1): 200 # Get the PDF page for this page in the document. 201 p = CGPDFDocumentGetPage(pdfDoc, i) 202 203 # Create a reference to the content stream for this page. 204 cs = CGPDFContentStreamCreateWithPage(p) 205 206 if cs is None: 207 print >>sys.stderr, "Couldn't create content stream for page #%d"%(i,) 208 return 209 210 # Initialize the counters of images for this page. 211 myData = MyDataScan() 212 213 # Create a scanner for this PDF document page. 214 scanner = CGPDFScannerCreate(cs, table, 0); 215 if scanner is None: 216 print >>sys.stderr, "Couldn't create scanner for page #%d!"%(i,) 217 return 218 219 220 # CGPDFScannerScan causes Quartz to scan the content stream, 221 # calling the callbacks in the table when the corresponding 222 # operator is encountered. Once the content stream for the 223 # page has been consumed or Quartz detects a malformed 224 # content stream, CGPDFScannerScan returns. 225 if not CGPDFScannerScan(scanner): 226 print >>sys.stderr, "Scanner couldn't scan all of page #%d!"%(i,) 227 228 # Print the results for this page. 229 printPageResults(outFile, myData, i); 230 231 # Update the total count of images with the count of the 232 # images on this page. 233 totalImages += ( 234 myData.numImagesWithColorThisPage + 235 myData.numImageMasksThisPage + 236 myData.numImagesMaskedWithMaskThisPage + 237 myData.numImagesMaskedWithColorsThisPage) 238 239 # Once the page has been scanned, release the 240 # scanner for this page. 241 CGPDFScannerRelease(scanner) 242 # Release the content stream for this page. 243 CGPDFContentStreamRelease(cs) 244 # Done with this page; loop to next page. 245 246 printDocResults(outFile, totPages, totalImages) 247 248def main(args = None): 249 if args is None: 250 args = sys.argv 251 252 if len(args) < 2: 253 print >>sys.stderr, "Usage: %s inputfile ... "%(args[0],) 254 return 1 255 256 for inputFileName in args[1:]: 257 print "Beginning Document %r"%(inputFileName,) 258 259 print CFURLCreateFromFileSystemRepresentation.__metadata__() 260 inURL = CFURLCreateFromFileSystemRepresentation(None, inputFileName, 261 len(inputFileName), False) 262 if inURL is None: 263 print >>sys.stderr, "Couldn't create URL for input file!" 264 return 1 265 266 dumpPageStreams(inURL, sys.stdout) 267 #CFRelease(inURL) 268 269 return 0 270 271if __name__ == "__main__": 272 sys.exit(main()) 273