• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /macosx-10.10/pyobjc-45/2.6/pyobjc/pyobjc-framework-Quartz/Examples/Programming with Quartz/ParsePageContents/
1"""
2Parse a PDF file and print some information about it.
3
4Usage:
5    python parse_page_contents.py inputfile ...
6"""
7from Quartz import *
8import objc
9import sys
10import array
11
12import Quartz
13
14class MyDataScan (object):
15    def __init__(self):
16        self.numImagesWithColorThisPage = 0
17        self.numImageMasksThisPage = 0
18        self.numImagesMaskedWithMaskThisPage = 0
19        self.numImagesMaskedWithColorsThisPage = 0
20
21def printPageResults(outFile, myData, pageNum):
22    if myData.numImagesWithColorThisPage:
23        print >>outFile, "Found %d images with intrinsic color on page %d."%(
24                myData.numImagesWithColorThisPage, pageNum)
25
26    if myData.numImageMasksThisPage:
27        print >>outFile, "Found %d image masks on page %d."%(
28                    myData.numImageMasksThisPage,
29                    pageNum)
30
31    if myData.numImagesMaskedWithMaskThisPage:
32        print >>outFile, "Found %d images masked with masks on page %d."%(
33                    myData.numImagesMaskedWithMaskThisPage,
34                    pageNum)
35
36    if myData.numImagesMaskedWithColorsThisPage:
37        print >>outFile, "Found %d images masked with colors on page %d."%(
38                    myData.numImagesMaskedWithColorsThisPage,
39                    pageNum)
40
41def printDocResults(outFile, totPages, totImages):
42    print >>outFile
43    print >>outFile, "Summary: %d page document contains %d images."%(
44			totPages, totImages)
45    print >>outFile
46
47
48def checkImageType(imageDict, myScanData):
49    hasMaskKey, isMask = CGPDFDictionaryGetBoolean(imageDict, "ImageMask", None);
50    if not hasMaskKey:
51        hasMaskKey, isMask = CGPDFDictionaryGetBoolean(imageDict, "IM", None);
52
53    if hasMaskKey and isMask:
54        myScanData.numImageMasksThisPage += 1
55        return
56
57    # If image is masked with an alpha image it has an SMask entry.
58    hasSMaskKey, object = CGPDFDictionaryGetObject(imageDict, "SMask", None)
59    if hasSMaskKey:
60        # This object must be an XObject that is an image.
61        # This code assumes the PDF is well formed in this regard.
62        myScanData.numImagesMaskedWithMaskThisPage += 1
63        return
64
65    # If this image is masked with an image or with colors it has
66    # a Mask entry.
67    hasMask, object = CGPDFDictionaryGetObject(imageDict, "Mask", None)
68    if hasMask:
69        # If the object is an XObject then the mask is an image.
70        # If it is an array, the mask is an array of colors.
71        type = CGPDFObjectGetType(object)
72        # Check if it is a stream type which it must be to be an XObject.
73        if type == kCGPDFObjectTypeStream:
74            myScanData.numImagesMaskedWithMaskThisPage += 1
75        elif type == kCGPDFObjectTypeArray:
76            myScanData.numImagesMaskedWithColorsThisPage += 1
77        else:
78            print >>sys.stderr, "Mask entry in Image object is not well formed!"
79
80        return
81
82    # This image is not a mask, is not masked with another image or
83    # color so it must be an image with intrinsic color with no mask.
84    myScanData.numImagesWithColorThisPage += 1
85
86#	The "Do" operator consumes one value off the stack, the name of
87# 	the object to execute. The name is a resource in the resource
88# 	dictionary of the page and the object corresponding to that name
89# 	is an XObject. The most common types of XObjects are either
90# 	Form objects or Image objects. This code only counts images.
91#
92#	Note that forms, patterns, and potentially other resources contain
93#	images. This code only counts the top level images in a PDF document,
94#	not images embedded in other resources.
95@objc.callbackFor(CGPDFOperatorTableSetCallback)
96def myOperator_Do(s, info):
97    # Check to see if this is an image or not.
98    cs = CGPDFScannerGetContentStream(s)
99
100    # The Do operator takes a name. Pop the name off the
101    # stack. If this fails then the argument to the
102    # Do operator is not a name and is therefore invalid!
103    res, name = CGPDFScannerPopName(s, None)
104    if not res:
105        print >>sys.stderr, "Couldn't pop name off stack!"
106        return
107
108    # Get the resource with type "XObject" and the name
109    # obtained from the stack.
110    xobject = CGPDFContentStreamGetResource(cs, "XObject", name);
111    if xobject is None:
112        print >>sys.stderr, "Couldn't get XObject with name %s"%(name,)
113        return
114
115    # An XObject must be a stream so obtain the value from the xobject
116    # as if it were a stream. If this fails, the PDF is malformed.
117    res, stream = CGPDFObjectGetValue(xobject, kCGPDFObjectTypeStream, None)
118    if not res:
119        print >>sys.stderr, "XObject '%s' is not a stream"%(name,)
120        return
121
122    print stream
123
124    # Streams consist of a dictionary and the data associated
125    # with the stream. This code only cares about the dictionary.
126    dict = CGPDFStreamGetDictionary(stream);
127    if dict is None:
128        print >>sys.stderr, "Couldn't obtain dictionary from stream %s!"%(name,)
129        return
130
131    # An XObject dict has a Subtype that indicates what kind it is.
132    res, name = CGPDFDictionaryGetName(dict, "Subtype", None)
133    if not res:
134        print >>sys.stderr, "Couldn't get SubType of dictionary object!"
135        return
136
137    # This code is interested in the "Image" Subtype of an XObject.
138    # Check whether this object has Subtype of "Image".
139    if name != "Image":
140        # The Subtype is not "Image" so this must be a form
141        # or other type of XObject.
142        return
143
144
145    # This is an Image so figure out what variety of image it is.
146    checkImageType(dict, info)
147
148# This callback handles inline images. Inline images end with the
149# "EI" operator.
150@objc.callbackFor(CGPDFOperatorTableSetCallback)
151def myOperator_EI(s, info):
152    print "EI"
153    # When the scanner encounters the EI operator, it has a
154    # stream corresponding to the image on the operand stack.
155    # This code pops the stream off the stack in order to
156    # examine it.
157    res, stream = CGPDFScannerPopStream(s, None)
158    if not res:
159        print >>sys.stderr, "Couldn't create stream from inline image"
160        return
161
162    # Get the image dictionary from the stream.
163    dict = CGPDFStreamGetDictionary(stream);
164    if dict is None:
165        print >>sys.stderr, "Couldn't get dict from inline image stream!"
166        return
167
168    # By definition the stream passed to EI is an image so
169    # pass it to the code to check the type of image.
170    checkImageType(dict, info)
171
172def createMyOperatorTable():
173    myTable = CGPDFOperatorTableCreate()
174    CGPDFOperatorTableSetCallback(myTable, "Do", myOperator_Do)
175    CGPDFOperatorTableSetCallback(myTable, "EI", myOperator_EI)
176    return myTable
177
178def dumpPageStreams(url, outFile):
179    # Create a CGPDFDocumentRef from the input PDF file.
180    pdfDoc = CGPDFDocumentCreateWithURL(url);
181    if pdfDoc is None:
182        print >>sys.stderr, "Couldn't open PDF document!"
183        return
184
185    # Create the operator table with the needed callbacks.
186    table = createMyOperatorTable();
187    if table is None:
188        print >>sys.stderr, "Couldn't create operator table!"
189        return
190
191    # Initialize the count of the images.
192    totalImages = 0
193
194    # Obtain the total number of pages for the document.
195    totPages = CGPDFDocumentGetNumberOfPages(pdfDoc)
196
197    # Loop over all the pages in the document, scanning the
198    # content stream of each one.
199    for i in range(1, totPages+1):
200		# Get the PDF page for this page in the document.
201		p = CGPDFDocumentGetPage(pdfDoc, i)
202
203		# Create a reference to the content stream for this page.
204		cs = CGPDFContentStreamCreateWithPage(p)
205
206                if cs is None:
207                    print >>sys.stderr, "Couldn't create content stream for page #%d"%(i,)
208                    return
209
210		# Initialize the counters of images for this page.
211                myData = MyDataScan()
212
213		# Create a scanner for this PDF document page.
214		scanner = CGPDFScannerCreate(cs, table, 0);
215                if scanner is None:
216			print >>sys.stderr, "Couldn't create scanner for page #%d!"%(i,)
217			return
218
219
220		# CGPDFScannerScan causes Quartz to scan the content stream,
221		# calling the callbacks in the table when the corresponding
222		# operator is encountered. Once the content stream for the
223		# page has been consumed or Quartz detects a malformed
224		# content stream, CGPDFScannerScan returns.
225                if not CGPDFScannerScan(scanner):
226                    print >>sys.stderr, "Scanner couldn't scan all of page #%d!"%(i,)
227
228		# Print the results for this page.
229		printPageResults(outFile, myData, i);
230
231		# Update the total count of images with the count of the
232		# images on this page.
233		totalImages += (
234			myData.numImagesWithColorThisPage +
235			myData.numImageMasksThisPage +
236			myData.numImagesMaskedWithMaskThisPage +
237			myData.numImagesMaskedWithColorsThisPage)
238
239		# Once the page has been scanned, release the
240		# scanner for this page.
241		CGPDFScannerRelease(scanner)
242		# Release the content stream for this page.
243		CGPDFContentStreamRelease(cs)
244		# Done with this page; loop to next page.
245
246    printDocResults(outFile, totPages, totalImages)
247
248def main(args = None):
249    if args is None:
250        args = sys.argv
251
252    if len(args) < 2:
253        print >>sys.stderr, "Usage: %s inputfile ... "%(args[0],)
254        return 1
255
256    for inputFileName in args[1:]:
257        print "Beginning Document %r"%(inputFileName,)
258
259        print CFURLCreateFromFileSystemRepresentation.__metadata__()
260        inURL = CFURLCreateFromFileSystemRepresentation(None, inputFileName,
261				len(inputFileName), False)
262        if inURL is None:
263            print >>sys.stderr, "Couldn't create URL for input file!"
264            return 1
265
266        dumpPageStreams(inURL, sys.stdout)
267        #CFRelease(inURL)
268
269    return 0
270
271if __name__ == "__main__":
272    sys.exit(main())
273