1# xsxp.tcl --
2#
3###Abstract
4# Extremely Simple XML Parser
5#
6# This is pretty lame, but I needed something like this for S3,
7# and at the time, TclDOM would not work with the new 8.5 Tcl
8# due to version number problems.
9#
10# In addition, this is a pure-value implementation. There is no
11# garbage to clean up in the event of a thrown error, for example.
12# This simplifies the code for sufficiently small XML documents,
13# which is what Amazon's S3 guarantees.
14#
15###Copyright
16# Copyright (c) 2006 Darren New.
17# All Rights Reserved.
18# NO WARRANTIES OF ANY TYPE ARE PROVIDED.
19# COPYING OR USE INDEMNIFIES THE AUTHOR IN ALL WAYS.
20# See the license terms in LICENSE.txt
21#
22###Revision String
23# SCCS: %Z% %M% %I% %E% %U%
24
25# xsxp::parse $xml
26# Returns a parsed XML, or PXML. A pxml is a list.
27# The first element is the name of the tag.
28# The second element is a list of name/value pairs of the
29# associated attribues, if any.
30# The third thru final values are recursively PXML values.
31# If the first element (element zero, that is) is "%PCDATA",
32# then the attributes will be emtpy and the third element
33# will be the text of the element.
34
35# xsxp::fetch $pxml $path ?$part?
36# $pxml is a parsed XML, as returned from xsxp::parse.
37# $path is a list of elements. Each element is the name of
38# a child to look up, optionally followed by a hash ("#")
39# and a string of digits. An emtpy list or an initial empty
40# element selects $pxml. If no hash sign is present, the
41# behavior is as if "#0" had been appended to that element.
42# An element of $path scans the children at the indicated
43# level for the n'th instance of a child whose tag matches
44# the part of the element before the hash sign. If an element
45# is simply "#" followed by digits, that indexed child is
46# selected, regardless of the tags in the children. So
47# an element of #3 will always select the fourth child
48# of the node under consideration.
49# $part defaults to %ALL. It can be one of the following:
50# %ALL - returns the entire selected element.
51# %TAGNAME - returns lindex 0 of the selected element.
52# %ATTRIBUTES - returns lindex 1 of the selected element.
53# %CHILDREN - returns lrange 2 through end of the selected element,
54#   resulting in a list of elements being returned.
55# %PCDATA - returns a concatenation of all the bodies of
56#   direct children of this node whose tag is %PCDATA.
57#   Throws an error if no such children are found. That
58#   is, part=%PCDATA means return the textual content found
59#   in that node but not its children nodes.
60# %PCDATA? - like %PCDATA, but returns an empty string if
61#   no PCDATA is found.
62
63# xsxp::fetchall $pxml_list $path ?$part?
64# Iterates over each PXML in $pxml_list, selecting the indicated
65# path from it, building a new list with the selected data, and
66# returning that new list. For example, $pxml_list might be
67# the %CHILDREN of a particular element, and the $path and $part
68# might select from each child a sub-element in which we're interested.
69
70# xsxp::only $pxml $tagname
71# Iterates over the direct children of $pxml and selects  only
72# those with $tagname as their tag. Returns a list of matching
73# elements.
74
75# xsxp::prettyprint $pxml
76# Outputs to stdout a nested-list notation of the parsed XML.
77
78package require xml
79package provide xsxp 1.0
80
81namespace eval xsxp {
82
83    variable Stack
84    variable Cur
85
86    proc Characterdatacommand {characterdata} {
87	variable Cur
88	# puts "characterdatacommand $characterdata"
89	set x [list %PCDATA {} $characterdata]
90	lappend Cur $x
91    }
92
93    proc Elementstartcommand {name attlist args} {
94	# puts "elementstart $name {$attlist} $args"
95	variable Stack
96	variable Cur
97	lappend Stack $Cur
98	set Cur [list $name $attlist]
99    }
100
101    proc Elementendcommand {args} {
102	# puts "elementend $args"
103	variable Stack
104	variable Cur
105	set x [lindex $Stack end]
106	lappend x $Cur
107	set Cur $x
108	set Stack [lrange $Stack 0 end-1]
109    }
110
111    proc parse {xml} {
112	variable Cur
113	variable Stack
114	set Cur {}
115	set Stack {}
116	set parser [::xml::parser \
117	    -characterdatacommand [namespace code Characterdatacommand] \
118	    -elementstartcommand [namespace code Elementstartcommand] \
119	    -elementendcommand [namespace code Elementendcommand] \
120	    -ignorewhitespace 1 -final 1
121        ]
122	$parser parse $xml
123	$parser free
124	# The following line is needed because the close of the last element
125	# appends the outermost element to the item on the top of the stack.
126	# Since there's nothing on the top of the stack at the close of the
127	# last element, we append the current element to an empty list.
128	# In essence, since we don't really have a terminating condition
129	# on the recursion, an empty stack is still treated like an element.
130	set Cur [lindex $Cur 0]
131        set Cur [Normalize $Cur]
132        return $Cur
133    }
134
135    proc Normalize {pxml} {
136	# This iterates over pxml recursively, finding entries that
137	# start with multiple %PCDATA elements, and coalesces their
138	# content, so if an element contains only %PCDATA, it is
139	# guaranteed to have only one child.
140	# Not really necessary, given definition of part=%PCDATA
141	# However, it makes pretty-prints nicer (for AWS at least)
142	# and ends up with smaller lists. I have no idea why they
143	# would put quotes around an MD5 hash in hex, tho.
144	set dupl 1
145	while {$dupl} {
146	    set first [lindex $pxml 2]
147	    set second [lindex $pxml 3]
148	    if {[lindex $first 0] eq "%PCDATA" && [lindex $second 0] eq "%PCDATA"} {
149		set repl [list %PCDATA {} [lindex $first 2][lindex $second 2]]
150		set pxml [lreplace $pxml 2 3 $repl]
151	    } else {
152		set dupl 0
153		for {set i 2} {$i < [llength $pxml]} {incr i} {
154		    set pxml [lreplace $pxml $i $i [Normalize [lindex $pxml $i]]]
155		}
156	    }
157	}
158	return $pxml
159    }
160
161    proc prettyprint {pxml {chan stdout} {indent 0}} {
162	puts -nonewline $chan [string repeat "  " $indent]
163	if {[lindex $pxml 0] eq "%PCDATA"} {
164	    puts $chan "%PCDATA: [lindex $pxml 2]"
165	    return
166	}
167	puts -nonewline $chan "[lindex $pxml 0]"
168	foreach {name val} [lindex $pxml 1] {
169	    puts -nonewline $chan " $name='$val'"
170	}
171	puts $chan ""
172	foreach node [lrange $pxml 2 end] {
173	    prettyprint $node $chan [expr $indent+1]
174	}
175    }
176
177    proc fetch {pxml path {part %ALL}} {
178	set path [string trim $path /]
179	if {-1 != [string first / $path]} {
180	    set path [split $path /]
181	}
182	foreach element $path {
183	    if {$pxml eq ""} {return ""}
184	    foreach {tag count} [split $element #] {
185		if {$tag ne ""} {
186		    if {$count eq ""} {set count 0}
187		    set pxml [lrange $pxml 2 end]
188		    while {0 <= $count && 0 != [llength $pxml]} {
189			if {$tag eq [lindex $pxml 0 0]} {
190			    incr count -1
191			    if {$count < 0} {
192				# We're done. Go on to next element.
193				set pxml [lindex $pxml 0]
194			    } else {
195				# Not done yet. Throw this away.
196				set pxml [lrange $pxml 1 end]
197			    }
198			} else {
199			    # Not what we want.
200			    set pxml [lrange $pxml 1 end]
201			}
202		    }
203		} else { # tag eq ""
204		    if {$count eq ""} {
205			# Just select whole $pxml
206		    } else {
207			set pxml [lindex $pxml [expr {2+$count}]]
208		    }
209		}
210		break
211	    } ; # done the foreach [split] loop
212	} ; # done all the elements.
213	if {$part eq "%ALL"} {return $pxml}
214	if {$part eq "%ATTRIBUTES"} {return [lindex $pxml 1]}
215	if {$part eq "%TAGNAME"} {return [lindex $pxml 0]}
216	if {$part eq "%CHILDREN"} {return [lrange $pxml 2 end]}
217	if {$part eq "%PCDATA" || $part eq "%PCDATA?"} {
218	    set res "" ; set found 0
219	    foreach elem [lrange $pxml 2 end] {
220		if {"%PCDATA" eq [lindex $elem 0]} {
221		    append res [lindex $elem 2]
222		    set found 1
223		}
224	    }
225	    if {$found || $part eq "%PCDATA?"} {
226		return $res
227	    } else {
228		error "xsxp::fetch did not find requested PCDATA"
229	    }
230	}
231	return $pxml ; # Don't know what he's after
232    }
233
234    proc only {pxml tag} {
235	set res {}
236	foreach element [lrange $pxml 2 end] {
237	    if {[lindex $element 0] eq $tag} {
238		lappend res $element
239	    }
240	}
241	return $res
242    }
243
244    proc fetchall {pxml_list path {part %ALL}} {
245	set res [list]
246	foreach pxml $pxml_list {
247	    lappend res [fetch $pxml $path $part]
248	}
249	return $res
250    }
251}
252
253namespace export xsxp parse prettyprint fetch
254
255