1# xsxp.tcl -- 2# 3###Abstract 4# Extremely Simple XML Parser 5# 6# This is pretty lame, but I needed something like this for S3, 7# and at the time, TclDOM would not work with the new 8.5 Tcl 8# due to version number problems. 9# 10# In addition, this is a pure-value implementation. There is no 11# garbage to clean up in the event of a thrown error, for example. 12# This simplifies the code for sufficiently small XML documents, 13# which is what Amazon's S3 guarantees. 14# 15###Copyright 16# Copyright (c) 2006 Darren New. 17# All Rights Reserved. 18# NO WARRANTIES OF ANY TYPE ARE PROVIDED. 19# COPYING OR USE INDEMNIFIES THE AUTHOR IN ALL WAYS. 20# See the license terms in LICENSE.txt 21# 22###Revision String 23# SCCS: %Z% %M% %I% %E% %U% 24 25# xsxp::parse $xml 26# Returns a parsed XML, or PXML. A pxml is a list. 27# The first element is the name of the tag. 28# The second element is a list of name/value pairs of the 29# associated attribues, if any. 30# The third thru final values are recursively PXML values. 31# If the first element (element zero, that is) is "%PCDATA", 32# then the attributes will be emtpy and the third element 33# will be the text of the element. 34 35# xsxp::fetch $pxml $path ?$part? 36# $pxml is a parsed XML, as returned from xsxp::parse. 37# $path is a list of elements. Each element is the name of 38# a child to look up, optionally followed by a hash ("#") 39# and a string of digits. An emtpy list or an initial empty 40# element selects $pxml. If no hash sign is present, the 41# behavior is as if "#0" had been appended to that element. 42# An element of $path scans the children at the indicated 43# level for the n'th instance of a child whose tag matches 44# the part of the element before the hash sign. If an element 45# is simply "#" followed by digits, that indexed child is 46# selected, regardless of the tags in the children. So 47# an element of #3 will always select the fourth child 48# of the node under consideration. 49# $part defaults to %ALL. It can be one of the following: 50# %ALL - returns the entire selected element. 51# %TAGNAME - returns lindex 0 of the selected element. 52# %ATTRIBUTES - returns lindex 1 of the selected element. 53# %CHILDREN - returns lrange 2 through end of the selected element, 54# resulting in a list of elements being returned. 55# %PCDATA - returns a concatenation of all the bodies of 56# direct children of this node whose tag is %PCDATA. 57# Throws an error if no such children are found. That 58# is, part=%PCDATA means return the textual content found 59# in that node but not its children nodes. 60# %PCDATA? - like %PCDATA, but returns an empty string if 61# no PCDATA is found. 62 63# xsxp::fetchall $pxml_list $path ?$part? 64# Iterates over each PXML in $pxml_list, selecting the indicated 65# path from it, building a new list with the selected data, and 66# returning that new list. For example, $pxml_list might be 67# the %CHILDREN of a particular element, and the $path and $part 68# might select from each child a sub-element in which we're interested. 69 70# xsxp::only $pxml $tagname 71# Iterates over the direct children of $pxml and selects only 72# those with $tagname as their tag. Returns a list of matching 73# elements. 74 75# xsxp::prettyprint $pxml 76# Outputs to stdout a nested-list notation of the parsed XML. 77 78package require xml 79package provide xsxp 1.0 80 81namespace eval xsxp { 82 83 variable Stack 84 variable Cur 85 86 proc Characterdatacommand {characterdata} { 87 variable Cur 88 # puts "characterdatacommand $characterdata" 89 set x [list %PCDATA {} $characterdata] 90 lappend Cur $x 91 } 92 93 proc Elementstartcommand {name attlist args} { 94 # puts "elementstart $name {$attlist} $args" 95 variable Stack 96 variable Cur 97 lappend Stack $Cur 98 set Cur [list $name $attlist] 99 } 100 101 proc Elementendcommand {args} { 102 # puts "elementend $args" 103 variable Stack 104 variable Cur 105 set x [lindex $Stack end] 106 lappend x $Cur 107 set Cur $x 108 set Stack [lrange $Stack 0 end-1] 109 } 110 111 proc parse {xml} { 112 variable Cur 113 variable Stack 114 set Cur {} 115 set Stack {} 116 set parser [::xml::parser \ 117 -characterdatacommand [namespace code Characterdatacommand] \ 118 -elementstartcommand [namespace code Elementstartcommand] \ 119 -elementendcommand [namespace code Elementendcommand] \ 120 -ignorewhitespace 1 -final 1 121 ] 122 $parser parse $xml 123 $parser free 124 # The following line is needed because the close of the last element 125 # appends the outermost element to the item on the top of the stack. 126 # Since there's nothing on the top of the stack at the close of the 127 # last element, we append the current element to an empty list. 128 # In essence, since we don't really have a terminating condition 129 # on the recursion, an empty stack is still treated like an element. 130 set Cur [lindex $Cur 0] 131 set Cur [Normalize $Cur] 132 return $Cur 133 } 134 135 proc Normalize {pxml} { 136 # This iterates over pxml recursively, finding entries that 137 # start with multiple %PCDATA elements, and coalesces their 138 # content, so if an element contains only %PCDATA, it is 139 # guaranteed to have only one child. 140 # Not really necessary, given definition of part=%PCDATA 141 # However, it makes pretty-prints nicer (for AWS at least) 142 # and ends up with smaller lists. I have no idea why they 143 # would put quotes around an MD5 hash in hex, tho. 144 set dupl 1 145 while {$dupl} { 146 set first [lindex $pxml 2] 147 set second [lindex $pxml 3] 148 if {[lindex $first 0] eq "%PCDATA" && [lindex $second 0] eq "%PCDATA"} { 149 set repl [list %PCDATA {} [lindex $first 2][lindex $second 2]] 150 set pxml [lreplace $pxml 2 3 $repl] 151 } else { 152 set dupl 0 153 for {set i 2} {$i < [llength $pxml]} {incr i} { 154 set pxml [lreplace $pxml $i $i [Normalize [lindex $pxml $i]]] 155 } 156 } 157 } 158 return $pxml 159 } 160 161 proc prettyprint {pxml {chan stdout} {indent 0}} { 162 puts -nonewline $chan [string repeat " " $indent] 163 if {[lindex $pxml 0] eq "%PCDATA"} { 164 puts $chan "%PCDATA: [lindex $pxml 2]" 165 return 166 } 167 puts -nonewline $chan "[lindex $pxml 0]" 168 foreach {name val} [lindex $pxml 1] { 169 puts -nonewline $chan " $name='$val'" 170 } 171 puts $chan "" 172 foreach node [lrange $pxml 2 end] { 173 prettyprint $node $chan [expr $indent+1] 174 } 175 } 176 177 proc fetch {pxml path {part %ALL}} { 178 set path [string trim $path /] 179 if {-1 != [string first / $path]} { 180 set path [split $path /] 181 } 182 foreach element $path { 183 if {$pxml eq ""} {return ""} 184 foreach {tag count} [split $element #] { 185 if {$tag ne ""} { 186 if {$count eq ""} {set count 0} 187 set pxml [lrange $pxml 2 end] 188 while {0 <= $count && 0 != [llength $pxml]} { 189 if {$tag eq [lindex $pxml 0 0]} { 190 incr count -1 191 if {$count < 0} { 192 # We're done. Go on to next element. 193 set pxml [lindex $pxml 0] 194 } else { 195 # Not done yet. Throw this away. 196 set pxml [lrange $pxml 1 end] 197 } 198 } else { 199 # Not what we want. 200 set pxml [lrange $pxml 1 end] 201 } 202 } 203 } else { # tag eq "" 204 if {$count eq ""} { 205 # Just select whole $pxml 206 } else { 207 set pxml [lindex $pxml [expr {2+$count}]] 208 } 209 } 210 break 211 } ; # done the foreach [split] loop 212 } ; # done all the elements. 213 if {$part eq "%ALL"} {return $pxml} 214 if {$part eq "%ATTRIBUTES"} {return [lindex $pxml 1]} 215 if {$part eq "%TAGNAME"} {return [lindex $pxml 0]} 216 if {$part eq "%CHILDREN"} {return [lrange $pxml 2 end]} 217 if {$part eq "%PCDATA" || $part eq "%PCDATA?"} { 218 set res "" ; set found 0 219 foreach elem [lrange $pxml 2 end] { 220 if {"%PCDATA" eq [lindex $elem 0]} { 221 append res [lindex $elem 2] 222 set found 1 223 } 224 } 225 if {$found || $part eq "%PCDATA?"} { 226 return $res 227 } else { 228 error "xsxp::fetch did not find requested PCDATA" 229 } 230 } 231 return $pxml ; # Don't know what he's after 232 } 233 234 proc only {pxml tag} { 235 set res {} 236 foreach element [lrange $pxml 2 end] { 237 if {[lindex $element 0] eq $tag} { 238 lappend res $element 239 } 240 } 241 return $res 242 } 243 244 proc fetchall {pxml_list path {part %ALL}} { 245 set res [list] 246 foreach pxml $pxml_list { 247 lappend res [fetch $pxml $path $part] 248 } 249 return $res 250 } 251} 252 253namespace export xsxp parse prettyprint fetch 254 255