1# xml.tcl --
2#
3#	This file provides XML services.
4#	These services include a XML document instance and DTD parser,
5#	as well as support for generating XML.
6#
7# Copyright (c) 1998,1999 Zveno Pty Ltd
8# http://www.zveno.com/
9#
10# Zveno makes this software and all associated data and documentation
11# ('Software') available free of charge for non-commercial purposes only. You
12# may make copies of the Software but you must include all of this notice on
13# any copy.
14#
15# The Software was developed for research purposes and Zveno does not warrant
16# that it is error free or fit for any purpose.  Zveno disclaims any
17# liability for all claims, expenses, losses, damages and costs any user may
18# incur as a result of using, copying or modifying the Software.
19#
20# Copyright (c) 1997 Australian National University (ANU).
21#
22# ANU makes this software and all associated data and documentation
23# ('Software') available free of charge for non-commercial purposes only. You
24# may make copies of the Software but you must include all of this notice on
25# any copy.
26#
27# The Software was developed for research purposes and ANU does not warrant
28# that it is error free or fit for any purpose.  ANU disclaims any
29# liability for all claims, expenses, losses, damages and costs any user may
30# incur as a result of using, copying or modifying the Software.
31#
32# $Id: xml.tcl,v 1.4 2006/09/27 08:12:40 neumann Exp $
33
34package provide xml 1.8
35
36package require sgml 1.6
37
38namespace eval xml {
39
40    # Procedures for parsing XML documents
41    namespace export parser
42    # Procedures for parsing XML DTDs
43    namespace export DTDparser
44
45    # Counter for creating unique parser objects
46    variable ParserCounter 0
47
48    # Convenience routine
49    proc cl x {
50	return "\[$x\]"
51    }
52
53    # Define various regular expressions
54    # white space
55    variable Wsp " \t\r\n"
56    variable noWsp [cl ^$Wsp]
57
58    # Various XML names and tokens
59
60    # BUG: NameChar does not include CombiningChar or Extender
61    variable NameChar [cl -a-zA-Z0-9._:]
62    variable Name [cl a-zA-Z_:]$NameChar*
63    variable Nmtoken $NameChar+
64
65    # Tokenising expressions
66
67    variable tokExpr <(/?)([cl ^$Wsp>]+)([cl $Wsp]*[cl ^>]*)>
68    variable substExpr "\}\n{\\2} {\\1} {} {\\3} \{"
69
70    # table of predefined entities
71
72    variable EntityPredef
73    array set EntityPredef {
74	lt <   gt >   amp &   quot \"   apos '
75    }
76
77}
78
79
80# xml::parser --
81#
82#	Creates XML parser object.
83#
84# Arguments:
85#	args	Unique name for parser object
86#		plus option/value pairs
87#
88# Recognised Options:
89#	-final			Indicates end of document data
90#	-elementstartcommand	Called when an element starts
91#	-elementendcommand	Called when an element ends
92#	-characterdatacommand	Called when character data occurs
93#	-processinginstructioncommand	Called when a PI occurs
94#	-externalentityrefcommand	Called for an external entity reference
95#
96#	(Not compatible with expat)
97#	-xmldeclcommand		Called when the XML declaration occurs
98#	-doctypecommand		Called when the document type declaration occurs
99#
100#	-errorcommand		Script to evaluate for a fatal error
101#	-warningcommand		Script to evaluate for a reportable warning
102#	-statevariable		global state variable
103#	-reportempty		whether to provide empty element indication
104#
105# Results:
106#	The state variable is initialised.
107
108proc xml::parser {args} {
109    variable ParserCounter
110
111    if {[llength $args] > 0} {
112	set name [lindex $args 0]
113	set args [lreplace $args 0 0]
114    } else {
115	set name parser[incr ParserCounter]
116    }
117
118    if {[info command [namespace current]::$name] != {}} {
119	return -code error "unable to create parser object \"[namespace current]::$name\" command"
120    }
121
122    # Initialise state variable and object command
123    upvar \#0 [namespace current]::$name parser
124    set sgml_ns [namespace parent]::sgml
125    array set parser [list name $name			\
126	-final 1					\
127	-elementstartcommand ${sgml_ns}::noop		\
128	-elementendcommand ${sgml_ns}::noop		\
129	-characterdatacommand ${sgml_ns}::noop		\
130	-processinginstructioncommand ${sgml_ns}::noop	\
131	-externalentityrefcommand ${sgml_ns}::noop	\
132	-xmldeclcommand ${sgml_ns}::noop		\
133	-doctypecommand ${sgml_ns}::noop		\
134	-warningcommand ${sgml_ns}::noop		\
135	-statevariable [namespace current]::$name	\
136	-reportempty 0					\
137	internaldtd {}					\
138    ]
139
140    proc [namespace current]::$name {method args} \
141	"eval ParseCommand $name \$method \$args"
142
143    eval ParseCommand [list $name] configure $args
144
145    return [namespace current]::$name
146}
147
148# xml::ParseCommand --
149#
150#	Handles parse object command invocations
151#
152# Valid Methods:
153#	cget
154#	configure
155#	parse
156#	reset
157#
158# Arguments:
159#	parser	parser object
160#	method	minor command
161#	args	other arguments
162#
163# Results:
164#	Depends on method
165
166proc xml::ParseCommand {parser method args} {
167    upvar \#0 [namespace current]::$parser state
168
169    switch -- $method {
170	cget {
171	    return $state([lindex $args 0])
172	}
173	configure {
174	    foreach {opt value} $args {
175		set state($opt) $value
176	    }
177	}
178	parse {
179	    ParseCommand_parse $parser [lindex $args 0]
180	}
181	reset {
182	    if {[llength $args]} {
183		return -code error "too many arguments"
184	    }
185	    ParseCommand_reset $parser
186	}
187	default {
188	    return -code error "unknown method \"$method\""
189	}
190    }
191
192    return {}
193}
194
195# xml::ParseCommand_parse --
196#
197#	Parses document instance data
198#
199# Arguments:
200#	object	parser object
201#	xml	data
202#
203# Results:
204#	Callbacks are invoked, if any are defined
205
206proc xml::ParseCommand_parse {object xml} {
207    upvar \#0 [namespace current]::$object parser
208    variable Wsp
209    variable tokExpr
210    variable substExpr
211
212    set parent [namespace parent]
213    if {"::" eq $parent } {
214	set parent {}
215    }
216
217    set tokenised [lrange \
218	    [${parent}::sgml::tokenise $xml \
219	    $tokExpr \
220	    $substExpr \
221	    -internaldtdvariable [namespace current]::${object}(internaldtd)] \
222	5 end]
223
224    eval ${parent}::sgml::parseEvent \
225	[list $tokenised \
226	    -emptyelement [namespace code ParseEmpty] \
227	    -parseattributelistcommand [namespace code ParseAttrs]] \
228	[array get parser -*command] \
229	[array get parser -entityvariable] \
230	[array get parser -reportempty] \
231	-normalize 0 \
232	-internaldtd [list $parser(internaldtd)]
233
234    return {}
235}
236
237# xml::ParseEmpty --
238#
239#	Used by parser to determine whether an element is empty.
240#	This should be dead easy in XML.  The only complication is
241#	that the RE above can't catch the trailing slash, so we have
242#	to dig it out of the tag name or attribute list.
243#
244#	Tcl 8.1 REs should fix this.
245#
246# Arguments:
247#	tag	element name
248#	attr	attribute list (raw)
249#	e	End tag delimiter.
250#
251# Results:
252#	"/" if the trailing slash is found.  Optionally, return a list
253#	containing new values for the tag name and/or attribute list.
254
255proc xml::ParseEmpty {tag attr e} {
256
257    if {[string match */ [string trimright $tag]] && \
258	    ![string length $attr]} {
259	regsub {/$} $tag {} tag
260	return [list / $tag $attr]
261    } elseif {[string match */ [string trimright $attr]]} {
262	regsub {/$} [string trimright $attr] {} attr
263	return [list / $tag $attr]
264    } else {
265	return {}
266    }
267
268}
269
270# xml::ParseAttrs --
271#
272#	Parse element attributes.
273#
274# There are two forms for name-value pairs:
275#
276#	name="value"
277#	name='value'
278#
279# Watch out for the trailing slash on empty elements.
280#
281# Arguments:
282#	attrs	attribute string given in a tag
283#
284# Results:
285#	Returns a Tcl list representing the name-value pairs in the
286#	attribute string
287
288proc xml::ParseAttrs attrs {
289    variable Wsp
290    variable Name
291
292    # First check whether there's any work to do
293    if {{} eq [string trim $attrs] } {
294	return {}
295    }
296
297    # Strip the trailing slash on empty elements
298    regsub [format {/[%s]*$} " \t\n\r"] $attrs {} atList
299
300    set mode name
301    set result {}
302    foreach component [split $atList =] {
303	switch $mode {
304	    name {
305		set component [string trim $component]
306		if {[regexp $Name $component]} {
307		    lappend result $component
308		} else {
309		    return -code error "invalid attribute name \"$component\""
310		}
311		set mode value:start
312	    }
313	    value:start {
314		set component [string trimleft $component]
315		set delimiter [string index $component 0]
316		set value {}
317		switch -- $delimiter {
318		    \" -
319		    ' {
320			if {[regexp [format {%s([^%s]*)%s(.*)} $delimiter $delimiter $delimiter] $component discard value remainder]} {
321			    lappend result $value
322			    set remainder [string trim $remainder]
323			    if {[string length $remainder]} {
324				if {[regexp $Name $remainder]} {
325				    lappend result $remainder
326				    set mode value:start
327				} else {
328				    return -code error "invalid attribute name \"$remainder\""
329				}
330			    } else {
331				set mode end
332			    }
333			} else {
334			    set value [string range $component 1 end]
335			    set mode value:continue
336			}
337		    }
338		    default {
339			return -code error "invalid value for attribute \"[lindex $result end]\""
340		    }
341		}
342	    }
343	    value:continue {
344		if {[regexp [format {([^%s]*)%s(.*)} $delimiter $delimiter] $component discard valuepart remainder]} {
345		    append value = $valuepart
346		    lappend result $value
347		    set remainder [string trim $remainder]
348		    if {[string length $remainder]} {
349			if {[regexp $Name $remainder]} {
350			    lappend result $remainder
351			    set mode value:start
352			} else {
353			    return -code error "invalid attribute name \"$remainder\""
354			}
355		    } else {
356			set mode end
357		    }
358		} else {
359		    append value = $component
360		}
361	    }
362	    end {
363		return -code error "unexpected data found after end of attribute list"
364	    }
365	}
366    }
367
368    switch $mode {
369	name -
370	end {
371	    # This is normal
372	}
373	default {
374	    return -code error "unexpected end of attribute list"
375	}
376    }
377
378    return $result
379}
380
381proc xml::OLDParseAttrs {attrs} {
382    variable Wsp
383    variable Name
384
385    # First check whether there's any work to do
386    if {{} eq [string trim $attrs] } {
387	return {}
388    }
389
390    # Strip the trailing slash on empty elements
391    regsub [format {/[%s]*$} " \t\n\r"] $attrs {} atList
392
393    # Protect Tcl special characters
394    #regsub -all {([[\$\\])} $atList {\\\1} atList
395    regsub -all & $atList {\&amp;} atList
396    regsub -all {\[} $atList {\&ob;} atList
397    regsub -all {\]} $atList {\&cb;} atlist
398    # NB. sgml package delivers braces and backslashes quoted
399    regsub -all {\\\{} $atList {\&oc;} atList
400    regsub -all {\\\}} $atList {\&cc;} atlist
401    regsub -all {\$} $atList {\&dollar;} atList
402    regsub -all {\\\\} $atList {\&bs;} atList
403
404    regsub -all [format {(%s)[%s]*=[%s]*"([^"]*)"} $Name $Wsp $Wsp] \
405	    $atList {[set parsed(\1) {\2}; set dummy {}] } atList	;# "
406    regsub -all [format {(%s)[%s]*=[%s]*'([^']*)'} $Name $Wsp $Wsp] \
407	    $atList {[set parsed(\1) {\2}; set dummy {}] } atList
408
409    set leftovers [subst $atList]
410
411    if {[string length [string trim $leftovers]]} {
412	return -code error "syntax error in attribute list \"$attrs\""
413    }
414
415    return [ParseAttrs:Deprotect [array get parsed]]
416}
417
418# xml::ParseAttrs:Deprotect --
419#
420#	Reverse map Tcl special characters previously protected
421#
422# Arguments:
423#	attrs	attribute list
424#
425# Results:
426#	Characters substituted
427
428proc xml::ParseAttrs:Deprotect attrs {
429
430    regsub -all &amp\; $attrs \\& attrs
431    regsub -all &ob\; $attrs \[ attrs
432    regsub -all &cb\; $attrs \] attrs
433    regsub -all &oc\; $attrs \{ attrs
434    regsub -all &cc\; $attrs \} attrs
435    regsub -all &dollar\; $attrs \$ attrs
436    regsub -all &bs\; $attrs \\\\ attrs
437
438    return $attrs
439
440}
441
442# xml::ParseCommand_reset --
443#
444#	Initialize parser data
445#
446# Arguments:
447#	object	parser object
448#
449# Results:
450#	Parser data structure initialised
451
452proc xml::ParseCommand_reset object {
453    upvar \#0 [namespace current]::$object parser
454
455    array set parser [list \
456	    -final 1		\
457	    internaldtd {}	\
458    ]
459}
460
461# xml::noop --
462#
463# A do-nothing proc
464
465proc xml::noop args {}
466
467### Following procedures are based on html_library
468
469# xml::zapWhite --
470#
471#	Convert multiple white space into a single space.
472#
473# Arguments:
474#	data	plain text
475#
476# Results:
477#	As above
478
479proc xml::zapWhite data {
480    regsub -all "\[ \t\r\n\]+" $data { } data
481    return $data
482}
483
484#
485# DTD parser for XML is wholly contained within the sgml.tcl package
486#
487
488# xml::parseDTD --
489#
490#	Entry point to the XML DTD parser.
491#
492# Arguments:
493#	dtd	XML data defining the DTD to be parsed
494#	args	configuration options
495#
496# Results:
497#	Returns a three element list, first element is the content model
498#	for each element, second element are the attribute lists of the
499#	elements and the third element is the entity map.
500
501proc xml::parseDTD {dtd args} {
502    return [eval [expr {[namespace parent] == {::} ? {} : [namespace parent]}]::sgml::parseDTD [list $dtd] $args]
503}
504
505