1/* xml_acquisition.pl : XML -> Document translation.
2 *
3 * Copyright (C) 2001-2005 Binding Time Limited
4 * Copyright (C) 2005, 2006 John Fletcher
5 *
6 * Current Release: $Revision: 1.2 $
7 *
8 * TERMS AND CONDITIONS:
9 *
10 * This program is offered free of charge, as unsupported source code. You may
11 * use it, copy it, distribute it, modify it or sell it without restriction,
12 * but entirely at your own risk.
13 */
14
15:- ensure_loaded( xml_utilities ).
16
17/* xml_to_document( +Controls, +XML, ?Document ) translates the list of
18 * character codes XML into the Prolog term Document. Controls is a list
19 * of terms controlling the treatment of layout characters and character
20 * entities.
21 */
22xml_to_document( Controls, XML, Document ) :-
23	initial_context( Controls, Context ),
24	( xml_declaration( Attributes0, XML, XML1 ) ->
25		Attributes = Attributes0
26	; otherwise ->
27		XML1 = XML,
28		Attributes = []
29	),
30	xml_to_document( XML1, Context, Terms, [], WellFormed ),
31	xml_to_document1( WellFormed, Attributes, Terms, Document ).
32
33xml_to_document1( true,  Attributes, Terms, xml(Attributes, Terms) ).
34xml_to_document1( false, Attributes, Terms, malformed(Attributes, Terms) ).
35
36% unparsed( +Unparsed, +Context, ?Terms, ?Residue, ?WellFormed )
37unparsed( Unparsed, _Context, [unparsed(Unparsed)], [], false ).
38
39xml_declaration( Attributes ) -->
40	spaces,
41	"<?",
42	nmtoken( xml ),
43	xml_declaration_attributes( Attributes ),
44	spaces,
45	"?>".
46
47xml_to_document( [], Context, Terms, [], WF ) :-
48	close_context( Context, Terms, WF ).
49xml_to_document( [Char|Chars], Context, Terms, Residue, WF ) :-
50	( Char =:= "<" ->
51		xml_markup_structure( Chars, Context, Terms, Residue, WF )
52	; Char =:= "&" ->
53		entity_reference( Chars, Context, Terms, Residue, WF )
54	; Char =< " ",
55	  \+ space_preserve( Context ) ->
56		layouts( Chars, Context, [Char|T], T, Terms, Residue, WF )
57	; void_context( Context ) ->
58		unparsed( [Char|Chars], Context, Terms, Residue, WF )
59	; otherwise ->
60		Terms = [pcdata([Char|Chars1])|Terms1],
61		acquire_pcdata( Chars, Context, Chars1, Terms1, Residue, WF )
62	).
63
64layouts( [], Context, _Plus, _Minus, Terms, [], WF ) :-
65	close_context( Context, Terms, WF ).
66layouts( [Char|Chars], Context, Plus, Minus, Terms, Residue, WF ) :-
67	( Char =:= "<" ->
68		xml_markup_structure( Chars, Context, Terms, Residue, WF )
69	; Char =:= "&" ->
70		reference_in_layout( Chars, Context, Plus, Minus, Terms, Residue, WF )
71	; Char =< " " ->
72		Minus = [Char|Minus1],
73		layouts( Chars, Context, Plus, Minus1, Terms, Residue, WF )
74	; void_context( Context ) ->
75		unparsed( [Char|Chars], Context, Terms, Residue, WF )
76	; otherwise ->
77		Terms = [pcdata(Plus)|Terms1],
78		Minus = [Char|Chars1],
79		context_update( space_preserve, Context, true, Context1 ),
80		acquire_pcdata( Chars, Context1, Chars1, Terms1, Residue, WF )
81	).
82
83acquire_pcdata( [], Context, [], Terms, [], WF ) :-
84	close_context( Context, Terms, WF ).
85acquire_pcdata( [Char|Chars], Context, Chars1, Terms, Residue, WF ) :-
86	( Char =:= "<" ->
87		Chars1 = [],
88		xml_markup_structure( Chars, Context, Terms, Residue, WF )
89	; Char =:= "&" ->
90		reference_in_pcdata( Chars, Context, Chars1, Terms, Residue, WF )
91	; otherwise ->
92		Chars1 = [Char|Chars2],
93		acquire_pcdata( Chars, Context, Chars2, Terms, Residue, WF )
94	).
95
96xml_markup_structure( [], Context, Terms, Residue, WF ) :-
97	unparsed( "<", Context, Terms, Residue, WF ).
98xml_markup_structure( Chars, Context, Terms, Residue, WF ) :-
99	Chars = [Char|Chars1],
100	( Char =:= "/" ->
101		closing_tag( Context, Chars1, Terms, Residue, WF )
102	; Char =:= "?" ->
103		pi_acquisition( Chars1, Context, Terms, Residue, WF )
104	; Char =:= "!" ->
105		declaration_acquisition( Chars1, Context, Terms, Residue, WF )
106	; open_tag(Tag,Context,Attributes,Type, Chars, Chars2 ) ->
107		push_tag( Tag, Chars2, Context, Attributes, Type, Terms, Residue, WF )
108	; otherwise ->
109		unparsed( [0'<|Chars], Context, Terms, Residue, WF ) %'
110	).
111
112push_tag( Tag, Chars, Context, Attributes, Type, Terms, Residue, WF ) :-
113	new_element(Tag, Chars, Context, Attributes, Type, Term, Rest, WF0),
114	push_tag1( WF0, Context, Term, Rest, Terms, Residue, WF ).
115
116push_tag1( true, Context, Term, Chars, [Term|Terms], Residue, WF ) :-
117	xml_to_document( Chars, Context, Terms, Residue, WF ).
118push_tag1( false, _Context, Term, Chars, [Term], Chars, false ).
119
120new_element( TagChars, Chars, Context, Attributes0, Type, Term, Residue, WF ) :-
121	namespace_attributes( Attributes0, Context, Context1, Attributes1 ),
122	( append( NSChars, [0':|TagChars1], TagChars ), %'
123	  specific_namespace( NSChars, Context1, SpecificNamespace ) ->
124		Namespace0 = SpecificNamespace
125	; otherwise ->
126		NSChars = "",
127		TagChars1 = TagChars,
128		default_namespace( Context1, Namespace0 )
129	),
130	current_namespace( Context1, CurrentNamespace ),
131	( Namespace0 == CurrentNamespace ->
132		Term = element(Tag, Attributes, Contents),
133		Context2 = Context1
134	; otherwise ->
135		Term = namespace( Namespace0, NSChars,
136					element(Tag, Attributes, Contents)
137					),
138		context_update( current_namespace, Context1, Namespace0, Context2 )
139	),
140	input_attributes( Attributes1, Context2, Attributes ),
141	atom_codes( Tag, TagChars1 ),
142	close_tag( Type, Chars, Context2, Contents, Residue, WF ).
143
144close_tag( empty, Residue, _Context, [], Residue, true ).
145close_tag( push(Tag), Chars, Context0, Contents, Residue, WF ) :-
146	context_update( element, Context0, Tag, Context1 ),
147	xml_to_document( Chars, Context1, Contents, Residue, WF ).
148
149pi_acquisition( Chars, Context, Terms, Residue, WellFormed ) :-
150	( inline_instruction(Target, Processing, Chars, Rest ),
151	  Target \== xml ->
152		Terms = [instructions(Target, Processing)|Terms1],
153		xml_to_document( Rest, Context, Terms1, Residue, WellFormed )
154	; otherwise ->
155		unparsed( [0'<,0'?|Chars], Context, Terms, Residue, WellFormed )
156	).
157
158declaration_acquisition( Chars, Context, Terms, Residue, WF ) :-
159	( declaration_type( Chars, Type, Chars1 ),
160	  declaration_parse( Type, Context, Term, Context1, Chars1, Rest ) ->
161		Terms = [Term|Terms1],
162		xml_to_document( Rest, Context1, Terms1, Residue, WF )
163	; otherwise ->
164		unparsed( [0'<,0'!|Chars], Context, Terms, Residue, WF )
165	).
166
167open_tag( Tag, Namespaces, Attributes, Termination ) -->
168	nmtoken_chars( Tag ),
169	attributes( Attributes, [], Namespaces ),
170	spaces,
171	open_tag_terminator( Tag, Termination ).
172
173open_tag_terminator( Tag, push(Tag) ) -->
174	">".
175open_tag_terminator( _Tag, empty ) -->
176	"/>".
177
178declaration_parse( comment, Namespaces, comment(Comment), Namespaces ) -->
179	comment(Comment).
180declaration_parse( cdata, Namespaces, cdata(CData), Namespaces ) -->
181	cdata( CData ).
182declaration_parse( doctype, Namespaces0, doctype(Name, Names), Namespaces ) -->
183	doctype( Name, Names, Namespaces0, Namespaces ),
184	spaces,
185	">".
186
187inline_instruction( Target, Processing, Plus, Minus  ) :-
188	nmtoken(Target, Plus, Mid0 ),
189	spaces( Mid0, Mid1 ),
190	append( Processing, [0'?,0'>|Minus], Mid1 ),
191	!.
192
193entity_reference_name( Reference ) -->
194	nmtoken_chars( Reference ),
195	";".
196
197declaration_type( [Char1,Char2|Chars1], Class, Rest ) :-
198	Chars = [Char1,Char2|Chars1],
199	( declaration_type1( Char1, Char2, Chars1, Class0, Residue ) ->
200		Class = Class0,
201		Rest = Residue
202	; otherwise ->
203		Class = generic,
204		Rest = Chars
205	).
206
207declaration_type1( 0'-, 0'-, Chars, comment, Chars ).
208declaration_type1( 0'[, 0'C, Chars, cdata, Residue ) :-
209	append( "DATA[", Residue, Chars ).
210declaration_type1( 0'D, 0'O, Chars, doctype, Residue ) :-
211	append( "CTYPE", Residue, Chars ).
212
213closing_tag( Context, Chars, Terms, Residue, WellFormed ) :-
214	( closing_tag_name( Tag, Chars, Rest ),
215	  current_tag( Context, Tag ) ->
216		Terms = [],
217		Residue = Rest,
218		WellFormed = true
219	; otherwise ->
220		unparsed( [0'<,0'/|Chars], Context, Terms, Residue, WellFormed )
221	).
222
223closing_tag_name( Tag ) -->
224	nmtoken_chars( Tag ),
225	spaces,
226	">".
227
228entity_reference( Chars, Context, Terms, Residue, WF ) :-
229	reference_in_layout( Chars, Context, L, L, Terms, Residue, WF ).
230
231reference_in_layout( Chars, Context, Plus, Minus, Terms, Residue, WF ) :-
232	( standard_character_entity( Char, Chars, Rest ) ->
233		Minus = [Char|Chars1],
234		Terms = [pcdata(Plus)|Terms1],
235		acquire_pcdata( Rest, Context, Chars1, Terms1, Residue, WF )
236	; entity_reference_name( Reference, Chars, Rest ),
237	  defined_entity( Reference, Context, String ) ->
238		append( String, Rest, Full ),
239		xml_to_document( Full, Context, Terms, Residue, WF )
240	; allow_ampersand( Context ) ->
241		Minus = [0'&|Chars1], %'
242		Terms = [pcdata(Plus)|Terms1],
243		acquire_pcdata( Chars, Context, Chars1, Terms1, Residue, WF )
244	; otherwise ->
245		unparsed( [0'&|Chars], Context, Terms, Residue, WF ) %'
246	).
247
248reference_in_pcdata( Chars0, Context, Chars1, Terms, Residue, WF ) :-
249	( standard_character_entity( Char, Chars0, Rest ) ->
250		Chars1 = [Char|Chars2],
251		acquire_pcdata( Rest, Context, Chars2, Terms, Residue, WF )
252	; entity_reference_name( Reference, Chars0, Rest ),
253	  defined_entity( Reference, Context, String ) ->
254		append( String, Rest, Full ),
255		acquire_pcdata( Full, Context, Chars1, Terms, Residue, WF )
256	; allow_ampersand( Context ) ->
257		Chars1 = [0'&|Chars2],
258		acquire_pcdata( Chars0, Context, Chars2, Terms, Residue, WF )
259	; otherwise ->
260		Chars1 = [],
261		unparsed( [0'&|Chars0], Context, Terms, Residue, WF )
262	).
263
264namespace_attributes( [], Context, Context, [] ).
265namespace_attributes( Attributes0, Context0, Context, Attributes ) :-
266	Attributes0 = [_|_],
267	append( "xmlns:", Unqualified, QualifiedNameChars ),
268	( select( "xmlns"=Value, Attributes0, Attributes1 ) ->
269		atom_codes( URI, Value ),
270		context_update( default_namespace, Context0, URI, Context1 ),
271		namespace_attributes( Attributes1, Context1, Context, Attributes )
272	; select( QualifiedNameChars=Value, Attributes0, Attributes1 ) ->
273		Attributes = [QualifiedNameChars=Value|Attributes2],
274		atom_codes( URI, Value ),
275		context_update( ns_prefix(Unqualified), Context0, URI, Context1 ),
276		namespace_attributes( Attributes1, Context1, Context, Attributes2 )
277	; member( "xml:space"="preserve", Attributes0 ) ->
278		Attributes = Attributes0,
279		context_update( space_preserve, Context0, true, Context )
280	; otherwise ->
281		Context = Context0,
282		Attributes = Attributes0
283	).
284
285input_attributes( [], _Context, [] ).
286input_attributes( [NameChars=Value|Attributes0], Context,
287		[Name=Value|Attributes] ) :-
288	( remove_attribute_prefixes( Context ),
289	  append( NSChars, [0':|NameChars1], NameChars ), %'
290	  NSChars \== "xmlns",
291	  specific_namespace( NSChars, Context, Namespace ),
292	  current_namespace( Context, Namespace ) ->
293		atom_codes( Name, NameChars1 )
294	; otherwise ->
295		atom_codes( Name, NameChars )
296	),
297	input_attributes( Attributes0, Context, Attributes ).
298
299attributes( [Name=Value|Attributes], Seen, Namespaces ) -->
300	spaces,
301	nmtoken_chars( Name ),
302	{\+ member(Name, Seen)},
303	spaces,
304	"=",
305	spaces,
306	attribute_value( Value, Namespaces ),
307	attributes( Attributes, [Name|Seen], Namespaces ).
308attributes( [], _Seen, _Namespaces ) --> "".
309
310xml_declaration_attributes( [] ) --> "".
311xml_declaration_attributes( [Name=Value|Attributes] ) -->
312	spaces,
313	nmtoken( Name ),
314	spaces,
315	"=",
316	spaces,
317	xml_string( Value ),
318	{xml_declaration_attribute_valid(Name, Value)},
319	xml_declaration_attributes( Attributes ),
320	spaces.
321
322doctype( Name, External, Namespaces0, Namespaces1 ) -->
323	spaces,
324	nmtoken( Name ),
325	spaces,
326	doctype_id( External0 ),
327	spaces,
328	doctype1( Namespaces0, Literals, Namespaces1 ),
329	{doctype_extension(Literals, External0, External)}.
330
331doctype_extension( [], External, External ).
332doctype_extension( [Literal|Literals], External0, External ) :-
333	extended_doctype( External0, [Literal|Literals], External ).
334
335extended_doctype( system(URL), Literals, system(URL,Literals) ).
336extended_doctype( public(URN,URL), Literals, public(URN,URL,Literals) ).
337extended_doctype( local, Literals, local(Literals) ).
338
339doctype1( Namespaces0, Literals, Namespaces1 ) -->
340	"[",
341	!,
342	dtd( Namespaces0, Literals, Namespaces1 ),
343	"]".
344doctype1( Namespaces, [], Namespaces ) --> "".
345
346doctype_id( system(URL) ) -->
347	"SYSTEM",
348	spaces,
349	uri( URL ).
350doctype_id( public(URN,URL) ) -->
351	"PUBLIC",
352	spaces,
353	uri( URN ),
354	spaces,
355	uri( URL ).
356doctype_id( local ) --> "".
357
358dtd( Namespaces0, Literals, Namespaces1 ) -->
359	spaces,
360	"<!ENTITY",
361	!,
362	spaces,
363	nmtoken_chars( Name ),
364	spaces,
365	quote( Quote ),
366	entity_value( Quote, Namespaces0, String ),
367	spaces,
368	">",
369	{\+ character_entity( Name, _StandardChar ),
370	 % Don't allow &lt; &quote; etc. to be updated
371	 context_update( entity(Name), Namespaces0, String, Namespaces2 )
372	 },
373	dtd( Namespaces2, Literals, Namespaces1 ).
374dtd( Namespaces0, Literals, Namespaces1 ) -->
375	spaces,
376	"<!--",
377	!,
378	dtd_comment,
379	">",
380	dtd( Namespaces0, Literals, Namespaces1 ).
381dtd( Namespaces0, [dtd_literal(Literal)|Literals], Namespaces1 ) -->
382	spaces,
383	"<!",
384	!,
385	dtd_literal( Literal ),
386	dtd( Namespaces0, Literals, Namespaces1 ).
387dtd( Namespaces, [], Namespaces ) --> spaces.
388
389dtd_literal( [] ) --> ">", !.
390dtd_literal( Chars ) -->
391	"--",
392	!,
393	dtd_comment,
394	dtd_literal( Chars ).
395dtd_literal( [Char|Chars] ) -->
396	[Char],
397	dtd_literal( Chars ).
398
399dtd_comment( Plus, Minus ) :-
400	append( _Chars, [0'-,0'-|Minus], Plus ),
401	!.
402
403nmtokens( [Name|Names] ) -->
404	spaces,
405	nmtoken( Name ),
406	nmtokens( Names ).
407nmtokens( [] ) --> [].
408
409entity_value( Quote, Namespaces, String, [Char|Plus], Minus ) :-
410	( Char == Quote ->
411		String = [],
412		Minus = Plus
413	; Char =:= "&" ->
414		reference_in_entity( Namespaces, Quote, String, Plus, Minus )
415	; otherwise ->
416		String = [Char|String1],
417		entity_value( Quote, Namespaces, String1, Plus, Minus )
418	).
419
420attribute_value( String, Namespaces ) -->
421	quote( Quote ),
422	attribute_leading_layouts( Quote, Namespaces, String ).
423
424attribute_leading_layouts( _Quote, _Namespace, [], [], [] ).
425attribute_leading_layouts( Quote, Namespaces, String, [Char|Plus], Minus ) :-
426	( Char == Quote ->
427		String = [],
428		Minus = Plus
429	; Char =:= "&" ->
430		ref_in_attribute_layout( Namespaces, Quote, String, Plus, Minus )
431	; Char > 32, Char \== 160 ->
432		String = [Char|String1],
433		attribute_layouts( Quote, Namespaces, false, String1, Plus, Minus )
434	; otherwise ->
435		attribute_leading_layouts( Quote, Namespaces, String, Plus, Minus )
436	).
437
438attribute_layouts( _Quote, _Namespaces, _Layout, [], [], [] ).
439attribute_layouts( Quote, Namespaces, Layout, String, [Char|Plus], Minus ) :-
440	( Char == Quote ->
441		String = [],
442		Minus = Plus
443	; Char =:= "&" ->
444		reference_in_value( Namespaces, Quote, Layout, String, Plus, Minus )
445	; Char > 32, Char \== 160 ->
446		( Layout == true ->
447			String = [0' ,Char|String1] %'
448		; otherwise ->
449			String = [Char|String1]
450		),
451		attribute_layouts( Quote, Namespaces, false, String1, Plus, Minus )
452	; otherwise ->
453		attribute_layouts( Quote, Namespaces, true, String, Plus, Minus )
454	).
455
456ref_in_attribute_layout( NS, Quote, String, Plus, Minus ) :-
457	( standard_character_entity( Char, Plus, Mid ) ->
458		String = [Char|String1],
459		attribute_layouts( Quote, NS, false,  String1, Mid, Minus )
460	; entity_reference_name( Name, Plus, Suffix ),
461	  defined_entity( Name, NS, Text ) ->
462		append( Text, Suffix, Mid ),
463		attribute_leading_layouts( Quote, NS, String, Mid, Minus )
464	; otherwise -> % Just & is okay in a value
465		String = [0'&|String1], %'
466		attribute_layouts( Quote, NS, false, String1, Plus, Minus )
467	).
468
469reference_in_value( Namespaces, Quote, Layout, String, Plus, Minus ) :-
470	( standard_character_entity( Char, Plus, Mid ) ->
471		( Layout == true ->
472			String = [0' ,Char|String1] %'
473		; otherwise ->
474			String = [Char|String1]
475		),
476		Layout1 = false
477	; entity_reference_name( Name, Plus, Suffix ),
478	  defined_entity( Name, Namespaces, Text ) ->
479		String = String1,
480		append( Text, Suffix, Mid ),
481		Layout1 = Layout
482	; otherwise -> % Just & is okay in a value
483		Mid = Plus,
484		String = [0'&|String1], %'
485		Layout1 = false
486	),
487	attribute_layouts( Quote, Namespaces, Layout1, String1, Mid, Minus ).
488
489/* References are resolved backwards in Entity defintions so that
490 * circularity is avoided.
491 */
492reference_in_entity( Namespaces, Quote, String, Plus, Minus ) :-
493	( standard_character_entity( _SomeChar, Plus, _Rest ) ->
494		String = [0'&|String1], % ' Character entities are unparsed
495		Mid = Plus
496	; entity_reference_name( Name, Plus, Suffix ),
497	  defined_entity( Name, Namespaces, Text ) ->
498		String = String1,
499		append( Text, Suffix, Mid )
500	),
501	entity_value( Quote, Namespaces, String1, Mid, Minus ).
502
503standard_character_entity( Char ) -->
504	"#x", hex_character_reference( Char ), ";".
505standard_character_entity( Char ) -->
506	"#", digit( Digit ), digits( Digits ), ";",
507	{number_chars( Char, [Digit|Digits])}.
508standard_character_entity( C ) -->
509	chars( String ),
510	";",
511	!,
512	{character_entity(String, C)}.
513
514uri( URI ) -->
515	quote( Quote ),
516	uri1( Quote, URI ).
517
518uri1( Quote, [] ) -->
519	quote( Quote ),
520	!.
521uri1( Quote, [Char|Chars] ) -->
522	[Char],
523	uri1( Quote, Chars ).
524
525comment( Chars, Plus, Minus ) :-
526	append( Chars, [0'-,0'-,0'>|Minus], Plus ), %'
527	!.
528
529cdata( Chars, Plus, Minus ) :-
530	append( Chars, [0'],0'],0'>|Minus], Plus ), %'
531	!.
532% Syntax Components
533
534hex_character_reference( Code ) -->
535	hex_character_reference1( 0, Code ).
536
537hex_character_reference1( Current, Code ) -->
538	hex_digit_char( Value ),
539	!,
540	{New is (Current << 4) + Value},
541	hex_character_reference1( New, Code ).
542hex_character_reference1( Code, Code ) --> "".
543
544hex_digit_char( 0 ) --> "0".
545hex_digit_char( 1 ) --> "1".
546hex_digit_char( 2 ) --> "2".
547hex_digit_char( 3 ) --> "3".
548hex_digit_char( 4 ) --> "4".
549hex_digit_char( 5 ) --> "5".
550hex_digit_char( 6 ) --> "6".
551hex_digit_char( 7 ) --> "7".
552hex_digit_char( 8 ) --> "8".
553hex_digit_char( 9 ) --> "9".
554hex_digit_char( 10 ) --> "A".
555hex_digit_char( 11 ) --> "B".
556hex_digit_char( 12 ) --> "C".
557hex_digit_char( 13 ) --> "D".
558hex_digit_char( 14 ) --> "E".
559hex_digit_char( 15 ) --> "F".
560hex_digit_char( 10 ) --> "a".
561hex_digit_char( 11 ) --> "b".
562hex_digit_char( 12 ) --> "c".
563hex_digit_char( 13 ) --> "d".
564hex_digit_char( 14 ) --> "e".
565hex_digit_char( 15 ) --> "f".
566
567quote( 0'" ) --> %'
568	"""".
569quote( 0'' ) -->
570	"'".
571
572spaces( [], [] ).
573spaces( [Char|Chars0], Chars1 ) :-
574	( Char =< 32 ->
575		spaces( Chars0, Chars1 )
576	; otherwise ->
577		Chars1 = [Char|Chars0]
578	).
579
580nmtoken( Name ) -->
581	nmtoken_chars( Chars ),
582	{atom_codes(Name, Chars)}.
583
584nmtoken_chars( [Char|Chars] ) -->
585	[Char],
586	{nmtoken_first( Char )},
587	nmtoken_chars_tail( Chars ).
588
589nmtoken_chars_tail( [Char|Chars] ) -->
590	[Char],
591	{nmtoken_char(Char)},
592	!,
593	nmtoken_chars_tail( Chars ).
594nmtoken_chars_tail([]) --> "".
595
596nmtoken_first( 0': ).
597nmtoken_first( 0'_ ).
598nmtoken_first( Char ) :-
599	alphabet( Char ).
600
601nmtoken_char( 0'a ).
602nmtoken_char( 0'b ).
603nmtoken_char( 0'c ).
604nmtoken_char( 0'd ).
605nmtoken_char( 0'e ).
606nmtoken_char( 0'f ).
607nmtoken_char( 0'g ).
608nmtoken_char( 0'h ).
609nmtoken_char( 0'i ).
610nmtoken_char( 0'j ).
611nmtoken_char( 0'k ).
612nmtoken_char( 0'l ).
613nmtoken_char( 0'm ).
614nmtoken_char( 0'n ).
615nmtoken_char( 0'o ).
616nmtoken_char( 0'p ).
617nmtoken_char( 0'q ).
618nmtoken_char( 0'r ).
619nmtoken_char( 0's ).
620nmtoken_char( 0't ).
621nmtoken_char( 0'u ).
622nmtoken_char( 0'v ).
623nmtoken_char( 0'w ).
624nmtoken_char( 0'x ).
625nmtoken_char( 0'y ).
626nmtoken_char( 0'z ).
627nmtoken_char( 0'A ).
628nmtoken_char( 0'B ).
629nmtoken_char( 0'C ).
630nmtoken_char( 0'D ).
631nmtoken_char( 0'E ).
632nmtoken_char( 0'F ).
633nmtoken_char( 0'G ).
634nmtoken_char( 0'H ).
635nmtoken_char( 0'I ).
636nmtoken_char( 0'J ).
637nmtoken_char( 0'K ).
638nmtoken_char( 0'L ).
639nmtoken_char( 0'M ).
640nmtoken_char( 0'N ).
641nmtoken_char( 0'O ).
642nmtoken_char( 0'P ).
643nmtoken_char( 0'Q ).
644nmtoken_char( 0'R ).
645nmtoken_char( 0'S ).
646nmtoken_char( 0'T ).
647nmtoken_char( 0'U ).
648nmtoken_char( 0'V ).
649nmtoken_char( 0'W ).
650nmtoken_char( 0'X ).
651nmtoken_char( 0'Y ).
652nmtoken_char( 0'Z ).
653nmtoken_char( 0'0 ).
654nmtoken_char( 0'1 ).
655nmtoken_char( 0'2 ).
656nmtoken_char( 0'3 ).
657nmtoken_char( 0'4 ).
658nmtoken_char( 0'5 ).
659nmtoken_char( 0'6 ).
660nmtoken_char( 0'7 ).
661nmtoken_char( 0'8 ).
662nmtoken_char( 0'9 ).
663nmtoken_char( 0'. ).
664nmtoken_char( 0'- ).
665nmtoken_char( 0'_ ).
666nmtoken_char( 0': ).
667
668xml_string( String ) -->
669	quote( Quote ),
670	xml_string1( Quote, String ).
671
672xml_string1( Quote, [] ) -->
673	quote( Quote ),
674	!.
675xml_string1( Quote, [Char|Chars] ) -->
676	[Char],
677	xml_string1( Quote, Chars ).
678
679alphabet( 0'a ).
680alphabet( 0'b ).
681alphabet( 0'c ).
682alphabet( 0'd ).
683alphabet( 0'e ).
684alphabet( 0'f ).
685alphabet( 0'g ).
686alphabet( 0'h ).
687alphabet( 0'i ).
688alphabet( 0'j ).
689alphabet( 0'k ).
690alphabet( 0'l ).
691alphabet( 0'm ).
692alphabet( 0'n ).
693alphabet( 0'o ).
694alphabet( 0'p ).
695alphabet( 0'q ).
696alphabet( 0'r ).
697alphabet( 0's ).
698alphabet( 0't ).
699alphabet( 0'u ).
700alphabet( 0'v ).
701alphabet( 0'w ).
702alphabet( 0'x ).
703alphabet( 0'y ).
704alphabet( 0'z ).
705alphabet( 0'A ).
706alphabet( 0'B ).
707alphabet( 0'C ).
708alphabet( 0'D ).
709alphabet( 0'E ).
710alphabet( 0'F ).
711alphabet( 0'G ).
712alphabet( 0'H ).
713alphabet( 0'I ).
714alphabet( 0'J ).
715alphabet( 0'K ).
716alphabet( 0'L ).
717alphabet( 0'M ).
718alphabet( 0'N ).
719alphabet( 0'O ).
720alphabet( 0'P ).
721alphabet( 0'Q ).
722alphabet( 0'R ).
723alphabet( 0'S ).
724alphabet( 0'T ).
725alphabet( 0'U ).
726alphabet( 0'V ).
727alphabet( 0'W ).
728alphabet( 0'X ).
729alphabet( 0'Y ).
730alphabet( 0'Z ).
731
732digit( C ) --> [C], {digit_table( C )}.
733
734digit_table( 0'0 ).
735digit_table( 0'1 ).
736digit_table( 0'2 ).
737digit_table( 0'3 ).
738digit_table( 0'4 ).
739digit_table( 0'5 ).
740digit_table( 0'6 ).
741digit_table( 0'7 ).
742digit_table( 0'8 ).
743digit_table( 0'9 ).
744
745digits( [Digit|Digits] ) -->
746	digit( Digit ),
747	digits( Digits ).
748digits( [] ) --> [].
749
750character_entity( "quot", 0'" ). %'
751character_entity( "amp", 0'&  ). %'
752character_entity( "lt", 0'< ). %'
753character_entity( "gt", 0'> ). %'
754character_entity( "apos", 0'' ).
755
756end_of_file.
757
758/* For reference, this is a comprehensive recognizer for namechar, based on
759 * the definition of in http://www.w3.org/TR/2000/REC-xml-20001006 .
760 */
761namechar -->
762	( letter
763	| unicode_digit
764	|  "."
765	|  "-"
766	|  "_"
767	|  ":"
768	|  combiningchar
769	|  extender
770	).
771
772letter  --> (basechar | ideographic).
773
774basechar  -->
775	( range( 16'0041, 16'005A )
776	| range( 16'0061, 16'007A )
777	| range( 16'00C0, 16'00D6 )
778	| range( 16'00D8, 16'00F6 )
779	| range( 16'00F8, 16'00FF )
780	| range( 16'0100, 16'0131 )
781	| range( 16'0134, 16'013E )
782	| range( 16'0141, 16'0148 )
783	| range( 16'014A, 16'017E )
784	| range( 16'0180, 16'01C3 )
785	| range( 16'01CD, 16'01F0 )
786	| range( 16'01F4, 16'01F5 )
787	| range( 16'01FA, 16'0217 )
788	| range( 16'0250, 16'02A8 )
789	| range( 16'02BB, 16'02C1 )
790	| [16'0386]
791	| range( 16'0388, 16'038A )
792	| [16'038C]
793	| range( 16'038E, 16'03A1 )
794	| range( 16'03A3, 16'03CE )
795	| range( 16'03D0, 16'03D6 )
796	| [16'03DA]
797	| [16'03DC]
798	| [16'03DE]
799	| [16'03E0]
800	| range( 16'03E2, 16'03F3 )
801	| range( 16'0401, 16'040C )
802	| range( 16'040E, 16'044F )
803	| range( 16'0451, 16'045C )
804	| range( 16'045E, 16'0481 )
805	| range( 16'0490, 16'04C4 )
806	| range( 16'04C7, 16'04C8 )
807	| range( 16'04CB, 16'04CC )
808	| range( 16'04D0, 16'04EB )
809	| range( 16'04EE, 16'04F5 )
810	| range( 16'04F8, 16'04F9 )
811	| range( 16'0531, 16'0556 )
812	| [16'0559]
813	| range( 16'0561, 16'0586 )
814	| range( 16'05D0, 16'05EA )
815	| range( 16'05F0, 16'05F2 )
816	| range( 16'0621, 16'063A )
817	| range( 16'0641, 16'064A )
818	| range( 16'0671, 16'06B7 )
819	| range( 16'06BA, 16'06BE )
820	| range( 16'06C0, 16'06CE )
821	| range( 16'06D0, 16'06D3 )
822	| [16'06D5]
823	| range( 16'06E5, 16'06E6 )
824	| range( 16'0905, 16'0939 )
825	| [16'093D]
826	| range( 16'0958, 16'0961 )
827	| range( 16'0985, 16'098C )
828	| range( 16'098F, 16'0990 )
829	| range( 16'0993, 16'09A8 )
830	| range( 16'09AA, 16'09B0 )
831	| [16'09B2]
832	| range( 16'09B6, 16'09B9 )
833	| range( 16'09DC, 16'09DD )
834	| range( 16'09DF, 16'09E1 )
835	| range( 16'09F0, 16'09F1 )
836	| range( 16'0A05, 16'0A0A )
837	| range( 16'0A0F, 16'0A10 )
838	| range( 16'0A13, 16'0A28 )
839	| range( 16'0A2A, 16'0A30 )
840	| range( 16'0A32, 16'0A33 )
841	| range( 16'0A35, 16'0A36 )
842	| range( 16'0A38, 16'0A39 )
843	| range( 16'0A59, 16'0A5C )
844	| [16'0A5E]
845	| range( 16'0A72, 16'0A74 )
846	| range( 16'0A85, 16'0A8B )
847	| [16'0A8D]
848	| range( 16'0A8F, 16'0A91 )
849	| range( 16'0A93, 16'0AA8 )
850	| range( 16'0AAA, 16'0AB0 )
851	| range( 16'0AB2, 16'0AB3 )
852	| range( 16'0AB5, 16'0AB9 )
853	| [16'0ABD]
854	| [16'0AE0]
855	| range( 16'0B05, 16'0B0C )
856	| range( 16'0B0F, 16'0B10 )
857	| range( 16'0B13, 16'0B28 )
858	| range( 16'0B2A, 16'0B30 )
859	| range( 16'0B32, 16'0B33 )
860	| range( 16'0B36, 16'0B39 )
861	| [16'0B3D]
862	| range( 16'0B5C, 16'0B5D )
863	| range( 16'0B5F, 16'0B61 )
864	| range( 16'0B85, 16'0B8A )
865	| range( 16'0B8E, 16'0B90 )
866	| range( 16'0B92, 16'0B95 )
867	| range( 16'0B99, 16'0B9A )
868	| [16'0B9C]
869	| range( 16'0B9E, 16'0B9F )
870	| range( 16'0BA3, 16'0BA4 )
871	| range( 16'0BA8, 16'0BAA )
872	| range( 16'0BAE, 16'0BB5 )
873	| range( 16'0BB7, 16'0BB9 )
874	| range( 16'0C05, 16'0C0C )
875	| range( 16'0C0E, 16'0C10 )
876	| range( 16'0C12, 16'0C28 )
877	| range( 16'0C2A, 16'0C33 )
878	| range( 16'0C35, 16'0C39 )
879	| range( 16'0C60, 16'0C61 )
880	| range( 16'0C85, 16'0C8C )
881	| range( 16'0C8E, 16'0C90 )
882	| range( 16'0C92, 16'0CA8 )
883	| range( 16'0CAA, 16'0CB3 )
884	| range( 16'0CB5, 16'0CB9 )
885	| [16'0CDE]
886	| range( 16'0CE0, 16'0CE1 )
887	| range( 16'0D05, 16'0D0C )
888	| range( 16'0D0E, 16'0D10 )
889	| range( 16'0D12, 16'0D28 )
890	| range( 16'0D2A, 16'0D39 )
891	| range( 16'0D60, 16'0D61 )
892	| range( 16'0E01, 16'0E2E )
893	| [16'0E30]
894	| range( 16'0E32, 16'0E33 )
895	| range( 16'0E40, 16'0E45 )
896	| range( 16'0E81, 16'0E82 )
897	| [16'0E84]
898	| range( 16'0E87, 16'0E88 )
899	| [16'0E8A]
900	| [16'0E8D]
901	| range( 16'0E94, 16'0E97 )
902	| range( 16'0E99, 16'0E9F )
903	| range( 16'0EA1, 16'0EA3 )
904	| [16'0EA5]
905	| [16'0EA7]
906	| range( 16'0EAA, 16'0EAB )
907	| range( 16'0EAD, 16'0EAE )
908	| [16'0EB0]
909	| range( 16'0EB2, 16'0EB3 )
910	| [16'0EBD]
911	| range( 16'0EC0, 16'0EC4 )
912	| range( 16'0F40, 16'0F47 )
913	| range( 16'0F49, 16'0F69 )
914	| range( 16'10A0, 16'10C5 )
915	| range( 16'10D0, 16'10F6 )
916	| [16'1100]
917	| range( 16'1102, 16'1103 )
918	| range( 16'1105, 16'1107 )
919	| [16'1109]
920	| range( 16'110B, 16'110C )
921	| range( 16'110E, 16'1112 )
922	| [16'113C]
923	| [16'113E]
924	| [16'1140]
925	| [16'114C]
926	| [16'114E]
927	| [16'1150]
928	| range( 16'1154, 16'1155 )
929	| [16'1159]
930	| range( 16'115F, 16'1161 )
931	| [16'1163]
932	| [16'1165]
933	| [16'1167]
934	| [16'1169]
935	| range( 16'116D, 16'116E )
936	| range( 16'1172, 16'1173 )
937	| [16'1175]
938	| [16'119E]
939	| [16'11A8]
940	| [16'11AB]
941	| range( 16'11AE, 16'11AF )
942	| range( 16'11B7, 16'11B8 )
943	| [16'11BA]
944	| range( 16'11BC, 16'11C2 )
945	| [16'11EB]
946	| [16'11F0]
947	| [16'11F9]
948	| range( 16'1E00, 16'1E9B )
949	| range( 16'1EA0, 16'1EF9 )
950	| range( 16'1F00, 16'1F15 )
951	| range( 16'1F18, 16'1F1D )
952	| range( 16'1F20, 16'1F45 )
953	| range( 16'1F48, 16'1F4D )
954	| range( 16'1F50, 16'1F57 )
955	| [16'1F59]
956	| [16'1F5B]
957	| [16'1F5D]
958	| range( 16'1F5F, 16'1F7D )
959	| range( 16'1F80, 16'1FB4 )
960	| range( 16'1FB6, 16'1FBC )
961	| [16'1FBE]
962	| range( 16'1FC2, 16'1FC4 )
963	| range( 16'1FC6, 16'1FCC )
964	| range( 16'1FD0, 16'1FD3 )
965	| range( 16'1FD6, 16'1FDB )
966	| range( 16'1FE0, 16'1FEC )
967	| range( 16'1FF2, 16'1FF4 )
968	| range( 16'1FF6, 16'1FFC )
969	| [16'2126]
970	| range( 16'212A, 16'212B )
971	| [16'212E]
972	| range( 16'2180, 16'2182 )
973	| range( 16'3041, 16'3094 )
974	| range( 16'30A1, 16'30FA )
975	| range( 16'3105, 16'312C )
976	| range( 16'AC00, 16'D7A3 )
977	).
978ideographic  -->
979	( range( 16'4E00, 16'9FA5 )
980	| [16'3007]
981	| range( 16'3021, 16'3029 )
982	).
983combiningchar  -->
984	( range( 16'0300, 16'0345 )
985	| range( 16'0360, 16'0361 )
986	| range( 16'0483, 16'0486 )
987	| range( 16'0591, 16'05A1 )
988	| range( 16'05A3, 16'05B9 )
989	| range( 16'05BB, 16'05BD )
990	| [16'05BF]
991	| range( 16'05C1, 16'05C2 )
992	| [16'05C4]
993	| range( 16'064B, 16'0652 )
994	| [16'0670]
995	| range( 16'06D6, 16'06DC )
996	| range( 16'06DD, 16'06DF )
997	| range( 16'06E0, 16'06E4 )
998	| range( 16'06E7, 16'06E8 )
999	| range( 16'06EA, 16'06ED )
1000	| range( 16'0901, 16'0903 )
1001	| [16'093C]
1002	| range( 16'093E, 16'094C )
1003	| [16'094D]
1004	| range( 16'0951, 16'0954 )
1005	| range( 16'0962, 16'0963 )
1006	| range( 16'0981, 16'0983 )
1007	| [16'09BC]
1008	| [16'09BE]
1009	| [16'09BF]
1010	| range( 16'09C0, 16'09C4 )
1011	| range( 16'09C7, 16'09C8 )
1012	| range( 16'09CB, 16'09CD )
1013	| [16'09D7]
1014	| range( 16'09E2, 16'09E3 )
1015	| [16'0A02]
1016	| [16'0A3C]
1017	| [16'0A3E]
1018	| [16'0A3F]
1019	| range( 16'0A40, 16'0A42 )
1020	| range( 16'0A47, 16'0A48 )
1021	| range( 16'0A4B, 16'0A4D )
1022	| range( 16'0A70, 16'0A71 )
1023	| range( 16'0A81, 16'0A83 )
1024	| [16'0ABC]
1025	| range( 16'0ABE, 16'0AC5 )
1026	| range( 16'0AC7, 16'0AC9 )
1027	| range( 16'0ACB, 16'0ACD )
1028	| range( 16'0B01, 16'0B03 )
1029	| [16'0B3C]
1030	| range( 16'0B3E, 16'0B43 )
1031	| range( 16'0B47, 16'0B48 )
1032	| range( 16'0B4B, 16'0B4D )
1033	| range( 16'0B56, 16'0B57 )
1034	| range( 16'0B82, 16'0B83 )
1035	| range( 16'0BBE, 16'0BC2 )
1036	| range( 16'0BC6, 16'0BC8 )
1037	| range( 16'0BCA, 16'0BCD )
1038	| [16'0BD7]
1039	| range( 16'0C01, 16'0C03 )
1040	| range( 16'0C3E, 16'0C44 )
1041	| range( 16'0C46, 16'0C48 )
1042	| range( 16'0C4A, 16'0C4D )
1043	| range( 16'0C55, 16'0C56 )
1044	| range( 16'0C82, 16'0C83 )
1045	| range( 16'0CBE, 16'0CC4 )
1046	| range( 16'0CC6, 16'0CC8 )
1047	| range( 16'0CCA, 16'0CCD )
1048	| range( 16'0CD5, 16'0CD6 )
1049	| range( 16'0D02, 16'0D03 )
1050	| range( 16'0D3E, 16'0D43 )
1051	| range( 16'0D46, 16'0D48 )
1052	| range( 16'0D4A, 16'0D4D )
1053	| [16'0D57]
1054	| [16'0E31]
1055	| range( 16'0E34, 16'0E3A )
1056	| range( 16'0E47, 16'0E4E )
1057	| [16'0EB1]
1058	| range( 16'0EB4, 16'0EB9 )
1059	| range( 16'0EBB, 16'0EBC )
1060	| range( 16'0EC8, 16'0ECD )
1061	| range( 16'0F18, 16'0F19 )
1062	| [16'0F35]
1063	| [16'0F37]
1064	| [16'0F39]
1065	| [16'0F3E]
1066	| [16'0F3F]
1067	| range( 16'0F71, 16'0F84 )
1068	| range( 16'0F86, 16'0F8B )
1069	| range( 16'0F90, 16'0F95 )
1070	| [16'0F97]
1071	| range( 16'0F99, 16'0FAD )
1072	| range( 16'0FB1, 16'0FB7 )
1073	| [16'0FB9]
1074	| range( 16'20D0, 16'20DC )
1075	| [16'20E1]
1076	| range( 16'302A, 16'302F )
1077	| [16'3099]
1078	| [16'309A]
1079	).
1080
1081unicode_digit  -->
1082	( range( 16'0030, 16'0039 )
1083	| range( 16'0660, 16'0669 )
1084	| range( 16'06F0, 16'06F9 )
1085	| range( 16'0966, 16'096F )
1086	| range( 16'09E6, 16'09EF )
1087	| range( 16'0A66, 16'0A6F )
1088	| range( 16'0AE6, 16'0AEF )
1089	| range( 16'0B66, 16'0B6F )
1090	| range( 16'0BE7, 16'0BEF )
1091	| range( 16'0C66, 16'0C6F )
1092	| range( 16'0CE6, 16'0CEF )
1093	| range( 16'0D66, 16'0D6F )
1094	| range( 16'0E50, 16'0E59 )
1095	| range( 16'0ED0, 16'0ED9 )
1096	| range( 16'0F20, 16'0F29 )
1097	).
1098
1099extender  -->
1100	( [16'00B7]
1101	| [16'02D0]
1102	| [16'02D1]
1103	| [16'0387]
1104	| [16'0640]
1105	| [16'0E46]
1106	| [16'0EC6]
1107	| [16'3005]
1108	| range( 16'3031, 16'3035 )
1109	| range( 16'309D, 16'309E )
1110	| range( 16'30FC, 16'30FE )
1111	).
1112
1113range( Low, High ) -->
1114	[Char],
1115	{Char >= Low, Char =< High}.
1116