1/* xml.pl : Contains xml_parse/[2,3] a bi-directional XML parser written in
2 * Prolog.
3 *
4 * Copyright (C) 2001-2005 Binding Time Limited
5 * Copyright (C) 2005, 2006 John Fletcher
6 *
7 * Current Release: $Revision: 1.2 $
8 *
9 * TERMS AND CONDITIONS:
10 *
11 * This program is offered free of charge, as unsupported source code. You may
12 * use it, copy it, distribute it, modify it or sell it without restriction,
13 * but entirely at your own risk.
14 *
15 */
16:- module( xml,
17	[
18	xml_parse/2,
19	xml_parse/3,
20	xml_subterm/2,
21	xml_pp/1
22	] ).
23
24/* xml_parse( {+Controls}, +?Chars, ?+Document ) parses Chars to/from a data
25 * structure of the form xml(<atts>, <content>). <atts> is a list of
26 * <atom>=<string> attributes from the (possibly implicit) XML signature of the
27 * document. <content> is a (possibly empty) list comprising occurrences of :
28 *
29 *    pcdata(<string>)                    :  Text
30 *    comment(<string>)                   :  An xml comment;
31 *    element(<tag>,<atts>,<content>)     :  <tag>..</tag> encloses <content>
32 *                                        :  <tag /> if empty
33 *    instructions(<atom>, <string>)      :  Processing <? <atom> <params> ?>"
34 *     cdata( <string> )                  :  <![CDATA[ <string> ]]>
35 *    doctype(<atom>, <doctype id>)       :  DTD <!DOCTYPE .. >
36 *
37 * The conversions are not completely symmetrical, in that weaker XML is
38 * accepted than can be generated. Specifically, in-bound (Chars -> Document)
39 * does not  require strictly well-formed XML. Document is instantiated to the
40 * term malformed(Attributes, Content) if Chars does not represent well-formed
41 * XML. The Content of a malformed/2 structure can contain:
42 *
43 *    unparsed( <string> )                :  Text which has not been parsed
44 *    out_of_context( <tag> )             :  <tag> is not closed
45 *
46 * in addition to the standard term types.
47 *
48 * Out-bound (Document -> Chars) parsing _does_ require that Document defines
49 * strictly well-formed XML. If an error is detected a 'domain' exception is
50 * raised.
51 *
52 * The domain exception will attempt to identify the particular sub-term in
53 * error and the message will show a list of its ancestor elements in the form
54 * <tag>{(id)}* where <id> is the value of any attribute _named_ id.
55 *
56 * At this release, the Controls applying to in-bound (Chars -> Document)
57 * parsing are:
58 *
59 *    extended_characters(<bool>)         :  Use the extended character
60 *                                        :  entities for XHTML (default true)
61 *
62 *    format(<bool>)                      :  Strip layouts when no character data
63 *                                        :  appears between elements.
64 *                                        :  (default true)
65 *
66 *    remove_attribute_prefixes(<bool>)   :  Remove namespace prefixes from
67 *                                        :  attributes when it's the same as the
68 *                                        :  prefix of the parent element
69 *                                        :  (default false).
70 *
71 *    allow_ampersand(<bool>)             :  Allow unescaped ampersand
72 *                                        :  characters (&) to occur in PCDATA.
73 *                                        :  (default false).
74 *
75 *    [<bool> is one of 'true' or 'false']
76 *
77 * For out-bound (Document -> Chars) parsing, the only available option is:
78 *
79 *    format(<Bool>)                      :  Indent the element content
80 *                                        :  (default true)
81 *
82 * Different DCGs for input and output are used because input parsing is
83 * more flexible than output parsing. Errors in input are recorded as part
84 * of the data structure. Output parsing throws an exception if the document
85 * is not well-formed, diagnosis tries to identify the specific culprit term.
86 */
87xml_parse( Chars, Document ) :-
88	xml_parse( [], Chars, Document ).
89
90xml_parse( Controls, Chars, Document ) :-
91	( ground( Chars ) ->
92		xml_to_document( Controls, Chars, Document )
93	; otherwise ->
94		document_to_xml( Controls, Document, Chars )
95	).
96
97document_to_xml( Controls, Document, Chars ) :-
98	( member( format(false), Controls ) ->
99		Format = false
100	; otherwise ->
101		Format = true
102	),
103	( ground( Document ),
104	  document_generation(Format, Document, Chars0, [] ) ->
105			Chars = Chars0
106	; otherwise ->
107		xml_fault( Document, [], Culprit, Path, Message ),
108		throw(
109			application_error('XML Parse: ~s in ~q~nCulprit: ~q~nPath: ~s',
110				[Message,Document,Culprit,Path] )
111			)
112	).
113
114/* xml_subterm( +XMLTerm, ?Subterm ) unifies Subterm with a sub-term of Term.
115 * Note that XMLTerm is a sub-term of itself.
116 */
117xml_subterm( Term, Term ).
118xml_subterm( xml(_Attributes, Content), Term ) :-
119	xml_subterm( Content, Term ).
120xml_subterm( [H|T], Term ) :-
121	( xml_subterm( H, Term )
122	; xml_subterm( T, Term )
123	).
124xml_subterm( element(_Name,_Attributes,Content), Term ) :-
125	xml_subterm( Content, Term ).
126xml_subterm( namespace(_URI,_Prefix,Content), Term ) :-
127	xml_subterm( Content, Term ).
128
129/* xml is intended to be a rather modular module: it should be easy to
130 * build a program that can output XML, but not read it, or vice versa.
131 * Similarly, you may be happy to dispense with diagnosis once you are
132 * sure that your code will only try to make valid calls to xml_parse/2.
133 *
134 * It is intended that the code should be very portable too. Clearly,
135 * some small changes will be needed between platforms, but these should
136 * be limited to xml_utilities. xml_utilities contains most of the shared
137 * code and most of the potentially non-portable code.
138 */
139:- ensure_loaded( xml_acquisition ).
140:- ensure_loaded( xml_diagnosis ).
141:- ensure_loaded( xml_generation ).
142:- ensure_loaded( xml_pp ).
143:- ensure_loaded( xml_utilities ).
144