1/* xml.pl : Contains xml_parse/[2,3] a bi-directional XML parser written in 2 * Prolog. 3 * 4 * Copyright (C) 2001-2005 Binding Time Limited 5 * Copyright (C) 2005, 2006 John Fletcher 6 * 7 * Current Release: $Revision: 1.2 $ 8 * 9 * TERMS AND CONDITIONS: 10 * 11 * This program is offered free of charge, as unsupported source code. You may 12 * use it, copy it, distribute it, modify it or sell it without restriction, 13 * but entirely at your own risk. 14 * 15 */ 16:- module( xml, 17 [ 18 xml_parse/2, 19 xml_parse/3, 20 xml_subterm/2, 21 xml_pp/1 22 ] ). 23 24/* xml_parse( {+Controls}, +?Chars, ?+Document ) parses Chars to/from a data 25 * structure of the form xml(<atts>, <content>). <atts> is a list of 26 * <atom>=<string> attributes from the (possibly implicit) XML signature of the 27 * document. <content> is a (possibly empty) list comprising occurrences of : 28 * 29 * pcdata(<string>) : Text 30 * comment(<string>) : An xml comment; 31 * element(<tag>,<atts>,<content>) : <tag>..</tag> encloses <content> 32 * : <tag /> if empty 33 * instructions(<atom>, <string>) : Processing <? <atom> <params> ?>" 34 * cdata( <string> ) : <![CDATA[ <string> ]]> 35 * doctype(<atom>, <doctype id>) : DTD <!DOCTYPE .. > 36 * 37 * The conversions are not completely symmetrical, in that weaker XML is 38 * accepted than can be generated. Specifically, in-bound (Chars -> Document) 39 * does not require strictly well-formed XML. Document is instantiated to the 40 * term malformed(Attributes, Content) if Chars does not represent well-formed 41 * XML. The Content of a malformed/2 structure can contain: 42 * 43 * unparsed( <string> ) : Text which has not been parsed 44 * out_of_context( <tag> ) : <tag> is not closed 45 * 46 * in addition to the standard term types. 47 * 48 * Out-bound (Document -> Chars) parsing _does_ require that Document defines 49 * strictly well-formed XML. If an error is detected a 'domain' exception is 50 * raised. 51 * 52 * The domain exception will attempt to identify the particular sub-term in 53 * error and the message will show a list of its ancestor elements in the form 54 * <tag>{(id)}* where <id> is the value of any attribute _named_ id. 55 * 56 * At this release, the Controls applying to in-bound (Chars -> Document) 57 * parsing are: 58 * 59 * extended_characters(<bool>) : Use the extended character 60 * : entities for XHTML (default true) 61 * 62 * format(<bool>) : Strip layouts when no character data 63 * : appears between elements. 64 * : (default true) 65 * 66 * remove_attribute_prefixes(<bool>) : Remove namespace prefixes from 67 * : attributes when it's the same as the 68 * : prefix of the parent element 69 * : (default false). 70 * 71 * allow_ampersand(<bool>) : Allow unescaped ampersand 72 * : characters (&) to occur in PCDATA. 73 * : (default false). 74 * 75 * [<bool> is one of 'true' or 'false'] 76 * 77 * For out-bound (Document -> Chars) parsing, the only available option is: 78 * 79 * format(<Bool>) : Indent the element content 80 * : (default true) 81 * 82 * Different DCGs for input and output are used because input parsing is 83 * more flexible than output parsing. Errors in input are recorded as part 84 * of the data structure. Output parsing throws an exception if the document 85 * is not well-formed, diagnosis tries to identify the specific culprit term. 86 */ 87xml_parse( Chars, Document ) :- 88 xml_parse( [], Chars, Document ). 89 90xml_parse( Controls, Chars, Document ) :- 91 ( ground( Chars ) -> 92 xml_to_document( Controls, Chars, Document ) 93 ; otherwise -> 94 document_to_xml( Controls, Document, Chars ) 95 ). 96 97document_to_xml( Controls, Document, Chars ) :- 98 ( member( format(false), Controls ) -> 99 Format = false 100 ; otherwise -> 101 Format = true 102 ), 103 ( ground( Document ), 104 document_generation(Format, Document, Chars0, [] ) -> 105 Chars = Chars0 106 ; otherwise -> 107 xml_fault( Document, [], Culprit, Path, Message ), 108 throw( 109 application_error('XML Parse: ~s in ~q~nCulprit: ~q~nPath: ~s', 110 [Message,Document,Culprit,Path] ) 111 ) 112 ). 113 114/* xml_subterm( +XMLTerm, ?Subterm ) unifies Subterm with a sub-term of Term. 115 * Note that XMLTerm is a sub-term of itself. 116 */ 117xml_subterm( Term, Term ). 118xml_subterm( xml(_Attributes, Content), Term ) :- 119 xml_subterm( Content, Term ). 120xml_subterm( [H|T], Term ) :- 121 ( xml_subterm( H, Term ) 122 ; xml_subterm( T, Term ) 123 ). 124xml_subterm( element(_Name,_Attributes,Content), Term ) :- 125 xml_subterm( Content, Term ). 126xml_subterm( namespace(_URI,_Prefix,Content), Term ) :- 127 xml_subterm( Content, Term ). 128 129/* xml is intended to be a rather modular module: it should be easy to 130 * build a program that can output XML, but not read it, or vice versa. 131 * Similarly, you may be happy to dispense with diagnosis once you are 132 * sure that your code will only try to make valid calls to xml_parse/2. 133 * 134 * It is intended that the code should be very portable too. Clearly, 135 * some small changes will be needed between platforms, but these should 136 * be limited to xml_utilities. xml_utilities contains most of the shared 137 * code and most of the potentially non-portable code. 138 */ 139:- ensure_loaded( xml_acquisition ). 140:- ensure_loaded( xml_diagnosis ). 141:- ensure_loaded( xml_generation ). 142:- ensure_loaded( xml_pp ). 143:- ensure_loaded( xml_utilities ). 144