1(*  Title:      Pure/PIDE/xml.ML
2    Author:     David Aspinall
3    Author:     Stefan Berghofer
4    Author:     Makarius
5
6Untyped XML trees and representation of ML values.
7*)
8
9signature XML_DATA_OPS =
10sig
11  type 'a A
12  type 'a T
13  type 'a V
14  type 'a P
15  val int_atom: int A
16  val bool_atom: bool A
17  val unit_atom: unit A
18  val properties: Properties.T T
19  val string: string T
20  val int: int T
21  val bool: bool T
22  val unit: unit T
23  val pair: 'a T -> 'b T -> ('a * 'b) T
24  val triple: 'a T -> 'b T -> 'c T -> ('a * 'b * 'c) T
25  val list: 'a T -> 'a list T
26  val option: 'a T -> 'a option T
27  val variant: 'a V list -> 'a T
28end;
29
30signature XML =
31sig
32  type attributes = (string * string) list
33  datatype tree =
34      Elem of (string * attributes) * tree list
35    | Text of string
36  type body = tree list
37  val blob: string list -> body
38  val is_empty: tree -> bool
39  val is_empty_body: body -> bool
40  val xml_elemN: string
41  val xml_nameN: string
42  val xml_bodyN: string
43  val wrap_elem: ((string * attributes) * tree list) * tree list -> tree
44  val unwrap_elem: tree -> (((string * attributes) * tree list) * tree list) option
45  val add_content: tree -> Buffer.T -> Buffer.T
46  val content_of: body -> string
47  val trim_blanks: body -> body
48  val header: string
49  val text: string -> string
50  val element: string -> attributes -> string list -> string
51  val output_markup: Markup.T -> Markup.output
52  val string_of: tree -> string
53  val pretty: int -> tree -> Pretty.T
54  val parse_comments: string list -> unit * string list
55  val parse_string : string -> string option
56  val parse_element: string list -> tree * string list
57  val parse_document: string list -> tree * string list
58  val parse: string -> tree
59  exception XML_ATOM of string
60  exception XML_BODY of body
61  structure Encode:
62  sig
63    include XML_DATA_OPS
64    val tree: tree T
65  end
66  structure Decode:
67  sig
68    include XML_DATA_OPS
69    val tree: tree T
70  end
71end;
72
73structure XML: XML =
74struct
75
76(** XML trees **)
77
78open Output_Primitives.XML;
79
80val blob = map Text;
81
82fun is_empty (Text "") = true
83  | is_empty _ = false;
84
85val is_empty_body = forall is_empty;
86
87
88(* wrapped elements *)
89
90val xml_elemN = "xml_elem";
91val xml_nameN = "xml_name";
92val xml_bodyN = "xml_body";
93
94fun wrap_elem (((a, atts), body1), body2) =
95  Elem ((xml_elemN, (xml_nameN, a) :: atts), Elem ((xml_bodyN, []), body1) :: body2);
96
97fun unwrap_elem (Elem ((name, (n, a) :: atts), Elem ((name', atts'), body1) :: body2)) =
98      if name = xml_elemN andalso n = xml_nameN andalso name' = xml_bodyN andalso null atts'
99      then SOME (((a, atts), body1), body2) else NONE
100  | unwrap_elem _ = NONE;
101
102
103(* text content *)
104
105fun add_content tree =
106  (case unwrap_elem tree of
107    SOME (_, ts) => fold add_content ts
108  | NONE =>
109      (case tree of
110        Elem (_, ts) => fold add_content ts
111      | Text s => Buffer.add s));
112
113fun content_of body = Buffer.empty |> fold add_content body |> Buffer.content;
114
115
116(* trim blanks *)
117
118fun trim_blanks trees =
119  trees |> maps
120    (fn Elem (markup, body) => [Elem (markup, trim_blanks body)]
121      | Text s =>
122          let val s' = s |> raw_explode |> trim Symbol.is_blank |> implode;
123          in if s' = "" then [] else [Text s'] end);
124
125
126
127(** string representation **)
128
129val header = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n";
130
131
132(* escaped text *)
133
134fun decode "&lt;" = "<"
135  | decode "&gt;" = ">"
136  | decode "&amp;" = "&"
137  | decode "&apos;" = "'"
138  | decode "&quot;" = "\""
139  | decode c = c;
140
141fun encode "<" = "&lt;"
142  | encode ">" = "&gt;"
143  | encode "&" = "&amp;"
144  | encode "'" = "&apos;"
145  | encode "\"" = "&quot;"
146  | encode c = c;
147
148val text = translate_string encode;
149
150
151(* elements *)
152
153fun elem name atts =
154  space_implode " " (name :: map (fn (a, x) => a ^ "=\"" ^ text x ^ "\"") atts);
155
156fun element name atts body =
157  let val b = implode body in
158    if b = "" then enclose "<" "/>" (elem name atts)
159    else enclose "<" ">" (elem name atts) ^ b ^ enclose "</" ">" name
160  end;
161
162fun output_markup (markup as (name, atts)) =
163  if Markup.is_empty markup then Markup.no_output
164  else (enclose "<" ">" (elem name atts), enclose "</" ">" name);
165
166
167(* output *)
168
169fun buffer_of depth tree =
170  let
171    fun traverse _ (Elem ((name, atts), [])) =
172          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add "/>"
173      | traverse d (Elem ((name, atts), ts)) =
174          Buffer.add "<" #> Buffer.add (elem name atts) #> Buffer.add ">" #>
175          traverse_body d ts #>
176          Buffer.add "</" #> Buffer.add name #> Buffer.add ">"
177      | traverse _ (Text s) = Buffer.add (text s)
178    and traverse_body 0 _ = Buffer.add "..."
179      | traverse_body d ts = fold (traverse (d - 1)) ts;
180  in Buffer.empty |> traverse depth tree end;
181
182val string_of = Buffer.content o buffer_of ~1;
183
184fun pretty depth tree =
185  Pretty.str (Buffer.content (buffer_of (Int.max (0, depth)) tree));
186
187val _ = ML_system_pp (fn depth => fn _ => Pretty.to_polyml o pretty (FixedInt.toInt depth));
188
189
190
191(** XML parsing **)
192
193local
194
195fun err msg (xs, _) =
196  fn () => "XML parsing error: " ^ msg () ^ "\nfound: " ^ quote (Symbol.beginning 100 xs);
197
198fun ignored _ = [];
199
200fun name_start_char c = Symbol.is_ascii_letter c orelse c = ":" orelse c = "_";
201fun name_char c = name_start_char c orelse Symbol.is_ascii_digit c orelse c = "-" orelse c = ".";
202val parse_name = Scan.one name_start_char ::: Scan.many name_char;
203
204val blanks = Scan.many Symbol.is_blank;
205val special = $$ "&" ^^ (parse_name >> implode) ^^ $$ ";" >> decode;
206val regular = Scan.one Symbol.not_eof;
207fun regular_except x = Scan.one (fn c => Symbol.not_eof c andalso c <> x);
208
209val parse_chars = Scan.repeat1 (special || regular_except "<") >> implode;
210
211val parse_cdata =
212  Scan.this_string "<![CDATA[" |--
213  (Scan.repeat (Scan.unless (Scan.this_string "]]>") regular) >> implode) --|
214  Scan.this_string "]]>";
215
216val parse_att =
217  ((parse_name >> implode) --| (blanks -- $$ "=" -- blanks)) --
218  (($$ "\"" || $$ "'") :|-- (fn s =>
219    (Scan.repeat (special || regular_except s) >> implode) --| $$ s));
220
221val parse_comment =
222  Scan.this_string "<!--" --
223  Scan.repeat (Scan.unless (Scan.this_string "-->") regular) --
224  Scan.this_string "-->" >> ignored;
225
226val parse_processing_instruction =
227  Scan.this_string "<?" --
228  Scan.repeat (Scan.unless (Scan.this_string "?>") regular) --
229  Scan.this_string "?>" >> ignored;
230
231val parse_doctype =
232  Scan.this_string "<!DOCTYPE" --
233  Scan.repeat (Scan.unless ($$ ">") regular) --
234  $$ ">" >> ignored;
235
236val parse_misc =
237  Scan.one Symbol.is_blank >> ignored ||
238  parse_processing_instruction ||
239  parse_comment;
240
241val parse_optional_text =
242  Scan.optional (parse_chars >> (single o Text)) [];
243
244in
245
246val parse_comments =
247  blanks -- Scan.repeat (parse_comment -- blanks >> K ()) >> K ();
248
249val parse_string = Scan.read Symbol.stopper parse_chars o raw_explode;
250
251fun parse_content xs =
252  (parse_optional_text @@@
253    Scan.repeats
254      ((parse_element >> single ||
255        parse_cdata >> (single o Text) ||
256        parse_processing_instruction ||
257        parse_comment)
258      @@@ parse_optional_text)) xs
259
260and parse_element xs =
261  ($$ "<" |-- parse_name -- Scan.repeat (blanks |-- parse_att) --| blanks :--
262    (fn (name, _) =>
263      !! (err (fn () => "Expected > or />"))
264       ($$ "/" -- $$ ">" >> ignored ||
265        $$ ">" |-- parse_content --|
266          !! (err (fn () => "Expected </" ^ implode name ^ ">"))
267              ($$ "<" -- $$ "/" -- Scan.this name -- blanks -- $$ ">")))
268    >> (fn ((name, atts), body) => Elem ((implode name, atts), body))) xs;
269
270val parse_document =
271  (Scan.repeat parse_misc -- Scan.option parse_doctype -- Scan.repeat parse_misc)
272  |-- parse_element;
273
274fun parse s =
275  (case Scan.finite Symbol.stopper (Scan.error (!! (err (fn () => "Malformed element"))
276      (blanks |-- parse_document --| blanks))) (raw_explode s) of
277    (x, []) => x
278  | (_, ys) => error ("XML parsing error: unprocessed input\n" ^ Symbol.beginning 100 ys));
279
280end;
281
282
283
284(** XML as data representation language **)
285
286exception XML_ATOM of string;
287exception XML_BODY of tree list;
288
289
290structure Encode =
291struct
292
293type 'a A = 'a -> string;
294type 'a T = 'a -> body;
295type 'a V = 'a -> string list * body;
296type 'a P = 'a -> string list;
297
298
299(* atomic values *)
300
301fun int_atom i = Value.print_int i;
302
303fun bool_atom false = "0"
304  | bool_atom true = "1";
305
306fun unit_atom () = "";
307
308
309(* structural nodes *)
310
311fun node ts = Elem ((":", []), ts);
312
313fun vector xs = map_index (fn (i, x) => (int_atom i, x)) xs;
314
315fun tagged (tag, (xs, ts)) = Elem ((int_atom tag, vector xs), ts);
316
317
318(* representation of standard types *)
319
320fun tree (t: tree) = [t];
321
322fun properties props = [Elem ((":", props), [])];
323
324fun string "" = []
325  | string s = [Text s];
326
327val int = string o int_atom;
328
329val bool = string o bool_atom;
330
331val unit = string o unit_atom;
332
333fun pair f g (x, y) = [node (f x), node (g y)];
334
335fun triple f g h (x, y, z) = [node (f x), node (g y), node (h z)];
336
337fun list f xs = map (node o f) xs;
338
339fun option _ NONE = []
340  | option f (SOME x) = [node (f x)];
341
342fun variant fs x =
343  [tagged (the (get_index (fn f => SOME (f x) handle General.Match => NONE) fs))];
344
345end;
346
347
348structure Decode =
349struct
350
351type 'a A = string -> 'a;
352type 'a T = body -> 'a;
353type 'a V = string list * body -> 'a;
354type 'a P = string list -> 'a;
355
356
357(* atomic values *)
358
359fun int_atom s =
360  Value.parse_int s
361    handle Fail _ => raise XML_ATOM s;
362
363fun bool_atom "0" = false
364  | bool_atom "1" = true
365  | bool_atom s = raise XML_ATOM s;
366
367fun unit_atom "" = ()
368  | unit_atom s = raise XML_ATOM s;
369
370
371(* structural nodes *)
372
373fun node (Elem ((":", []), ts)) = ts
374  | node t = raise XML_BODY [t];
375
376fun vector atts =
377  map_index (fn (i, (a, x)) => if int_atom a = i then x else raise XML_ATOM a) atts;
378
379fun tagged (Elem ((name, atts), ts)) = (int_atom name, (vector atts, ts))
380  | tagged t = raise XML_BODY [t];
381
382
383(* representation of standard types *)
384
385fun tree [t] = t
386  | tree ts = raise XML_BODY ts;
387
388fun properties [Elem ((":", props), [])] = props
389  | properties ts = raise XML_BODY ts;
390
391fun string [] = ""
392  | string [Text s] = s
393  | string ts = raise XML_BODY ts;
394
395val int = int_atom o string;
396
397val bool = bool_atom o string;
398
399val unit = unit_atom o string;
400
401fun pair f g [t1, t2] = (f (node t1), g (node t2))
402  | pair _ _ ts = raise XML_BODY ts;
403
404fun triple f g h [t1, t2, t3] = (f (node t1), g (node t2), h (node t3))
405  | triple _ _ _ ts = raise XML_BODY ts;
406
407fun list f ts = map (f o node) ts;
408
409fun option _ [] = NONE
410  | option f [t] = SOME (f (node t))
411  | option _ ts = raise XML_BODY ts;
412
413fun variant fs [t] =
414      let
415        val (tag, (xs, ts)) = tagged t;
416        val f = nth fs tag handle General.Subscript => raise XML_BODY [t];
417      in f (xs, ts) end
418  | variant _ ts = raise XML_BODY ts;
419
420end;
421
422end;
423