1/*
2*******************************************************************************
3*
4*   Copyright (C) 2004-2010, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  xmlparser.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2004jul21
14*   created by: Andy Heninger
15*/
16
17#include <stdio.h>
18#include "unicode/uchar.h"
19#include "unicode/ucnv.h"
20#include "unicode/regex.h"
21#include "filestrm.h"
22#include "xmlparser.h"
23
24#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
25
26// character constants
27enum {
28    x_QUOT=0x22,
29    x_AMP=0x26,
30    x_APOS=0x27,
31    x_LT=0x3c,
32    x_GT=0x3e,
33    x_l=0x6c
34};
35
36#define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
37
38// XML #4
39#define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
40                    "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
41                    "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
42                    "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
43
44//  XML #5
45#define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
46
47//  XML #6
48#define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
49
50U_NAMESPACE_BEGIN
51
52UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
53UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
54
55//
56//   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
57//                             used for parsing.
58//
59UXMLParser::UXMLParser(UErrorCode &status) :
60      //  XML Declaration.  XML Production #23.
61      //      example:  "<?xml version=1.0 encoding="utf-16" ?>
62      //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
63      //            allow for a possible leading BOM.
64      mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
65
66      //  XML Comment   production #15
67      //     example:  "<!-- whatever -->
68      //       note, does not detect an illegal "--" within comments
69      mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
70
71      //  XML Spaces
72      //      production [3]
73      mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
74
75      //  XML Doctype decl  production #28
76      //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
77      //       or      "<!DOCTYPE foo [internal dtd]>
78      //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
79      //           Some internal dtd subsets could confuse this simple-minded
80      //           attempt at skipping over them, specifically, occcurences
81      //           of closeing square brackets.  These could appear in comments,
82      //           or in parameter entity declarations, for example.
83      mXMLDoctype(UnicodeString(
84           "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
85           ), 0, status),
86
87      //  XML PI     production #16
88      //     example   "<?target stuff?>
89      mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
90
91      //  XML Element Start   Productions #40, #41
92      //          example   <foo att1='abc'  att2="d e f" >
93      //      capture #1:  the tag name
94      //
95      mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
96          "(?:"
97                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
98                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
99          ")*"                                                             //   * for zero or more attributes.
100          XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
101
102      //  XML Element End     production #42
103      //     example   </foo>
104      mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
105
106      // XML Element Empty    production #44
107      //     example   <foo att1="abc"   att2="d e f" />
108      mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
109          "(?:"
110                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
111                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
112          ")*"                                                             //   * for zero or more attributes.
113          XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
114
115
116      // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
117      mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
118
119      // Attribute name = "value".  XML Productions 10, 40/41
120      //  Capture group 1 is name,
121      //                2 is the attribute value, including the quotes.
122      //
123      //   Note that attributes are scanned twice.  The first time is with
124      //        the regex for an entire element start.  There, the attributes
125      //        are checked syntactically, but not separted out one by one.
126      //        Here, we match a single attribute, and make its name and
127      //        attribute value available to the parser code.
128      mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
129         "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
130
131
132      mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
133
134      // Match any of the new-line sequences in content.
135      //   All are changed to \u000a.
136      mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
137
138      // & char references
139      //   We will figure out what we've got based on which capture group has content.
140      //   The last one is a catchall for unrecognized entity references..
141      //             1     2     3      4      5           6                    7          8
142      mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
143                0, status),
144
145      fNames(status),
146      fElementStack(status),
147      fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
148      {
149      }
150
151UXMLParser *
152UXMLParser::createParser(UErrorCode &errorCode) {
153    if (U_FAILURE(errorCode)) {
154        return NULL;
155    } else {
156        return new UXMLParser(errorCode);
157    }
158}
159
160UXMLParser::~UXMLParser() {}
161
162UXMLElement *
163UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
164    char bytes[4096], charsetBuffer[100];
165    FileStream *f;
166    const char *charset, *pb;
167    UnicodeString src;
168    UConverter *cnv;
169    UChar *buffer, *pu;
170    int32_t fileLength, bytesLength, length, capacity;
171    UBool flush;
172
173    if(U_FAILURE(errorCode)) {
174        return NULL;
175    }
176
177    f=T_FileStream_open(filename, "rb");
178    if(f==NULL) {
179        errorCode=U_FILE_ACCESS_ERROR;
180        return NULL;
181    }
182
183    bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
184    if(bytesLength<(int32_t)sizeof(bytes)) {
185        // we have already read the entire file
186        fileLength=bytesLength;
187    } else {
188        // get the file length
189        fileLength=T_FileStream_size(f);
190    }
191
192    /*
193     * get the charset:
194     * 1. Unicode signature
195     * 2. treat as ISO-8859-1 and read XML encoding="charser"
196     * 3. default to UTF-8
197     */
198    charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
199    if(U_SUCCESS(errorCode) && charset!=NULL) {
200        // open converter according to Unicode signature
201        cnv=ucnv_open(charset, &errorCode);
202    } else {
203        // read as Latin-1 and parse the XML declaration and encoding
204        cnv=ucnv_open("ISO-8859-1", &errorCode);
205        if(U_FAILURE(errorCode)) {
206            // unexpected error opening Latin-1 converter
207            goto exit;
208        }
209
210        buffer=src.getBuffer(bytesLength);
211        if(buffer==NULL) {
212            // unexpected failure to reserve some string capacity
213            errorCode=U_MEMORY_ALLOCATION_ERROR;
214            goto exit;
215        }
216        pb=bytes;
217        pu=buffer;
218        ucnv_toUnicode(
219            cnv,
220            &pu, buffer+src.getCapacity(),
221            &pb, bytes+bytesLength,
222            NULL, TRUE, &errorCode);
223        src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
224        ucnv_close(cnv);
225        cnv=NULL;
226        if(U_FAILURE(errorCode)) {
227            // unexpected error in conversion from Latin-1
228            src.remove();
229            goto exit;
230        }
231
232        // parse XML declaration
233        if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
234            int32_t declEnd=mXMLDecl.end(errorCode);
235            // go beyond <?xml
236            int32_t pos=src.indexOf((UChar)x_l)+1;
237
238            mAttrValue.reset(src);
239            while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
240                UnicodeString attName  = mAttrValue.group(1, errorCode);
241                UnicodeString attValue = mAttrValue.group(2, errorCode);
242
243                // Trim the quotes from the att value.  These are left over from the original regex
244                //   that parsed the attribue, which couldn't conveniently strip them.
245                attValue.remove(0,1);                    // one char from the beginning
246                attValue.truncate(attValue.length()-1);  // and one from the end.
247
248                if(attName==UNICODE_STRING("encoding", 8)) {
249                    length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
250                    charset=charsetBuffer;
251                    break;
252                }
253                pos = mAttrValue.end(2, errorCode);
254            }
255
256            if(charset==NULL) {
257                // default to UTF-8
258                charset="UTF-8";
259            }
260            cnv=ucnv_open(charset, &errorCode);
261        }
262    }
263
264    if(U_FAILURE(errorCode)) {
265        // unable to open the converter
266        goto exit;
267    }
268
269    // convert the file contents
270    capacity=fileLength;        // estimated capacity
271    src.getBuffer(capacity);
272    src.releaseBuffer(0);       // zero length
273    flush=FALSE;
274    for(;;) {
275        // convert contents of bytes[bytesLength]
276        pb=bytes;
277        for(;;) {
278            length=src.length();
279            buffer=src.getBuffer(capacity);
280            if(buffer==NULL) {
281                // unexpected failure to reserve some string capacity
282                errorCode=U_MEMORY_ALLOCATION_ERROR;
283                goto exit;
284            }
285
286            pu=buffer+length;
287            ucnv_toUnicode(
288                cnv, &pu, buffer+src.getCapacity(),
289                &pb, bytes+bytesLength,
290                NULL, FALSE, &errorCode);
291            src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
292            if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
293                errorCode=U_ZERO_ERROR;
294                capacity=(3*src.getCapacity())/2; // increase capacity by 50%
295            } else {
296                break;
297            }
298        }
299
300        if(U_FAILURE(errorCode)) {
301            break; // conversion error
302        }
303
304        if(flush) {
305            break; // completely converted the file
306        }
307
308        // read next block
309        bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
310        if(bytesLength==0) {
311            // reached end of file, convert once more to flush the converter
312            flush=TRUE;
313        }
314    };
315
316exit:
317    ucnv_close(cnv);
318    T_FileStream_close(f);
319
320    if(U_SUCCESS(errorCode)) {
321        return parse(src, errorCode);
322    } else {
323        return NULL;
324    }
325}
326
327UXMLElement *
328UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
329    if(U_FAILURE(status)) {
330        return NULL;
331    }
332
333    UXMLElement   *root = NULL;
334    fPos = 0; // TODO use just a local pos variable and pass it into functions
335              // where necessary?
336
337    // set all matchers to work on the input string
338    mXMLDecl.reset(src);
339    mXMLComment.reset(src);
340    mXMLSP.reset(src);
341    mXMLDoctype.reset(src);
342    mXMLPI.reset(src);
343    mXMLElemStart.reset(src);
344    mXMLElemEnd.reset(src);
345    mXMLElemEmpty.reset(src);
346    mXMLCharData.reset(src);
347    mAttrValue.reset(src);
348    mAttrNormalizer.reset(src);
349    mNewLineNormalizer.reset(src);
350    mAmps.reset(src);
351
352    // Consume the XML Declaration, if present.
353    if (mXMLDecl.lookingAt(fPos, status)) {
354        fPos = mXMLDecl.end(status);
355    }
356
357    // Consume "misc" [XML production 27] appearing before DocType
358    parseMisc(status);
359
360    // Consume a DocType declaration, if present.
361    if (mXMLDoctype.lookingAt(fPos, status)) {
362        fPos = mXMLDoctype.end(status);
363    }
364
365    // Consume additional "misc" [XML production 27] appearing after the DocType
366    parseMisc(status);
367
368    // Get the root element
369    if (mXMLElemEmpty.lookingAt(fPos, status)) {
370        // Root is an empty element (no nested elements or content)
371        root = createElement(mXMLElemEmpty, status);
372        fPos = mXMLElemEmpty.end(status);
373    } else {
374        if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
375            error("Root Element expected", status);
376            goto errorExit;
377        }
378        root = createElement(mXMLElemStart, status);
379        UXMLElement  *el = root;
380
381        //
382        // This is the loop that consumes the root element of the document,
383        //      including all nested content.   Nested elements are handled by
384        //      explicit pushes/pops of the element stack; there is no recursion
385        //      in the control flow of this code.
386        //      "el" always refers to the current element, the one to which content
387        //      is being added.  It is above the top of the element stack.
388        for (;;) {
389            // Nested Element Start
390            if (mXMLElemStart.lookingAt(fPos, status)) {
391                UXMLElement *t = createElement(mXMLElemStart, status);
392                el->fChildren.addElement(t, status);
393                t->fParent = el;
394                fElementStack.push(el, status);
395                el = t;
396                continue;
397            }
398
399            // Text Content.  String is concatenated onto the current node's content,
400            //                but only if it contains something other than spaces.
401            UnicodeString s = scanContent(status);
402            if (s.length() > 0) {
403                mXMLSP.reset(s);
404                if (mXMLSP.matches(status) == FALSE) {
405                    // This chunk of text contains something other than just
406                    //  white space. Make a child node for it.
407                    replaceCharRefs(s, status);
408                    el->fChildren.addElement(s.clone(), status);
409                }
410                mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
411                continue;
412            }
413
414            // Comments.  Discard.
415            if (mXMLComment.lookingAt(fPos, status)) {
416                fPos = mXMLComment.end(status);
417                continue;
418            }
419
420            // PIs.  Discard.
421            if (mXMLPI.lookingAt(fPos, status)) {
422                fPos = mXMLPI.end(status);
423                continue;
424            }
425
426            // Element End
427            if (mXMLElemEnd.lookingAt(fPos, status)) {
428                fPos = mXMLElemEnd.end(0, status);
429                const UnicodeString name = mXMLElemEnd.group(1, status);
430                if (name != *el->fName) {
431                    error("Element start / end tag mismatch", status);
432                    goto errorExit;
433                }
434                if (fElementStack.empty()) {
435                    // Close of the root element.  We're done with the doc.
436                    el = NULL;
437                    break;
438                }
439                el = (UXMLElement *)fElementStack.pop();
440                continue;
441            }
442
443            // Empty Element.  Stored as a child of the current element, but not stacked.
444            if (mXMLElemEmpty.lookingAt(fPos, status)) {
445                UXMLElement *t = createElement(mXMLElemEmpty, status);
446                el->fChildren.addElement(t, status);
447                continue;
448            }
449
450            // Hit something within the document that doesn't match anything.
451            //   It's an error.
452            error("Unrecognized markup", status);
453            break;
454        }
455
456        if (el != NULL || !fElementStack.empty()) {
457            // We bailed out early, for some reason.
458            error("Root element not closed.", status);
459            goto errorExit;
460        }
461    }
462
463    // Root Element parse is complete.
464    // Consume the annoying xml "Misc" that can appear at the end of the doc.
465    parseMisc(status);
466
467    // We should have reached the end of the input
468    if (fPos != src.length()) {
469        error("Extra content at the end of the document", status);
470        goto errorExit;
471    }
472
473    // Success!
474    return root;
475
476errorExit:
477    delete root;
478    return NULL;
479}
480
481//
482//  createElement
483//      We've just matched an element start tag.  Create and fill in a UXMLElement object
484//      for it.
485//
486UXMLElement *
487UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
488    // First capture group is the element's name.
489    UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
490
491    // Scan for attributes.
492    int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
493
494    while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
495        UnicodeString attName  = mAttrValue.group(1, status);
496        UnicodeString attValue = mAttrValue.group(2, status);
497
498        // Trim the quotes from the att value.  These are left over from the original regex
499        //   that parsed the attribue, which couldn't conveniently strip them.
500        attValue.remove(0,1);                    // one char from the beginning
501        attValue.truncate(attValue.length()-1);  // and one from the end.
502
503        // XML Attribue value normalization.
504        // This is one of the really screwy parts of the XML spec.
505        // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
506        // Note that non-validating parsers must treat all entities as type CDATA
507        //   which simplifies things some.
508
509        // Att normalization step 1:  normalize any newlines in the attribute value
510        mNewLineNormalizer.reset(attValue);
511        attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
512
513        // Next change all xml white space chars to plain \u0020 spaces.
514        mAttrNormalizer.reset(attValue);
515        UnicodeString oneSpace((UChar)0x0020);
516        attValue = mAttrNormalizer.replaceAll(oneSpace, status);
517
518        // Replace character entities.
519        replaceCharRefs(attValue, status);
520
521        // Save the attribute name and value in our document structure.
522        el->fAttNames.addElement((void *)intern(attName, status), status);
523        el->fAttValues.addElement(attValue.clone(), status);
524        pos = mAttrValue.end(2, status);
525    }
526    fPos = mEl.end(0, status);
527    return el;
528}
529
530//
531//  parseMisc
532//     Consume XML "Misc" [production #27]
533//        which is any combination of space, PI and comments
534//      Need to watch end-of-input because xml MISC stuff is allowed after
535//        the document element, so we WILL scan off the end in this function
536//
537void
538UXMLParser::parseMisc(UErrorCode &status)  {
539    for (;;) {
540        if (fPos >= mXMLPI.input().length()) {
541            break;
542        }
543        if (mXMLPI.lookingAt(fPos, status)) {
544            fPos = mXMLPI.end(status);
545            continue;
546        }
547        if (mXMLSP.lookingAt(fPos, status)) {
548            fPos = mXMLSP.end(status);
549            continue;
550        }
551        if (mXMLComment.lookingAt(fPos, status)) {
552            fPos = mXMLComment.end(status);
553            continue;
554        }
555        break;
556    }
557}
558
559//
560//  Scan for document content.
561//
562UnicodeString
563UXMLParser::scanContent(UErrorCode &status) {
564    UnicodeString  result;
565    if (mXMLCharData.lookingAt(fPos, status)) {
566        result = mXMLCharData.group((int32_t)0, status);
567        // Normalize the new-lines.  (Before char ref substitution)
568        mNewLineNormalizer.reset(result);
569        result = mNewLineNormalizer.replaceAll(fOneLF, status);
570
571        // TODO:  handle CDATA
572        fPos = mXMLCharData.end(0, status);
573    }
574
575    return result;
576}
577
578//
579//   replaceCharRefs
580//
581//      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
582//       with the corresponding actual character.
583//
584void
585UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
586    UnicodeString result;
587    UnicodeString replacement;
588    int     i;
589
590    mAmps.reset(s);
591    // See the initialization for the regex matcher mAmps.
592    //    Which entity we've matched is determined by which capture group has content,
593    //      which is flaged by start() of that group not being -1.
594    while (mAmps.find()) {
595        if (mAmps.start(1, status) != -1) {
596            replacement.setTo((UChar)x_AMP);
597        } else if (mAmps.start(2, status) != -1) {
598            replacement.setTo((UChar)x_LT);
599        } else if (mAmps.start(3, status) != -1) {
600            replacement.setTo((UChar)x_GT);
601        } else if (mAmps.start(4, status) != -1) {
602            replacement.setTo((UChar)x_APOS);
603        } else if (mAmps.start(5, status) != -1) {
604            replacement.setTo((UChar)x_QUOT);
605        } else if (mAmps.start(6, status) != -1) {
606            UnicodeString hexString = mAmps.group(6, status);
607            UChar32 val = 0;
608            for (i=0; i<hexString.length(); i++) {
609                val = (val << 4) + u_digit(hexString.charAt(i), 16);
610            }
611            // TODO:  some verification that the character is valid
612            replacement.setTo(val);
613        } else if (mAmps.start(7, status) != -1) {
614            UnicodeString decimalString = mAmps.group(7, status);
615            UChar32 val = 0;
616            for (i=0; i<decimalString.length(); i++) {
617                val = val*10 + u_digit(decimalString.charAt(i), 10);
618            }
619            // TODO:  some verification that the character is valid
620            replacement.setTo(val);
621        } else {
622            // An unrecognized &entity;  Leave it alone.
623            //  TODO:  check that it really looks like an entity, and is not some
624            //         random & in the text.
625            replacement = mAmps.group((int32_t)0, status);
626        }
627        mAmps.appendReplacement(result, replacement, status);
628    }
629    mAmps.appendTail(result);
630    s = result;
631}
632
633void
634UXMLParser::error(const char *message, UErrorCode &status) {
635    // TODO:  something better here...
636    const UnicodeString &src=mXMLDecl.input();
637    int  line = 0;
638    int  ci = 0;
639    while (ci < fPos && ci>=0) {
640        ci = src.indexOf((UChar)0x0a, ci+1);
641        line++;
642    }
643    fprintf(stderr, "Error: %s at line %d\n", message, line);
644    if (U_SUCCESS(status)) {
645        status = U_PARSE_ERROR;
646    }
647}
648
649// intern strings like in Java
650
651const UnicodeString *
652UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
653    const UHashElement *he=fNames.find(s);
654    if(he!=NULL) {
655        // already a known name, return its hashed key pointer
656        return (const UnicodeString *)he->key.pointer;
657    } else {
658        // add this new name and return its hashed key pointer
659        fNames.puti(s, 0, errorCode);
660        he=fNames.find(s);
661        return (const UnicodeString *)he->key.pointer;
662    }
663}
664
665const UnicodeString *
666UXMLParser::findName(const UnicodeString &s) const {
667    const UHashElement *he=fNames.find(s);
668    if(he!=NULL) {
669        // a known name, return its hashed key pointer
670        return (const UnicodeString *)he->key.pointer;
671    } else {
672        // unknown name
673        return NULL;
674    }
675}
676
677// UXMLElement ------------------------------------------------------------- ***
678
679UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
680   fParser(parser),
681   fName(name),
682   fAttNames(errorCode),
683   fAttValues(errorCode),
684   fChildren(errorCode),
685   fParent(NULL)
686{
687}
688
689UXMLElement::~UXMLElement() {
690    int   i;
691    // attribute names are owned by the UXMLParser, don't delete them here
692    for (i=fAttValues.size()-1; i>=0; i--) {
693        delete (UObject *)fAttValues.elementAt(i);
694    }
695    for (i=fChildren.size()-1; i>=0; i--) {
696        delete (UObject *)fChildren.elementAt(i);
697    }
698}
699
700const UnicodeString &
701UXMLElement::getTagName() const {
702    return *fName;
703}
704
705UnicodeString
706UXMLElement::getText(UBool recurse) const {
707    UnicodeString text;
708    appendText(text, recurse);
709    return text;
710}
711
712void
713UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
714    const UObject *node;
715    int32_t i, count=fChildren.size();
716    for(i=0; i<count; ++i) {
717        node=(const UObject *)fChildren.elementAt(i);
718        const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
719        if(s!=NULL) {
720            text.append(*s);
721        } else if(recurse) /* must be a UXMLElement */ {
722            ((const UXMLElement *)node)->appendText(text, recurse);
723        }
724    }
725}
726
727int32_t
728UXMLElement::countAttributes() const {
729    return fAttNames.size();
730}
731
732const UnicodeString *
733UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
734    if(0<=i && i<fAttNames.size()) {
735        name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
736        value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
737        return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
738    } else {
739        return NULL;
740    }
741}
742
743const UnicodeString *
744UXMLElement::getAttribute(const UnicodeString &name) const {
745    // search for the attribute name by comparing the interned pointer,
746    // not the string contents
747    const UnicodeString *p=fParser->findName(name);
748    if(p==NULL) {
749        return NULL; // no such attribute seen by the parser at all
750    }
751
752    int32_t i, count=fAttNames.size();
753    for(i=0; i<count; ++i) {
754        if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
755            return (const UnicodeString *)fAttValues.elementAt(i);
756        }
757    }
758    return NULL;
759}
760
761int32_t
762UXMLElement::countChildren() const {
763    return fChildren.size();
764}
765
766const UObject *
767UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
768    if(0<=i && i<fChildren.size()) {
769        const UObject *node=(const UObject *)fChildren.elementAt(i);
770        if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
771            type=UXML_NODE_TYPE_ELEMENT;
772        } else {
773            type=UXML_NODE_TYPE_STRING;
774        }
775        return node;
776    } else {
777        return NULL;
778    }
779}
780
781const UXMLElement *
782UXMLElement::nextChildElement(int32_t &i) const {
783    if(i<0) {
784        return NULL;
785    }
786
787    const UObject *node;
788    int32_t count=fChildren.size();
789    while(i<count) {
790        node=(const UObject *)fChildren.elementAt(i++);
791        const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
792        if(elem!=NULL) {
793            return elem;
794        }
795    }
796    return NULL;
797}
798
799const UXMLElement *
800UXMLElement::getChildElement(const UnicodeString &name) const {
801    // search for the element name by comparing the interned pointer,
802    // not the string contents
803    const UnicodeString *p=fParser->findName(name);
804    if(p==NULL) {
805        return NULL; // no such element seen by the parser at all
806    }
807
808    const UObject *node;
809    int32_t i, count=fChildren.size();
810    for(i=0; i<count; ++i) {
811        node=(const UObject *)fChildren.elementAt(i);
812        const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
813        if(elem!=NULL) {
814            if(p==elem->fName) {
815                return elem;
816            }
817        }
818    }
819    return NULL;
820}
821
822U_NAMESPACE_END
823
824#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
825
826