1/*
2 * Copyright (c) 2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24/*	CFXMLInputStream.c
25	Copyright (c) 1999-2013, Apple Inc. All rights reserved.
26	Responsibility: David Smith
27*/
28
29#include "CFXMLInputStream.h"
30#include <CoreFoundation/CFCharacterSet.h>
31#include <string.h>
32#include "CFStringEncodingConverter.h"
33#include "CFUniChar.h"
34
35/* Utility functions used in parsing */
36static Boolean determineEncoding(_CFXMLInputStream *stream) {
37    const uint8_t *bytes = (uint8_t *)CFDataGetBytePtr(stream->data);
38    UInt32 length = CFDataGetLength(stream->data);
39    const uint8_t *idx = 0L, *end = 0L;
40    const uint8_t *base = 0L;
41    char quote = ' ';
42    Boolean useUTF8 = false;
43
44    // Check for the byte order mark first
45    if (length > 2) {
46        // This clause checks for the unicode byte order mark, or a Unicode sequence lacking the BOM; technically an error, but this check is recommended by the XML spec
47        if ((*bytes == 0xFF && *(bytes+1) == 0xFE) ||*(bytes+1) == 0x00) {
48#if __BIG_ENDIAN__
49            stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
50#else
51            stream->flags |= ENCODING_IS_UNICODE_NATURAL;
52#endif
53            if (*bytes == 0xFF) {
54                stream->currentByte = bytes + 2;
55            }
56            stream->encoding = kCFStringEncodingUnicode;
57            return true;
58        } else if ((*bytes == 0xFE && *(bytes+1) == 0xFF) || *bytes == 0x00) {
59#if __BIG_ENDIAN__
60            stream->flags |= ENCODING_IS_UNICODE_NATURAL;
61#else
62            stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
63#endif
64            if (*bytes == 0xFE) {
65                stream->currentByte = bytes + 2;
66            }
67            stream->encoding = kCFStringEncodingUnicode;
68            return true;
69        } else if(*bytes == 0xEF && *(bytes+1) == 0xBB && *(bytes+2) == 0xBF) {
70            if(*bytes == 0xEF) {
71                stream->currentByte = bytes + 3;
72            }
73            stream->encoding = kCFStringEncodingUTF8;
74            stream->flags |= ENCODING_MATCHES_ASCII;
75            return true;
76        }
77    }
78    // Scan for the <?xml.... ?> opening
79    if (length < 5 || strncmp((char const *) bytes, "<?xml", 5) != 0) {
80        useUTF8 = true;
81    }
82    if (!useUTF8) {
83        idx = bytes + 5;
84        end = bytes + length;
85        // Found "<?xml"; now we scan for "encoding"
86        while (idx < end) {
87            uint8_t ch = *idx;
88            const uint8_t *scan;
89            if ( ch == '?' || ch == '>') {
90                useUTF8 = true;
91                break;
92            }
93            idx ++;
94            scan = idx;
95            if (ch == 'e' && *scan++ == 'n' && *scan++ == 'c' && *scan++ == 'o' && *scan++ == 'd' && *scan++ == 'i' && *scan++ == 'n' && *scan++ == 'g' && *scan++ == '=') {
96                idx = scan;
97                break;
98            }
99        }
100        if (!useUTF8 && idx >= end) {
101            useUTF8 = true;
102        }
103    }
104    if (!useUTF8) {
105        // Found "encoding="; see if we've got an honest-to-goodness encoding name
106        quote = *idx;
107        if (quote != '\'' && quote != '\"') {
108            useUTF8 = true;
109        }
110    }
111    if (!useUTF8) {
112        base = idx + 1; // Move past the quote character
113        idx ++;
114        while (idx < end && *idx != quote) idx ++;
115        if (idx >= end) {
116            useUTF8 = true;
117        }
118    }
119    if (!useUTF8) {
120        UInt32 len = idx - base;
121        if (len == 5 && (*base == 'u' || *base == 'U') && (base[1] == 't' || base[1] == 'T') && (base[2] == 'f' || base[2] == 'F') && (base[3] == '-') && (base[4] == '8')) {
122            useUTF8 = true;
123        } else {
124            CFStringRef encodingName = CFStringCreateWithBytes(stream->allocator, base, len, kCFStringEncodingISOLatin1, false);
125            stream->encoding = CFStringConvertIANACharSetNameToEncoding(encodingName);
126            CFRelease(encodingName);
127        }
128    }
129    if (useUTF8) {
130        stream->encoding = kCFStringEncodingUTF8;
131        stream->flags |= ENCODING_MATCHES_ASCII;
132        return true;
133    } else if (stream->encoding == kCFStringEncodingInvalidId) {
134        return false;
135    } else if (__CFStringEncodingIsSupersetOfASCII(stream->encoding)) {
136        stream->flags |= ENCODING_MATCHES_ASCII;
137    }
138    return true;
139}
140
141CF_INLINE void _fillStringWithCharacters(CFMutableStringRef string, UniChar *characters, CFIndex numChars) {
142    CFStringDelete(string, CFRangeMake(0, CFStringGetLength(string)));
143    if (numChars) {
144        CFStringAppendCharacters(string, characters, numChars);
145    }
146}
147
148CF_PRIVATE Boolean _openInputStream(_CFXMLInputStream *stream) {
149    if (NULL == stream->data) {
150        return false;
151    } else {
152        stream->currentByte = CFDataGetBytePtr(stream->data);
153        if (determineEncoding(stream)) {
154            stream->flags |= STREAM_OPEN;
155            return true;
156        } else {
157            return false;
158        }
159    }
160}
161
162CF_PRIVATE void _initializeInputStream(_CFXMLInputStream *stream, CFAllocatorRef alloc, CFURLRef dataSource, CFDataRef xmlData) {
163    stream->data = xmlData ? (CFDataRef)CFRetain(xmlData) : NULL;
164    stream->url = dataSource ? (CFURLRef)CFRetain(dataSource) : NULL;
165    stream->encoding = kCFStringEncodingInvalidId;
166    stream->currentByte = NULL;
167
168    stream->allocator = (CFAllocatorRef)CFRetain(alloc);
169    stream->charBuffer = NULL;
170    stream->currentChar = NULL;
171    stream->mark = NULL;
172    stream->parserMark = NULL;
173    stream->bufferLength = 0;
174    stream->bufferCapacity = 0;
175
176    stream->charIndex = 1;
177    stream->lineNum = 1;
178
179    stream->flags = 0;
180    stream->nameSet = NULL;
181    stream->tempString = NULL;
182}
183
184
185CF_PRIVATE void _freeInputStream(_CFXMLInputStream *stream) {
186    if (stream->data) CFRelease(stream->data);
187    if (stream->url) CFRelease(stream->url);
188    if (stream->charBuffer) CFAllocatorDeallocate(stream->allocator, stream->charBuffer);
189    if (stream->nameSet) CFRelease(stream->nameSet);
190    if (stream->tempString) CFRelease(stream->tempString);
191    CFRelease(stream->allocator);
192}
193
194CF_PRIVATE CFStringEncoding _inputStreamGetEncoding(_CFXMLInputStream *stream) {
195    return stream->encoding;
196}
197
198CF_PRIVATE CFIndex _inputStreamCurrentLocation(_CFXMLInputStream *stream) {
199    return stream->charIndex;
200}
201
202CF_PRIVATE CFIndex _inputStreamCurrentLine(_CFXMLInputStream *stream) {
203    return stream->lineNum;
204}
205
206CF_PRIVATE Boolean _inputStreamAtEOF(_CFXMLInputStream *stream) {
207    if (!(stream->flags & STREAM_OPEN)) return false;
208    if (stream->currentChar) return false;
209    if (stream->currentByte - CFDataGetBytePtr(stream->data) < CFDataGetLength(stream->data)) return false;
210    return true;
211}
212
213CF_PRIVATE Boolean _inputStreamComposingErrorOccurred(_CFXMLInputStream *stream) {
214    return stream->flags & ENCODING_COMPOSITION_ERROR;
215}
216
217#define INITIAL_BUFFER_SIZE 64
218static void growCharacterBuffer(_CFXMLInputStream *stream) {
219    if (!stream->charBuffer) {
220        stream->charBuffer = (UniChar *)CFAllocatorAllocate(stream->allocator, INITIAL_BUFFER_SIZE*sizeof(UniChar), 0);
221        stream->bufferCapacity = INITIAL_BUFFER_SIZE;
222    } else {
223        CFIndex currCharDelta = stream->currentChar ? stream->currentChar - stream->charBuffer : -1;
224        CFIndex markDelta = stream->mark ? stream->mark - stream->charBuffer: -1;
225        CFIndex parserMarkDelta = stream->parserMark ? stream->parserMark - stream->charBuffer: -1;
226        UniChar *newBuffer = (UniChar *)CFAllocatorReallocate(stream->allocator, stream->charBuffer, stream->bufferCapacity * 2 * sizeof(UniChar), 0);
227        stream->bufferCapacity *= 2;
228        if (newBuffer != stream->charBuffer) {
229            stream->charBuffer = newBuffer;
230            if (currCharDelta != -1) {
231                stream->currentChar = newBuffer + currCharDelta;
232            }
233            if (markDelta != -1) {
234                stream->mark = newBuffer + markDelta;
235            }
236            if (parserMarkDelta != -1) {
237                stream->parserMark = newBuffer + parserMarkDelta;
238            }
239        }
240    }
241}
242
243static CFIndex loadCharacters(UniChar *base, CFIndex maxLength, _CFXMLInputStream *stream) {
244    const uint8_t *dataEnd = CFDataGetBytePtr(stream->data) + CFDataGetLength(stream->data);
245    if (stream->flags & (ENCODING_IS_UNICODE_NATURAL|ENCODING_IS_UNICODE_SWAPPED) ) {
246        CFIndex charsToTranslate = (dataEnd - stream->currentByte) / sizeof(UniChar);
247        if (charsToTranslate > maxLength) {
248            charsToTranslate = maxLength;
249        }
250        if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
251            memmove(base, stream->currentByte, charsToTranslate * sizeof(UniChar));
252            stream->currentByte += (charsToTranslate * sizeof(UniChar));
253        } else {
254            CFIndex i;
255            uint8_t *baseBytePtr = (uint8_t *)base;
256            for (i = 0; i < charsToTranslate; i ++) {
257                *(baseBytePtr + 1) = *stream->currentByte;
258                *baseBytePtr = *(stream->currentByte + 1);
259                baseBytePtr += 2;
260                stream->currentByte += 2;
261            }
262        }
263        return charsToTranslate;
264    } else {
265        CFIndex lengthConsumed = 0;
266        CFIndex usedByteLength, usedCharLength;
267        UInt32 conversionResult;
268        if (stream->flags & ENCODING_MATCHES_ASCII) {
269            while (stream->currentByte < dataEnd && lengthConsumed < maxLength) {
270                if (*stream->currentByte > 0x7f) break;
271                *base = *stream->currentByte;
272                base ++;
273                stream->currentByte ++;
274                lengthConsumed ++;
275            }
276            if (stream->currentByte == dataEnd || lengthConsumed == maxLength) {
277                return lengthConsumed;
278            }
279        }
280        conversionResult = CFStringEncodingBytesToUnicode(stream->encoding, 0, stream->currentByte, dataEnd - stream->currentByte, &usedByteLength, base, maxLength-lengthConsumed, &usedCharLength);
281        if(kCFStringEncodingConversionSuccess != conversionResult) {
282            switch(conversionResult) {
283                case kCFStringEncodingConverterUnavailable:
284                case kCFStringEncodingInvalidInputStream:
285                    stream->flags |= ENCODING_COMPOSITION_ERROR;
286                    break;
287                case kCFStringEncodingInsufficientOutputBufferLength:
288                default:
289                    break;
290            }
291        }
292        if (usedByteLength > 0) {
293            stream->currentByte += usedByteLength;
294            lengthConsumed += usedCharLength;
295        }
296        return lengthConsumed;
297    }
298}
299
300// returns number of characters filled
301CF_INLINE CFIndex fillToCapacity(_CFXMLInputStream *stream) {
302    CFIndex numFilled;
303    if (stream->bufferLength >= stream->bufferCapacity) return 0;
304    // Try and fill in the remaining characters
305    numFilled = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
306    if (numFilled != 0) {
307        stream->currentChar = stream->charBuffer + stream->bufferLength;
308        stream->bufferLength += numFilled;
309    }
310    return numFilled;
311}
312
313// we are expected to move mark & parserMark relative to any moved characters, set currentChar to the first new character fetched, update bufferLength, and advance currentByte as appropriate.  Does not check for EOF; it is the caller's responsibility to verify this.
314static void fillCharacterBuffer(_CFXMLInputStream *stream) {
315    if (!stream->charBuffer) {
316        growCharacterBuffer(stream);
317    }
318    if (!stream->mark && !stream->parserMark) {
319        // This is the easy case; we can freely overwrite the buffer; if either mark or parserMark is set, we must not remove any characters from those marks and the end of the buffer
320        CFIndex fillLength = stream->bufferCapacity-5; // We leave a few characters at the end, b/c we don't want to reallocate (doubling the amount of memory used) just b/c we're matching a small string near the end of the filled buffer
321        stream->bufferLength = loadCharacters(stream->charBuffer, fillLength, stream);
322        CFAssert(stream->bufferLength != 0, __kCFLogAssertion, "CF internal error: XML parser input stream corruption");
323        stream->currentChar = stream->charBuffer;
324    } else {
325        // We do everything we can not to allocate; first we fill any remaining characters.  If that doesn't work, we try shifting the characters starting at the earlier of mark or parserMark to the beginning of buffer, then filling the newly-freed characters.
326        Boolean done;
327
328        // First try just filling the remaining capacity
329        done = (fillToCapacity(stream) != 0);
330        if (!done) {
331            const UniChar *leftMostMark;
332            if (stream->mark && !stream->parserMark) {
333                leftMostMark = stream->mark;
334            } else if (stream->parserMark && !stream->mark) {
335                leftMostMark = stream->parserMark;
336            } else if (stream->parserMark < stream->mark) {
337                leftMostMark = stream->parserMark;
338            } else {
339                leftMostMark = stream->mark;
340            }
341            if (leftMostMark > stream->charBuffer) {
342                CFIndex delta = leftMostMark - stream->charBuffer;
343                memmove(stream->charBuffer, leftMostMark, (stream->bufferLength - delta) * sizeof(UniChar));
344                stream->bufferLength -= delta;
345                if (stream->mark) {
346                    stream->mark -= delta;
347                }
348                if (stream->parserMark) {
349                    stream->parserMark -= delta;
350                }
351                // Now try to fill the newly-opened space
352                done = (fillToCapacity(stream) != 0);
353                delta = loadCharacters(stream->charBuffer + stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
354            }
355        }
356        if (!done) {
357            // No help for it; now we must allocate
358            growCharacterBuffer(stream);
359            fillToCapacity(stream); // If this doesn't work, we give up.
360        }
361    }
362}
363
364/* The guts of getCharacter() have been separated in order to allow getCharacter() to be small and more easily inline-able. Note that a check late in the 10.3 development cycle indicates that getCharacter() should call getCharacterGuts() less than 2% of the time.  (In 29000 calls, less than 400 called this. Note that a majority of calls have advanceStream set, so that was left in the inline version.  Also note that some calls to getCharacter() were changed to go through the functions _inputStreamGetCharacter() or _inputStreamPeekCharacter(), as the expansion in using the inline version didn't seem worthwhile. See 3275503 for some data supporting this.
365*/
366static Boolean getCharacterGuts(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
367    if (stream->currentByte - CFDataGetBytePtr(stream->data) >= CFDataGetLength(stream->data)) {
368        return false; // EOF
369    } else if (!((stream->mark || stream->parserMark) && advanceStream) &&
370               (((stream->flags & ENCODING_MATCHES_ASCII) && *(stream->currentByte) < 0x7F) ||
371                (stream->flags & (ENCODING_IS_UNICODE_NATURAL | ENCODING_IS_UNICODE_SWAPPED)))) {
372        // We can only perform optimizations if neither mark is set (if the mark is set, we must fill the character buffer so we can retrieve the characters later), and the encoding is Unicode, or the encoding matches ASCII and we're looking at a low-byte character.
373        if (stream->flags & ENCODING_MATCHES_ASCII) {
374            *ch = (UniChar)*(stream->currentByte);
375            if (advanceStream) {
376                stream->currentByte ++;
377            }
378        } else if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
379            *ch = *(UniChar *)(stream->currentByte);
380            if (advanceStream) {
381                stream->currentByte += 2;
382            }
383        } else {
384            // Unicode with swapped bytes
385            *ch = CFSwapInt16(*(UniChar *)(stream->currentByte));
386            if (advanceStream) {
387                stream->currentByte += 2;
388            }
389        }
390    } else {
391        fillCharacterBuffer(stream); // this takes into account markIsSet to make sure and do the right thing
392        if (!stream->charBuffer || !stream->currentChar) {
393            return false;
394        } else {
395            *ch = *(stream->currentChar);
396            if (advanceStream) {
397                stream->currentChar ++;
398                if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
399                    stream->currentChar = NULL;
400                }
401            }
402        }
403    }
404    return true;
405}
406
407/* See comments above getCharacterGuts()
408*/
409CF_INLINE Boolean getCharacter(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
410    if (!(stream->flags & STREAM_OPEN)) {
411        return false;
412    } else if (stream->currentChar) {
413        *ch = *stream->currentChar;
414        if (advanceStream) {
415            stream->currentChar ++;
416            if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
417                stream->currentChar = NULL;
418            }
419        }
420    } else {
421        if (!getCharacterGuts(stream, ch, advanceStream)) return false;
422    }
423    if (advanceStream) {
424        UniChar nextChar;
425        stream->charIndex ++;
426        if ((*ch == '\n') || ((*ch == '\r') && (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n'))) stream->lineNum ++;
427    }
428    return true;
429}
430
431CF_PRIVATE Boolean _inputStreamPeekCharacter(_CFXMLInputStream *stream, UniChar *ch) {
432    return getCharacter(stream, ch, false);
433}
434
435CF_PRIVATE Boolean _inputStreamGetCharacter(_CFXMLInputStream *stream, UniChar *ch) {
436    return getCharacter(stream, ch, true);
437}
438
439CF_PRIVATE Boolean _inputStreamReturnCharacter(_CFXMLInputStream *stream, UniChar ch) {
440    Boolean decrementLineNum = false;
441    if (ch == '\n') {
442        decrementLineNum = true;
443    } else if (ch == '\r') {
444        UniChar nextChar;
445        if (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n') {
446            decrementLineNum = true;
447        }
448    }
449
450    if (!(stream->flags & STREAM_OPEN)) {
451        return false;
452    } else if (stream->currentChar) {
453        if (stream->currentChar != stream->charBuffer) {
454            stream->currentChar --;
455        } else {
456            // Yuck; we're unlucky and are returning a character _before_ the first character in charBuffer
457            if (stream->bufferLength >= stream->bufferCapacity) {
458                growCharacterBuffer(stream);
459            }
460            memmove(stream->charBuffer + 1, stream->charBuffer, stream->bufferLength * sizeof(UniChar));
461            *stream->charBuffer = ch;
462            stream->bufferLength ++;
463            if (stream->mark) {
464                stream->mark ++;
465            }
466            if (stream->parserMark) {
467                stream->parserMark ++;
468            }
469        }
470    } else if ((stream->mark || stream->parserMark) && stream->bufferLength) {
471        // We've been collecting characters in charBuffer; the only reason stream->currentChar is NULL is that we've processed the last character thusfar translated from data.  That last character is the one being returned.
472        stream->currentChar = stream->charBuffer + stream->bufferLength - 1;
473    } else if (stream->charBuffer) {
474        // We have processed all the meaningful characters from charBuffer and have no reason to preserve them.  We use charBuffer to hold this one character that has been returned to us.
475        *stream->charBuffer = ch;
476        stream->currentChar = stream->charBuffer;
477        stream->bufferLength = 1;
478        if (stream->mark) {
479            stream->mark ++;
480        }
481        if (stream->parserMark) {
482            stream->parserMark ++;
483        }
484    } else if (stream->currentByte > CFDataGetBytePtr(stream->data)) {
485        // We have no character buffer available, so that means one of two things - either we've never needed a character buffer because all the characters could come directly out of the byte stream, or we've not yet processed the first character.  The former means we can just back up the byte pointer; the latter means Bad Things have happened.
486        if (stream->flags & ENCODING_MATCHES_ASCII) {
487            stream->currentByte --;
488        } else {  // Must be Unicode
489            stream->currentByte -= 2;
490        }
491    } else {
492        return false;
493    }
494    stream->charIndex --;
495    if (decrementLineNum) {
496        stream->lineNum --;
497    }
498    return true;
499}
500
501// Returns the pointer to hold as the mark
502static UniChar *dropMark(_CFXMLInputStream *stream) {
503    if (stream->currentChar) {
504        return stream->currentChar;
505    } else if (stream->mark || stream->parserMark) {
506        return stream->charBuffer + stream->bufferLength;
507    } else {
508        if (!stream->charBuffer) {
509            growCharacterBuffer(stream);
510        }
511        stream->bufferLength = 0; // This will be sufficient to force a fetch into the buffer when the next character is requested
512        return stream->charBuffer;
513    }
514
515}
516
517CF_PRIVATE void _inputStreamSetMark(_CFXMLInputStream *stream) {
518    CFAssert(stream->mark == NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
519    stream->mark = dropMark(stream);
520}
521
522CF_PRIVATE void _inputStreamClearMark(_CFXMLInputStream *stream) {
523    CFAssert(stream->mark != NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
524    stream->mark = NULL;
525}
526
527CF_PRIVATE void _inputStreamGetCharactersFromMark(_CFXMLInputStream *stream, CFMutableStringRef string) {
528    UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
529    CFIndex numChars = end - stream->mark;
530    CFAssert(stream->mark, __kCFLogAssertion, "CF internal error: malformed XML input stream");
531    _fillStringWithCharacters(string, stream->mark, numChars);
532}
533
534static void restoreToMark(_CFXMLInputStream *stream, UniChar *mark) {
535    UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
536    if (end > mark) {
537        CFIndex numChars = end - mark;
538        stream->charIndex -= numChars;
539        stream->currentChar = mark;
540
541        // This is annoying; to keep the line count accurate, if the first character we are returning is a CR, we must decrement the line count iff the next character is NOT a LF
542        if (*(end - 1) == '\r') {
543            UniChar nextChar;
544            if (_inputStreamPeekCharacter(stream, &nextChar) && nextChar == '\n') {
545                end --;
546            }
547        }
548        while (end != mark) {
549            end --;
550            if (*end == '\r') {
551                stream->lineNum --;
552            } else if (*end == '\n') {
553                stream->lineNum --;
554                if (end != mark && *(end - 1) == '\r') {
555                    end --;
556                }
557            }
558        }
559    }
560}
561
562CF_PRIVATE void _inputStreamBackUpToMark(_CFXMLInputStream *stream) {
563    CFAssert(stream->mark != NULL || stream->charBuffer == NULL, __kCFLogAssertion, "CF internal error: malformed XML input stream");
564    restoreToMark(stream, stream->mark);
565}
566
567CF_INLINE Boolean isWhitespaceChar(UniChar ch) {
568    return (ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t');
569}
570
571CF_PRIVATE CFIndex _inputStreamSkipWhitespace(_CFXMLInputStream *stream, CFMutableStringRef str) {
572    UniChar ch;
573    CFIndex len = 0;
574    if (str) {
575        stream->parserMark = dropMark(stream);
576    }
577    while (getCharacter(stream, &ch, true) && isWhitespaceChar(ch)) {
578        len ++;
579    }
580    if (!isWhitespaceChar(ch)) {
581        _inputStreamReturnCharacter(stream, ch);
582    }
583    if (str) {
584        _fillStringWithCharacters(str, stream->parserMark, len);
585        stream->parserMark = NULL;
586    }
587    return len;
588}
589
590// false return means EOF was encountered without finding scanChars
591CF_PRIVATE Boolean _inputStreamScanToCharacters(_CFXMLInputStream *stream, const UniChar *scanChars, CFIndex numChars, CFMutableStringRef str) {
592    Boolean done = false;
593    CFIndex firstRepeatIndex = -1;
594    CFIndex len = 0;
595    stream->parserMark = dropMark(stream);
596    do {
597        UniChar ch;
598        while (_inputStreamGetCharacter(stream, &ch) && ch != scanChars[0]) {
599            len ++;
600        }
601        if (ch != scanChars[0]) {
602            restoreToMark(stream, stream->parserMark);
603            stream->parserMark = NULL;
604            return false;
605        } else {
606            CFIndex i;
607            for (i = 1; i < numChars; i ++) {
608                if (!_inputStreamGetCharacter(stream, &ch)) break;
609                if (ch != scanChars[i]) break;
610            }
611            if (i == numChars) {
612                done = true;
613            } else {
614                if (firstRepeatIndex == -1) {
615                    CFIndex j;
616                    for (j = 1; j < numChars; j ++) {
617                        if (scanChars[0] == scanChars[j]) {
618                            break;
619                        }
620                    }
621                    firstRepeatIndex = j;
622                }
623                _inputStreamReturnCharacter(stream, ch);
624                while (i > firstRepeatIndex) {
625                    i --;
626                    _inputStreamReturnCharacter(stream, scanChars[i]);
627                }
628                len += i;
629            }
630        }
631    } while (!done);
632    if (str) {
633        _fillStringWithCharacters(str, stream->parserMark, len);
634    }
635    stream->parserMark = NULL;
636    return true;
637}
638
639CF_PRIVATE Boolean _inputStreamMatchString(_CFXMLInputStream *stream, const UniChar *stringToMatch, CFIndex length) {
640    const UniChar *end = stringToMatch+length;
641    const UniChar *sPtr=stringToMatch;
642    stream->parserMark = dropMark(stream);
643    while (sPtr < end) {
644        UniChar ch;
645        if (!_inputStreamGetCharacter(stream, &ch)) break;
646        if (ch != *sPtr) break;
647        sPtr ++;
648    }
649    if (sPtr != end) {
650        restoreToMark(stream, stream->parserMark);
651        stream->parserMark = NULL;
652        return false;
653    } else {
654        stream->parserMark = NULL;
655        return true;
656    }
657}
658
659CF_PRIVATE Boolean _inputStreamScanQuotedString(_CFXMLInputStream *stream, CFMutableStringRef str) {
660    UniChar ch;
661    if (!_inputStreamPeekCharacter(stream, &ch)) return false;
662    if (ch != '\'' && ch != '\"')  return false;
663
664    _inputStreamGetCharacter(stream, &ch);
665    if (!_inputStreamScanToCharacters(stream, &ch, 1, str)) {
666        return false;
667    }
668    return true;
669}
670
671/*
672 [4]  NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
673 [5]  Name ::= (Letter | '_' | ':') (NameChar)*
674 [7]  Nmtoken ::= (NameChar)+
675 [84] Letter ::= BaseChar | Ideographic
676
677 We don't do this quite right; we rely on the Unicode charsets to do this analysis.  While
678 the productions in the XML spec are based on the Unicode character sets, the definitions
679 differ slightly to avoid those areas where the Unicode standard is still being resolved.
680 At any rate, I'd lay money that using the Unicode charsets, we will be more correct than
681 the vast majority of parsers out there.
682
683 Letter == kCFUniCharLetterCharacterSet
684 Digit == kCFUniCharDecimalDigitCharacterSet
685 CombiningChar == kCFUniCharNonBaseCharacterSet
686 Extender - complex, and not represented by a uniform character set.
687 */
688CF_PRIVATE Boolean _inputStreamScanXMLName(_CFXMLInputStream *stream, Boolean isNMToken, CFStringRef *str) {
689    UniChar ch;
690    Boolean success = true;
691    stream->parserMark = dropMark(stream);
692    if (!isNMToken) {
693        // Only difference between an NMToken and a Name is Names have a stricter condition on the first character
694        if (!getCharacter(stream, &ch, false)) {
695            success = false;
696        } else if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && ch != '_' && ch != ':') {
697            success = false;
698        } else {
699            getCharacter(stream, &ch, true);
700        }
701    }
702    if (success) {
703        while (getCharacter(stream, &ch, true)) {
704            if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && !CFUniCharIsMemberOf(ch, kCFUniCharDecimalDigitCharacterSet)  && ch != '.' && ch != '-' && ch != '_' && ch != ':' && !CFUniCharIsMemberOf(ch, kCFUniCharNonBaseCharacterSet)) {
705                _inputStreamReturnCharacter(stream, ch);
706                break;
707            }
708        }
709        if (NULL == stream->currentChar || stream->currentChar == stream->parserMark) {
710            success = false; // Must have processed at least one character
711        }
712    }
713    if (success) {
714        if (str) {
715            if (!stream->nameSet) {
716                stream->nameSet = CFSetCreateMutable(stream->allocator, 0, &kCFTypeSetCallBacks);
717                stream->tempString = CFStringCreateMutableWithExternalCharactersNoCopy(stream->allocator, NULL, 0, 0, kCFAllocatorNull);
718            }
719            CFStringSetExternalCharactersNoCopy(stream->tempString, stream->parserMark, stream->currentChar-stream->parserMark, stream->currentChar-stream->parserMark);
720            if (!CFSetGetValueIfPresent(stream->nameSet, stream->tempString, (const void **)str)) {
721                *str = (CFStringRef)CFStringCreateCopy(stream->allocator, stream->tempString);
722                CFSetAddValue(stream->nameSet, *str);
723                CFRelease(*str);
724            }
725        }
726    } else {
727        restoreToMark(stream, stream->parserMark);
728    }
729    stream->parserMark = NULL;
730    return success;
731}
732
733
734