1/*
2                            __  __            _
3                         ___\ \/ /_ __   __ _| |_
4                        / _ \\  /| '_ \ / _` | __|
5                       |  __//  \| |_) | (_| | |_
6                        \___/_/\_\ .__/ \__,_|\__|
7                                 |_| XML parser
8
9   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10   Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11   Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12   Copyright (c) 2004-2009 Karl Waclawek <karl@waclawek.net>
13   Copyright (c) 2005-2007 Steven Solie <steven@solie.ca>
14   Copyright (c) 2016-2023 Sebastian Pipping <sebastian@pipping.org>
15   Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
16   Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
17   Copyright (c) 2020      Joe Orton <jorton@redhat.com>
18   Copyright (c) 2020      Kleber Tarc��sio <klebertarcisio@yahoo.com.br>
19   Copyright (c) 2021      Tim Bray <tbray@textuality.com>
20   Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
21   Copyright (c) 2022      Sean McBride <sean@rogue-research.com>
22   Licensed under the MIT license:
23
24   Permission is  hereby granted,  free of charge,  to any  person obtaining
25   a  copy  of  this  software   and  associated  documentation  files  (the
26   "Software"),  to  deal in  the  Software  without restriction,  including
27   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
28   distribute, sublicense, and/or sell copies of the Software, and to permit
29   persons  to whom  the Software  is  furnished to  do so,  subject to  the
30   following conditions:
31
32   The above copyright  notice and this permission notice  shall be included
33   in all copies or substantial portions of the Software.
34
35   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
36   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
37   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
38   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
39   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
40   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
41   USE OR OTHER DEALINGS IN THE SOFTWARE.
42*/
43
44#include "expat_config.h"
45
46#include <assert.h>
47#include <stdio.h>
48#include <stdlib.h>
49#include <stddef.h>
50#include <string.h>
51#include <math.h> /* for isnan */
52#include <errno.h>
53
54#include "expat.h"
55#include "codepage.h"
56#include "internal.h" /* for UNUSED_P only */
57#include "xmlfile.h"
58#include "xmltchar.h"
59
60#ifdef _MSC_VER
61#  include <crtdbg.h>
62#endif
63
64#ifdef XML_UNICODE
65#  include <wchar.h>
66#endif
67
68enum ExitCode {
69  XMLWF_EXIT_SUCCESS = 0,
70  XMLWF_EXIT_INTERNAL_ERROR = 1,
71  XMLWF_EXIT_NOT_WELLFORMED = 2,
72  XMLWF_EXIT_OUTPUT_ERROR = 3,
73  XMLWF_EXIT_USAGE_ERROR = 4,
74};
75
76/* Structures for handler user data */
77typedef struct NotationList {
78  struct NotationList *next;
79  const XML_Char *notationName;
80  const XML_Char *systemId;
81  const XML_Char *publicId;
82} NotationList;
83
84typedef struct xmlwfUserData {
85  FILE *fp;
86  NotationList *notationListHead;
87  const XML_Char *currentDoctypeName;
88} XmlwfUserData;
89
90/* This ensures proper sorting. */
91
92#define NSSEP T('\001')
93
94static void XMLCALL
95characterData(void *userData, const XML_Char *s, int len) {
96  FILE *fp = ((XmlwfUserData *)userData)->fp;
97  for (; len > 0; --len, ++s) {
98    switch (*s) {
99    case T('&'):
100      fputts(T("&amp;"), fp);
101      break;
102    case T('<'):
103      fputts(T("&lt;"), fp);
104      break;
105    case T('>'):
106      fputts(T("&gt;"), fp);
107      break;
108#ifdef W3C14N
109    case 13:
110      fputts(T("&#xD;"), fp);
111      break;
112#else
113    case T('"'):
114      fputts(T("&quot;"), fp);
115      break;
116    case 9:
117    case 10:
118    case 13:
119      ftprintf(fp, T("&#%d;"), *s);
120      break;
121#endif
122    default:
123      puttc(*s, fp);
124      break;
125    }
126  }
127}
128
129static void
130attributeValue(FILE *fp, const XML_Char *s) {
131  puttc(T('='), fp);
132  puttc(T('"'), fp);
133  assert(s);
134  for (;;) {
135    switch (*s) {
136    case 0:
137    case NSSEP:
138      puttc(T('"'), fp);
139      return;
140    case T('&'):
141      fputts(T("&amp;"), fp);
142      break;
143    case T('<'):
144      fputts(T("&lt;"), fp);
145      break;
146    case T('"'):
147      fputts(T("&quot;"), fp);
148      break;
149#ifdef W3C14N
150    case 9:
151      fputts(T("&#x9;"), fp);
152      break;
153    case 10:
154      fputts(T("&#xA;"), fp);
155      break;
156    case 13:
157      fputts(T("&#xD;"), fp);
158      break;
159#else
160    case T('>'):
161      fputts(T("&gt;"), fp);
162      break;
163    case 9:
164    case 10:
165    case 13:
166      ftprintf(fp, T("&#%d;"), *s);
167      break;
168#endif
169    default:
170      puttc(*s, fp);
171      break;
172    }
173    s++;
174  }
175}
176
177/* Lexicographically comparing UTF-8 encoded attribute values,
178is equivalent to lexicographically comparing based on the character number. */
179
180static int
181attcmp(const void *att1, const void *att2) {
182  return tcscmp(*(const XML_Char *const *)att1, *(const XML_Char *const *)att2);
183}
184
185static void XMLCALL
186startElement(void *userData, const XML_Char *name, const XML_Char **atts) {
187  int nAtts;
188  const XML_Char **p;
189  FILE *fp = ((XmlwfUserData *)userData)->fp;
190  puttc(T('<'), fp);
191  fputts(name, fp);
192
193  p = atts;
194  while (*p)
195    ++p;
196  nAtts = (int)((p - atts) >> 1);
197  if (nAtts > 1)
198    qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, attcmp);
199  while (*atts) {
200    puttc(T(' '), fp);
201    fputts(*atts++, fp);
202    attributeValue(fp, *atts);
203    atts++;
204  }
205  puttc(T('>'), fp);
206}
207
208static void XMLCALL
209endElement(void *userData, const XML_Char *name) {
210  FILE *fp = ((XmlwfUserData *)userData)->fp;
211  puttc(T('<'), fp);
212  puttc(T('/'), fp);
213  fputts(name, fp);
214  puttc(T('>'), fp);
215}
216
217static int
218nsattcmp(const void *p1, const void *p2) {
219  const XML_Char *att1 = *(const XML_Char *const *)p1;
220  const XML_Char *att2 = *(const XML_Char *const *)p2;
221  int sep1 = (tcsrchr(att1, NSSEP) != 0);
222  int sep2 = (tcsrchr(att2, NSSEP) != 0);
223  if (sep1 != sep2)
224    return sep1 - sep2;
225  return tcscmp(att1, att2);
226}
227
228static void XMLCALL
229startElementNS(void *userData, const XML_Char *name, const XML_Char **atts) {
230  int nAtts;
231  int nsi;
232  const XML_Char **p;
233  FILE *fp = ((XmlwfUserData *)userData)->fp;
234  const XML_Char *sep;
235  puttc(T('<'), fp);
236
237  sep = tcsrchr(name, NSSEP);
238  if (sep) {
239    fputts(T("n1:"), fp);
240    fputts(sep + 1, fp);
241    fputts(T(" xmlns:n1"), fp);
242    attributeValue(fp, name);
243    nsi = 2;
244  } else {
245    fputts(name, fp);
246    nsi = 1;
247  }
248
249  p = atts;
250  while (*p)
251    ++p;
252  nAtts = (int)((p - atts) >> 1);
253  if (nAtts > 1)
254    qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, nsattcmp);
255  while (*atts) {
256    name = *atts++;
257    sep = tcsrchr(name, NSSEP);
258    puttc(T(' '), fp);
259    if (sep) {
260      ftprintf(fp, T("n%d:"), nsi);
261      fputts(sep + 1, fp);
262    } else
263      fputts(name, fp);
264    attributeValue(fp, *atts);
265    if (sep) {
266      ftprintf(fp, T(" xmlns:n%d"), nsi++);
267      attributeValue(fp, name);
268    }
269    atts++;
270  }
271  puttc(T('>'), fp);
272}
273
274static void XMLCALL
275endElementNS(void *userData, const XML_Char *name) {
276  FILE *fp = ((XmlwfUserData *)userData)->fp;
277  const XML_Char *sep;
278  puttc(T('<'), fp);
279  puttc(T('/'), fp);
280  sep = tcsrchr(name, NSSEP);
281  if (sep) {
282    fputts(T("n1:"), fp);
283    fputts(sep + 1, fp);
284  } else
285    fputts(name, fp);
286  puttc(T('>'), fp);
287}
288
289#ifndef W3C14N
290
291static void XMLCALL
292processingInstruction(void *userData, const XML_Char *target,
293                      const XML_Char *data) {
294  FILE *fp = ((XmlwfUserData *)userData)->fp;
295  puttc(T('<'), fp);
296  puttc(T('?'), fp);
297  fputts(target, fp);
298  puttc(T(' '), fp);
299  fputts(data, fp);
300  puttc(T('?'), fp);
301  puttc(T('>'), fp);
302}
303
304static XML_Char *
305xcsdup(const XML_Char *s) {
306  XML_Char *result;
307  int count = 0;
308  int numBytes;
309
310  /* Get the length of the string, including terminator */
311  while (s[count++] != 0) {
312    /* Do nothing */
313  }
314  numBytes = count * sizeof(XML_Char);
315  result = malloc(numBytes);
316  if (result == NULL)
317    return NULL;
318  memcpy(result, s, numBytes);
319  return result;
320}
321
322static void XMLCALL
323startDoctypeDecl(void *userData, const XML_Char *doctypeName,
324                 const XML_Char *sysid, const XML_Char *publid,
325                 int has_internal_subset) {
326  XmlwfUserData *data = (XmlwfUserData *)userData;
327  UNUSED_P(sysid);
328  UNUSED_P(publid);
329  UNUSED_P(has_internal_subset);
330  data->currentDoctypeName = xcsdup(doctypeName);
331}
332
333static void
334freeNotations(XmlwfUserData *data) {
335  NotationList *notationListHead = data->notationListHead;
336
337  while (notationListHead != NULL) {
338    NotationList *next = notationListHead->next;
339    free((void *)notationListHead->notationName);
340    free((void *)notationListHead->systemId);
341    free((void *)notationListHead->publicId);
342    free(notationListHead);
343    notationListHead = next;
344  }
345  data->notationListHead = NULL;
346}
347
348static void
349cleanupUserData(XmlwfUserData *userData) {
350  free((void *)userData->currentDoctypeName);
351  userData->currentDoctypeName = NULL;
352  freeNotations(userData);
353}
354
355static int
356xcscmp(const XML_Char *xs, const XML_Char *xt) {
357  while (*xs != 0 && *xt != 0) {
358    if (*xs < *xt)
359      return -1;
360    if (*xs > *xt)
361      return 1;
362    xs++;
363    xt++;
364  }
365  if (*xs < *xt)
366    return -1;
367  if (*xs > *xt)
368    return 1;
369  return 0;
370}
371
372static int
373notationCmp(const void *a, const void *b) {
374  const NotationList *const n1 = *(const NotationList *const *)a;
375  const NotationList *const n2 = *(const NotationList *const *)b;
376
377  return xcscmp(n1->notationName, n2->notationName);
378}
379
380static void XMLCALL
381endDoctypeDecl(void *userData) {
382  XmlwfUserData *data = (XmlwfUserData *)userData;
383  NotationList **notations;
384  int notationCount = 0;
385  NotationList *p;
386  int i;
387
388  /* How many notations do we have? */
389  for (p = data->notationListHead; p != NULL; p = p->next)
390    notationCount++;
391  if (notationCount == 0) {
392    /* Nothing to report */
393    free((void *)data->currentDoctypeName);
394    data->currentDoctypeName = NULL;
395    return;
396  }
397
398  notations = malloc(notationCount * sizeof(NotationList *));
399  if (notations == NULL) {
400    fprintf(stderr, "Unable to sort notations");
401    freeNotations(data);
402    return;
403  }
404
405  for (p = data->notationListHead, i = 0; i < notationCount; p = p->next, i++) {
406    notations[i] = p;
407  }
408  qsort(notations, notationCount, sizeof(NotationList *), notationCmp);
409
410  /* Output the DOCTYPE header */
411  fputts(T("<!DOCTYPE "), data->fp);
412  fputts(data->currentDoctypeName, data->fp);
413  fputts(T(" [\n"), data->fp);
414
415  /* Now the NOTATIONs */
416  for (i = 0; i < notationCount; i++) {
417    fputts(T("<!NOTATION "), data->fp);
418    fputts(notations[i]->notationName, data->fp);
419    if (notations[i]->publicId != NULL) {
420      fputts(T(" PUBLIC '"), data->fp);
421      fputts(notations[i]->publicId, data->fp);
422      puttc(T('\''), data->fp);
423      if (notations[i]->systemId != NULL) {
424        puttc(T(' '), data->fp);
425        puttc(T('\''), data->fp);
426        fputts(notations[i]->systemId, data->fp);
427        puttc(T('\''), data->fp);
428      }
429    } else if (notations[i]->systemId != NULL) {
430      fputts(T(" SYSTEM '"), data->fp);
431      fputts(notations[i]->systemId, data->fp);
432      puttc(T('\''), data->fp);
433    }
434    puttc(T('>'), data->fp);
435    puttc(T('\n'), data->fp);
436  }
437
438  /* Finally end the DOCTYPE */
439  fputts(T("]>\n"), data->fp);
440
441  free(notations);
442  freeNotations(data);
443  free((void *)data->currentDoctypeName);
444  data->currentDoctypeName = NULL;
445}
446
447static void XMLCALL
448notationDecl(void *userData, const XML_Char *notationName, const XML_Char *base,
449             const XML_Char *systemId, const XML_Char *publicId) {
450  XmlwfUserData *data = (XmlwfUserData *)userData;
451  NotationList *entry = malloc(sizeof(NotationList));
452  const char *errorMessage = "Unable to store NOTATION for output\n";
453
454  UNUSED_P(base);
455  if (entry == NULL) {
456    fputs(errorMessage, stderr);
457    return; /* Nothing we can really do about this */
458  }
459  entry->notationName = xcsdup(notationName);
460  if (entry->notationName == NULL) {
461    fputs(errorMessage, stderr);
462    free(entry);
463    return;
464  }
465  if (systemId != NULL) {
466    entry->systemId = xcsdup(systemId);
467    if (entry->systemId == NULL) {
468      fputs(errorMessage, stderr);
469      free((void *)entry->notationName);
470      free(entry);
471      return;
472    }
473  } else {
474    entry->systemId = NULL;
475  }
476  if (publicId != NULL) {
477    entry->publicId = xcsdup(publicId);
478    if (entry->publicId == NULL) {
479      fputs(errorMessage, stderr);
480      free((void *)entry->systemId); /* Safe if it's NULL */
481      free((void *)entry->notationName);
482      free(entry);
483      return;
484    }
485  } else {
486    entry->publicId = NULL;
487  }
488
489  entry->next = data->notationListHead;
490  data->notationListHead = entry;
491}
492
493#endif /* not W3C14N */
494
495static void XMLCALL
496defaultCharacterData(void *userData, const XML_Char *s, int len) {
497  UNUSED_P(s);
498  UNUSED_P(len);
499  XML_DefaultCurrent((XML_Parser)userData);
500}
501
502static void XMLCALL
503defaultStartElement(void *userData, const XML_Char *name,
504                    const XML_Char **atts) {
505  UNUSED_P(name);
506  UNUSED_P(atts);
507  XML_DefaultCurrent((XML_Parser)userData);
508}
509
510static void XMLCALL
511defaultEndElement(void *userData, const XML_Char *name) {
512  UNUSED_P(name);
513  XML_DefaultCurrent((XML_Parser)userData);
514}
515
516static void XMLCALL
517defaultProcessingInstruction(void *userData, const XML_Char *target,
518                             const XML_Char *data) {
519  UNUSED_P(target);
520  UNUSED_P(data);
521  XML_DefaultCurrent((XML_Parser)userData);
522}
523
524static void XMLCALL
525nopCharacterData(void *userData, const XML_Char *s, int len) {
526  UNUSED_P(userData);
527  UNUSED_P(s);
528  UNUSED_P(len);
529}
530
531static void XMLCALL
532nopStartElement(void *userData, const XML_Char *name, const XML_Char **atts) {
533  UNUSED_P(userData);
534  UNUSED_P(name);
535  UNUSED_P(atts);
536}
537
538static void XMLCALL
539nopEndElement(void *userData, const XML_Char *name) {
540  UNUSED_P(userData);
541  UNUSED_P(name);
542}
543
544static void XMLCALL
545nopProcessingInstruction(void *userData, const XML_Char *target,
546                         const XML_Char *data) {
547  UNUSED_P(userData);
548  UNUSED_P(target);
549  UNUSED_P(data);
550}
551
552static void XMLCALL
553markup(void *userData, const XML_Char *s, int len) {
554  FILE *fp = ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp;
555  for (; len > 0; --len, ++s)
556    puttc(*s, fp);
557}
558
559static void
560metaLocation(XML_Parser parser) {
561  const XML_Char *uri = XML_GetBase(parser);
562  FILE *fp = ((XmlwfUserData *)XML_GetUserData(parser))->fp;
563  if (uri)
564    ftprintf(fp, T(" uri=\"%s\""), uri);
565  ftprintf(fp,
566           T(" byte=\"%") T(XML_FMT_INT_MOD) T("d\"") T(" nbytes=\"%d\"")
567               T(" line=\"%") T(XML_FMT_INT_MOD) T("u\"") T(" col=\"%")
568                   T(XML_FMT_INT_MOD) T("u\""),
569           XML_GetCurrentByteIndex(parser), XML_GetCurrentByteCount(parser),
570           XML_GetCurrentLineNumber(parser),
571           XML_GetCurrentColumnNumber(parser));
572}
573
574static void
575metaStartDocument(void *userData) {
576  fputts(T("<document>\n"),
577         ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp);
578}
579
580static void
581metaEndDocument(void *userData) {
582  fputts(T("</document>\n"),
583         ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp);
584}
585
586static void XMLCALL
587metaStartElement(void *userData, const XML_Char *name, const XML_Char **atts) {
588  XML_Parser parser = (XML_Parser)userData;
589  XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
590  FILE *fp = data->fp;
591  const XML_Char **specifiedAttsEnd
592      = atts + XML_GetSpecifiedAttributeCount(parser);
593  const XML_Char **idAttPtr;
594  int idAttIndex = XML_GetIdAttributeIndex(parser);
595  if (idAttIndex < 0)
596    idAttPtr = 0;
597  else
598    idAttPtr = atts + idAttIndex;
599
600  ftprintf(fp, T("<starttag name=\"%s\""), name);
601  metaLocation(parser);
602  if (*atts) {
603    fputts(T(">\n"), fp);
604    do {
605      ftprintf(fp, T("<attribute name=\"%s\" value=\""), atts[0]);
606      characterData(data, atts[1], (int)tcslen(atts[1]));
607      if (atts >= specifiedAttsEnd)
608        fputts(T("\" defaulted=\"yes\"/>\n"), fp);
609      else if (atts == idAttPtr)
610        fputts(T("\" id=\"yes\"/>\n"), fp);
611      else
612        fputts(T("\"/>\n"), fp);
613    } while (*(atts += 2));
614    fputts(T("</starttag>\n"), fp);
615  } else
616    fputts(T("/>\n"), fp);
617}
618
619static void XMLCALL
620metaEndElement(void *userData, const XML_Char *name) {
621  XML_Parser parser = (XML_Parser)userData;
622  XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
623  FILE *fp = data->fp;
624  ftprintf(fp, T("<endtag name=\"%s\""), name);
625  metaLocation(parser);
626  fputts(T("/>\n"), fp);
627}
628
629static void XMLCALL
630metaProcessingInstruction(void *userData, const XML_Char *target,
631                          const XML_Char *data) {
632  XML_Parser parser = (XML_Parser)userData;
633  XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser);
634  FILE *fp = usrData->fp;
635  ftprintf(fp, T("<pi target=\"%s\" data=\""), target);
636  characterData(usrData, data, (int)tcslen(data));
637  puttc(T('"'), fp);
638  metaLocation(parser);
639  fputts(T("/>\n"), fp);
640}
641
642static void XMLCALL
643metaComment(void *userData, const XML_Char *data) {
644  XML_Parser parser = (XML_Parser)userData;
645  XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser);
646  FILE *fp = usrData->fp;
647  fputts(T("<comment data=\""), fp);
648  characterData(usrData, data, (int)tcslen(data));
649  puttc(T('"'), fp);
650  metaLocation(parser);
651  fputts(T("/>\n"), fp);
652}
653
654static void XMLCALL
655metaStartCdataSection(void *userData) {
656  XML_Parser parser = (XML_Parser)userData;
657  XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
658  FILE *fp = data->fp;
659  fputts(T("<startcdata"), fp);
660  metaLocation(parser);
661  fputts(T("/>\n"), fp);
662}
663
664static void XMLCALL
665metaEndCdataSection(void *userData) {
666  XML_Parser parser = (XML_Parser)userData;
667  XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
668  FILE *fp = data->fp;
669  fputts(T("<endcdata"), fp);
670  metaLocation(parser);
671  fputts(T("/>\n"), fp);
672}
673
674static void XMLCALL
675metaCharacterData(void *userData, const XML_Char *s, int len) {
676  XML_Parser parser = (XML_Parser)userData;
677  XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
678  FILE *fp = data->fp;
679  fputts(T("<chars str=\""), fp);
680  characterData(data, s, len);
681  puttc(T('"'), fp);
682  metaLocation(parser);
683  fputts(T("/>\n"), fp);
684}
685
686static void XMLCALL
687metaStartDoctypeDecl(void *userData, const XML_Char *doctypeName,
688                     const XML_Char *sysid, const XML_Char *pubid,
689                     int has_internal_subset) {
690  XML_Parser parser = (XML_Parser)userData;
691  XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
692  FILE *fp = data->fp;
693  UNUSED_P(sysid);
694  UNUSED_P(pubid);
695  UNUSED_P(has_internal_subset);
696  ftprintf(fp, T("<startdoctype name=\"%s\""), doctypeName);
697  metaLocation(parser);
698  fputts(T("/>\n"), fp);
699}
700
701static void XMLCALL
702metaEndDoctypeDecl(void *userData) {
703  XML_Parser parser = (XML_Parser)userData;
704  XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
705  FILE *fp = data->fp;
706  fputts(T("<enddoctype"), fp);
707  metaLocation(parser);
708  fputts(T("/>\n"), fp);
709}
710
711static void XMLCALL
712metaNotationDecl(void *userData, const XML_Char *notationName,
713                 const XML_Char *base, const XML_Char *systemId,
714                 const XML_Char *publicId) {
715  XML_Parser parser = (XML_Parser)userData;
716  XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
717  FILE *fp = data->fp;
718  UNUSED_P(base);
719  ftprintf(fp, T("<notation name=\"%s\""), notationName);
720  if (publicId)
721    ftprintf(fp, T(" public=\"%s\""), publicId);
722  if (systemId) {
723    fputts(T(" system=\""), fp);
724    characterData(data, systemId, (int)tcslen(systemId));
725    puttc(T('"'), fp);
726  }
727  metaLocation(parser);
728  fputts(T("/>\n"), fp);
729}
730
731static void XMLCALL
732metaEntityDecl(void *userData, const XML_Char *entityName, int is_param,
733               const XML_Char *value, int value_length, const XML_Char *base,
734               const XML_Char *systemId, const XML_Char *publicId,
735               const XML_Char *notationName) {
736  XML_Parser parser = (XML_Parser)userData;
737  XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
738  FILE *fp = data->fp;
739
740  UNUSED_P(is_param);
741  UNUSED_P(base);
742  if (value) {
743    ftprintf(fp, T("<entity name=\"%s\""), entityName);
744    metaLocation(parser);
745    puttc(T('>'), fp);
746    characterData(data, value, value_length);
747    fputts(T("</entity/>\n"), fp);
748  } else if (notationName) {
749    ftprintf(fp, T("<entity name=\"%s\""), entityName);
750    if (publicId)
751      ftprintf(fp, T(" public=\"%s\""), publicId);
752    fputts(T(" system=\""), fp);
753    characterData(data, systemId, (int)tcslen(systemId));
754    puttc(T('"'), fp);
755    ftprintf(fp, T(" notation=\"%s\""), notationName);
756    metaLocation(parser);
757    fputts(T("/>\n"), fp);
758  } else {
759    ftprintf(fp, T("<entity name=\"%s\""), entityName);
760    if (publicId)
761      ftprintf(fp, T(" public=\"%s\""), publicId);
762    fputts(T(" system=\""), fp);
763    characterData(data, systemId, (int)tcslen(systemId));
764    puttc(T('"'), fp);
765    metaLocation(parser);
766    fputts(T("/>\n"), fp);
767  }
768}
769
770static void XMLCALL
771metaStartNamespaceDecl(void *userData, const XML_Char *prefix,
772                       const XML_Char *uri) {
773  XML_Parser parser = (XML_Parser)userData;
774  XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
775  FILE *fp = data->fp;
776  fputts(T("<startns"), fp);
777  if (prefix)
778    ftprintf(fp, T(" prefix=\"%s\""), prefix);
779  if (uri) {
780    fputts(T(" ns=\""), fp);
781    characterData(data, uri, (int)tcslen(uri));
782    fputts(T("\"/>\n"), fp);
783  } else
784    fputts(T("/>\n"), fp);
785}
786
787static void XMLCALL
788metaEndNamespaceDecl(void *userData, const XML_Char *prefix) {
789  XML_Parser parser = (XML_Parser)userData;
790  XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
791  FILE *fp = data->fp;
792  if (! prefix)
793    fputts(T("<endns/>\n"), fp);
794  else
795    ftprintf(fp, T("<endns prefix=\"%s\"/>\n"), prefix);
796}
797
798static int XMLCALL
799unknownEncodingConvert(void *data, const char *p) {
800  return codepageConvert(*(int *)data, p);
801}
802
803static int XMLCALL
804unknownEncoding(void *userData, const XML_Char *name, XML_Encoding *info) {
805  int cp;
806  static const XML_Char prefixL[] = T("windows-");
807  static const XML_Char prefixU[] = T("WINDOWS-");
808  int i;
809
810  UNUSED_P(userData);
811  for (i = 0; prefixU[i]; i++)
812    if (name[i] != prefixU[i] && name[i] != prefixL[i])
813      return 0;
814
815  cp = 0;
816  for (; name[i]; i++) {
817    static const XML_Char digits[] = T("0123456789");
818    const XML_Char *s = tcschr(digits, name[i]);
819    if (! s)
820      return 0;
821    cp *= 10;
822    cp += (int)(s - digits);
823    if (cp >= 0x10000)
824      return 0;
825  }
826  if (! codepageMap(cp, info->map))
827    return 0;
828  info->convert = unknownEncodingConvert;
829  /* We could just cast the code page integer to a void *,
830  and avoid the use of release. */
831  info->release = free;
832  info->data = malloc(sizeof(int));
833  if (! info->data)
834    return 0;
835  *(int *)info->data = cp;
836  return 1;
837}
838
839static int XMLCALL
840notStandalone(void *userData) {
841  UNUSED_P(userData);
842  return 0;
843}
844
845static void
846showVersion(XML_Char *prog) {
847  XML_Char *s = prog;
848  XML_Char ch;
849  const XML_Feature *features = XML_GetFeatureList();
850  while ((ch = *s) != 0) {
851    if (ch == '/'
852#if defined(_WIN32)
853        || ch == '\\'
854#endif
855    )
856      prog = s + 1;
857    ++s;
858  }
859  ftprintf(stdout, T("%s using %s\n"), prog, XML_ExpatVersion());
860  if (features != NULL && features[0].feature != XML_FEATURE_END) {
861    int i = 1;
862    ftprintf(stdout, T("%s"), features[0].name);
863    if (features[0].value)
864      ftprintf(stdout, T("=%ld"), features[0].value);
865    while (features[i].feature != XML_FEATURE_END) {
866      ftprintf(stdout, T(", %s"), features[i].name);
867      if (features[i].value)
868        ftprintf(stdout, T("=%ld"), features[i].value);
869      ++i;
870    }
871    ftprintf(stdout, T("\n"));
872  }
873}
874
875#if defined(__GNUC__)
876__attribute__((noreturn))
877#endif
878static void
879usage(const XML_Char *prog, int rc) {
880  ftprintf(
881      stderr,
882      /* Generated with:
883       * $ xmlwf/xmlwf_helpgen.sh
884       * To update, change xmlwf/xmlwf_helpgen.py, then paste the output of
885       * xmlwf/xmlwf_helpgen.sh in here.
886       */
887      /* clang-format off */
888      T("usage:\n")
889      T("  %s [OPTIONS] [FILE ...]\n")
890      T("  %s -h|--help\n")
891      T("  %s -v|--version\n")
892      T("\n")
893      T("xmlwf - Determines if an XML document is well-formed\n")
894      T("\n")
895      T("positional arguments:\n")
896      T("  FILE           file to process (default: STDIN)\n")
897      T("\n")
898      T("input control arguments:\n")
899      T("  -s             print an error if the document is not [s]tandalone\n")
900      T("  -n             enable [n]amespace processing\n")
901      T("  -p             enable processing of external DTDs and [p]arameter entities\n")
902      T("  -x             enable processing of e[x]ternal entities\n")
903      T("  -e ENCODING    override any in-document [e]ncoding declaration\n")
904      T("  -w             enable support for [W]indows code pages\n")
905      T("  -r             disable memory-mapping and use [r]ead calls instead\n")
906      T("  -g BYTES       buffer size to request per call pair to XML_[G]etBuffer and read (default: 8 KiB)\n")
907      T("  -k             when processing multiple files, [k]eep processing after first file with error\n")
908      T("\n")
909      T("output control arguments:\n")
910      T("  -d DIRECTORY   output [d]estination directory\n")
911      T("  -c             write a [c]opy of input XML, not canonical XML\n")
912      T("  -m             write [m]eta XML, not canonical XML\n")
913      T("  -t             write no XML output for [t]iming of plain parsing\n")
914      T("  -N             enable adding doctype and [n]otation declarations\n")
915      T("\n")
916      T("billion laughs attack protection:\n")
917      T("  NOTE: If you ever need to increase these values for non-attack payload, please file a bug report.\n")
918      T("\n")
919      T("  -a FACTOR      set maximum tolerated [a]mplification factor (default: 100.0)\n")
920      T("  -b BYTES       set number of output [b]ytes needed to activate (default: 8 MiB)\n")
921      T("\n")
922      T("reparse deferral:\n")
923      T("  -q             disable reparse deferral, and allow [q]uadratic parse runtime with large tokens\n")
924      T("\n")
925      T("info arguments:\n")
926      T("  -h, --help     show this [h]elp message and exit\n")
927      T("  -v, --version  show program's [v]ersion number and exit\n")
928      T("\n")
929      T("exit status:\n")
930      T("  0              the input files are well-formed and the output (if requested) was written successfully\n")
931      T("  1              could not allocate data structures, signals a serious problem with execution environment\n")
932      T("  2              one or more input files were not well-formed\n")
933      T("  3              could not create an output file\n")
934      T("  4              command-line argument error\n")
935      T("\n")
936      T("xmlwf of libexpat is software libre, licensed under the MIT license.\n")
937      T("Please report bugs at https://github.com/libexpat/libexpat/issues -- thank you!\n")
938      , /* clang-format on */
939      prog, prog, prog);
940  exit(rc);
941}
942
943#if defined(__MINGW32__) && defined(XML_UNICODE)
944/* Silence warning about missing prototype */
945int wmain(int argc, XML_Char **argv);
946#endif
947
948#define XMLWF_SHIFT_ARG_INTO(constCharStarTarget, argc, argv, i, j)            \
949  {                                                                            \
950    if (argv[i][j + 1] == T('\0')) {                                           \
951      if (++i == argc) {                                                       \
952        usage(argv[0], XMLWF_EXIT_USAGE_ERROR);                                \
953        /* usage called exit(..), never gets here */                           \
954      }                                                                        \
955      constCharStarTarget = argv[i];                                           \
956    } else {                                                                   \
957      constCharStarTarget = argv[i] + j + 1;                                   \
958    }                                                                          \
959    i++;                                                                       \
960    j = 0;                                                                     \
961  }
962
963int
964tmain(int argc, XML_Char **argv) {
965  int i, j;
966  const XML_Char *outputDir = NULL;
967  const XML_Char *encoding = NULL;
968  unsigned processFlags = XML_MAP_FILE;
969  int windowsCodePages = 0;
970  int outputType = 0;
971  int useNamespaces = 0;
972  int requireStandalone = 0;
973  int requiresNotations = 0;
974  int continueOnError = 0;
975
976  float attackMaximumAmplification = -1.0f; /* signaling "not set" */
977  unsigned long long attackThresholdBytes = 0;
978  XML_Bool attackThresholdGiven = XML_FALSE;
979
980  XML_Bool disableDeferral = XML_FALSE;
981
982  int exitCode = XMLWF_EXIT_SUCCESS;
983  enum XML_ParamEntityParsing paramEntityParsing
984      = XML_PARAM_ENTITY_PARSING_NEVER;
985  int useStdin = 0;
986  XmlwfUserData userData = {NULL, NULL, NULL};
987
988#ifdef _MSC_VER
989  _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
990#endif
991
992  i = 1;
993  j = 0;
994  while (i < argc) {
995    if (j == 0) {
996      if (argv[i][0] != T('-'))
997        break;
998      if (argv[i][1] == T('-')) {
999        if (argv[i][2] == T('\0')) {
1000          i++;
1001          break;
1002        } else if (tcscmp(argv[i] + 2, T("help")) == 0) {
1003          usage(argv[0], XMLWF_EXIT_SUCCESS);
1004          // usage called exit(..), never gets here
1005        } else if (tcscmp(argv[i] + 2, T("version")) == 0) {
1006          showVersion(argv[0]);
1007          return XMLWF_EXIT_SUCCESS;
1008        }
1009      }
1010      j++;
1011    }
1012    switch (argv[i][j]) {
1013    case T('r'):
1014      processFlags &= ~XML_MAP_FILE;
1015      j++;
1016      break;
1017    case T('s'):
1018      requireStandalone = 1;
1019      j++;
1020      break;
1021    case T('n'):
1022      useNamespaces = 1;
1023      j++;
1024      break;
1025    case T('p'):
1026      paramEntityParsing = XML_PARAM_ENTITY_PARSING_ALWAYS;
1027      /* fall through */
1028    case T('x'):
1029      processFlags |= XML_EXTERNAL_ENTITIES;
1030      j++;
1031      break;
1032    case T('w'):
1033      windowsCodePages = 1;
1034      j++;
1035      break;
1036    case T('m'):
1037      outputType = 'm';
1038      j++;
1039      break;
1040    case T('c'):
1041      outputType = 'c';
1042      useNamespaces = 0;
1043      j++;
1044      break;
1045    case T('t'):
1046      outputType = 't';
1047      j++;
1048      break;
1049    case T('N'):
1050      requiresNotations = 1;
1051      j++;
1052      break;
1053    case T('d'):
1054      XMLWF_SHIFT_ARG_INTO(outputDir, argc, argv, i, j);
1055      break;
1056    case T('e'):
1057      XMLWF_SHIFT_ARG_INTO(encoding, argc, argv, i, j);
1058      break;
1059    case T('h'):
1060      usage(argv[0], XMLWF_EXIT_SUCCESS);
1061      // usage called exit(..), never gets here
1062    case T('v'):
1063      showVersion(argv[0]);
1064      return XMLWF_EXIT_SUCCESS;
1065    case T('g'): {
1066      const XML_Char *valueText = NULL;
1067      XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j);
1068
1069      errno = 0;
1070      XML_Char *afterValueText = (XML_Char *)valueText;
1071      const long long read_size_bytes_candidate
1072          = tcstoull(valueText, &afterValueText, 10);
1073      if ((errno != 0) || (afterValueText[0] != T('\0'))
1074          || (read_size_bytes_candidate < 1)
1075          || (read_size_bytes_candidate > (INT_MAX / 2 + 1))) {
1076        // This prevents tperror(..) from reporting misleading "[..]: Success"
1077        errno = ERANGE;
1078        tperror(T("invalid buffer size") T(
1079            " (needs an integer from 1 to INT_MAX/2+1 i.e. 1,073,741,824 on most platforms)"));
1080        exit(XMLWF_EXIT_USAGE_ERROR);
1081      }
1082      g_read_size_bytes = (int)read_size_bytes_candidate;
1083      break;
1084    }
1085    case T('k'):
1086      continueOnError = 1;
1087      j++;
1088      break;
1089    case T('a'): {
1090      const XML_Char *valueText = NULL;
1091      XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j);
1092
1093      errno = 0;
1094      XML_Char *afterValueText = NULL;
1095      attackMaximumAmplification = tcstof(valueText, &afterValueText);
1096      if ((errno != 0) || (afterValueText[0] != T('\0'))
1097          || isnan(attackMaximumAmplification)
1098          || (attackMaximumAmplification < 1.0f)) {
1099        // This prevents tperror(..) from reporting misleading "[..]: Success"
1100        errno = ERANGE;
1101        tperror(T("invalid amplification limit") T(
1102            " (needs a floating point number greater or equal than 1.0)"));
1103        exit(XMLWF_EXIT_USAGE_ERROR);
1104      }
1105#if XML_GE == 0
1106      ftprintf(stderr,
1107               T("Warning: Given amplification limit ignored")
1108                   T(", xmlwf has been compiled without DTD/GE support.\n"));
1109#endif
1110      break;
1111    }
1112    case T('b'): {
1113      const XML_Char *valueText = NULL;
1114      XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j);
1115
1116      errno = 0;
1117      XML_Char *afterValueText = (XML_Char *)valueText;
1118      attackThresholdBytes = tcstoull(valueText, &afterValueText, 10);
1119      if ((errno != 0) || (afterValueText[0] != T('\0'))) {
1120        // This prevents tperror(..) from reporting misleading "[..]: Success"
1121        errno = ERANGE;
1122        tperror(T("invalid ignore threshold")
1123                    T(" (needs an integer from 0 to 2^64-1)"));
1124        exit(XMLWF_EXIT_USAGE_ERROR);
1125      }
1126      attackThresholdGiven = XML_TRUE;
1127#if XML_GE == 0
1128      ftprintf(stderr,
1129               T("Warning: Given attack threshold ignored")
1130                   T(", xmlwf has been compiled without DTD/GE support.\n"));
1131#endif
1132      break;
1133    }
1134    case T('q'): {
1135      disableDeferral = XML_TRUE;
1136      j++;
1137      break;
1138    }
1139    case T('\0'):
1140      if (j > 1) {
1141        i++;
1142        j = 0;
1143        break;
1144      }
1145      /* fall through */
1146    default:
1147      usage(argv[0], XMLWF_EXIT_USAGE_ERROR);
1148      // usage called exit(..), never gets here
1149    }
1150  }
1151  if (i == argc) {
1152    useStdin = 1;
1153    processFlags &= ~XML_MAP_FILE;
1154    i--;
1155  }
1156  for (; i < argc; i++) {
1157    XML_Char *outName = 0;
1158    int result;
1159    XML_Parser parser;
1160    if (useNamespaces)
1161      parser = XML_ParserCreateNS(encoding, NSSEP);
1162    else
1163      parser = XML_ParserCreate(encoding);
1164
1165    if (! parser) {
1166      tperror(T("Could not instantiate parser"));
1167      exit(XMLWF_EXIT_INTERNAL_ERROR);
1168    }
1169
1170    if (attackMaximumAmplification != -1.0f) {
1171#if XML_GE == 1
1172      XML_SetBillionLaughsAttackProtectionMaximumAmplification(
1173          parser, attackMaximumAmplification);
1174#endif
1175    }
1176    if (attackThresholdGiven) {
1177#if XML_GE == 1
1178      XML_SetBillionLaughsAttackProtectionActivationThreshold(
1179          parser, attackThresholdBytes);
1180#else
1181      (void)attackThresholdBytes; // silence -Wunused-but-set-variable
1182#endif
1183    }
1184
1185    if (disableDeferral) {
1186      const XML_Bool success = XML_SetReparseDeferralEnabled(parser, XML_FALSE);
1187      if (! success) {
1188        // This prevents tperror(..) from reporting misleading "[..]: Success"
1189        errno = EINVAL;
1190        tperror(T("Failed to disable reparse deferral"));
1191        exit(XMLWF_EXIT_INTERNAL_ERROR);
1192      }
1193    }
1194
1195    if (requireStandalone)
1196      XML_SetNotStandaloneHandler(parser, notStandalone);
1197    XML_SetParamEntityParsing(parser, paramEntityParsing);
1198    if (outputType == 't') {
1199      /* This is for doing timings; this gives a more realistic estimate of
1200         the parsing time. */
1201      outputDir = 0;
1202      XML_SetElementHandler(parser, nopStartElement, nopEndElement);
1203      XML_SetCharacterDataHandler(parser, nopCharacterData);
1204      XML_SetProcessingInstructionHandler(parser, nopProcessingInstruction);
1205    } else if (outputDir) {
1206      const XML_Char *delim = T("/");
1207      const XML_Char *file = useStdin ? T("STDIN") : argv[i];
1208      if (! useStdin) {
1209        /* Jump after last (back)slash */
1210        const XML_Char *lastDelim = tcsrchr(file, delim[0]);
1211        if (lastDelim)
1212          file = lastDelim + 1;
1213#if defined(_WIN32)
1214        else {
1215          const XML_Char *winDelim = T("\\");
1216          lastDelim = tcsrchr(file, winDelim[0]);
1217          if (lastDelim) {
1218            file = lastDelim + 1;
1219            delim = winDelim;
1220          }
1221        }
1222#endif
1223      }
1224      outName = (XML_Char *)malloc((tcslen(outputDir) + tcslen(file) + 2)
1225                                   * sizeof(XML_Char));
1226      if (! outName) {
1227        tperror(T("Could not allocate memory"));
1228        exit(XMLWF_EXIT_INTERNAL_ERROR);
1229      }
1230      tcscpy(outName, outputDir);
1231      tcscat(outName, delim);
1232      tcscat(outName, file);
1233      userData.fp = tfopen(outName, T("wb"));
1234      if (! userData.fp) {
1235        tperror(outName);
1236        exitCode = XMLWF_EXIT_OUTPUT_ERROR;
1237        free(outName);
1238        XML_ParserFree(parser);
1239        if (continueOnError) {
1240          continue;
1241        } else {
1242          break;
1243        }
1244      }
1245      setvbuf(userData.fp, NULL, _IOFBF, 16384);
1246#ifdef XML_UNICODE
1247      puttc(0xFEFF, userData.fp);
1248#endif
1249      XML_SetUserData(parser, &userData);
1250      switch (outputType) {
1251      case 'm':
1252        XML_UseParserAsHandlerArg(parser);
1253        XML_SetElementHandler(parser, metaStartElement, metaEndElement);
1254        XML_SetProcessingInstructionHandler(parser, metaProcessingInstruction);
1255        XML_SetCommentHandler(parser, metaComment);
1256        XML_SetCdataSectionHandler(parser, metaStartCdataSection,
1257                                   metaEndCdataSection);
1258        XML_SetCharacterDataHandler(parser, metaCharacterData);
1259        XML_SetDoctypeDeclHandler(parser, metaStartDoctypeDecl,
1260                                  metaEndDoctypeDecl);
1261        XML_SetEntityDeclHandler(parser, metaEntityDecl);
1262        XML_SetNotationDeclHandler(parser, metaNotationDecl);
1263        XML_SetNamespaceDeclHandler(parser, metaStartNamespaceDecl,
1264                                    metaEndNamespaceDecl);
1265        metaStartDocument(parser);
1266        break;
1267      case 'c':
1268        XML_UseParserAsHandlerArg(parser);
1269        XML_SetDefaultHandler(parser, markup);
1270        XML_SetElementHandler(parser, defaultStartElement, defaultEndElement);
1271        XML_SetCharacterDataHandler(parser, defaultCharacterData);
1272        XML_SetProcessingInstructionHandler(parser,
1273                                            defaultProcessingInstruction);
1274        break;
1275      default:
1276        if (useNamespaces)
1277          XML_SetElementHandler(parser, startElementNS, endElementNS);
1278        else
1279          XML_SetElementHandler(parser, startElement, endElement);
1280        XML_SetCharacterDataHandler(parser, characterData);
1281#ifndef W3C14N
1282        XML_SetProcessingInstructionHandler(parser, processingInstruction);
1283        if (requiresNotations) {
1284          XML_SetDoctypeDeclHandler(parser, startDoctypeDecl, endDoctypeDecl);
1285          XML_SetNotationDeclHandler(parser, notationDecl);
1286        }
1287#endif /* not W3C14N */
1288        break;
1289      }
1290    }
1291    if (windowsCodePages)
1292      XML_SetUnknownEncodingHandler(parser, unknownEncoding, 0);
1293    result = XML_ProcessFile(parser, useStdin ? NULL : argv[i], processFlags);
1294    if (outputDir) {
1295      if (outputType == 'm')
1296        metaEndDocument(parser);
1297      fclose(userData.fp);
1298      if (! result) {
1299        tremove(outName);
1300      }
1301      free(outName);
1302    }
1303    XML_ParserFree(parser);
1304    if (! result) {
1305      exitCode = XMLWF_EXIT_NOT_WELLFORMED;
1306      cleanupUserData(&userData);
1307      if (! continueOnError) {
1308        break;
1309      }
1310    }
1311  }
1312  return exitCode;
1313}
1314