1/*
2The contents of this file are subject to the Mozilla Public License
3Version 1.0 (the "License"); you may not use this file except in
4compliance with the License. You may obtain a copy of the License at
5http://www.mozilla.org/MPL/
6
7Software distributed under the License is distributed on an "AS IS"
8basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
9License for the specific language governing rights and limitations
10under the License.
11
12The Original Code is expat.
13
14The Initial Developer of the Original Code is James Clark.
15Portions created by James Clark are Copyright (C) 1998
16James Clark. All Rights Reserved.
17
18Contributor(s):
19*/
20
21#include <tcl.h>
22#include "xmlparse.h"
23#include "filemap.h"
24#include "codepage.h"
25
26#include <stdio.h>
27#include <stdlib.h>
28#include <stddef.h>
29#include <string.h>
30#include <fcntl.h>
31
32#ifdef _MSC_VER
33#include <io.h>
34#endif
35
36#ifdef _POSIX_SOURCE
37#include <unistd.h>
38#endif
39
40#ifndef O_BINARY
41#ifdef _O_BINARY
42#define O_BINARY _O_BINARY
43#else
44#define O_BINARY 0
45#endif
46#endif
47
48#ifdef _MSC_VER
49#include <crtdbg.h>
50#endif
51
52#ifdef _DEBUG
53#define READ_SIZE 16
54#else
55#define READ_SIZE (1024*8)
56#endif
57
58#ifdef XML_UNICODE
59#ifndef XML_UNICODE_WCHAR_T
60#error xmlwf requires a 16-bit Unicode-compatible wchar_t
61#endif
62#define T(x) L ## x
63#define ftprintf fwprintf
64#define tfopen _wfopen
65#define fputts fputws
66#define puttc putwc
67#define tcscmp wcscmp
68#define tcscpy wcscpy
69#define tcscat wcscat
70#define tcschr wcschr
71#define tcsrchr wcsrchr
72#define tcslen wcslen
73#define tperror _wperror
74#define topen _wopen
75#define tmain wmain
76#define tremove _wremove
77#else /* not XML_UNICODE */
78#define T(x) x
79#define ftprintf fprintf
80#define tfopen fopen
81#define fputts fputs
82#define puttc putc
83#define tcscmp strcmp
84#define tcscpy strcpy
85#define tcscat strcat
86#define tcschr strchr
87#define tcsrchr strrchr
88#define tcslen strlen
89#define tperror perror
90#define topen open
91#define tmain main
92#define tremove remove
93#endif /* not XML_UNICODE */
94
95static void characterData(void *userData, const XML_Char *s, size_t len)
96{
97  FILE *fp = userData;
98  for (; len > 0; --len, ++s) {
99    switch (*s) {
100    case T('&'):
101      fputts(T("&amp;"), fp);
102      break;
103    case T('<'):
104      fputts(T("&lt;"), fp);
105      break;
106    case T('>'):
107      fputts(T("&gt;"), fp);
108      break;
109    case T('"'):
110      fputts(T("&quot;"), fp);
111      break;
112    case 9:
113    case 10:
114    case 13:
115      ftprintf(fp, T("&#%d;"), *s);
116      break;
117    default:
118      puttc(*s, fp);
119      break;
120    }
121  }
122}
123
124/* Lexicographically comparing UTF-8 encoded attribute values,
125is equivalent to lexicographically comparing based on the character number. */
126
127static int attcmp(const void *att1, const void *att2)
128{
129  return tcscmp(*(const XML_Char **)att1, *(const XML_Char **)att2);
130}
131
132static void startElement(void *userData, const XML_Char *name, const XML_Char **atts)
133{
134  size_t nAtts;
135  const XML_Char **p;
136  FILE *fp = userData;
137  puttc(T('<'), fp);
138  fputts(name, fp);
139
140  p = atts;
141  while (*p)
142    ++p;
143  nAtts = (p - atts) >> 1;
144  if (nAtts > 1)
145    qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, attcmp);
146  while (*atts) {
147    puttc(T(' '), fp);
148    fputts(*atts++, fp);
149    puttc(T('='), fp);
150    puttc(T('"'), fp);
151    characterData(userData, *atts, tcslen(*atts));
152    puttc(T('"'), fp);
153    atts++;
154  }
155  puttc(T('>'), fp);
156}
157
158static void endElement(void *userData, const XML_Char *name)
159{
160  FILE *fp = userData;
161  puttc(T('<'), fp);
162  puttc(T('/'), fp);
163  fputts(name, fp);
164  puttc(T('>'), fp);
165}
166
167static void processingInstruction(void *userData, const XML_Char *target, const XML_Char *data)
168{
169  FILE *fp = userData;
170  puttc(T('<'), fp);
171  puttc(T('?'), fp);
172  fputts(target, fp);
173  puttc(T(' '), fp);
174  fputts(data, fp);
175  puttc(T('?'), fp);
176  puttc(T('>'), fp);
177}
178
179static void defaultCharacterData(XML_Parser parser, const XML_Char *s, size_t len)
180{
181  XML_DefaultCurrent(parser);
182}
183
184static void defaultStartElement(XML_Parser parser, const XML_Char *name, const XML_Char **atts)
185{
186  XML_DefaultCurrent(parser);
187}
188
189static void defaultEndElement(XML_Parser parser, const XML_Char *name)
190{
191  XML_DefaultCurrent(parser);
192}
193
194static void defaultProcessingInstruction(XML_Parser parser, const XML_Char *target, const XML_Char *data)
195{
196  XML_DefaultCurrent(parser);
197}
198
199static void markup(XML_Parser parser, const XML_Char *s, int len)
200{
201  FILE *fp = XML_GetUserData(parser);
202  for (; len > 0; --len, ++s)
203    puttc(*s, fp);
204}
205
206static
207void metaLocation(XML_Parser parser)
208{
209  const XML_Char *uri = XML_GetBase(parser);
210  if (uri)
211    ftprintf(XML_GetUserData(parser), T(" uri=\"%s\""), uri);
212  ftprintf(XML_GetUserData(parser),
213           T(" byte=\"%ld\" line=\"%d\" col=\"%d\""),
214	   XML_GetCurrentByteIndex(parser),
215	   XML_GetCurrentLineNumber(parser),
216	   XML_GetCurrentColumnNumber(parser));
217}
218
219static
220void metaStartElement(XML_Parser parser, const XML_Char *name, const XML_Char **atts)
221{
222  FILE *fp = XML_GetUserData(parser);
223  ftprintf(fp, T("<starttag name=\"%s\""), name);
224  metaLocation(parser);
225  if (*atts) {
226    fputts(T(">\n"), fp);
227    do {
228      ftprintf(fp, T("<attribute name=\"%s\" value=\""), atts[0]);
229      characterData(fp, atts[1], tcslen(atts[1]));
230      fputts(T("\"/>\n"), fp);
231    } while (*(atts += 2));
232    fputts(T("</starttag>\n"), fp);
233  }
234  else
235    fputts(T("/>\n"), fp);
236}
237
238static
239void metaEndElement(XML_Parser parser, const XML_Char *name)
240{
241  FILE *fp = XML_GetUserData(parser);
242  ftprintf(fp, T("<endtag name=\"%s\""), name);
243  metaLocation(parser);
244  fputts(T("/>\n"), fp);
245}
246
247static
248void metaProcessingInstruction(XML_Parser parser, const XML_Char *target, const XML_Char *data)
249{
250  FILE *fp = XML_GetUserData(parser);
251  ftprintf(fp, T("<pi target=\"%s\" data=\""), target);
252  characterData(fp, data, tcslen(data));
253  puttc(T('"'), fp);
254  metaLocation(parser);
255  fputts(T("/>\n"), fp);
256}
257
258static
259void metaCharacterData(XML_Parser parser, const XML_Char *s, size_t len)
260{
261  FILE *fp = XML_GetUserData(parser);
262  fputts(T("<chars str=\""), fp);
263  characterData(fp, s, len);
264  puttc(T('"'), fp);
265  metaLocation(parser);
266  fputts(T("/>\n"), fp);
267}
268
269static
270void metaUnparsedEntityDecl(XML_Parser parser,
271			       const XML_Char *entityName,
272			       const XML_Char *base,
273			       const XML_Char *systemId,
274			       const XML_Char *publicId,
275			       const XML_Char *notationName)
276{
277  FILE *fp = XML_GetUserData(parser);
278  ftprintf(fp, T("<entity name=\"%s\""), entityName);
279  if (publicId)
280    ftprintf(fp, T(" public=\"%s\""), publicId);
281  fputts(T(" system=\""), fp);
282  characterData(fp, systemId, tcslen(systemId));
283  puttc(T('"'), fp);
284  ftprintf(fp, T(" notation=\"%s\""), notationName);
285  metaLocation(parser);
286  fputts(T("/>\n"), fp);
287}
288
289static
290void metaNotationDecl(XML_Parser parser,
291		      const XML_Char *notationName,
292		      const XML_Char *base,
293		      const XML_Char *systemId,
294		      const XML_Char *publicId)
295{
296  FILE *fp = XML_GetUserData(parser);
297  ftprintf(fp, T("<notation name=\"%s\""), notationName);
298  if (publicId)
299    ftprintf(fp, T(" public=\"%s\""), publicId);
300  if (systemId) {
301    fputts(T(" system=\""), fp);
302    characterData(fp, systemId, tcslen(systemId));
303    puttc(T('"'), fp);
304  }
305  metaLocation(parser);
306  fputts(T("/>\n"), fp);
307}
308
309typedef struct {
310  XML_Parser parser;
311  int *retPtr;
312} PROCESS_ARGS;
313
314static
315void reportError(XML_Parser parser, const XML_Char *filename)
316{
317  int code = XML_GetErrorCode(parser);
318  const XML_Char *message = XML_ErrorString(code);
319  if (message)
320    ftprintf(stdout, T("%s:%d:%d: %s\n"),
321	     filename,
322	     XML_GetErrorLineNumber(parser),
323	     XML_GetErrorColumnNumber(parser),
324	     message);
325  else
326    ftprintf(stderr, T("%s: (unknown message %d)\n"), filename, code);
327}
328
329static
330void processFile(const void *data, size_t size, const XML_Char *filename, void *args)
331{
332  XML_Parser parser = ((PROCESS_ARGS *)args)->parser;
333  int *retPtr = ((PROCESS_ARGS *)args)->retPtr;
334  if (!XML_Parse(parser, data, size, 1)) {
335    reportError(parser, filename);
336    *retPtr = 0;
337  }
338  else
339    *retPtr = 1;
340}
341
342static
343int isAsciiLetter(XML_Char c)
344{
345  return (T('a') <= c && c <= T('z')) || (T('A') <= c && c <= T('Z'));
346}
347
348static
349const XML_Char *resolveSystemId(const XML_Char *base, const XML_Char *systemId, XML_Char **toFree)
350{
351  XML_Char *s;
352  *toFree = 0;
353  if (!base
354      || *systemId == T('/')
355#ifdef WIN32
356      || *systemId == T('\\')
357      || (isAsciiLetter(systemId[0]) && systemId[1] == T(':'))
358#endif
359     )
360    return systemId;
361  *toFree = (XML_Char *)malloc((tcslen(base) + tcslen(systemId) + 2)*sizeof(XML_Char));
362  if (!*toFree)
363    return systemId;
364  tcscpy(*toFree, base);
365  s = *toFree;
366  if (tcsrchr(s, T('/')))
367    s = tcsrchr(s, T('/')) + 1;
368#ifdef WIN32
369  if (tcsrchr(s, T('\\')))
370    s = tcsrchr(s, T('\\')) + 1;
371#endif
372  tcscpy(s, systemId);
373  return *toFree;
374}
375
376static
377int externalEntityRefFilemap(XML_Parser parser,
378			     const XML_Char *openEntityNames,
379			     const XML_Char *base,
380			     const XML_Char *systemId,
381			     const XML_Char *publicId)
382{
383  int result;
384  XML_Char *s;
385  const XML_Char *filename;
386  XML_Parser entParser = XML_ExternalEntityParserCreate(parser, openEntityNames, 0);
387  PROCESS_ARGS args;
388  args.retPtr = &result;
389  args.parser = entParser;
390  filename = resolveSystemId(base, systemId, &s);
391  XML_SetBase(entParser, filename);
392  if (!filemap(filename, processFile, &args))
393    result = 0;
394  free(s);
395  XML_ParserFree(entParser);
396  return result;
397}
398
399static
400int processStream(const XML_Char *filename, XML_Parser parser)
401{
402  int fd = topen(filename, O_BINARY|O_RDONLY);
403  if (fd < 0) {
404    tperror(filename);
405    return 0;
406  }
407  for (;;) {
408    size_t nread;
409    char *buf = XML_GetBuffer(parser, READ_SIZE);
410    if (!buf) {
411      close(fd);
412      ftprintf(stderr, T("%s: out of memory\n"), filename);
413      return 0;
414    }
415    nread = read(fd, buf, READ_SIZE);
416    if (nread < 0) {
417      tperror(filename);
418      close(fd);
419      return 0;
420    }
421    if (!XML_ParseBuffer(parser, nread, nread == 0)) {
422      reportError(parser, filename);
423      close(fd);
424      return 0;
425    }
426    if (nread == 0) {
427      close(fd);
428      break;;
429    }
430  }
431  return 1;
432}
433
434static
435int externalEntityRefStream(XML_Parser parser,
436			    const XML_Char *openEntityNames,
437			    const XML_Char *base,
438			    const XML_Char *systemId,
439			    const XML_Char *publicId)
440{
441  XML_Char *s;
442  const XML_Char *filename;
443  int ret;
444  XML_Parser entParser = XML_ExternalEntityParserCreate(parser, openEntityNames, 0);
445  filename = resolveSystemId(base, systemId, &s);
446  XML_SetBase(entParser, filename);
447  ret = processStream(filename, entParser);
448  free(s);
449  XML_ParserFree(entParser);
450  return ret;
451}
452
453static
454int unknownEncodingConvert(void *data, const char *p)
455{
456  return codepageConvert(*(int *)data, p);
457}
458
459static
460int unknownEncoding(void *userData,
461		    const XML_Char *name,
462		    XML_Encoding *info)
463{
464  int cp;
465  static const XML_Char prefixL[] = T("windows-");
466  static const XML_Char prefixU[] = T("WINDOWS-");
467  int i;
468
469  for (i = 0; prefixU[i]; i++)
470    if (name[i] != prefixU[i] && name[i] != prefixL[i])
471      return 0;
472
473  cp = 0;
474  for (; name[i]; i++) {
475    static const XML_Char digits[] = T("0123456789");
476    const XML_Char *s = tcschr(digits, name[i]);
477    if (!s)
478      return 0;
479    cp *= 10;
480    cp += s - digits;
481    if (cp >= 0x10000)
482      return 0;
483  }
484  if (!codepageMap(cp, info->map))
485    return 0;
486  info->convert = unknownEncodingConvert;
487  /* We could just cast the code page integer to a void *,
488  and avoid the use of release. */
489  info->release = free;
490  info->data = malloc(sizeof(int));
491  if (!info->data)
492    return 0;
493  *(int *)info->data = cp;
494  return 1;
495}
496
497static
498void usage(const XML_Char *prog)
499{
500  ftprintf(stderr, T("usage: %s [-r] [-w] [-x] [-d output-dir] [-e encoding] file ...\n"), prog);
501  exit(1);
502}
503
504int tmain(int argc, XML_Char **argv)
505{
506  int i;
507  const XML_Char *outputDir = 0;
508  const XML_Char *encoding = 0;
509  int useFilemap = 1;
510  int processExternalEntities = 0;
511  int windowsCodePages = 0;
512  int outputType = 0;
513
514#ifdef _MSC_VER
515  _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF|_CRTDBG_LEAK_CHECK_DF);
516#endif
517
518  i = 1;
519  while (i < argc && argv[i][0] == T('-')) {
520    int j;
521    if (argv[i][1] == T('-') && argv[i][2] == T('\0')) {
522      i++;
523      break;
524    }
525    j = 1;
526    if (argv[i][j] == T('r')) {
527      useFilemap = 0;
528      j++;
529    }
530    if (argv[i][j] == T('x')) {
531      processExternalEntities = 1;
532      j++;
533    }
534    if (argv[i][j] == T('w')) {
535      windowsCodePages = 1;
536      j++;
537    }
538    if (argv[i][j] == T('m')) {
539      outputType = 'm';
540      j++;
541    }
542    if (argv[i][j] == T('c')) {
543      outputType = 'c';
544      j++;
545    }
546    if (argv[i][j] == T('d')) {
547      if (argv[i][j + 1] == T('\0')) {
548	if (++i == argc)
549	  usage(argv[0]);
550	outputDir = argv[i];
551      }
552      else
553	outputDir = argv[i] + j + 1;
554      i++;
555    }
556    else if (argv[i][j] == T('e')) {
557      if (argv[i][j + 1] == T('\0')) {
558	if (++i == argc)
559	  usage(argv[0]);
560	encoding = argv[i];
561      }
562      else
563	encoding = argv[i] + j + 1;
564      i++;
565    }
566    else if (argv[i][j] == T('\0') && j > 1)
567      i++;
568    else
569      usage(argv[0]);
570  }
571  if (i == argc)
572    usage(argv[0]);
573  for (; i < argc; i++) {
574    FILE *fp = 0;
575    XML_Char *outName = 0;
576    int result;
577    XML_Parser parser = XML_ParserCreate(encoding);
578    if (outputDir) {
579      const XML_Char *file = argv[i];
580      if (tcsrchr(file, T('/')))
581	file = tcsrchr(file, T('/')) + 1;
582#ifdef WIN32
583      if (tcsrchr(file, T('\\')))
584	file = tcsrchr(file, T('\\')) + 1;
585#endif
586      outName = malloc((tcslen(outputDir) + tcslen(file) + 2) * sizeof(XML_Char));
587      tcscpy(outName, outputDir);
588      tcscat(outName, T("/"));
589      tcscat(outName, file);
590      fp = tfopen(outName, T("wb"));
591      if (!fp) {
592	tperror(outName);
593	exit(1);
594      }
595#ifdef XML_UNICODE
596      puttc(0xFEFF, fp);
597#endif
598      XML_SetUserData(parser, fp);
599      switch (outputType) {
600      case 'm':
601	XML_UseParserAsHandlerArg(parser);
602	fputts(T("<document>\n"), fp);
603	XML_SetElementHandler(parser, metaStartElement, metaEndElement);
604	XML_SetProcessingInstructionHandler(parser, metaProcessingInstruction);
605	XML_SetCharacterDataHandler(parser, metaCharacterData);
606	XML_SetUnparsedEntityDeclHandler(parser, metaUnparsedEntityDecl);
607	XML_SetNotationDeclHandler(parser, metaNotationDecl);
608	break;
609      case 'c':
610	XML_UseParserAsHandlerArg(parser);
611	XML_SetDefaultHandler(parser, markup);
612	XML_SetElementHandler(parser, defaultStartElement, defaultEndElement);
613	XML_SetCharacterDataHandler(parser, defaultCharacterData);
614	XML_SetProcessingInstructionHandler(parser, defaultProcessingInstruction);
615	break;
616      default:
617	XML_SetElementHandler(parser, startElement, endElement);
618	XML_SetCharacterDataHandler(parser, characterData);
619	XML_SetProcessingInstructionHandler(parser, processingInstruction);
620	break;
621      }
622    }
623    if (windowsCodePages)
624      XML_SetUnknownEncodingHandler(parser, unknownEncoding, 0);
625    if (!XML_SetBase(parser, argv[i])) {
626      ftprintf(stderr, T("%s: out of memory"), argv[0]);
627      exit(1);
628    }
629    if (processExternalEntities)
630      XML_SetExternalEntityRefHandler(parser,
631	                              useFilemap
632				      ? externalEntityRefFilemap
633				      : externalEntityRefStream);
634    if (useFilemap) {
635      PROCESS_ARGS args;
636      args.retPtr = &result;
637      args.parser = parser;
638      if (!filemap(argv[i], processFile, &args))
639	result = 0;
640    }
641    else
642      result = processStream(argv[i], parser);
643    if (outputDir) {
644      if (outputType == 'm')
645	fputts(T("</document>\n"), fp);
646      fclose(fp);
647      if (!result)
648	tremove(outName);
649      free(outName);
650    }
651    XML_ParserFree(parser);
652  }
653  return 0;
654}
655