1355604Sdelphij/*
2355604Sdelphij                            __  __            _
3355604Sdelphij                         ___\ \/ /_ __   __ _| |_
4355604Sdelphij                        / _ \\  /| '_ \ / _` | __|
5355604Sdelphij                       |  __//  \| |_) | (_| | |_
6355604Sdelphij                        \___/_/\_\ .__/ \__,_|\__|
7355604Sdelphij                                 |_| XML parser
8355604Sdelphij
9355604Sdelphij   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10355604Sdelphij   Copyright (c) 2000-2017 Expat development team
11355604Sdelphij   Licensed under the MIT license:
12355604Sdelphij
13355604Sdelphij   Permission is  hereby granted,  free of charge,  to any  person obtaining
14355604Sdelphij   a  copy  of  this  software   and  associated  documentation  files  (the
15355604Sdelphij   "Software"),  to  deal in  the  Software  without restriction,  including
16355604Sdelphij   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
17355604Sdelphij   distribute, sublicense, and/or sell copies of the Software, and to permit
18355604Sdelphij   persons  to whom  the Software  is  furnished to  do so,  subject to  the
19355604Sdelphij   following conditions:
20355604Sdelphij
21355604Sdelphij   The above copyright  notice and this permission notice  shall be included
22355604Sdelphij   in all copies or substantial portions of the Software.
23355604Sdelphij
24355604Sdelphij   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
25355604Sdelphij   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
26355604Sdelphij   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27355604Sdelphij   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28355604Sdelphij   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
29355604Sdelphij   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30355604Sdelphij   USE OR OTHER DEALINGS IN THE SOFTWARE.
31355604Sdelphij*/
32355604Sdelphij
33104349Sphk#define CHARSET_MAX 41
34104349Sphk
35104349Sphkstatic const char *
36355604SdelphijgetTok(const char **pp) {
37104349Sphk  enum { inAtom, inString, init, inComment };
38104349Sphk  int state = init;
39104349Sphk  const char *tokStart = 0;
40104349Sphk  for (;;) {
41104349Sphk    switch (**pp) {
42104349Sphk    case '\0':
43104349Sphk      return 0;
44104349Sphk    case ' ':
45104349Sphk    case '\r':
46104349Sphk    case '\t':
47104349Sphk    case '\n':
48104349Sphk      if (state == inAtom)
49104349Sphk        return tokStart;
50104349Sphk      break;
51104349Sphk    case '(':
52104349Sphk      if (state == inAtom)
53104349Sphk        return tokStart;
54104349Sphk      if (state != inString)
55104349Sphk        state++;
56104349Sphk      break;
57104349Sphk    case ')':
58104349Sphk      if (state > init)
59104349Sphk        --state;
60104349Sphk      else if (state != inString)
61104349Sphk        return 0;
62104349Sphk      break;
63104349Sphk    case ';':
64104349Sphk    case '/':
65104349Sphk    case '=':
66104349Sphk      if (state == inAtom)
67104349Sphk        return tokStart;
68104349Sphk      if (state == init)
69104349Sphk        return (*pp)++;
70104349Sphk      break;
71104349Sphk    case '\\':
72104349Sphk      ++*pp;
73104349Sphk      if (**pp == '\0')
74104349Sphk        return 0;
75104349Sphk      break;
76104349Sphk    case '"':
77104349Sphk      switch (state) {
78104349Sphk      case inString:
79104349Sphk        ++*pp;
80104349Sphk        return tokStart;
81104349Sphk      case inAtom:
82104349Sphk        return tokStart;
83104349Sphk      case init:
84104349Sphk        tokStart = *pp;
85104349Sphk        state = inString;
86104349Sphk        break;
87104349Sphk      }
88104349Sphk      break;
89104349Sphk    default:
90104349Sphk      if (state == init) {
91104349Sphk        tokStart = *pp;
92104349Sphk        state = inAtom;
93104349Sphk      }
94104349Sphk      break;
95104349Sphk    }
96104349Sphk    ++*pp;
97104349Sphk  }
98104349Sphk  /* not reached */
99104349Sphk}
100104349Sphk
101104349Sphk/* key must be lowercase ASCII */
102104349Sphk
103104349Sphkstatic int
104355604Sdelphijmatchkey(const char *start, const char *end, const char *key) {
105355604Sdelphij  if (! start)
106104349Sphk    return 0;
107104349Sphk  for (; start != end; start++, key++)
108104349Sphk    if (*start != *key && *start != 'A' + (*key - 'a'))
109104349Sphk      return 0;
110104349Sphk  return *key == '\0';
111104349Sphk}
112104349Sphk
113104349Sphkvoid
114355604SdelphijgetXMLCharset(const char *buf, char *charset) {
115104349Sphk  const char *next, *p;
116104349Sphk
117104349Sphk  charset[0] = '\0';
118104349Sphk  next = buf;
119104349Sphk  p = getTok(&next);
120104349Sphk  if (matchkey(p, next, "text"))
121104349Sphk    strcpy(charset, "us-ascii");
122355604Sdelphij  else if (! matchkey(p, next, "application"))
123104349Sphk    return;
124104349Sphk  p = getTok(&next);
125355604Sdelphij  if (! p || *p != '/')
126104349Sphk    return;
127104349Sphk  p = getTok(&next);
128104349Sphk  if (matchkey(p, next, "xml"))
129104349Sphk    isXml = 1;
130104349Sphk  p = getTok(&next);
131104349Sphk  while (p) {
132104349Sphk    if (*p == ';') {
133104349Sphk      p = getTok(&next);
134104349Sphk      if (matchkey(p, next, "charset")) {
135104349Sphk        p = getTok(&next);
136104349Sphk        if (p && *p == '=') {
137104349Sphk          p = getTok(&next);
138104349Sphk          if (p) {
139104349Sphk            char *s = charset;
140104349Sphk            if (*p == '"') {
141104349Sphk              while (++p != next - 1) {
142104349Sphk                if (*p == '\\')
143104349Sphk                  ++p;
144104349Sphk                if (s == charset + CHARSET_MAX - 1) {
145104349Sphk                  charset[0] = '\0';
146104349Sphk                  break;
147104349Sphk                }
148104349Sphk                *s++ = *p;
149104349Sphk              }
150104349Sphk              *s++ = '\0';
151355604Sdelphij            } else {
152104349Sphk              if (next - p > CHARSET_MAX - 1)
153104349Sphk                break;
154104349Sphk              while (p != next)
155104349Sphk                *s++ = *p++;
156104349Sphk              *s = 0;
157104349Sphk              break;
158104349Sphk            }
159104349Sphk          }
160104349Sphk        }
161104349Sphk      }
162355604Sdelphij    } else
163355604Sdelphij      p = getTok(&next);
164104349Sphk  }
165104349Sphk}
166104349Sphk
167104349Sphkint
168355604Sdelphijmain(int argc, char **argv) {
169104349Sphk  char buf[CHARSET_MAX];
170104349Sphk  getXMLCharset(argv[1], buf);
171104349Sphk  printf("charset = \"%s\"\n", buf);
172104349Sphk  return 0;
173104349Sphk}
174