1/* entities.c -- recognize HTML ISO entities
2
3  (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
4  See tidy.h for the copyright notice.
5
6  CVS Info :
7
8    $Author: iccir $
9    $Date: 2007/01/30 23:46:51 $
10    $Revision: 1.3 $
11
12  Entity handling can be static because there are no config or
13  document-specific values.  Lookup table is 100% defined at
14  compile time.
15
16*/
17
18#include <stdio.h>
19#include "entities.h"
20#include "tidy-int.h"
21#include "tmbstr.h"
22
23struct _entity;
24typedef struct _entity entity;
25
26struct _entity
27{
28    ctmbstr name;
29    uint    versions;
30    uint    code;
31};
32
33
34static const entity entities[] =
35{
36    /*
37    ** Markup pre-defined character entities
38    */
39    { "quot",    VERS_ALL|VERS_XML,    34 },
40    { "amp",     VERS_ALL|VERS_XML,    38 },
41    { "apos",    VERS_FROM40|VERS_XML, 39 },
42    { "lt",      VERS_ALL|VERS_XML,    60 },
43    { "gt",      VERS_ALL|VERS_XML,    62 },
44
45    /*
46    ** Latin-1 character entities
47    */
48    { "nbsp",     VERS_ALL,      160 },
49    { "iexcl",    VERS_ALL,      161 },
50    { "cent",     VERS_ALL,      162 },
51    { "pound",    VERS_ALL,      163 },
52    { "curren",   VERS_ALL,      164 },
53    { "yen",      VERS_ALL,      165 },
54    { "brvbar",   VERS_ALL,      166 },
55    { "sect",     VERS_ALL,      167 },
56    { "uml",      VERS_ALL,      168 },
57    { "copy",     VERS_ALL,      169 },
58    { "ordf",     VERS_ALL,      170 },
59    { "laquo",    VERS_ALL,      171 },
60    { "not",      VERS_ALL,      172 },
61    { "shy",      VERS_ALL,      173 },
62    { "reg",      VERS_ALL,      174 },
63    { "macr",     VERS_ALL,      175 },
64    { "deg",      VERS_ALL,      176 },
65    { "plusmn",   VERS_ALL,      177 },
66    { "sup2",     VERS_ALL,      178 },
67    { "sup3",     VERS_ALL,      179 },
68    { "acute",    VERS_ALL,      180 },
69    { "micro",    VERS_ALL,      181 },
70    { "para",     VERS_ALL,      182 },
71    { "middot",   VERS_ALL,      183 },
72    { "cedil",    VERS_ALL,      184 },
73    { "sup1",     VERS_ALL,      185 },
74    { "ordm",     VERS_ALL,      186 },
75    { "raquo",    VERS_ALL,      187 },
76    { "frac14",   VERS_ALL,      188 },
77    { "frac12",   VERS_ALL,      189 },
78    { "frac34",   VERS_ALL,      190 },
79    { "iquest",   VERS_ALL,      191 },
80    { "Agrave",   VERS_ALL,      192 },
81    { "Aacute",   VERS_ALL,      193 },
82    { "Acirc",    VERS_ALL,      194 },
83    { "Atilde",   VERS_ALL,      195 },
84    { "Auml",     VERS_ALL,      196 },
85    { "Aring",    VERS_ALL,      197 },
86    { "AElig",    VERS_ALL,      198 },
87    { "Ccedil",   VERS_ALL,      199 },
88    { "Egrave",   VERS_ALL,      200 },
89    { "Eacute",   VERS_ALL,      201 },
90    { "Ecirc",    VERS_ALL,      202 },
91    { "Euml",     VERS_ALL,      203 },
92    { "Igrave",   VERS_ALL,      204 },
93    { "Iacute",   VERS_ALL,      205 },
94    { "Icirc",    VERS_ALL,      206 },
95    { "Iuml",     VERS_ALL,      207 },
96    { "ETH",      VERS_ALL,      208 },
97    { "Ntilde",   VERS_ALL,      209 },
98    { "Ograve",   VERS_ALL,      210 },
99    { "Oacute",   VERS_ALL,      211 },
100    { "Ocirc",    VERS_ALL,      212 },
101    { "Otilde",   VERS_ALL,      213 },
102    { "Ouml",     VERS_ALL,      214 },
103    { "times",    VERS_ALL,      215 },
104    { "Oslash",   VERS_ALL,      216 },
105    { "Ugrave",   VERS_ALL,      217 },
106    { "Uacute",   VERS_ALL,      218 },
107    { "Ucirc",    VERS_ALL,      219 },
108    { "Uuml",     VERS_ALL,      220 },
109    { "Yacute",   VERS_ALL,      221 },
110    { "THORN",    VERS_ALL,      222 },
111    { "szlig",    VERS_ALL,      223 },
112    { "agrave",   VERS_ALL,      224 },
113    { "aacute",   VERS_ALL,      225 },
114    { "acirc",    VERS_ALL,      226 },
115    { "atilde",   VERS_ALL,      227 },
116    { "auml",     VERS_ALL,      228 },
117    { "aring",    VERS_ALL,      229 },
118    { "aelig",    VERS_ALL,      230 },
119    { "ccedil",   VERS_ALL,      231 },
120    { "egrave",   VERS_ALL,      232 },
121    { "eacute",   VERS_ALL,      233 },
122    { "ecirc",    VERS_ALL,      234 },
123    { "euml",     VERS_ALL,      235 },
124    { "igrave",   VERS_ALL,      236 },
125    { "iacute",   VERS_ALL,      237 },
126    { "icirc",    VERS_ALL,      238 },
127    { "iuml",     VERS_ALL,      239 },
128    { "eth",      VERS_ALL,      240 },
129    { "ntilde",   VERS_ALL,      241 },
130    { "ograve",   VERS_ALL,      242 },
131    { "oacute",   VERS_ALL,      243 },
132    { "ocirc",    VERS_ALL,      244 },
133    { "otilde",   VERS_ALL,      245 },
134    { "ouml",     VERS_ALL,      246 },
135    { "divide",   VERS_ALL,      247 },
136    { "oslash",   VERS_ALL,      248 },
137    { "ugrave",   VERS_ALL,      249 },
138    { "uacute",   VERS_ALL,      250 },
139    { "ucirc",    VERS_ALL,      251 },
140    { "uuml",     VERS_ALL,      252 },
141    { "yacute",   VERS_ALL,      253 },
142    { "thorn",    VERS_ALL,      254 },
143    { "yuml",     VERS_ALL,      255 },
144
145    /*
146    ** Extended Entities defined in HTML 4: Symbols
147    */
148    { "fnof",     VERS_FROM40,   402 },
149    { "Alpha",    VERS_FROM40,   913 },
150    { "Beta",     VERS_FROM40,   914 },
151    { "Gamma",    VERS_FROM40,   915 },
152    { "Delta",    VERS_FROM40,   916 },
153    { "Epsilon",  VERS_FROM40,   917 },
154    { "Zeta",     VERS_FROM40,   918 },
155    { "Eta",      VERS_FROM40,   919 },
156    { "Theta",    VERS_FROM40,   920 },
157    { "Iota",     VERS_FROM40,   921 },
158    { "Kappa",    VERS_FROM40,   922 },
159    { "Lambda",   VERS_FROM40,   923 },
160    { "Mu",       VERS_FROM40,   924 },
161    { "Nu",       VERS_FROM40,   925 },
162    { "Xi",       VERS_FROM40,   926 },
163    { "Omicron",  VERS_FROM40,   927 },
164    { "Pi",       VERS_FROM40,   928 },
165    { "Rho",      VERS_FROM40,   929 },
166    { "Sigma",    VERS_FROM40,   931 },
167    { "Tau",      VERS_FROM40,   932 },
168    { "Upsilon",  VERS_FROM40,   933 },
169    { "Phi",      VERS_FROM40,   934 },
170    { "Chi",      VERS_FROM40,   935 },
171    { "Psi",      VERS_FROM40,   936 },
172    { "Omega",    VERS_FROM40,   937 },
173    { "alpha",    VERS_FROM40,   945 },
174    { "beta",     VERS_FROM40,   946 },
175    { "gamma",    VERS_FROM40,   947 },
176    { "delta",    VERS_FROM40,   948 },
177    { "epsilon",  VERS_FROM40,   949 },
178    { "zeta",     VERS_FROM40,   950 },
179    { "eta",      VERS_FROM40,   951 },
180    { "theta",    VERS_FROM40,   952 },
181    { "iota",     VERS_FROM40,   953 },
182    { "kappa",    VERS_FROM40,   954 },
183    { "lambda",   VERS_FROM40,   955 },
184    { "mu",       VERS_FROM40,   956 },
185    { "nu",       VERS_FROM40,   957 },
186    { "xi",       VERS_FROM40,   958 },
187    { "omicron",  VERS_FROM40,   959 },
188    { "pi",       VERS_FROM40,   960 },
189    { "rho",      VERS_FROM40,   961 },
190    { "sigmaf",   VERS_FROM40,   962 },
191    { "sigma",    VERS_FROM40,   963 },
192    { "tau",      VERS_FROM40,   964 },
193    { "upsilon",  VERS_FROM40,   965 },
194    { "phi",      VERS_FROM40,   966 },
195    { "chi",      VERS_FROM40,   967 },
196    { "psi",      VERS_FROM40,   968 },
197    { "omega",    VERS_FROM40,   969 },
198    { "thetasym", VERS_FROM40,   977 },
199    { "upsih",    VERS_FROM40,   978 },
200    { "piv",      VERS_FROM40,   982 },
201    { "bull",     VERS_FROM40,  8226 },
202    { "hellip",   VERS_FROM40,  8230 },
203    { "prime",    VERS_FROM40,  8242 },
204    { "Prime",    VERS_FROM40,  8243 },
205    { "oline",    VERS_FROM40,  8254 },
206    { "frasl",    VERS_FROM40,  8260 },
207    { "weierp",   VERS_FROM40,  8472 },
208    { "image",    VERS_FROM40,  8465 },
209    { "real",     VERS_FROM40,  8476 },
210    { "trade",    VERS_FROM40,  8482 },
211    { "alefsym",  VERS_FROM40,  8501 },
212    { "larr",     VERS_FROM40,  8592 },
213    { "uarr",     VERS_FROM40,  8593 },
214    { "rarr",     VERS_FROM40,  8594 },
215    { "darr",     VERS_FROM40,  8595 },
216    { "harr",     VERS_FROM40,  8596 },
217    { "crarr",    VERS_FROM40,  8629 },
218    { "lArr",     VERS_FROM40,  8656 },
219    { "uArr",     VERS_FROM40,  8657 },
220    { "rArr",     VERS_FROM40,  8658 },
221    { "dArr",     VERS_FROM40,  8659 },
222    { "hArr",     VERS_FROM40,  8660 },
223    { "forall",   VERS_FROM40,  8704 },
224    { "part",     VERS_FROM40,  8706 },
225    { "exist",    VERS_FROM40,  8707 },
226    { "empty",    VERS_FROM40,  8709 },
227    { "nabla",    VERS_FROM40,  8711 },
228    { "isin",     VERS_FROM40,  8712 },
229    { "notin",    VERS_FROM40,  8713 },
230    { "ni",       VERS_FROM40,  8715 },
231    { "prod",     VERS_FROM40,  8719 },
232    { "sum",      VERS_FROM40,  8721 },
233    { "minus",    VERS_FROM40,  8722 },
234    { "lowast",   VERS_FROM40,  8727 },
235    { "radic",    VERS_FROM40,  8730 },
236    { "prop",     VERS_FROM40,  8733 },
237    { "infin",    VERS_FROM40,  8734 },
238    { "ang",      VERS_FROM40,  8736 },
239    { "and",      VERS_FROM40,  8743 },
240    { "or",       VERS_FROM40,  8744 },
241    { "cap",      VERS_FROM40,  8745 },
242    { "cup",      VERS_FROM40,  8746 },
243    { "int",      VERS_FROM40,  8747 },
244    { "there4",   VERS_FROM40,  8756 },
245    { "sim",      VERS_FROM40,  8764 },
246    { "cong",     VERS_FROM40,  8773 },
247    { "asymp",    VERS_FROM40,  8776 },
248    { "ne",       VERS_FROM40,  8800 },
249    { "equiv",    VERS_FROM40,  8801 },
250    { "le",       VERS_FROM40,  8804 },
251    { "ge",       VERS_FROM40,  8805 },
252    { "sub",      VERS_FROM40,  8834 },
253    { "sup",      VERS_FROM40,  8835 },
254    { "nsub",     VERS_FROM40,  8836 },
255    { "sube",     VERS_FROM40,  8838 },
256    { "supe",     VERS_FROM40,  8839 },
257    { "oplus",    VERS_FROM40,  8853 },
258    { "otimes",   VERS_FROM40,  8855 },
259    { "perp",     VERS_FROM40,  8869 },
260    { "sdot",     VERS_FROM40,  8901 },
261    { "lceil",    VERS_FROM40,  8968 },
262    { "rceil",    VERS_FROM40,  8969 },
263    { "lfloor",   VERS_FROM40,  8970 },
264    { "rfloor",   VERS_FROM40,  8971 },
265    { "lang",     VERS_FROM40,  9001 },
266    { "rang",     VERS_FROM40,  9002 },
267    { "loz",      VERS_FROM40,  9674 },
268    { "spades",   VERS_FROM40,  9824 },
269    { "clubs",    VERS_FROM40,  9827 },
270    { "hearts",   VERS_FROM40,  9829 },
271    { "diams",    VERS_FROM40,  9830 },
272
273    /*
274    ** Extended Entities defined in HTML 4: Special (less Markup at top)
275    */
276    { "OElig",    VERS_FROM40,   338 },
277    { "oelig",    VERS_FROM40,   339 },
278    { "Scaron",   VERS_FROM40,   352 },
279    { "scaron",   VERS_FROM40,   353 },
280    { "Yuml",     VERS_FROM40,   376 },
281    { "circ",     VERS_FROM40,   710 },
282    { "tilde",    VERS_FROM40,   732 },
283    { "ensp",     VERS_FROM40,  8194 },
284    { "emsp",     VERS_FROM40,  8195 },
285    { "thinsp",   VERS_FROM40,  8201 },
286    { "zwnj",     VERS_FROM40,  8204 },
287    { "zwj",      VERS_FROM40,  8205 },
288    { "lrm",      VERS_FROM40,  8206 },
289    { "rlm",      VERS_FROM40,  8207 },
290    { "ndash",    VERS_FROM40,  8211 },
291    { "mdash",    VERS_FROM40,  8212 },
292    { "lsquo",    VERS_FROM40,  8216 },
293    { "rsquo",    VERS_FROM40,  8217 },
294    { "sbquo",    VERS_FROM40,  8218 },
295    { "ldquo",    VERS_FROM40,  8220 },
296    { "rdquo",    VERS_FROM40,  8221 },
297    { "bdquo",    VERS_FROM40,  8222 },
298    { "dagger",   VERS_FROM40,  8224 },
299    { "Dagger",   VERS_FROM40,  8225 },
300    { "permil",   VERS_FROM40,  8240 },
301    { "lsaquo",   VERS_FROM40,  8249 },
302    { "rsaquo",   VERS_FROM40,  8250 },
303    { "euro",     VERS_FROM40,  8364 },
304    { NULL,       0,               0 }
305};
306
307
308/* Pure static implementation.  Trades off lookup speed
309** for faster setup time (well, none actually).
310** Optimization of comparing 1st character buys enough
311** speed that hash doesn't improve things without > 500
312** items in list.
313*/
314static const entity* lookup( ctmbstr s )
315{
316    tmbchar ch = (tmbchar)( s ? *s : 0 );
317    const entity *np;
318    for ( np = entities; ch && np && np->name; ++np )
319        if ( ch == *np->name && TY_(tmbstrcmp)(s, np->name) == 0 )
320            return np;
321    return NULL;
322}
323
324#if 0
325/* entity starting with "&" returns zero on error */
326uint EntityCode( ctmbstr name, uint versions )
327{
328    const entity* np;
329    assert( name && name[0] == '&' );
330
331    /* numeric entitity: name = "&#" followed by number */
332    if ( name[1] == '#' )
333    {
334        uint c = 0;  /* zero on missing/bad number */
335        Bool isXml = ( (versions & VERS_XML) == VERS_XML );
336
337        /* 'x' prefix denotes hexadecimal number format */
338        if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
339            sscanf( name+3, "%x", &c );
340        else
341            sscanf( name+2, "%u", &c );
342
343        return (uint) c;
344    }
345
346   /* Named entity: name ="&" followed by a name */
347    if ( NULL != (np = lookup(name+1)) )
348    {
349        /* Only recognize entity name if version supports it.  */
350        if ( np->versions & versions )
351            return np->code;
352    }
353
354    return 0;   /* zero signifies unknown entity name */
355}
356#endif
357
358Bool TY_(EntityInfo)( ctmbstr name, Bool isXml, uint* code, uint* versions )
359{
360    const entity* np;
361    assert( name && name[0] == '&' );
362    assert( code != NULL );
363    assert( versions != NULL );
364
365    /* numeric entitity: name = "&#" followed by number */
366    if ( name[1] == '#' )
367    {
368        uint c = 0;  /* zero on missing/bad number */
369
370        /* 'x' prefix denotes hexadecimal number format */
371        if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
372            sscanf( name+3, "%x", &c );
373        else
374            sscanf( name+2, "%u", &c );
375
376        *code = c;
377        *versions = VERS_ALL;
378        return yes;
379    }
380
381    /* Named entity: name ="&" followed by a name */
382    if ( NULL != (np = lookup(name+1)) )
383    {
384        *code = np->code;
385        *versions = np->versions;
386        return yes;
387    }
388
389    *code = 0;
390    *versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
391    return no;
392}
393
394
395ctmbstr TY_(EntityName)( uint ch, uint versions )
396{
397    ctmbstr entnam = NULL;
398    const entity *ep;
399
400    for ( ep = entities; ep->name != NULL; ++ep )
401    {
402        if ( ep->code == ch )
403        {
404            if ( ep->versions & versions )
405                entnam = ep->name;
406            break; /* Found code. Stop search. */
407        }
408    }
409    return entnam;
410}
411
412/*
413 * local variables:
414 * mode: c
415 * indent-tabs-mode: nil
416 * c-basic-offset: 4
417 * eval: (c-set-offset 'substatement-open 0)
418 * end:
419 */
420