1/* entities.c -- recognize HTML ISO entities 2 3 (c) 1998-2006 (W3C) MIT, ERCIM, Keio University 4 See tidy.h for the copyright notice. 5 6 CVS Info : 7 8 $Author: iccir $ 9 $Date: 2007/01/30 23:46:51 $ 10 $Revision: 1.3 $ 11 12 Entity handling can be static because there are no config or 13 document-specific values. Lookup table is 100% defined at 14 compile time. 15 16*/ 17 18#include <stdio.h> 19#include "entities.h" 20#include "tidy-int.h" 21#include "tmbstr.h" 22 23struct _entity; 24typedef struct _entity entity; 25 26struct _entity 27{ 28 ctmbstr name; 29 uint versions; 30 uint code; 31}; 32 33 34static const entity entities[] = 35{ 36 /* 37 ** Markup pre-defined character entities 38 */ 39 { "quot", VERS_ALL|VERS_XML, 34 }, 40 { "amp", VERS_ALL|VERS_XML, 38 }, 41 { "apos", VERS_FROM40|VERS_XML, 39 }, 42 { "lt", VERS_ALL|VERS_XML, 60 }, 43 { "gt", VERS_ALL|VERS_XML, 62 }, 44 45 /* 46 ** Latin-1 character entities 47 */ 48 { "nbsp", VERS_ALL, 160 }, 49 { "iexcl", VERS_ALL, 161 }, 50 { "cent", VERS_ALL, 162 }, 51 { "pound", VERS_ALL, 163 }, 52 { "curren", VERS_ALL, 164 }, 53 { "yen", VERS_ALL, 165 }, 54 { "brvbar", VERS_ALL, 166 }, 55 { "sect", VERS_ALL, 167 }, 56 { "uml", VERS_ALL, 168 }, 57 { "copy", VERS_ALL, 169 }, 58 { "ordf", VERS_ALL, 170 }, 59 { "laquo", VERS_ALL, 171 }, 60 { "not", VERS_ALL, 172 }, 61 { "shy", VERS_ALL, 173 }, 62 { "reg", VERS_ALL, 174 }, 63 { "macr", VERS_ALL, 175 }, 64 { "deg", VERS_ALL, 176 }, 65 { "plusmn", VERS_ALL, 177 }, 66 { "sup2", VERS_ALL, 178 }, 67 { "sup3", VERS_ALL, 179 }, 68 { "acute", VERS_ALL, 180 }, 69 { "micro", VERS_ALL, 181 }, 70 { "para", VERS_ALL, 182 }, 71 { "middot", VERS_ALL, 183 }, 72 { "cedil", VERS_ALL, 184 }, 73 { "sup1", VERS_ALL, 185 }, 74 { "ordm", VERS_ALL, 186 }, 75 { "raquo", VERS_ALL, 187 }, 76 { "frac14", VERS_ALL, 188 }, 77 { "frac12", VERS_ALL, 189 }, 78 { "frac34", VERS_ALL, 190 }, 79 { "iquest", VERS_ALL, 191 }, 80 { "Agrave", VERS_ALL, 192 }, 81 { "Aacute", VERS_ALL, 193 }, 82 { "Acirc", VERS_ALL, 194 }, 83 { "Atilde", VERS_ALL, 195 }, 84 { "Auml", VERS_ALL, 196 }, 85 { "Aring", VERS_ALL, 197 }, 86 { "AElig", VERS_ALL, 198 }, 87 { "Ccedil", VERS_ALL, 199 }, 88 { "Egrave", VERS_ALL, 200 }, 89 { "Eacute", VERS_ALL, 201 }, 90 { "Ecirc", VERS_ALL, 202 }, 91 { "Euml", VERS_ALL, 203 }, 92 { "Igrave", VERS_ALL, 204 }, 93 { "Iacute", VERS_ALL, 205 }, 94 { "Icirc", VERS_ALL, 206 }, 95 { "Iuml", VERS_ALL, 207 }, 96 { "ETH", VERS_ALL, 208 }, 97 { "Ntilde", VERS_ALL, 209 }, 98 { "Ograve", VERS_ALL, 210 }, 99 { "Oacute", VERS_ALL, 211 }, 100 { "Ocirc", VERS_ALL, 212 }, 101 { "Otilde", VERS_ALL, 213 }, 102 { "Ouml", VERS_ALL, 214 }, 103 { "times", VERS_ALL, 215 }, 104 { "Oslash", VERS_ALL, 216 }, 105 { "Ugrave", VERS_ALL, 217 }, 106 { "Uacute", VERS_ALL, 218 }, 107 { "Ucirc", VERS_ALL, 219 }, 108 { "Uuml", VERS_ALL, 220 }, 109 { "Yacute", VERS_ALL, 221 }, 110 { "THORN", VERS_ALL, 222 }, 111 { "szlig", VERS_ALL, 223 }, 112 { "agrave", VERS_ALL, 224 }, 113 { "aacute", VERS_ALL, 225 }, 114 { "acirc", VERS_ALL, 226 }, 115 { "atilde", VERS_ALL, 227 }, 116 { "auml", VERS_ALL, 228 }, 117 { "aring", VERS_ALL, 229 }, 118 { "aelig", VERS_ALL, 230 }, 119 { "ccedil", VERS_ALL, 231 }, 120 { "egrave", VERS_ALL, 232 }, 121 { "eacute", VERS_ALL, 233 }, 122 { "ecirc", VERS_ALL, 234 }, 123 { "euml", VERS_ALL, 235 }, 124 { "igrave", VERS_ALL, 236 }, 125 { "iacute", VERS_ALL, 237 }, 126 { "icirc", VERS_ALL, 238 }, 127 { "iuml", VERS_ALL, 239 }, 128 { "eth", VERS_ALL, 240 }, 129 { "ntilde", VERS_ALL, 241 }, 130 { "ograve", VERS_ALL, 242 }, 131 { "oacute", VERS_ALL, 243 }, 132 { "ocirc", VERS_ALL, 244 }, 133 { "otilde", VERS_ALL, 245 }, 134 { "ouml", VERS_ALL, 246 }, 135 { "divide", VERS_ALL, 247 }, 136 { "oslash", VERS_ALL, 248 }, 137 { "ugrave", VERS_ALL, 249 }, 138 { "uacute", VERS_ALL, 250 }, 139 { "ucirc", VERS_ALL, 251 }, 140 { "uuml", VERS_ALL, 252 }, 141 { "yacute", VERS_ALL, 253 }, 142 { "thorn", VERS_ALL, 254 }, 143 { "yuml", VERS_ALL, 255 }, 144 145 /* 146 ** Extended Entities defined in HTML 4: Symbols 147 */ 148 { "fnof", VERS_FROM40, 402 }, 149 { "Alpha", VERS_FROM40, 913 }, 150 { "Beta", VERS_FROM40, 914 }, 151 { "Gamma", VERS_FROM40, 915 }, 152 { "Delta", VERS_FROM40, 916 }, 153 { "Epsilon", VERS_FROM40, 917 }, 154 { "Zeta", VERS_FROM40, 918 }, 155 { "Eta", VERS_FROM40, 919 }, 156 { "Theta", VERS_FROM40, 920 }, 157 { "Iota", VERS_FROM40, 921 }, 158 { "Kappa", VERS_FROM40, 922 }, 159 { "Lambda", VERS_FROM40, 923 }, 160 { "Mu", VERS_FROM40, 924 }, 161 { "Nu", VERS_FROM40, 925 }, 162 { "Xi", VERS_FROM40, 926 }, 163 { "Omicron", VERS_FROM40, 927 }, 164 { "Pi", VERS_FROM40, 928 }, 165 { "Rho", VERS_FROM40, 929 }, 166 { "Sigma", VERS_FROM40, 931 }, 167 { "Tau", VERS_FROM40, 932 }, 168 { "Upsilon", VERS_FROM40, 933 }, 169 { "Phi", VERS_FROM40, 934 }, 170 { "Chi", VERS_FROM40, 935 }, 171 { "Psi", VERS_FROM40, 936 }, 172 { "Omega", VERS_FROM40, 937 }, 173 { "alpha", VERS_FROM40, 945 }, 174 { "beta", VERS_FROM40, 946 }, 175 { "gamma", VERS_FROM40, 947 }, 176 { "delta", VERS_FROM40, 948 }, 177 { "epsilon", VERS_FROM40, 949 }, 178 { "zeta", VERS_FROM40, 950 }, 179 { "eta", VERS_FROM40, 951 }, 180 { "theta", VERS_FROM40, 952 }, 181 { "iota", VERS_FROM40, 953 }, 182 { "kappa", VERS_FROM40, 954 }, 183 { "lambda", VERS_FROM40, 955 }, 184 { "mu", VERS_FROM40, 956 }, 185 { "nu", VERS_FROM40, 957 }, 186 { "xi", VERS_FROM40, 958 }, 187 { "omicron", VERS_FROM40, 959 }, 188 { "pi", VERS_FROM40, 960 }, 189 { "rho", VERS_FROM40, 961 }, 190 { "sigmaf", VERS_FROM40, 962 }, 191 { "sigma", VERS_FROM40, 963 }, 192 { "tau", VERS_FROM40, 964 }, 193 { "upsilon", VERS_FROM40, 965 }, 194 { "phi", VERS_FROM40, 966 }, 195 { "chi", VERS_FROM40, 967 }, 196 { "psi", VERS_FROM40, 968 }, 197 { "omega", VERS_FROM40, 969 }, 198 { "thetasym", VERS_FROM40, 977 }, 199 { "upsih", VERS_FROM40, 978 }, 200 { "piv", VERS_FROM40, 982 }, 201 { "bull", VERS_FROM40, 8226 }, 202 { "hellip", VERS_FROM40, 8230 }, 203 { "prime", VERS_FROM40, 8242 }, 204 { "Prime", VERS_FROM40, 8243 }, 205 { "oline", VERS_FROM40, 8254 }, 206 { "frasl", VERS_FROM40, 8260 }, 207 { "weierp", VERS_FROM40, 8472 }, 208 { "image", VERS_FROM40, 8465 }, 209 { "real", VERS_FROM40, 8476 }, 210 { "trade", VERS_FROM40, 8482 }, 211 { "alefsym", VERS_FROM40, 8501 }, 212 { "larr", VERS_FROM40, 8592 }, 213 { "uarr", VERS_FROM40, 8593 }, 214 { "rarr", VERS_FROM40, 8594 }, 215 { "darr", VERS_FROM40, 8595 }, 216 { "harr", VERS_FROM40, 8596 }, 217 { "crarr", VERS_FROM40, 8629 }, 218 { "lArr", VERS_FROM40, 8656 }, 219 { "uArr", VERS_FROM40, 8657 }, 220 { "rArr", VERS_FROM40, 8658 }, 221 { "dArr", VERS_FROM40, 8659 }, 222 { "hArr", VERS_FROM40, 8660 }, 223 { "forall", VERS_FROM40, 8704 }, 224 { "part", VERS_FROM40, 8706 }, 225 { "exist", VERS_FROM40, 8707 }, 226 { "empty", VERS_FROM40, 8709 }, 227 { "nabla", VERS_FROM40, 8711 }, 228 { "isin", VERS_FROM40, 8712 }, 229 { "notin", VERS_FROM40, 8713 }, 230 { "ni", VERS_FROM40, 8715 }, 231 { "prod", VERS_FROM40, 8719 }, 232 { "sum", VERS_FROM40, 8721 }, 233 { "minus", VERS_FROM40, 8722 }, 234 { "lowast", VERS_FROM40, 8727 }, 235 { "radic", VERS_FROM40, 8730 }, 236 { "prop", VERS_FROM40, 8733 }, 237 { "infin", VERS_FROM40, 8734 }, 238 { "ang", VERS_FROM40, 8736 }, 239 { "and", VERS_FROM40, 8743 }, 240 { "or", VERS_FROM40, 8744 }, 241 { "cap", VERS_FROM40, 8745 }, 242 { "cup", VERS_FROM40, 8746 }, 243 { "int", VERS_FROM40, 8747 }, 244 { "there4", VERS_FROM40, 8756 }, 245 { "sim", VERS_FROM40, 8764 }, 246 { "cong", VERS_FROM40, 8773 }, 247 { "asymp", VERS_FROM40, 8776 }, 248 { "ne", VERS_FROM40, 8800 }, 249 { "equiv", VERS_FROM40, 8801 }, 250 { "le", VERS_FROM40, 8804 }, 251 { "ge", VERS_FROM40, 8805 }, 252 { "sub", VERS_FROM40, 8834 }, 253 { "sup", VERS_FROM40, 8835 }, 254 { "nsub", VERS_FROM40, 8836 }, 255 { "sube", VERS_FROM40, 8838 }, 256 { "supe", VERS_FROM40, 8839 }, 257 { "oplus", VERS_FROM40, 8853 }, 258 { "otimes", VERS_FROM40, 8855 }, 259 { "perp", VERS_FROM40, 8869 }, 260 { "sdot", VERS_FROM40, 8901 }, 261 { "lceil", VERS_FROM40, 8968 }, 262 { "rceil", VERS_FROM40, 8969 }, 263 { "lfloor", VERS_FROM40, 8970 }, 264 { "rfloor", VERS_FROM40, 8971 }, 265 { "lang", VERS_FROM40, 9001 }, 266 { "rang", VERS_FROM40, 9002 }, 267 { "loz", VERS_FROM40, 9674 }, 268 { "spades", VERS_FROM40, 9824 }, 269 { "clubs", VERS_FROM40, 9827 }, 270 { "hearts", VERS_FROM40, 9829 }, 271 { "diams", VERS_FROM40, 9830 }, 272 273 /* 274 ** Extended Entities defined in HTML 4: Special (less Markup at top) 275 */ 276 { "OElig", VERS_FROM40, 338 }, 277 { "oelig", VERS_FROM40, 339 }, 278 { "Scaron", VERS_FROM40, 352 }, 279 { "scaron", VERS_FROM40, 353 }, 280 { "Yuml", VERS_FROM40, 376 }, 281 { "circ", VERS_FROM40, 710 }, 282 { "tilde", VERS_FROM40, 732 }, 283 { "ensp", VERS_FROM40, 8194 }, 284 { "emsp", VERS_FROM40, 8195 }, 285 { "thinsp", VERS_FROM40, 8201 }, 286 { "zwnj", VERS_FROM40, 8204 }, 287 { "zwj", VERS_FROM40, 8205 }, 288 { "lrm", VERS_FROM40, 8206 }, 289 { "rlm", VERS_FROM40, 8207 }, 290 { "ndash", VERS_FROM40, 8211 }, 291 { "mdash", VERS_FROM40, 8212 }, 292 { "lsquo", VERS_FROM40, 8216 }, 293 { "rsquo", VERS_FROM40, 8217 }, 294 { "sbquo", VERS_FROM40, 8218 }, 295 { "ldquo", VERS_FROM40, 8220 }, 296 { "rdquo", VERS_FROM40, 8221 }, 297 { "bdquo", VERS_FROM40, 8222 }, 298 { "dagger", VERS_FROM40, 8224 }, 299 { "Dagger", VERS_FROM40, 8225 }, 300 { "permil", VERS_FROM40, 8240 }, 301 { "lsaquo", VERS_FROM40, 8249 }, 302 { "rsaquo", VERS_FROM40, 8250 }, 303 { "euro", VERS_FROM40, 8364 }, 304 { NULL, 0, 0 } 305}; 306 307 308/* Pure static implementation. Trades off lookup speed 309** for faster setup time (well, none actually). 310** Optimization of comparing 1st character buys enough 311** speed that hash doesn't improve things without > 500 312** items in list. 313*/ 314static const entity* lookup( ctmbstr s ) 315{ 316 tmbchar ch = (tmbchar)( s ? *s : 0 ); 317 const entity *np; 318 for ( np = entities; ch && np && np->name; ++np ) 319 if ( ch == *np->name && TY_(tmbstrcmp)(s, np->name) == 0 ) 320 return np; 321 return NULL; 322} 323 324#if 0 325/* entity starting with "&" returns zero on error */ 326uint EntityCode( ctmbstr name, uint versions ) 327{ 328 const entity* np; 329 assert( name && name[0] == '&' ); 330 331 /* numeric entitity: name = "&#" followed by number */ 332 if ( name[1] == '#' ) 333 { 334 uint c = 0; /* zero on missing/bad number */ 335 Bool isXml = ( (versions & VERS_XML) == VERS_XML ); 336 337 /* 'x' prefix denotes hexadecimal number format */ 338 if ( name[2] == 'x' || (!isXml && name[2] == 'X') ) 339 sscanf( name+3, "%x", &c ); 340 else 341 sscanf( name+2, "%u", &c ); 342 343 return (uint) c; 344 } 345 346 /* Named entity: name ="&" followed by a name */ 347 if ( NULL != (np = lookup(name+1)) ) 348 { 349 /* Only recognize entity name if version supports it. */ 350 if ( np->versions & versions ) 351 return np->code; 352 } 353 354 return 0; /* zero signifies unknown entity name */ 355} 356#endif 357 358Bool TY_(EntityInfo)( ctmbstr name, Bool isXml, uint* code, uint* versions ) 359{ 360 const entity* np; 361 assert( name && name[0] == '&' ); 362 assert( code != NULL ); 363 assert( versions != NULL ); 364 365 /* numeric entitity: name = "&#" followed by number */ 366 if ( name[1] == '#' ) 367 { 368 uint c = 0; /* zero on missing/bad number */ 369 370 /* 'x' prefix denotes hexadecimal number format */ 371 if ( name[2] == 'x' || (!isXml && name[2] == 'X') ) 372 sscanf( name+3, "%x", &c ); 373 else 374 sscanf( name+2, "%u", &c ); 375 376 *code = c; 377 *versions = VERS_ALL; 378 return yes; 379 } 380 381 /* Named entity: name ="&" followed by a name */ 382 if ( NULL != (np = lookup(name+1)) ) 383 { 384 *code = np->code; 385 *versions = np->versions; 386 return yes; 387 } 388 389 *code = 0; 390 *versions = ( isXml ? VERS_XML : VERS_PROPRIETARY ); 391 return no; 392} 393 394 395ctmbstr TY_(EntityName)( uint ch, uint versions ) 396{ 397 ctmbstr entnam = NULL; 398 const entity *ep; 399 400 for ( ep = entities; ep->name != NULL; ++ep ) 401 { 402 if ( ep->code == ch ) 403 { 404 if ( ep->versions & versions ) 405 entnam = ep->name; 406 break; /* Found code. Stop search. */ 407 } 408 } 409 return entnam; 410} 411 412/* 413 * local variables: 414 * mode: c 415 * indent-tabs-mode: nil 416 * c-basic-offset: 4 417 * eval: (c-set-offset 'substatement-open 0) 418 * end: 419 */ 420