1/* Licensed to the Apache Software Foundation (ASF) under one or more 2 * contributor license agreements. See the NOTICE file distributed with 3 * this work for additional information regarding copyright ownership. 4 * The ASF licenses this file to You under the Apache License, Version 2.0 5 * (the "License"); you may not use this file except in compliance with 6 * the License. You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "apr.h" 18#include "apr_file_io.h" 19#include "apr_strings.h" 20#include "apr_lib.h" 21 22#define APR_WANT_STRFUNC 23#include "apr_want.h" 24 25#define WANT_BASENAME_MATCH 26 27#include "httpd.h" 28#include "http_core.h" 29#include "http_config.h" 30#include "http_request.h" 31#include "http_log.h" 32 33/* mod_speling.c - by Alexei Kosut <akosut@organic.com> June, 1996 34 * 35 * This module is transparent, and simple. It attempts to correct 36 * misspellings of URLs that users might have entered, namely by checking 37 * capitalizations. If it finds a match, it sends a redirect. 38 * 39 * Sep-1999 Hugo Haas <hugo@w3.org> 40 * o Added a CheckCaseOnly option to check only miscapitalized words. 41 * 42 * 08-Aug-1997 <Martin.Kraemer@Mch.SNI.De> 43 * o Upgraded module interface to apache_1.3a2-dev API (more NULL's in 44 * speling_module). 45 * o Integrated tcsh's "spelling correction" routine which allows one 46 * misspelling (character insertion/omission/typo/transposition). 47 * Rewrote it to ignore case as well. This ought to catch the majority 48 * of misspelled requests. 49 * o Commented out the second pass where files' suffixes are stripped. 50 * Given the better hit rate of the first pass, this rather ugly 51 * (request index.html, receive index.db ?!?!) solution can be 52 * omitted. 53 * o wrote a "kind of" html page for mod_speling 54 * 55 * Activate it with "CheckSpelling On" 56 */ 57 58module AP_MODULE_DECLARE_DATA speling_module; 59 60typedef struct { 61 int enabled; 62 int case_only; 63} spconfig; 64 65/* 66 * Create a configuration specific to this module for a server or directory 67 * location, and fill it with the default settings. 68 * 69 * The API says that in the absence of a merge function, the record for the 70 * closest ancestor is used exclusively. That's what we want, so we don't 71 * bother to have such a function. 72 */ 73 74static void *mkconfig(apr_pool_t *p) 75{ 76 spconfig *cfg = apr_pcalloc(p, sizeof(spconfig)); 77 78 cfg->enabled = 0; 79 cfg->case_only = 0; 80 return cfg; 81} 82 83/* 84 * Respond to a callback to create configuration record for a server or 85 * vhost environment. 86 */ 87static void *create_mconfig_for_server(apr_pool_t *p, server_rec *s) 88{ 89 return mkconfig(p); 90} 91 92/* 93 * Respond to a callback to create a config record for a specific directory. 94 */ 95static void *create_mconfig_for_directory(apr_pool_t *p, char *dir) 96{ 97 return mkconfig(p); 98} 99 100/* 101 * Define the directives specific to this module. This structure is referenced 102 * later by the 'module' structure. 103 */ 104static const command_rec speling_cmds[] = 105{ 106 AP_INIT_FLAG("CheckSpelling", ap_set_flag_slot, 107 (void*)APR_OFFSETOF(spconfig, enabled), OR_OPTIONS, 108 "whether or not to fix miscapitalized/misspelled requests"), 109 AP_INIT_FLAG("CheckCaseOnly", ap_set_flag_slot, 110 (void*)APR_OFFSETOF(spconfig, case_only), OR_OPTIONS, 111 "whether or not to fix only miscapitalized requests"), 112 { NULL } 113}; 114 115typedef enum { 116 SP_IDENTICAL = 0, 117 SP_MISCAPITALIZED = 1, 118 SP_TRANSPOSITION = 2, 119 SP_MISSINGCHAR = 3, 120 SP_EXTRACHAR = 4, 121 SP_SIMPLETYPO = 5, 122 SP_VERYDIFFERENT = 6 123} sp_reason; 124 125static const char *sp_reason_str[] = 126{ 127 "identical", 128 "miscapitalized", 129 "transposed characters", 130 "character missing", 131 "extra character", 132 "mistyped character", 133 "common basename", 134}; 135 136typedef struct { 137 const char *name; 138 sp_reason quality; 139} misspelled_file; 140 141/* 142 * spdist() is taken from Kernighan & Pike, 143 * _The_UNIX_Programming_Environment_ 144 * and adapted somewhat to correspond better to psychological reality. 145 * (Note the changes to the return values) 146 * 147 * According to Pollock and Zamora, CACM April 1984 (V. 27, No. 4), 148 * page 363, the correct order for this is: 149 * OMISSION = TRANSPOSITION > INSERTION > SUBSTITUTION 150 * thus, it was exactly backwards in the old version. -- PWP 151 * 152 * This routine was taken out of tcsh's spelling correction code 153 * (tcsh-6.07.04) and re-converted to apache data types ("char" type 154 * instead of tcsh's NLS'ed "Char"). Plus it now ignores the case 155 * during comparisons, so is a "approximate strcasecmp()". 156 * NOTE that is still allows only _one_ real "typo", 157 * it does NOT try to correct multiple errors. 158 */ 159 160static sp_reason spdist(const char *s, const char *t) 161{ 162 for (; apr_tolower(*s) == apr_tolower(*t); t++, s++) { 163 if (*t == '\0') { 164 return SP_MISCAPITALIZED; /* exact match (sans case) */ 165 } 166 } 167 if (*s) { 168 if (*t) { 169 if (s[1] && t[1] && apr_tolower(*s) == apr_tolower(t[1]) 170 && apr_tolower(*t) == apr_tolower(s[1]) 171 && strcasecmp(s + 2, t + 2) == 0) { 172 return SP_TRANSPOSITION; /* transposition */ 173 } 174 if (strcasecmp(s + 1, t + 1) == 0) { 175 return SP_SIMPLETYPO; /* 1 char mismatch */ 176 } 177 } 178 if (strcasecmp(s + 1, t) == 0) { 179 return SP_EXTRACHAR; /* extra character */ 180 } 181 } 182 if (*t && strcasecmp(s, t + 1) == 0) { 183 return SP_MISSINGCHAR; /* missing character */ 184 } 185 return SP_VERYDIFFERENT; /* distance too large to fix. */ 186} 187 188static int sort_by_quality(const void *left, const void *rite) 189{ 190 return (int) (((misspelled_file *) left)->quality) 191 - (int) (((misspelled_file *) rite)->quality); 192} 193 194static int check_speling(request_rec *r) 195{ 196 spconfig *cfg; 197 char *good, *bad, *postgood, *url; 198 apr_finfo_t dirent; 199 int filoc, dotloc, urlen, pglen; 200 apr_array_header_t *candidates = NULL; 201 apr_dir_t *dir; 202 203 cfg = ap_get_module_config(r->per_dir_config, &speling_module); 204 if (!cfg->enabled) { 205 return DECLINED; 206 } 207 208 /* We only want to worry about GETs */ 209 if (r->method_number != M_GET) { 210 return DECLINED; 211 } 212 213 /* We've already got a file of some kind or another */ 214 if (r->finfo.filetype != 0) { 215 return DECLINED; 216 } 217 218 /* Not a file request */ 219 if (r->proxyreq || !r->filename) { 220 return DECLINED; 221 } 222 223 /* This is a sub request - don't mess with it */ 224 if (r->main) { 225 return DECLINED; 226 } 227 228 /* 229 * The request should end up looking like this: 230 * r->uri: /correct-url/mispelling/more 231 * r->filename: /correct-file/mispelling r->path_info: /more 232 * 233 * So we do this in steps. First break r->filename into two pieces 234 */ 235 236 filoc = ap_rind(r->filename, '/'); 237 /* 238 * Don't do anything if the request doesn't contain a slash, or 239 * requests "/" 240 */ 241 if (filoc == -1 || strcmp(r->uri, "/") == 0) { 242 return DECLINED; 243 } 244 245 /* good = /correct-file */ 246 good = apr_pstrndup(r->pool, r->filename, filoc); 247 /* bad = mispelling */ 248 bad = apr_pstrdup(r->pool, r->filename + filoc + 1); 249 /* postgood = mispelling/more */ 250 postgood = apr_pstrcat(r->pool, bad, r->path_info, NULL); 251 252 urlen = strlen(r->uri); 253 pglen = strlen(postgood); 254 255 /* Check to see if the URL pieces add up */ 256 if (strcmp(postgood, r->uri + (urlen - pglen))) { 257 return DECLINED; 258 } 259 260 /* url = /correct-url */ 261 url = apr_pstrndup(r->pool, r->uri, (urlen - pglen)); 262 263 /* Now open the directory and do ourselves a check... */ 264 if (apr_dir_open(&dir, good, r->pool) != APR_SUCCESS) { 265 /* Oops, not a directory... */ 266 return DECLINED; 267 } 268 269 candidates = apr_array_make(r->pool, 2, sizeof(misspelled_file)); 270 271 dotloc = ap_ind(bad, '.'); 272 if (dotloc == -1) { 273 dotloc = strlen(bad); 274 } 275 276 while (apr_dir_read(&dirent, APR_FINFO_DIRENT, dir) == APR_SUCCESS) { 277 sp_reason q; 278 279 /* 280 * If we end up with a "fixed" URL which is identical to the 281 * requested one, we must have found a broken symlink or some such. 282 * Do _not_ try to redirect this, it causes a loop! 283 */ 284 if (strcmp(bad, dirent.name) == 0) { 285 apr_dir_close(dir); 286 return OK; 287 } 288 289 /* 290 * miscapitalization errors are checked first (like, e.g., lower case 291 * file, upper case request) 292 */ 293 else if (strcasecmp(bad, dirent.name) == 0) { 294 misspelled_file *sp_new; 295 296 sp_new = (misspelled_file *) apr_array_push(candidates); 297 sp_new->name = apr_pstrdup(r->pool, dirent.name); 298 sp_new->quality = SP_MISCAPITALIZED; 299 } 300 301 /* 302 * simple typing errors are checked next (like, e.g., 303 * missing/extra/transposed char) 304 */ 305 else if ((cfg->case_only == 0) 306 && ((q = spdist(bad, dirent.name)) != SP_VERYDIFFERENT)) { 307 misspelled_file *sp_new; 308 309 sp_new = (misspelled_file *) apr_array_push(candidates); 310 sp_new->name = apr_pstrdup(r->pool, dirent.name); 311 sp_new->quality = q; 312 } 313 314 /* 315 * The spdist() should have found the majority of the misspelled 316 * requests. It is of questionable use to continue looking for 317 * files with the same base name, but potentially of totally wrong 318 * type (index.html <-> index.db). 319 * I would propose to not set the WANT_BASENAME_MATCH define. 320 * 08-Aug-1997 <Martin.Kraemer@Mch.SNI.De> 321 * 322 * However, Alexei replied giving some reasons to add it anyway: 323 * > Oh, by the way, I remembered why having the 324 * > extension-stripping-and-matching stuff is a good idea: 325 * > 326 * > If you're using MultiViews, and have a file named foobar.html, 327 * > which you refer to as "foobar", and someone tried to access 328 * > "Foobar", mod_speling won't find it, because it won't find 329 * > anything matching that spelling. With the extension-munging, 330 * > it would locate "foobar.html". Not perfect, but I ran into 331 * > that problem when I first wrote the module. 332 */ 333 else { 334#ifdef WANT_BASENAME_MATCH 335 /* 336 * Okay... we didn't find anything. Now we take out the hard-core 337 * power tools. There are several cases here. Someone might have 338 * entered a wrong extension (.htm instead of .html or vice 339 * versa) or the document could be negotiated. At any rate, now 340 * we just compare stuff before the first dot. If it matches, we 341 * figure we got us a match. This can result in wrong things if 342 * there are files of different content types but the same prefix 343 * (e.g. foo.gif and foo.html) This code will pick the first one 344 * it finds. Better than a Not Found, though. 345 */ 346 int entloc = ap_ind(dirent.name, '.'); 347 if (entloc == -1) { 348 entloc = strlen(dirent.name); 349 } 350 351 if ((dotloc == entloc) 352 && !strncasecmp(bad, dirent.name, dotloc)) { 353 misspelled_file *sp_new; 354 355 sp_new = (misspelled_file *) apr_array_push(candidates); 356 sp_new->name = apr_pstrdup(r->pool, dirent.name); 357 sp_new->quality = SP_VERYDIFFERENT; 358 } 359#endif 360 } 361 } 362 apr_dir_close(dir); 363 364 if (candidates->nelts != 0) { 365 /* Wow... we found us a mispelling. Construct a fixed url */ 366 char *nuri; 367 const char *ref; 368 misspelled_file *variant = (misspelled_file *) candidates->elts; 369 int i; 370 371 ref = apr_table_get(r->headers_in, "Referer"); 372 373 qsort((void *) candidates->elts, candidates->nelts, 374 sizeof(misspelled_file), sort_by_quality); 375 376 /* 377 * Conditions for immediate redirection: 378 * a) the first candidate was not found by stripping the suffix 379 * AND b) there exists only one candidate OR the best match is not 380 * ambiguous 381 * then return a redirection right away. 382 */ 383 if (variant[0].quality != SP_VERYDIFFERENT 384 && (candidates->nelts == 1 385 || variant[0].quality != variant[1].quality)) { 386 387 nuri = ap_escape_uri(r->pool, apr_pstrcat(r->pool, url, 388 variant[0].name, 389 r->path_info, NULL)); 390 if (r->parsed_uri.query) 391 nuri = apr_pstrcat(r->pool, nuri, "?", r->parsed_uri.query, NULL); 392 393 apr_table_setn(r->headers_out, "Location", 394 ap_construct_url(r->pool, nuri, r)); 395 396 ap_log_rerror(APLOG_MARK, APLOG_INFO, APR_SUCCESS, 397 r, 398 ref ? "Fixed spelling: %s to %s from %s" 399 : "Fixed spelling: %s to %s", 400 r->uri, nuri, ref); 401 402 return HTTP_MOVED_PERMANENTLY; 403 } 404 /* 405 * Otherwise, a "[300] Multiple Choices" list with the variants is 406 * returned. 407 */ 408 else { 409 apr_pool_t *p; 410 apr_table_t *notes; 411 apr_pool_t *sub_pool; 412 apr_array_header_t *t; 413 apr_array_header_t *v; 414 415 416 if (r->main == NULL) { 417 p = r->pool; 418 notes = r->notes; 419 } 420 else { 421 p = r->main->pool; 422 notes = r->main->notes; 423 } 424 425 if (apr_pool_create(&sub_pool, p) != APR_SUCCESS) 426 return DECLINED; 427 428 t = apr_array_make(sub_pool, candidates->nelts * 8 + 8, 429 sizeof(char *)); 430 v = apr_array_make(sub_pool, candidates->nelts * 5, 431 sizeof(char *)); 432 433 /* Generate the response text. */ 434 435 *(const char **)apr_array_push(t) = 436 "The document name you requested (<code>"; 437 *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, r->uri); 438 *(const char **)apr_array_push(t) = 439 "</code>) could not be found on this server.\n" 440 "However, we found documents with names similar " 441 "to the one you requested.<p>" 442 "Available documents:\n<ul>\n"; 443 444 for (i = 0; i < candidates->nelts; ++i) { 445 char *vuri; 446 const char *reason; 447 448 reason = sp_reason_str[(int) (variant[i].quality)]; 449 /* The format isn't very neat... */ 450 vuri = apr_pstrcat(sub_pool, url, variant[i].name, r->path_info, 451 (r->parsed_uri.query != NULL) ? "?" : "", 452 (r->parsed_uri.query != NULL) 453 ? r->parsed_uri.query : "", 454 NULL); 455 *(const char **)apr_array_push(v) = "\""; 456 *(const char **)apr_array_push(v) = ap_escape_uri(sub_pool, vuri); 457 *(const char **)apr_array_push(v) = "\";\""; 458 *(const char **)apr_array_push(v) = reason; 459 *(const char **)apr_array_push(v) = "\""; 460 461 *(const char **)apr_array_push(t) = "<li><a href=\""; 462 *(const char **)apr_array_push(t) = ap_escape_uri(sub_pool, vuri); 463 *(const char **)apr_array_push(t) = "\">"; 464 *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, vuri); 465 *(const char **)apr_array_push(t) = "</a> ("; 466 *(const char **)apr_array_push(t) = reason; 467 *(const char **)apr_array_push(t) = ")\n"; 468 469 /* 470 * when we have printed the "close matches" and there are 471 * more "distant matches" (matched by stripping the suffix), 472 * then we insert an additional separator text to suggest 473 * that the user LOOK CLOSELY whether these are really the 474 * files she wanted. 475 */ 476 if (i > 0 && i < candidates->nelts - 1 477 && variant[i].quality != SP_VERYDIFFERENT 478 && variant[i + 1].quality == SP_VERYDIFFERENT) { 479 *(const char **)apr_array_push(t) = 480 "</ul>\nFurthermore, the following related " 481 "documents were found:\n<ul>\n"; 482 } 483 } 484 *(const char **)apr_array_push(t) = "</ul>\n"; 485 486 /* If we know there was a referring page, add a note: */ 487 if (ref != NULL) { 488 *(const char **)apr_array_push(t) = 489 "Please consider informing the owner of the " 490 "<a href=\""; 491 *(const char **)apr_array_push(t) = ap_escape_uri(sub_pool, ref); 492 *(const char **)apr_array_push(t) = "\">referring page</a> " 493 "about the broken link.\n"; 494 } 495 496 497 /* Pass our apr_table_t to http_protocol.c (see mod_negotiation): */ 498 apr_table_setn(notes, "variant-list", apr_array_pstrcat(p, t, 0)); 499 500 apr_table_mergen(r->subprocess_env, "VARIANTS", 501 apr_array_pstrcat(p, v, ',')); 502 503 apr_pool_destroy(sub_pool); 504 505 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, 506 ref ? "Spelling fix: %s: %d candidates from %s" 507 : "Spelling fix: %s: %d candidates", 508 r->uri, candidates->nelts, ref); 509 510 return HTTP_MULTIPLE_CHOICES; 511 } 512 } 513 514 return OK; 515} 516 517static void register_hooks(apr_pool_t *p) 518{ 519 ap_hook_fixups(check_speling,NULL,NULL,APR_HOOK_LAST); 520} 521 522module AP_MODULE_DECLARE_DATA speling_module = 523{ 524 STANDARD20_MODULE_STUFF, 525 create_mconfig_for_directory, /* create per-dir config */ 526 NULL, /* merge per-dir config */ 527 create_mconfig_for_server, /* server config */ 528 NULL, /* merge server config */ 529 speling_cmds, /* command apr_table_t */ 530 register_hooks /* register hooks */ 531}; 532