1/* Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements.  See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License.  You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "apr.h"
18#include "apr_file_io.h"
19#include "apr_strings.h"
20#include "apr_lib.h"
21
22#define APR_WANT_STRFUNC
23#include "apr_want.h"
24
25#define WANT_BASENAME_MATCH
26
27#include "httpd.h"
28#include "http_core.h"
29#include "http_config.h"
30#include "http_request.h"
31#include "http_log.h"
32
33/* mod_speling.c - by Alexei Kosut <akosut@organic.com> June, 1996
34 *
35 * This module is transparent, and simple. It attempts to correct
36 * misspellings of URLs that users might have entered, namely by checking
37 * capitalizations. If it finds a match, it sends a redirect.
38 *
39 * Sep-1999 Hugo Haas <hugo@w3.org>
40 * o Added a CheckCaseOnly option to check only miscapitalized words.
41 *
42 * 08-Aug-1997 <Martin.Kraemer@Mch.SNI.De>
43 * o Upgraded module interface to apache_1.3a2-dev API (more NULL's in
44 *   speling_module).
45 * o Integrated tcsh's "spelling correction" routine which allows one
46 *   misspelling (character insertion/omission/typo/transposition).
47 *   Rewrote it to ignore case as well. This ought to catch the majority
48 *   of misspelled requests.
49 * o Commented out the second pass where files' suffixes are stripped.
50 *   Given the better hit rate of the first pass, this rather ugly
51 *   (request index.html, receive index.db ?!?!) solution can be
52 *   omitted.
53 * o wrote a "kind of" html page for mod_speling
54 *
55 * Activate it with "CheckSpelling On"
56 */
57
58module AP_MODULE_DECLARE_DATA speling_module;
59
60typedef struct {
61    int enabled;
62    int case_only;
63} spconfig;
64
65/*
66 * Create a configuration specific to this module for a server or directory
67 * location, and fill it with the default settings.
68 *
69 * The API says that in the absence of a merge function, the record for the
70 * closest ancestor is used exclusively.  That's what we want, so we don't
71 * bother to have such a function.
72 */
73
74static void *mkconfig(apr_pool_t *p)
75{
76    spconfig *cfg = apr_pcalloc(p, sizeof(spconfig));
77
78    cfg->enabled = 0;
79    cfg->case_only = 0;
80    return cfg;
81}
82
83/*
84 * Respond to a callback to create configuration record for a server or
85 * vhost environment.
86 */
87static void *create_mconfig_for_server(apr_pool_t *p, server_rec *s)
88{
89    return mkconfig(p);
90}
91
92/*
93 * Respond to a callback to create a config record for a specific directory.
94 */
95static void *create_mconfig_for_directory(apr_pool_t *p, char *dir)
96{
97    return mkconfig(p);
98}
99
100/*
101 * Define the directives specific to this module.  This structure is referenced
102 * later by the 'module' structure.
103 */
104static const command_rec speling_cmds[] =
105{
106    AP_INIT_FLAG("CheckSpelling", ap_set_flag_slot,
107                  (void*)APR_OFFSETOF(spconfig, enabled), OR_OPTIONS,
108                 "whether or not to fix miscapitalized/misspelled requests"),
109    AP_INIT_FLAG("CheckCaseOnly", ap_set_flag_slot,
110                  (void*)APR_OFFSETOF(spconfig, case_only), OR_OPTIONS,
111                 "whether or not to fix only miscapitalized requests"),
112    { NULL }
113};
114
115typedef enum {
116    SP_IDENTICAL = 0,
117    SP_MISCAPITALIZED = 1,
118    SP_TRANSPOSITION = 2,
119    SP_MISSINGCHAR = 3,
120    SP_EXTRACHAR = 4,
121    SP_SIMPLETYPO = 5,
122    SP_VERYDIFFERENT = 6
123} sp_reason;
124
125static const char *sp_reason_str[] =
126{
127    "identical",
128    "miscapitalized",
129    "transposed characters",
130    "character missing",
131    "extra character",
132    "mistyped character",
133    "common basename",
134};
135
136typedef struct {
137    const char *name;
138    sp_reason quality;
139} misspelled_file;
140
141/*
142 * spdist() is taken from Kernighan & Pike,
143 *  _The_UNIX_Programming_Environment_
144 * and adapted somewhat to correspond better to psychological reality.
145 * (Note the changes to the return values)
146 *
147 * According to Pollock and Zamora, CACM April 1984 (V. 27, No. 4),
148 * page 363, the correct order for this is:
149 * OMISSION = TRANSPOSITION > INSERTION > SUBSTITUTION
150 * thus, it was exactly backwards in the old version. -- PWP
151 *
152 * This routine was taken out of tcsh's spelling correction code
153 * (tcsh-6.07.04) and re-converted to apache data types ("char" type
154 * instead of tcsh's NLS'ed "Char"). Plus it now ignores the case
155 * during comparisons, so is a "approximate strcasecmp()".
156 * NOTE that is still allows only _one_ real "typo",
157 * it does NOT try to correct multiple errors.
158 */
159
160static sp_reason spdist(const char *s, const char *t)
161{
162    for (; apr_tolower(*s) == apr_tolower(*t); t++, s++) {
163        if (*t == '\0') {
164            return SP_MISCAPITALIZED;   /* exact match (sans case) */
165        }
166    }
167    if (*s) {
168        if (*t) {
169            if (s[1] && t[1] && apr_tolower(*s) == apr_tolower(t[1])
170                && apr_tolower(*t) == apr_tolower(s[1])
171                && strcasecmp(s + 2, t + 2) == 0) {
172                return SP_TRANSPOSITION;        /* transposition */
173            }
174            if (strcasecmp(s + 1, t + 1) == 0) {
175                return SP_SIMPLETYPO;   /* 1 char mismatch */
176            }
177        }
178        if (strcasecmp(s + 1, t) == 0) {
179            return SP_EXTRACHAR;        /* extra character */
180        }
181    }
182    if (*t && strcasecmp(s, t + 1) == 0) {
183        return SP_MISSINGCHAR;  /* missing character */
184    }
185    return SP_VERYDIFFERENT;    /* distance too large to fix. */
186}
187
188static int sort_by_quality(const void *left, const void *rite)
189{
190    return (int) (((misspelled_file *) left)->quality)
191        - (int) (((misspelled_file *) rite)->quality);
192}
193
194static int check_speling(request_rec *r)
195{
196    spconfig *cfg;
197    char *good, *bad, *postgood, *url;
198    apr_finfo_t dirent;
199    int filoc, dotloc, urlen, pglen;
200    apr_array_header_t *candidates = NULL;
201    apr_dir_t          *dir;
202
203    cfg = ap_get_module_config(r->per_dir_config, &speling_module);
204    if (!cfg->enabled) {
205        return DECLINED;
206    }
207
208    /* We only want to worry about GETs */
209    if (r->method_number != M_GET) {
210        return DECLINED;
211    }
212
213    /* We've already got a file of some kind or another */
214    if (r->finfo.filetype != 0) {
215        return DECLINED;
216    }
217
218    /* Not a file request */
219    if (r->proxyreq || !r->filename) {
220        return DECLINED;
221    }
222
223    /* This is a sub request - don't mess with it */
224    if (r->main) {
225        return DECLINED;
226    }
227
228    /*
229     * The request should end up looking like this:
230     * r->uri: /correct-url/mispelling/more
231     * r->filename: /correct-file/mispelling r->path_info: /more
232     *
233     * So we do this in steps. First break r->filename into two pieces
234     */
235
236    filoc = ap_rind(r->filename, '/');
237    /*
238     * Don't do anything if the request doesn't contain a slash, or
239     * requests "/"
240     */
241    if (filoc == -1 || strcmp(r->uri, "/") == 0) {
242        return DECLINED;
243    }
244
245    /* good = /correct-file */
246    good = apr_pstrndup(r->pool, r->filename, filoc);
247    /* bad = mispelling */
248    bad = apr_pstrdup(r->pool, r->filename + filoc + 1);
249    /* postgood = mispelling/more */
250    postgood = apr_pstrcat(r->pool, bad, r->path_info, NULL);
251
252    urlen = strlen(r->uri);
253    pglen = strlen(postgood);
254
255    /* Check to see if the URL pieces add up */
256    if (strcmp(postgood, r->uri + (urlen - pglen))) {
257        return DECLINED;
258    }
259
260    /* url = /correct-url */
261    url = apr_pstrndup(r->pool, r->uri, (urlen - pglen));
262
263    /* Now open the directory and do ourselves a check... */
264    if (apr_dir_open(&dir, good, r->pool) != APR_SUCCESS) {
265        /* Oops, not a directory... */
266        return DECLINED;
267    }
268
269    candidates = apr_array_make(r->pool, 2, sizeof(misspelled_file));
270
271    dotloc = ap_ind(bad, '.');
272    if (dotloc == -1) {
273        dotloc = strlen(bad);
274    }
275
276    while (apr_dir_read(&dirent, APR_FINFO_DIRENT, dir) == APR_SUCCESS) {
277        sp_reason q;
278
279        /*
280         * If we end up with a "fixed" URL which is identical to the
281         * requested one, we must have found a broken symlink or some such.
282         * Do _not_ try to redirect this, it causes a loop!
283         */
284        if (strcmp(bad, dirent.name) == 0) {
285            apr_dir_close(dir);
286            return OK;
287        }
288
289        /*
290         * miscapitalization errors are checked first (like, e.g., lower case
291         * file, upper case request)
292         */
293        else if (strcasecmp(bad, dirent.name) == 0) {
294            misspelled_file *sp_new;
295
296            sp_new = (misspelled_file *) apr_array_push(candidates);
297            sp_new->name = apr_pstrdup(r->pool, dirent.name);
298            sp_new->quality = SP_MISCAPITALIZED;
299        }
300
301        /*
302         * simple typing errors are checked next (like, e.g.,
303         * missing/extra/transposed char)
304         */
305        else if ((cfg->case_only == 0)
306                 && ((q = spdist(bad, dirent.name)) != SP_VERYDIFFERENT)) {
307            misspelled_file *sp_new;
308
309            sp_new = (misspelled_file *) apr_array_push(candidates);
310            sp_new->name = apr_pstrdup(r->pool, dirent.name);
311            sp_new->quality = q;
312        }
313
314        /*
315         * The spdist() should have found the majority of the misspelled
316         * requests.  It is of questionable use to continue looking for
317         * files with the same base name, but potentially of totally wrong
318         * type (index.html <-> index.db).
319         * I would propose to not set the WANT_BASENAME_MATCH define.
320         *      08-Aug-1997 <Martin.Kraemer@Mch.SNI.De>
321         *
322         * However, Alexei replied giving some reasons to add it anyway:
323         * > Oh, by the way, I remembered why having the
324         * > extension-stripping-and-matching stuff is a good idea:
325         * >
326         * > If you're using MultiViews, and have a file named foobar.html,
327         * > which you refer to as "foobar", and someone tried to access
328         * > "Foobar", mod_speling won't find it, because it won't find
329         * > anything matching that spelling. With the extension-munging,
330         * > it would locate "foobar.html". Not perfect, but I ran into
331         * > that problem when I first wrote the module.
332         */
333        else {
334#ifdef WANT_BASENAME_MATCH
335            /*
336             * Okay... we didn't find anything. Now we take out the hard-core
337             * power tools. There are several cases here. Someone might have
338             * entered a wrong extension (.htm instead of .html or vice
339             * versa) or the document could be negotiated. At any rate, now
340             * we just compare stuff before the first dot. If it matches, we
341             * figure we got us a match. This can result in wrong things if
342             * there are files of different content types but the same prefix
343             * (e.g. foo.gif and foo.html) This code will pick the first one
344             * it finds. Better than a Not Found, though.
345             */
346            int entloc = ap_ind(dirent.name, '.');
347            if (entloc == -1) {
348                entloc = strlen(dirent.name);
349            }
350
351            if ((dotloc == entloc)
352                && !strncasecmp(bad, dirent.name, dotloc)) {
353                misspelled_file *sp_new;
354
355                sp_new = (misspelled_file *) apr_array_push(candidates);
356                sp_new->name = apr_pstrdup(r->pool, dirent.name);
357                sp_new->quality = SP_VERYDIFFERENT;
358            }
359#endif
360        }
361    }
362    apr_dir_close(dir);
363
364    if (candidates->nelts != 0) {
365        /* Wow... we found us a mispelling. Construct a fixed url */
366        char *nuri;
367        const char *ref;
368        misspelled_file *variant = (misspelled_file *) candidates->elts;
369        int i;
370
371        ref = apr_table_get(r->headers_in, "Referer");
372
373        qsort((void *) candidates->elts, candidates->nelts,
374              sizeof(misspelled_file), sort_by_quality);
375
376        /*
377         * Conditions for immediate redirection:
378         *     a) the first candidate was not found by stripping the suffix
379         * AND b) there exists only one candidate OR the best match is not
380         *        ambiguous
381         * then return a redirection right away.
382         */
383        if (variant[0].quality != SP_VERYDIFFERENT
384            && (candidates->nelts == 1
385                || variant[0].quality != variant[1].quality)) {
386
387            nuri = ap_escape_uri(r->pool, apr_pstrcat(r->pool, url,
388                                                     variant[0].name,
389                                                     r->path_info, NULL));
390            if (r->parsed_uri.query)
391                nuri = apr_pstrcat(r->pool, nuri, "?", r->parsed_uri.query, NULL);
392
393            apr_table_setn(r->headers_out, "Location",
394                          ap_construct_url(r->pool, nuri, r));
395
396            ap_log_rerror(APLOG_MARK, APLOG_INFO, APR_SUCCESS,
397                          r,
398                          ref ? "Fixed spelling: %s to %s from %s"
399                              : "Fixed spelling: %s to %s",
400                          r->uri, nuri, ref);
401
402            return HTTP_MOVED_PERMANENTLY;
403        }
404        /*
405         * Otherwise, a "[300] Multiple Choices" list with the variants is
406         * returned.
407         */
408        else {
409            apr_pool_t *p;
410            apr_table_t *notes;
411            apr_pool_t *sub_pool;
412            apr_array_header_t *t;
413            apr_array_header_t *v;
414
415
416            if (r->main == NULL) {
417                p = r->pool;
418                notes = r->notes;
419            }
420            else {
421                p = r->main->pool;
422                notes = r->main->notes;
423            }
424
425            if (apr_pool_create(&sub_pool, p) != APR_SUCCESS)
426                return DECLINED;
427
428            t = apr_array_make(sub_pool, candidates->nelts * 8 + 8,
429                              sizeof(char *));
430            v = apr_array_make(sub_pool, candidates->nelts * 5,
431                              sizeof(char *));
432
433            /* Generate the response text. */
434
435            *(const char **)apr_array_push(t) =
436                          "The document name you requested (<code>";
437            *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, r->uri);
438            *(const char **)apr_array_push(t) =
439                           "</code>) could not be found on this server.\n"
440                           "However, we found documents with names similar "
441                           "to the one you requested.<p>"
442                           "Available documents:\n<ul>\n";
443
444            for (i = 0; i < candidates->nelts; ++i) {
445                char *vuri;
446                const char *reason;
447
448                reason = sp_reason_str[(int) (variant[i].quality)];
449                /* The format isn't very neat... */
450                vuri = apr_pstrcat(sub_pool, url, variant[i].name, r->path_info,
451                                  (r->parsed_uri.query != NULL) ? "?" : "",
452                                  (r->parsed_uri.query != NULL)
453                                      ? r->parsed_uri.query : "",
454                                  NULL);
455                *(const char **)apr_array_push(v) = "\"";
456                *(const char **)apr_array_push(v) = ap_escape_uri(sub_pool, vuri);
457                *(const char **)apr_array_push(v) = "\";\"";
458                *(const char **)apr_array_push(v) = reason;
459                *(const char **)apr_array_push(v) = "\"";
460
461                *(const char **)apr_array_push(t) = "<li><a href=\"";
462                *(const char **)apr_array_push(t) = ap_escape_uri(sub_pool, vuri);
463                *(const char **)apr_array_push(t) = "\">";
464                *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, vuri);
465                *(const char **)apr_array_push(t) = "</a> (";
466                *(const char **)apr_array_push(t) = reason;
467                *(const char **)apr_array_push(t) = ")\n";
468
469                /*
470                 * when we have printed the "close matches" and there are
471                 * more "distant matches" (matched by stripping the suffix),
472                 * then we insert an additional separator text to suggest
473                 * that the user LOOK CLOSELY whether these are really the
474                 * files she wanted.
475                 */
476                if (i > 0 && i < candidates->nelts - 1
477                    && variant[i].quality != SP_VERYDIFFERENT
478                    && variant[i + 1].quality == SP_VERYDIFFERENT) {
479                    *(const char **)apr_array_push(t) =
480                                   "</ul>\nFurthermore, the following related "
481                                   "documents were found:\n<ul>\n";
482                }
483            }
484            *(const char **)apr_array_push(t) = "</ul>\n";
485
486            /* If we know there was a referring page, add a note: */
487            if (ref != NULL) {
488                *(const char **)apr_array_push(t) =
489                               "Please consider informing the owner of the "
490                               "<a href=\"";
491                *(const char **)apr_array_push(t) = ap_escape_uri(sub_pool, ref);
492                *(const char **)apr_array_push(t) = "\">referring page</a> "
493                               "about the broken link.\n";
494            }
495
496
497            /* Pass our apr_table_t to http_protocol.c (see mod_negotiation): */
498            apr_table_setn(notes, "variant-list", apr_array_pstrcat(p, t, 0));
499
500            apr_table_mergen(r->subprocess_env, "VARIANTS",
501                            apr_array_pstrcat(p, v, ','));
502
503            apr_pool_destroy(sub_pool);
504
505            ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
506                         ref ? "Spelling fix: %s: %d candidates from %s"
507                             : "Spelling fix: %s: %d candidates",
508                         r->uri, candidates->nelts, ref);
509
510            return HTTP_MULTIPLE_CHOICES;
511        }
512    }
513
514    return OK;
515}
516
517static void register_hooks(apr_pool_t *p)
518{
519    ap_hook_fixups(check_speling,NULL,NULL,APR_HOOK_LAST);
520}
521
522module AP_MODULE_DECLARE_DATA speling_module =
523{
524    STANDARD20_MODULE_STUFF,
525    create_mconfig_for_directory,  /* create per-dir config */
526    NULL,                          /* merge per-dir config */
527    create_mconfig_for_server,     /* server config */
528    NULL,                          /* merge server config */
529    speling_cmds,                  /* command apr_table_t */
530    register_hooks                 /* register hooks */
531};
532