1/* Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements.  See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License.  You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * simple hokey charset recoding configuration module
19 *
20 * See mod_ebcdic and mod_charset for more thought-out examples.  This
21 * one is just so Jeff can learn how a module works and experiment with
22 * basic character set recoding configuration.
23 *
24 * !!!This is an extremely cheap ripoff of mod_charset.c from Russian Apache!!!
25 */
26
27#include "httpd.h"
28#include "http_config.h"
29#define CORE_PRIVATE
30#include "http_core.h"
31#include "http_log.h"
32#include "http_main.h"
33#include "http_protocol.h"
34#include "http_request.h"
35#include "util_charset.h"
36#include "apr_buckets.h"
37#include "util_filter.h"
38#include "apr_strings.h"
39#include "apr_lib.h"
40#include "apr_xlate.h"
41#define APR_WANT_STRFUNC
42#include "apr_want.h"
43
44#define OUTPUT_XLATE_BUF_SIZE (16*1024) /* size of translation buffer used on output */
45#define INPUT_XLATE_BUF_SIZE  (8*1024)  /* size of translation buffer used on input */
46
47#define XLATE_MIN_BUFF_LEFT 128  /* flush once there is no more than this much
48                                  * space left in the translation buffer
49                                  */
50
51#define FATTEST_CHAR  8          /* we don't handle chars wider than this that straddle
52                                  * two buckets
53                                  */
54
55/* extended error status codes; this is used in addition to an apr_status_t to
56 * track errors in the translation filter
57 */
58typedef enum {
59    EES_INIT = 0,   /* no error info yet; value must be 0 for easy init */
60    EES_LIMIT,      /* built-in restriction encountered */
61    EES_INCOMPLETE_CHAR, /* incomplete multi-byte char at end of content */
62    EES_BUCKET_READ,
63    EES_DOWNSTREAM, /* something bad happened in a filter below xlate */
64    EES_BAD_INPUT   /* input data invalid */
65} ees_t;
66
67/* registered name of the output translation filter */
68#define XLATEOUT_FILTER_NAME "XLATEOUT"
69/* registered name of input translation filter */
70#define XLATEIN_FILTER_NAME  "XLATEIN"
71
72typedef struct charset_dir_t {
73    /** debug level; -1 means uninitialized, 0 means no debug */
74    int debug;
75    const char *charset_source; /* source encoding */
76    const char *charset_default; /* how to ship on wire */
77    /** module does ap_add_*_filter()? */
78    enum {IA_INIT, IA_IMPADD, IA_NOIMPADD} implicit_add;
79    /** treat all mimetypes as text? */
80    enum {FX_INIT, FX_FORCE, FX_NOFORCE} force_xlate;
81} charset_dir_t;
82
83/* charset_filter_ctx_t is created for each filter instance; because the same
84 * filter code is used for translating in both directions, we need this context
85 * data to tell the filter which translation handle to use; it also can hold a
86 * character which was split between buckets
87 */
88typedef struct charset_filter_ctx_t {
89    apr_xlate_t *xlate;
90    int is_sb;              /* single-byte translation? */
91    charset_dir_t *dc;
92    ees_t ees;              /* extended error status */
93    apr_size_t saved;
94    char buf[FATTEST_CHAR]; /* we want to be able to build a complete char here */
95    int ran;                /* has filter instance run before? */
96    int noop;               /* should we pass brigades through unchanged? */
97    char *tmp;              /* buffer for input filtering */
98    apr_bucket_brigade *bb; /* input buckets we couldn't finish translating */
99    apr_bucket_brigade *tmpbb; /* used for passing downstream */
100} charset_filter_ctx_t;
101
102/* charset_req_t is available via r->request_config if any translation is
103 * being performed
104 */
105typedef struct charset_req_t {
106    charset_dir_t *dc;
107    charset_filter_ctx_t *output_ctx, *input_ctx;
108} charset_req_t;
109
110/* debug level definitions */
111#define DBGLVL_GORY           9 /* gory details */
112#define DBGLVL_FLOW           4 /* enough messages to see what happens on
113                                 * each request */
114#define DBGLVL_PMC            2 /* messages about possible misconfiguration */
115
116module AP_MODULE_DECLARE_DATA charset_lite_module;
117
118static void *create_charset_dir_conf(apr_pool_t *p,char *dummy)
119{
120    charset_dir_t *dc = (charset_dir_t *)apr_pcalloc(p,sizeof(charset_dir_t));
121
122    dc->debug = -1;
123    return dc;
124}
125
126static void *merge_charset_dir_conf(apr_pool_t *p, void *basev, void *overridesv)
127{
128    charset_dir_t *a = (charset_dir_t *)apr_pcalloc (p, sizeof(charset_dir_t));
129    charset_dir_t *base = (charset_dir_t *)basev,
130        *over = (charset_dir_t *)overridesv;
131
132    /* If it is defined in the current container, use it.  Otherwise, use the one
133     * from the enclosing container.
134     */
135
136    a->debug =
137        over->debug != -1 ? over->debug : base->debug;
138    a->charset_default =
139        over->charset_default ? over->charset_default : base->charset_default;
140    a->charset_source =
141        over->charset_source ? over->charset_source : base->charset_source;
142    a->implicit_add =
143        over->implicit_add != IA_INIT ? over->implicit_add : base->implicit_add;
144    a->force_xlate=
145        over->force_xlate != FX_INIT ? over->force_xlate : base->force_xlate;
146    return a;
147}
148
149/* CharsetSourceEnc charset
150 */
151static const char *add_charset_source(cmd_parms *cmd, void *in_dc,
152                                      const char *name)
153{
154    charset_dir_t *dc = in_dc;
155
156    dc->charset_source = name;
157    return NULL;
158}
159
160/* CharsetDefault charset
161 */
162static const char *add_charset_default(cmd_parms *cmd, void *in_dc,
163                                       const char *name)
164{
165    charset_dir_t *dc = in_dc;
166
167    dc->charset_default = name;
168    return NULL;
169}
170
171/* CharsetOptions optionflag...
172 */
173static const char *add_charset_options(cmd_parms *cmd, void *in_dc,
174                                       const char *flag)
175{
176    charset_dir_t *dc = in_dc;
177
178    if (!strcasecmp(flag, "ImplicitAdd")) {
179        dc->implicit_add = IA_IMPADD;
180    }
181    else if (!strcasecmp(flag, "NoImplicitAdd")) {
182        dc->implicit_add = IA_NOIMPADD;
183    }
184    else if (!strcasecmp(flag, "TranslateAllMimeTypes")) {
185        dc->force_xlate = FX_FORCE;
186    }
187    else if (!strcasecmp(flag, "NoTranslateAllMimeTypes")) {
188        dc->force_xlate = FX_NOFORCE;
189    }
190    else if (!strncasecmp(flag, "DebugLevel=", 11)) {
191        dc->debug = atoi(flag + 11);
192    }
193    else {
194        return apr_pstrcat(cmd->temp_pool,
195                           "Invalid CharsetOptions option: ",
196                           flag,
197                           NULL);
198    }
199
200    return NULL;
201}
202
203/* find_code_page() is a fixup hook that checks if the module is
204 * configured and the input or output potentially need to be translated.
205 * If so, context is initialized for the filters.
206 */
207static int find_code_page(request_rec *r)
208{
209    charset_dir_t *dc = ap_get_module_config(r->per_dir_config,
210                                             &charset_lite_module);
211    charset_req_t *reqinfo;
212    charset_filter_ctx_t *input_ctx, *output_ctx;
213    apr_status_t rv;
214
215    if (dc->debug >= DBGLVL_FLOW) {
216        ap_log_rerror(APLOG_MARK,APLOG_DEBUG, 0, r,
217                      "uri: %s file: %s method: %d "
218                      "imt: %s flags: %s%s%s %s->%s",
219                      r->uri,
220                      r->filename ? r->filename : "(none)",
221                      r->method_number,
222                      r->content_type ? r->content_type : "(unknown)",
223                      r->main     ? "S" : "",    /* S if subrequest */
224                      r->prev     ? "R" : "",    /* R if redirect */
225                      r->proxyreq ? "P" : "",    /* P if proxy */
226                      dc->charset_source, dc->charset_default);
227    }
228
229    /* If we don't have a full directory configuration, bail out.
230     */
231    if (!dc->charset_source || !dc->charset_default) {
232        if (dc->debug >= DBGLVL_PMC) {
233            ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
234                          "incomplete configuration: src %s, dst %s",
235                          dc->charset_source ? dc->charset_source : "unspecified",
236                          dc->charset_default ? dc->charset_default : "unspecified");
237        }
238        return DECLINED;
239    }
240
241    /* catch proxy requests */
242    if (r->proxyreq) {
243        return DECLINED;
244    }
245
246    /* mod_rewrite indicators */
247    if (r->filename
248        && (!strncmp(r->filename, "redirect:", 9)
249            || !strncmp(r->filename, "gone:", 5)
250            || !strncmp(r->filename, "passthrough:", 12)
251            || !strncmp(r->filename, "forbidden:", 10))) {
252        return DECLINED;
253    }
254
255    /* no translation when server and network charsets are set to the same value */
256    if (!strcasecmp(dc->charset_source, dc->charset_default)) {
257        return DECLINED;
258    }
259
260    /* Get storage for the request data and the output filter context.
261     * We rarely need the input filter context, so allocate that separately.
262     */
263    reqinfo = (charset_req_t *)apr_pcalloc(r->pool,
264                                           sizeof(charset_req_t) +
265                                           sizeof(charset_filter_ctx_t));
266    output_ctx = (charset_filter_ctx_t *)(reqinfo + 1);
267
268    reqinfo->dc = dc;
269    output_ctx->dc = dc;
270    output_ctx->tmpbb = apr_brigade_create(r->pool,
271                                           r->connection->bucket_alloc);
272    ap_set_module_config(r->request_config, &charset_lite_module, reqinfo);
273
274    reqinfo->output_ctx = output_ctx;
275
276    switch (r->method_number) {
277    case M_PUT:
278    case M_POST:
279        /* Set up input translation.  Note: A request body can be included
280         * with the OPTIONS method, but for now we don't set up translation
281         * of it.
282         */
283        input_ctx = apr_pcalloc(r->pool, sizeof(charset_filter_ctx_t));
284        input_ctx->bb = apr_brigade_create(r->pool,
285                                           r->connection->bucket_alloc);
286        input_ctx->tmp = apr_palloc(r->pool, INPUT_XLATE_BUF_SIZE);
287        input_ctx->dc = dc;
288        reqinfo->input_ctx = input_ctx;
289        rv = apr_xlate_open(&input_ctx->xlate, dc->charset_source,
290                            dc->charset_default, r->pool);
291        if (rv != APR_SUCCESS) {
292            ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r,
293                          "can't open translation %s->%s",
294                          dc->charset_default, dc->charset_source);
295            return HTTP_INTERNAL_SERVER_ERROR;
296        }
297        if (apr_xlate_sb_get(input_ctx->xlate, &input_ctx->is_sb) != APR_SUCCESS) {
298            input_ctx->is_sb = 0;
299        }
300    }
301
302    return DECLINED;
303}
304
305static int configured_in_list(request_rec *r, const char *filter_name,
306                              struct ap_filter_t *filter_list)
307{
308    struct ap_filter_t *filter = filter_list;
309
310    while (filter) {
311        if (!strcasecmp(filter_name, filter->frec->name)) {
312            return 1;
313        }
314        filter = filter->next;
315    }
316    return 0;
317}
318
319static int configured_on_input(request_rec *r, const char *filter_name)
320{
321    return configured_in_list(r, filter_name, r->input_filters);
322}
323
324static int configured_on_output(request_rec *r, const char *filter_name)
325{
326    return configured_in_list(r, filter_name, r->output_filters);
327}
328
329/* xlate_insert_filter() is a filter hook which decides whether or not
330 * to insert a translation filter for the current request.
331 */
332static void xlate_insert_filter(request_rec *r)
333{
334    /* Hey... don't be so quick to use reqinfo->dc here; reqinfo may be NULL */
335    charset_req_t *reqinfo = ap_get_module_config(r->request_config,
336                                                  &charset_lite_module);
337    charset_dir_t *dc = ap_get_module_config(r->per_dir_config,
338                                             &charset_lite_module);
339
340    if (dc && (dc->implicit_add == IA_NOIMPADD)) {
341        if (dc->debug >= DBGLVL_GORY) {
342            ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
343                          "xlate output filter not added implicitly because "
344                          "CharsetOptions included 'NoImplicitAdd'");
345        }
346        return;
347    }
348
349    if (reqinfo) {
350        if (reqinfo->output_ctx && !configured_on_output(r, XLATEOUT_FILTER_NAME)) {
351            ap_add_output_filter(XLATEOUT_FILTER_NAME, reqinfo->output_ctx, r,
352                                 r->connection);
353        }
354        else if (dc->debug >= DBGLVL_FLOW) {
355            ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
356                          "xlate output filter not added implicitly because %s",
357                          !reqinfo->output_ctx ?
358                          "no output configuration available" :
359                          "another module added the filter");
360        }
361
362        if (reqinfo->input_ctx && !configured_on_input(r, XLATEIN_FILTER_NAME)) {
363            ap_add_input_filter(XLATEIN_FILTER_NAME, reqinfo->input_ctx, r,
364                                r->connection);
365        }
366        else if (dc->debug >= DBGLVL_FLOW) {
367            ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
368                          "xlate input filter not added implicitly because %s",
369                          !reqinfo->input_ctx ?
370                          "no input configuration available" :
371                          "another module added the filter");
372        }
373    }
374}
375
376/* stuff that sucks that I know of:
377 *
378 * bucket handling:
379 *  why create an eos bucket when we see it come down the stream?  just send the one
380 *  passed as input...  news flash: this will be fixed when xlate_out_filter() starts
381 *  using the more generic xlate_brigade()
382 *
383 * translation mechanics:
384 *   we don't handle characters that straddle more than two buckets; an error
385 *   will be generated
386 */
387
388static apr_status_t send_bucket_downstream(ap_filter_t *f, apr_bucket *b)
389{
390    charset_filter_ctx_t *ctx = f->ctx;
391    apr_status_t rv;
392
393    APR_BRIGADE_INSERT_TAIL(ctx->tmpbb, b);
394    rv = ap_pass_brigade(f->next, ctx->tmpbb);
395    if (rv != APR_SUCCESS) {
396        ctx->ees = EES_DOWNSTREAM;
397    }
398    apr_brigade_cleanup(ctx->tmpbb);
399    return rv;
400}
401
402/* send_downstream() is passed the translated data; it puts it in a single-
403 * bucket brigade and passes the brigade to the next filter
404 */
405static apr_status_t send_downstream(ap_filter_t *f, const char *tmp, apr_size_t len)
406{
407    request_rec *r = f->r;
408    conn_rec *c = r->connection;
409    apr_bucket *b;
410
411    b = apr_bucket_transient_create(tmp, len, c->bucket_alloc);
412    return send_bucket_downstream(f, b);
413}
414
415static apr_status_t send_eos(ap_filter_t *f)
416{
417    request_rec *r = f->r;
418    conn_rec *c = r->connection;
419    apr_bucket_brigade *bb;
420    apr_bucket *b;
421    charset_filter_ctx_t *ctx = f->ctx;
422    apr_status_t rv;
423
424    bb = apr_brigade_create(r->pool, c->bucket_alloc);
425    b = apr_bucket_eos_create(c->bucket_alloc);
426    APR_BRIGADE_INSERT_TAIL(bb, b);
427    rv = ap_pass_brigade(f->next, bb);
428    if (rv != APR_SUCCESS) {
429        ctx->ees = EES_DOWNSTREAM;
430    }
431    return rv;
432}
433
434static apr_status_t set_aside_partial_char(charset_filter_ctx_t *ctx,
435                                           const char *partial,
436                                           apr_size_t partial_len)
437{
438    apr_status_t rv;
439
440    if (sizeof(ctx->buf) > partial_len) {
441        ctx->saved = partial_len;
442        memcpy(ctx->buf, partial, partial_len);
443        rv = APR_SUCCESS;
444    }
445    else {
446        rv = APR_INCOMPLETE;
447        ctx->ees = EES_LIMIT; /* we don't handle chars this wide which straddle
448                               * buckets
449                               */
450    }
451    return rv;
452}
453
454static apr_status_t finish_partial_char(charset_filter_ctx_t *ctx,
455                                        /* input buffer: */
456                                        const char **cur_str,
457                                        apr_size_t *cur_len,
458                                        /* output buffer: */
459                                        char **out_str,
460                                        apr_size_t *out_len)
461{
462    apr_status_t rv;
463    apr_size_t tmp_input_len;
464
465    /* Keep adding bytes from the input string to the saved string until we
466     *    1) finish the input char
467     *    2) get an error
468     * or 3) run out of bytes to add
469     */
470
471    do {
472        ctx->buf[ctx->saved] = **cur_str;
473        ++ctx->saved;
474        ++*cur_str;
475        --*cur_len;
476        tmp_input_len = ctx->saved;
477        rv = apr_xlate_conv_buffer(ctx->xlate,
478                                   ctx->buf,
479                                   &tmp_input_len,
480                                   *out_str,
481                                   out_len);
482    } while (rv == APR_INCOMPLETE && *cur_len);
483
484    if (rv == APR_SUCCESS) {
485        ctx->saved = 0;
486    }
487    else {
488        ctx->ees = EES_LIMIT; /* code isn't smart enough to handle chars
489                               * straddling more than two buckets
490                               */
491    }
492
493    return rv;
494}
495
496static void log_xlate_error(ap_filter_t *f, apr_status_t rv)
497{
498    charset_filter_ctx_t *ctx = f->ctx;
499    const char *msg;
500    char msgbuf[100];
501    int cur;
502
503    switch(ctx->ees) {
504    case EES_LIMIT:
505        rv = 0;
506        msg = "xlate filter - a built-in restriction was encountered";
507        break;
508    case EES_BAD_INPUT:
509        rv = 0;
510        msg = "xlate filter - an input character was invalid";
511        break;
512    case EES_BUCKET_READ:
513        rv = 0;
514        msg = "xlate filter - bucket read routine failed";
515        break;
516    case EES_INCOMPLETE_CHAR:
517        rv = 0;
518        strcpy(msgbuf, "xlate filter - incomplete char at end of input - ");
519        cur = 0;
520        while ((apr_size_t)cur < ctx->saved) {
521            apr_snprintf(msgbuf + strlen(msgbuf), sizeof(msgbuf) - strlen(msgbuf),
522                         "%02X", (unsigned)ctx->buf[cur]);
523            ++cur;
524        }
525        msg = msgbuf;
526        break;
527    case EES_DOWNSTREAM:
528        msg = "xlate filter - an error occurred in a lower filter";
529        break;
530    default:
531        msg = "xlate filter - returning error";
532    }
533    ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r,
534                  "%s", msg);
535}
536
537/* chk_filter_chain() is called once per filter instance; it tries to
538 * determine if the current filter instance should be disabled because
539 * its translation is incompatible with the translation of an existing
540 * instance of the translate filter
541 *
542 * Example bad scenario:
543 *
544 *   configured filter chain for the request:
545 *     INCLUDES XLATEOUT(8859-1->UTS-16)
546 *   configured filter chain for the subrequest:
547 *     XLATEOUT(8859-1->UTS-16)
548 *
549 *   When the subrequest is processed, the filter chain will be
550 *     XLATEOUT(8859-1->UTS-16) XLATEOUT(8859-1->UTS-16)
551 *   This makes no sense, so the instance of XLATEOUT added for the
552 *   subrequest will be noop-ed.
553 *
554 * Example good scenario:
555 *
556 *   configured filter chain for the request:
557 *     INCLUDES XLATEOUT(8859-1->UTS-16)
558 *   configured filter chain for the subrequest:
559 *     XLATEOUT(IBM-1047->8859-1)
560 *
561 *   When the subrequest is processed, the filter chain will be
562 *     XLATEOUT(IBM-1047->8859-1) XLATEOUT(8859-1->UTS-16)
563 *   This makes sense, so the instance of XLATEOUT added for the
564 *   subrequest will be left alone and it will translate from
565 *   IBM-1047->8859-1.
566 */
567static void chk_filter_chain(ap_filter_t *f)
568{
569    ap_filter_t *curf;
570    charset_filter_ctx_t *curctx, *last_xlate_ctx = NULL,
571        *ctx = f->ctx;
572    int debug = ctx->dc->debug;
573    int output = !strcasecmp(f->frec->name, XLATEOUT_FILTER_NAME);
574
575    if (ctx->noop) {
576        return;
577    }
578
579    /* walk the filter chain; see if it makes sense for our filter to
580     * do any translation
581     */
582    curf = output ? f->r->output_filters : f->r->input_filters;
583    while (curf) {
584        if (!strcasecmp(curf->frec->name, f->frec->name) &&
585            curf->ctx) {
586            curctx = (charset_filter_ctx_t *)curf->ctx;
587            if (!last_xlate_ctx) {
588                last_xlate_ctx = curctx;
589            }
590            else {
591                if (strcmp(last_xlate_ctx->dc->charset_default,
592                           curctx->dc->charset_source)) {
593                    /* incompatible translation
594                     * if our filter instance is incompatible with an instance
595                     * already in place, noop our instance
596                     * Notes:
597                     * . We are only willing to noop our own instance.
598                     * . It is possible to noop another instance which has not
599                     *   yet run, but this is not currently implemented.
600                     *   Hopefully it will not be needed.
601                     * . It is not possible to noop an instance which has
602                     *   already run.
603                     */
604                    if (last_xlate_ctx == f->ctx) {
605                        last_xlate_ctx->noop = 1;
606                        if (debug >= DBGLVL_PMC) {
607                            const char *symbol = output ? "->" : "<-";
608
609                            ap_log_rerror(APLOG_MARK, APLOG_DEBUG,
610                                          0, f->r,
611                                          "%s %s - disabling "
612                                          "translation %s%s%s; existing "
613                                          "translation %s%s%s",
614                                          f->r->uri ? "uri" : "file",
615                                          f->r->uri ? f->r->uri : f->r->filename,
616                                          last_xlate_ctx->dc->charset_source,
617                                          symbol,
618                                          last_xlate_ctx->dc->charset_default,
619                                          curctx->dc->charset_source,
620                                          symbol,
621                                          curctx->dc->charset_default);
622                        }
623                    }
624                    else {
625                        const char *symbol = output ? "->" : "<-";
626
627                        ap_log_rerror(APLOG_MARK, APLOG_ERR,
628                                      0, f->r,
629                                      "chk_filter_chain() - can't disable "
630                                      "translation %s%s%s; existing "
631                                      "translation %s%s%s",
632                                      last_xlate_ctx->dc->charset_source,
633                                      symbol,
634                                      last_xlate_ctx->dc->charset_default,
635                                      curctx->dc->charset_source,
636                                      symbol,
637                                      curctx->dc->charset_default);
638                    }
639                    break;
640                }
641            }
642        }
643        curf = curf->next;
644    }
645}
646
647/* xlate_brigade() is used to filter request and response bodies
648 *
649 * we'll stop when one of the following occurs:
650 * . we run out of buckets
651 * . we run out of space in the output buffer
652 * . we hit an error or metadata
653 *
654 * inputs:
655 *   bb:               brigade to process
656 *   buffer:           storage to hold the translated characters
657 *   buffer_avail:     size of buffer
658 *   (and a few more uninteresting parms)
659 *
660 * outputs:
661 *   return value:     APR_SUCCESS or some error code
662 *   bb:               we've removed any buckets representing the
663 *                     translated characters; the eos bucket, if
664 *                     present, will be left in the brigade
665 *   buffer:           filled in with translated characters
666 *   buffer_avail:     updated with the bytes remaining
667 *   hit_eos:          did we hit an EOS bucket?
668 */
669static apr_status_t xlate_brigade(charset_filter_ctx_t *ctx,
670                                  apr_bucket_brigade *bb,
671                                  char *buffer,
672                                  apr_size_t *buffer_avail,
673                                  int *hit_eos)
674{
675    apr_bucket *b = NULL; /* set to NULL only to quiet some gcc */
676    apr_bucket *consumed_bucket;
677    const char *bucket;
678    apr_size_t bytes_in_bucket; /* total bytes read from current bucket */
679    apr_size_t bucket_avail;    /* bytes left in current bucket */
680    apr_status_t rv = APR_SUCCESS;
681
682    *hit_eos = 0;
683    bucket_avail = 0;
684    consumed_bucket = NULL;
685    while (1) {
686        if (!bucket_avail) { /* no bytes left to process in the current bucket... */
687            if (consumed_bucket) {
688                apr_bucket_delete(consumed_bucket);
689                consumed_bucket = NULL;
690            }
691            b = APR_BRIGADE_FIRST(bb);
692            if (b == APR_BRIGADE_SENTINEL(bb) ||
693                APR_BUCKET_IS_METADATA(b)) {
694                break;
695            }
696            rv = apr_bucket_read(b, &bucket, &bytes_in_bucket, APR_BLOCK_READ);
697            if (rv != APR_SUCCESS) {
698                ctx->ees = EES_BUCKET_READ;
699                break;
700            }
701            bucket_avail = bytes_in_bucket;
702            consumed_bucket = b;   /* for axing when we're done reading it */
703        }
704        if (bucket_avail) {
705            /* We've got data, so translate it. */
706            if (ctx->saved) {
707                /* Rats... we need to finish a partial character from the previous
708                 * bucket.
709                 *
710                 * Strangely, finish_partial_char() increments the input buffer
711                 * pointer but does not increment the output buffer pointer.
712                 */
713                apr_size_t old_buffer_avail = *buffer_avail;
714                rv = finish_partial_char(ctx,
715                                         &bucket, &bucket_avail,
716                                         &buffer, buffer_avail);
717                buffer += old_buffer_avail - *buffer_avail;
718            }
719            else {
720                apr_size_t old_buffer_avail = *buffer_avail;
721                apr_size_t old_bucket_avail = bucket_avail;
722                rv = apr_xlate_conv_buffer(ctx->xlate,
723                                           bucket, &bucket_avail,
724                                           buffer,
725                                           buffer_avail);
726                buffer  += old_buffer_avail - *buffer_avail;
727                bucket  += old_bucket_avail - bucket_avail;
728
729                if (rv == APR_INCOMPLETE) { /* partial character at end of input */
730                    /* We need to save the final byte(s) for next time; we can't
731                     * convert it until we look at the next bucket.
732                     */
733                    rv = set_aside_partial_char(ctx, bucket, bucket_avail);
734                    bucket_avail = 0;
735                }
736            }
737            if (rv != APR_SUCCESS) {
738                /* bad input byte or partial char too big to store */
739                break;
740            }
741            if (*buffer_avail < XLATE_MIN_BUFF_LEFT) {
742                /* if any data remains in the current bucket, split there */
743                if (bucket_avail) {
744                    apr_bucket_split(b, bytes_in_bucket - bucket_avail);
745                }
746                apr_bucket_delete(b);
747                break;
748            }
749        }
750    }
751
752    if (!APR_BRIGADE_EMPTY(bb)) {
753        b = APR_BRIGADE_FIRST(bb);
754        if (APR_BUCKET_IS_EOS(b)) {
755            /* Leave the eos bucket in the brigade for reporting to
756             * subsequent filters.
757             */
758            *hit_eos = 1;
759            if (ctx->saved) {
760                /* Oops... we have a partial char from the previous bucket
761                 * that won't be completed because there's no more data.
762                 */
763                rv = APR_INCOMPLETE;
764                ctx->ees = EES_INCOMPLETE_CHAR;
765            }
766        }
767    }
768
769    return rv;
770}
771
772/* xlate_out_filter() handles (almost) arbitrary conversions from one charset
773 * to another...
774 * translation is determined in the fixup hook (find_code_page), which is
775 * where the filter's context data is set up... the context data gives us
776 * the translation handle
777 */
778static apr_status_t xlate_out_filter(ap_filter_t *f, apr_bucket_brigade *bb)
779{
780    charset_req_t *reqinfo = ap_get_module_config(f->r->request_config,
781                                                  &charset_lite_module);
782    charset_dir_t *dc = ap_get_module_config(f->r->per_dir_config,
783                                             &charset_lite_module);
784    charset_filter_ctx_t *ctx = f->ctx;
785    apr_bucket *dptr, *consumed_bucket;
786    const char *cur_str;
787    apr_size_t cur_len, cur_avail;
788    char tmp[OUTPUT_XLATE_BUF_SIZE];
789    apr_size_t space_avail;
790    int done;
791    apr_status_t rv = APR_SUCCESS;
792
793    if (!ctx) {
794        /* this is SetOutputFilter path; grab the preallocated context,
795         * if any; note that if we decided not to do anything in an earlier
796         * handler, we won't even have a reqinfo
797         */
798        if (reqinfo) {
799            ctx = f->ctx = reqinfo->output_ctx;
800            reqinfo->output_ctx = NULL; /* prevent SNAFU if user coded us twice
801                                         * in the filter chain; we can't have two
802                                         * instances using the same context
803                                         */
804        }
805        if (!ctx) {                   /* no idea how to translate; don't do anything */
806            ctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(charset_filter_ctx_t));
807            ctx->dc = dc;
808            ctx->noop = 1;
809        }
810    }
811
812    /* Check the mime type to see if translation should be performed.
813     */
814    if (!ctx->noop && ctx->xlate == NULL) {
815        const char *mime_type = f->r->content_type ? f->r->content_type : ap_default_type(f->r);
816
817        if (strncasecmp(mime_type, "text/", 5) == 0 ||
818#if APR_CHARSET_EBCDIC
819        /* On an EBCDIC machine, be willing to translate mod_autoindex-
820         * generated output.  Otherwise, it doesn't look too cool.
821         *
822         * XXX This isn't a perfect fix because this doesn't trigger us
823         * to convert from the charset of the source code to ASCII.  The
824         * general solution seems to be to allow a generator to set an
825         * indicator in the r specifying that the body is coded in the
826         * implementation character set (i.e., the charset of the source
827         * code).  This would get several different types of documents
828         * translated properly: mod_autoindex output, mod_status output,
829         * mod_info output, hard-coded error documents, etc.
830         */
831            strcmp(mime_type, DIR_MAGIC_TYPE) == 0 ||
832#endif
833            strncasecmp(mime_type, "message/", 8) == 0 ||
834            dc->force_xlate == FX_FORCE) {
835
836            rv = apr_xlate_open(&ctx->xlate,
837                                dc->charset_default, dc->charset_source, f->r->pool);
838            if (rv != APR_SUCCESS) {
839                ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r,
840                              "can't open translation %s->%s",
841                              dc->charset_source, dc->charset_default);
842                ctx->noop = 1;
843            }
844            else {
845                if (apr_xlate_sb_get(ctx->xlate, &ctx->is_sb) != APR_SUCCESS) {
846                    ctx->is_sb = 0;
847                }
848            }
849        }
850        else {
851            ctx->noop = 1;
852            if (dc->debug >= DBGLVL_GORY) {
853                ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
854                              "mime type is %s; no translation selected",
855                              mime_type);
856            }
857        }
858    }
859
860    if (dc->debug >= DBGLVL_GORY) {
861        ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
862                      "xlate_out_filter() - "
863                      "charset_source: %s charset_default: %s",
864                      dc && dc->charset_source ? dc->charset_source : "(none)",
865                      dc && dc->charset_default ? dc->charset_default : "(none)");
866    }
867
868    if (!ctx->ran) {  /* filter never ran before */
869        chk_filter_chain(f);
870        ctx->ran = 1;
871        if (!ctx->noop && !ctx->is_sb) {
872            /* We're not converting between two single-byte charsets, so unset
873             * Content-Length since it is unlikely to remain the same.
874             */
875            apr_table_unset(f->r->headers_out, "Content-Length");
876        }
877    }
878
879    if (ctx->noop) {
880        return ap_pass_brigade(f->next, bb);
881    }
882
883    dptr = APR_BRIGADE_FIRST(bb);
884    done = 0;
885    cur_len = 0;
886    space_avail = sizeof(tmp);
887    consumed_bucket = NULL;
888    while (!done) {
889        if (!cur_len) { /* no bytes left to process in the current bucket... */
890            if (consumed_bucket) {
891                apr_bucket_delete(consumed_bucket);
892                consumed_bucket = NULL;
893            }
894            if (dptr == APR_BRIGADE_SENTINEL(bb)) {
895                done = 1;
896                break;
897            }
898            if (APR_BUCKET_IS_EOS(dptr)) {
899                done = 1;
900                cur_len = -1; /* XXX yuck, but that tells us to send
901                                 * eos down; when we minimize our bb construction
902                                 * we'll fix this crap */
903                if (ctx->saved) {
904                    /* Oops... we have a partial char from the previous bucket
905                     * that won't be completed because there's no more data.
906                     */
907                    rv = APR_INCOMPLETE;
908                    ctx->ees = EES_INCOMPLETE_CHAR;
909                }
910                break;
911            }
912            if (APR_BUCKET_IS_METADATA(dptr)) {
913                apr_bucket *metadata_bucket;
914                metadata_bucket = dptr;
915                dptr = APR_BUCKET_NEXT(dptr);
916                APR_BUCKET_REMOVE(metadata_bucket);
917                rv = send_bucket_downstream(f, metadata_bucket);
918                if (rv != APR_SUCCESS) {
919                    done = 1;
920                }
921                continue;
922            }
923            rv = apr_bucket_read(dptr, &cur_str, &cur_len, APR_BLOCK_READ);
924            if (rv != APR_SUCCESS) {
925                done = 1;
926                ctx->ees = EES_BUCKET_READ;
927                break;
928            }
929            consumed_bucket = dptr; /* for axing when we're done reading it */
930            dptr = APR_BUCKET_NEXT(dptr); /* get ready for when we access the
931                                          * next bucket */
932        }
933        /* Try to fill up our tmp buffer with translated data. */
934        cur_avail = cur_len;
935
936        if (cur_len) { /* maybe we just hit the end of a pipe (len = 0) ? */
937            if (ctx->saved) {
938                /* Rats... we need to finish a partial character from the previous
939                 * bucket.
940                 */
941                char *tmp_tmp;
942
943                tmp_tmp = tmp + sizeof(tmp) - space_avail;
944                rv = finish_partial_char(ctx,
945                                         &cur_str, &cur_len,
946                                         &tmp_tmp, &space_avail);
947            }
948            else {
949                rv = apr_xlate_conv_buffer(ctx->xlate,
950                                           cur_str, &cur_avail,
951                                           tmp + sizeof(tmp) - space_avail, &space_avail);
952
953                /* Update input ptr and len after consuming some bytes */
954                cur_str += cur_len - cur_avail;
955                cur_len = cur_avail;
956
957                if (rv == APR_INCOMPLETE) { /* partial character at end of input */
958                    /* We need to save the final byte(s) for next time; we can't
959                     * convert it until we look at the next bucket.
960                     */
961                    rv = set_aside_partial_char(ctx, cur_str, cur_len);
962                    cur_len = 0;
963                }
964            }
965        }
966
967        if (rv != APR_SUCCESS) {
968            /* bad input byte or partial char too big to store */
969            done = 1;
970        }
971
972        if (space_avail < XLATE_MIN_BUFF_LEFT) {
973            /* It is time to flush, as there is not enough space left in the
974             * current output buffer to bother with converting more data.
975             */
976            rv = send_downstream(f, tmp, sizeof(tmp) - space_avail);
977            if (rv != APR_SUCCESS) {
978                done = 1;
979            }
980
981            /* tmp is now empty */
982            space_avail = sizeof(tmp);
983        }
984    }
985
986    if (rv == APR_SUCCESS) {
987        if (space_avail < sizeof(tmp)) { /* gotta write out what we converted */
988            rv = send_downstream(f, tmp, sizeof(tmp) - space_avail);
989        }
990    }
991    if (rv == APR_SUCCESS) {
992        if (cur_len == -1) {
993            rv = send_eos(f);
994        }
995    }
996    else {
997        log_xlate_error(f, rv);
998    }
999
1000    return rv;
1001}
1002
1003static int xlate_in_filter(ap_filter_t *f, apr_bucket_brigade *bb,
1004                           ap_input_mode_t mode, apr_read_type_e block,
1005                           apr_off_t readbytes)
1006{
1007    apr_status_t rv;
1008    charset_req_t *reqinfo = ap_get_module_config(f->r->request_config,
1009                                                  &charset_lite_module);
1010    charset_dir_t *dc = ap_get_module_config(f->r->per_dir_config,
1011                                             &charset_lite_module);
1012    charset_filter_ctx_t *ctx = f->ctx;
1013    apr_size_t buffer_size;
1014    int hit_eos;
1015
1016    if (!ctx) {
1017        /* this is SetInputFilter path; grab the preallocated context,
1018         * if any; note that if we decided not to do anything in an earlier
1019         * handler, we won't even have a reqinfo
1020         */
1021        if (reqinfo) {
1022            ctx = f->ctx = reqinfo->input_ctx;
1023            reqinfo->input_ctx = NULL; /* prevent SNAFU if user coded us twice
1024                                        * in the filter chain; we can't have two
1025                                        * instances using the same context
1026                                        */
1027        }
1028        if (!ctx) {                   /* no idea how to translate; don't do anything */
1029            ctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(charset_filter_ctx_t));
1030            ctx->dc = dc;
1031            ctx->noop = 1;
1032        }
1033    }
1034
1035    if (dc->debug >= DBGLVL_GORY) {
1036        ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
1037                     "xlate_in_filter() - "
1038                     "charset_source: %s charset_default: %s",
1039                     dc && dc->charset_source ? dc->charset_source : "(none)",
1040                     dc && dc->charset_default ? dc->charset_default : "(none)");
1041    }
1042
1043    if (!ctx->ran) {  /* filter never ran before */
1044        chk_filter_chain(f);
1045        ctx->ran = 1;
1046        if (!ctx->noop && !ctx->is_sb
1047            && apr_table_get(f->r->headers_in, "Content-Length")) {
1048            /* A Content-Length header is present, but it won't be valid after
1049             * conversion because we're not converting between two single-byte
1050             * charsets.  This will affect most CGI scripts and may affect
1051             * some modules.
1052             * Content-Length can't be unset here because that would break
1053             * being able to read the request body.
1054             * Processing of chunked request bodies is not impacted by this
1055             * filter since the the length was not declared anyway.
1056             */
1057            if (dc->debug >= DBGLVL_PMC) {
1058                ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
1059                              "Request body length may change, resulting in "
1060                              "misprocessing by some modules or scripts");
1061            }
1062        }
1063    }
1064
1065    if (ctx->noop) {
1066        return ap_get_brigade(f->next, bb, mode, block, readbytes);
1067    }
1068
1069    if (APR_BRIGADE_EMPTY(ctx->bb)) {
1070        if ((rv = ap_get_brigade(f->next, bb, mode, block,
1071                                 readbytes)) != APR_SUCCESS) {
1072            return rv;
1073        }
1074    }
1075    else {
1076        APR_BRIGADE_PREPEND(bb, ctx->bb); /* first use the leftovers */
1077    }
1078
1079    buffer_size = INPUT_XLATE_BUF_SIZE;
1080    rv = xlate_brigade(ctx, bb, ctx->tmp, &buffer_size, &hit_eos);
1081    if (rv == APR_SUCCESS) {
1082        if (!hit_eos) {
1083            /* move anything leftover into our context for next time;
1084             * we don't currently "set aside" since the data came from
1085             * down below, but I suspect that for long-term we need to
1086             * do that
1087             */
1088            APR_BRIGADE_CONCAT(ctx->bb, bb);
1089        }
1090        if (buffer_size < INPUT_XLATE_BUF_SIZE) { /* do we have output? */
1091            apr_bucket *e;
1092
1093            e = apr_bucket_heap_create(ctx->tmp,
1094                                       INPUT_XLATE_BUF_SIZE - buffer_size,
1095                                       NULL, f->r->connection->bucket_alloc);
1096            /* make sure we insert at the head, because there may be
1097             * an eos bucket already there, and the eos bucket should
1098             * come after the data
1099             */
1100            APR_BRIGADE_INSERT_HEAD(bb, e);
1101        }
1102        else {
1103            /* XXX need to get some more data... what if the last brigade
1104             * we got had only the first byte of a multibyte char?  we need
1105             * to grab more data from the network instead of returning an
1106             * empty brigade
1107             */
1108        }
1109        /* If we have any metadata at the head of ctx->bb, go ahead and move it
1110         * onto the end of bb to be returned to our caller.
1111         */
1112        if (!APR_BRIGADE_EMPTY(ctx->bb)) {
1113            apr_bucket *b = APR_BRIGADE_FIRST(ctx->bb);
1114            while (b != APR_BRIGADE_SENTINEL(ctx->bb)
1115                   && APR_BUCKET_IS_METADATA(b)) {
1116                APR_BUCKET_REMOVE(b);
1117                APR_BRIGADE_INSERT_TAIL(bb, b);
1118                b = APR_BRIGADE_FIRST(ctx->bb);
1119            }
1120        }
1121    }
1122    else {
1123        log_xlate_error(f, rv);
1124    }
1125
1126    return rv;
1127}
1128
1129static const command_rec cmds[] =
1130{
1131    AP_INIT_TAKE1("CharsetSourceEnc",
1132                  add_charset_source,
1133                  NULL,
1134                  OR_FILEINFO,
1135                  "source (html,cgi,ssi) file charset"),
1136    AP_INIT_TAKE1("CharsetDefault",
1137                  add_charset_default,
1138                  NULL,
1139                  OR_FILEINFO,
1140                  "name of default charset"),
1141    AP_INIT_ITERATE("CharsetOptions",
1142                    add_charset_options,
1143                    NULL,
1144                    OR_FILEINFO,
1145                    "valid options: ImplicitAdd, NoImplicitAdd, TranslateAllMimeTypes, "
1146                    "NoTranslateAllMimeTypes, DebugLevel=n"),
1147    {NULL}
1148};
1149
1150static void charset_register_hooks(apr_pool_t *p)
1151{
1152    ap_hook_fixups(find_code_page, NULL, NULL, APR_HOOK_MIDDLE);
1153    ap_hook_insert_filter(xlate_insert_filter, NULL, NULL, APR_HOOK_REALLY_LAST);
1154    ap_register_output_filter(XLATEOUT_FILTER_NAME, xlate_out_filter, NULL,
1155                              AP_FTYPE_RESOURCE);
1156    ap_register_input_filter(XLATEIN_FILTER_NAME, xlate_in_filter, NULL,
1157                             AP_FTYPE_RESOURCE);
1158}
1159
1160module AP_MODULE_DECLARE_DATA charset_lite_module =
1161{
1162    STANDARD20_MODULE_STUFF,
1163    create_charset_dir_conf,
1164    merge_charset_dir_conf,
1165    NULL,
1166    NULL,
1167    cmds,
1168    charset_register_hooks
1169};
1170
1171