1/* Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements.  See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License.  You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * simple hokey charset recoding configuration module
19 *
20 * See mod_ebcdic and mod_charset for more thought-out examples.  This
21 * one is just so Jeff can learn how a module works and experiment with
22 * basic character set recoding configuration.
23 *
24 * !!!This is an extremely cheap ripoff of mod_charset.c from Russian Apache!!!
25 */
26
27#include "httpd.h"
28#include "http_config.h"
29
30#include "http_core.h"
31#include "http_log.h"
32#include "http_main.h"
33#include "http_protocol.h"
34#include "http_request.h"
35#include "util_charset.h"
36#include "apr_buckets.h"
37#include "util_filter.h"
38#include "apr_strings.h"
39#include "apr_lib.h"
40#include "apr_xlate.h"
41#define APR_WANT_STRFUNC
42#include "apr_want.h"
43
44#define OUTPUT_XLATE_BUF_SIZE (16*1024) /* size of translation buffer used on output */
45#define INPUT_XLATE_BUF_SIZE  (8*1024)  /* size of translation buffer used on input */
46
47#define XLATE_MIN_BUFF_LEFT 128  /* flush once there is no more than this much
48                                  * space left in the translation buffer
49                                  */
50
51#define FATTEST_CHAR  8          /* we don't handle chars wider than this that straddle
52                                  * two buckets
53                                  */
54
55/* extended error status codes; this is used in addition to an apr_status_t to
56 * track errors in the translation filter
57 */
58typedef enum {
59    EES_INIT = 0,   /* no error info yet; value must be 0 for easy init */
60    EES_LIMIT,      /* built-in restriction encountered */
61    EES_INCOMPLETE_CHAR, /* incomplete multi-byte char at end of content */
62    EES_BUCKET_READ,
63    EES_DOWNSTREAM, /* something bad happened in a filter below xlate */
64    EES_BAD_INPUT   /* input data invalid */
65} ees_t;
66
67/* registered name of the output translation filter */
68#define XLATEOUT_FILTER_NAME "XLATEOUT"
69/* registered name of input translation filter */
70#define XLATEIN_FILTER_NAME  "XLATEIN"
71
72typedef struct charset_dir_t {
73    const char *charset_source; /* source encoding */
74    const char *charset_default; /* how to ship on wire */
75    /** module does ap_add_*_filter()? */
76    enum {IA_INIT, IA_IMPADD, IA_NOIMPADD} implicit_add;
77    /** treat all mimetypes as text? */
78    enum {FX_INIT, FX_FORCE, FX_NOFORCE} force_xlate;
79} charset_dir_t;
80
81/* charset_filter_ctx_t is created for each filter instance; because the same
82 * filter code is used for translating in both directions, we need this context
83 * data to tell the filter which translation handle to use; it also can hold a
84 * character which was split between buckets
85 */
86typedef struct charset_filter_ctx_t {
87    apr_xlate_t *xlate;
88    int is_sb;              /* single-byte translation? */
89    charset_dir_t *dc;
90    ees_t ees;              /* extended error status */
91    apr_size_t saved;
92    char buf[FATTEST_CHAR]; /* we want to be able to build a complete char here */
93    int ran;                /* has filter instance run before? */
94    int noop;               /* should we pass brigades through unchanged? */
95    char *tmp;              /* buffer for input filtering */
96    apr_bucket_brigade *bb; /* input buckets we couldn't finish translating */
97    apr_bucket_brigade *tmpbb; /* used for passing downstream */
98} charset_filter_ctx_t;
99
100/* charset_req_t is available via r->request_config if any translation is
101 * being performed
102 */
103typedef struct charset_req_t {
104    charset_dir_t *dc;
105    charset_filter_ctx_t *output_ctx, *input_ctx;
106} charset_req_t;
107
108module AP_MODULE_DECLARE_DATA charset_lite_module;
109
110static void *create_charset_dir_conf(apr_pool_t *p,char *dummy)
111{
112    charset_dir_t *dc = (charset_dir_t *)apr_pcalloc(p,sizeof(charset_dir_t));
113
114    return dc;
115}
116
117static void *merge_charset_dir_conf(apr_pool_t *p, void *basev, void *overridesv)
118{
119    charset_dir_t *a = (charset_dir_t *)apr_pcalloc (p, sizeof(charset_dir_t));
120    charset_dir_t *base = (charset_dir_t *)basev,
121        *over = (charset_dir_t *)overridesv;
122
123    /* If it is defined in the current container, use it.  Otherwise, use the one
124     * from the enclosing container.
125     */
126
127    a->charset_default =
128        over->charset_default ? over->charset_default : base->charset_default;
129    a->charset_source =
130        over->charset_source ? over->charset_source : base->charset_source;
131    a->implicit_add =
132        over->implicit_add != IA_INIT ? over->implicit_add : base->implicit_add;
133    a->force_xlate=
134        over->force_xlate != FX_INIT ? over->force_xlate : base->force_xlate;
135    return a;
136}
137
138/* CharsetSourceEnc charset
139 */
140static const char *add_charset_source(cmd_parms *cmd, void *in_dc,
141                                      const char *name)
142{
143    charset_dir_t *dc = in_dc;
144
145    dc->charset_source = name;
146    return NULL;
147}
148
149/* CharsetDefault charset
150 */
151static const char *add_charset_default(cmd_parms *cmd, void *in_dc,
152                                       const char *name)
153{
154    charset_dir_t *dc = in_dc;
155
156    dc->charset_default = name;
157    return NULL;
158}
159
160/* CharsetOptions optionflag...
161 */
162static const char *add_charset_options(cmd_parms *cmd, void *in_dc,
163                                       const char *flag)
164{
165    charset_dir_t *dc = in_dc;
166
167    if (!strcasecmp(flag, "ImplicitAdd")) {
168        dc->implicit_add = IA_IMPADD;
169    }
170    else if (!strcasecmp(flag, "NoImplicitAdd")) {
171        dc->implicit_add = IA_NOIMPADD;
172    }
173    else if (!strcasecmp(flag, "TranslateAllMimeTypes")) {
174        dc->force_xlate = FX_FORCE;
175    }
176    else if (!strcasecmp(flag, "NoTranslateAllMimeTypes")) {
177        dc->force_xlate = FX_NOFORCE;
178    }
179    else {
180        return apr_pstrcat(cmd->temp_pool,
181                           "Invalid CharsetOptions option: ",
182                           flag,
183                           NULL);
184    }
185
186    return NULL;
187}
188
189/* find_code_page() is a fixup hook that checks if the module is
190 * configured and the input or output potentially need to be translated.
191 * If so, context is initialized for the filters.
192 */
193static int find_code_page(request_rec *r)
194{
195    charset_dir_t *dc = ap_get_module_config(r->per_dir_config,
196                                             &charset_lite_module);
197    charset_req_t *reqinfo;
198    charset_filter_ctx_t *input_ctx, *output_ctx;
199    apr_status_t rv;
200
201    ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, r,
202                  "uri: %s file: %s method: %d "
203                  "imt: %s flags: %s%s%s %s->%s",
204                  r->uri,
205                  r->filename ? r->filename : "(none)",
206                  r->method_number,
207                  r->content_type ? r->content_type : "(unknown)",
208                  r->main     ? "S" : "",    /* S if subrequest */
209                  r->prev     ? "R" : "",    /* R if redirect */
210                  r->proxyreq ? "P" : "",    /* P if proxy */
211                  dc->charset_source, dc->charset_default);
212
213    /* If we don't have a full directory configuration, bail out.
214     */
215    if (!dc->charset_source || !dc->charset_default) {
216        ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(01448)
217                      "incomplete configuration: src %s, dst %s",
218                      dc->charset_source ? dc->charset_source : "unspecified",
219                      dc->charset_default ? dc->charset_default : "unspecified");
220        return DECLINED;
221    }
222
223    /* catch proxy requests */
224    if (r->proxyreq) {
225        return DECLINED;
226    }
227
228    /* mod_rewrite indicators */
229    if (r->filename
230        && (!strncmp(r->filename, "redirect:", 9)
231            || !strncmp(r->filename, "gone:", 5)
232            || !strncmp(r->filename, "passthrough:", 12)
233            || !strncmp(r->filename, "forbidden:", 10))) {
234        return DECLINED;
235    }
236
237    /* no translation when server and network charsets are set to the same value */
238    if (!strcasecmp(dc->charset_source, dc->charset_default)) {
239        return DECLINED;
240    }
241
242    /* Get storage for the request data and the output filter context.
243     * We rarely need the input filter context, so allocate that separately.
244     */
245    reqinfo = (charset_req_t *)apr_pcalloc(r->pool,
246                                           sizeof(charset_req_t) +
247                                           sizeof(charset_filter_ctx_t));
248    output_ctx = (charset_filter_ctx_t *)(reqinfo + 1);
249
250    reqinfo->dc = dc;
251    output_ctx->dc = dc;
252    output_ctx->tmpbb = apr_brigade_create(r->pool,
253                                           r->connection->bucket_alloc);
254    ap_set_module_config(r->request_config, &charset_lite_module, reqinfo);
255
256    reqinfo->output_ctx = output_ctx;
257
258    switch (r->method_number) {
259    case M_PUT:
260    case M_POST:
261        /* Set up input translation.  Note: A request body can be included
262         * with the OPTIONS method, but for now we don't set up translation
263         * of it.
264         */
265        input_ctx = apr_pcalloc(r->pool, sizeof(charset_filter_ctx_t));
266        input_ctx->bb = apr_brigade_create(r->pool,
267                                           r->connection->bucket_alloc);
268        input_ctx->tmp = apr_palloc(r->pool, INPUT_XLATE_BUF_SIZE);
269        input_ctx->dc = dc;
270        reqinfo->input_ctx = input_ctx;
271        rv = apr_xlate_open(&input_ctx->xlate, dc->charset_source,
272                            dc->charset_default, r->pool);
273        if (rv != APR_SUCCESS) {
274            ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r, APLOGNO(01449)
275                          "can't open translation %s->%s",
276                          dc->charset_default, dc->charset_source);
277            return HTTP_INTERNAL_SERVER_ERROR;
278        }
279        if (apr_xlate_sb_get(input_ctx->xlate, &input_ctx->is_sb) != APR_SUCCESS) {
280            input_ctx->is_sb = 0;
281        }
282    }
283
284    return DECLINED;
285}
286
287static int configured_in_list(request_rec *r, const char *filter_name,
288                              struct ap_filter_t *filter_list)
289{
290    struct ap_filter_t *filter = filter_list;
291
292    while (filter) {
293        if (!strcasecmp(filter_name, filter->frec->name)) {
294            return 1;
295        }
296        filter = filter->next;
297    }
298    return 0;
299}
300
301static int configured_on_input(request_rec *r, const char *filter_name)
302{
303    return configured_in_list(r, filter_name, r->input_filters);
304}
305
306static int configured_on_output(request_rec *r, const char *filter_name)
307{
308    return configured_in_list(r, filter_name, r->output_filters);
309}
310
311/* xlate_insert_filter() is a filter hook which decides whether or not
312 * to insert a translation filter for the current request.
313 */
314static void xlate_insert_filter(request_rec *r)
315{
316    /* Hey... don't be so quick to use reqinfo->dc here; reqinfo may be NULL */
317    charset_req_t *reqinfo = ap_get_module_config(r->request_config,
318                                                  &charset_lite_module);
319    charset_dir_t *dc = ap_get_module_config(r->per_dir_config,
320                                             &charset_lite_module);
321
322    if (dc && (dc->implicit_add == IA_NOIMPADD)) {
323        ap_log_rerror(APLOG_MARK, APLOG_TRACE6, 0, r,
324                      "xlate output filter not added implicitly because "
325                      "CharsetOptions included 'NoImplicitAdd'");
326        return;
327    }
328
329    if (reqinfo) {
330        if (reqinfo->output_ctx && !configured_on_output(r, XLATEOUT_FILTER_NAME)) {
331            ap_add_output_filter(XLATEOUT_FILTER_NAME, reqinfo->output_ctx, r,
332                                 r->connection);
333        }
334        ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, r,
335                      "xlate output filter not added implicitly because %s",
336                      !reqinfo->output_ctx ?
337                      "no output configuration available" :
338                      "another module added the filter");
339
340        if (reqinfo->input_ctx && !configured_on_input(r, XLATEIN_FILTER_NAME)) {
341            ap_add_input_filter(XLATEIN_FILTER_NAME, reqinfo->input_ctx, r,
342                                r->connection);
343        }
344        ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, r,
345                      "xlate input filter not added implicitly because %s",
346                      !reqinfo->input_ctx ?
347                      "no input configuration available" :
348                      "another module added the filter");
349    }
350}
351
352/* stuff that sucks that I know of:
353 *
354 * bucket handling:
355 *  why create an eos bucket when we see it come down the stream?  just send the one
356 *  passed as input...  news flash: this will be fixed when xlate_out_filter() starts
357 *  using the more generic xlate_brigade()
358 *
359 * translation mechanics:
360 *   we don't handle characters that straddle more than two buckets; an error
361 *   will be generated
362 */
363
364static apr_status_t send_bucket_downstream(ap_filter_t *f, apr_bucket *b)
365{
366    charset_filter_ctx_t *ctx = f->ctx;
367    apr_status_t rv;
368
369    APR_BRIGADE_INSERT_TAIL(ctx->tmpbb, b);
370    rv = ap_pass_brigade(f->next, ctx->tmpbb);
371    if (rv != APR_SUCCESS) {
372        ctx->ees = EES_DOWNSTREAM;
373    }
374    apr_brigade_cleanup(ctx->tmpbb);
375    return rv;
376}
377
378/* send_downstream() is passed the translated data; it puts it in a single-
379 * bucket brigade and passes the brigade to the next filter
380 */
381static apr_status_t send_downstream(ap_filter_t *f, const char *tmp, apr_size_t len)
382{
383    request_rec *r = f->r;
384    conn_rec *c = r->connection;
385    apr_bucket *b;
386
387    b = apr_bucket_transient_create(tmp, len, c->bucket_alloc);
388    return send_bucket_downstream(f, b);
389}
390
391static apr_status_t send_eos(ap_filter_t *f)
392{
393    request_rec *r = f->r;
394    conn_rec *c = r->connection;
395    apr_bucket_brigade *bb;
396    apr_bucket *b;
397    charset_filter_ctx_t *ctx = f->ctx;
398    apr_status_t rv;
399
400    bb = apr_brigade_create(r->pool, c->bucket_alloc);
401    b = apr_bucket_eos_create(c->bucket_alloc);
402    APR_BRIGADE_INSERT_TAIL(bb, b);
403    rv = ap_pass_brigade(f->next, bb);
404    if (rv != APR_SUCCESS) {
405        ctx->ees = EES_DOWNSTREAM;
406    }
407    return rv;
408}
409
410static apr_status_t set_aside_partial_char(charset_filter_ctx_t *ctx,
411                                           const char *partial,
412                                           apr_size_t partial_len)
413{
414    apr_status_t rv;
415
416    if (sizeof(ctx->buf) > partial_len) {
417        ctx->saved = partial_len;
418        memcpy(ctx->buf, partial, partial_len);
419        rv = APR_SUCCESS;
420    }
421    else {
422        rv = APR_INCOMPLETE;
423        ctx->ees = EES_LIMIT; /* we don't handle chars this wide which straddle
424                               * buckets
425                               */
426    }
427    return rv;
428}
429
430static apr_status_t finish_partial_char(charset_filter_ctx_t *ctx,
431                                        /* input buffer: */
432                                        const char **cur_str,
433                                        apr_size_t *cur_len,
434                                        /* output buffer: */
435                                        char **out_str,
436                                        apr_size_t *out_len)
437{
438    apr_status_t rv;
439    apr_size_t tmp_input_len;
440
441    /* Keep adding bytes from the input string to the saved string until we
442     *    1) finish the input char
443     *    2) get an error
444     * or 3) run out of bytes to add
445     */
446
447    do {
448        ctx->buf[ctx->saved] = **cur_str;
449        ++ctx->saved;
450        ++*cur_str;
451        --*cur_len;
452        tmp_input_len = ctx->saved;
453        rv = apr_xlate_conv_buffer(ctx->xlate,
454                                   ctx->buf,
455                                   &tmp_input_len,
456                                   *out_str,
457                                   out_len);
458    } while (rv == APR_INCOMPLETE && *cur_len);
459
460    if (rv == APR_SUCCESS) {
461        ctx->saved = 0;
462    }
463    else {
464        ctx->ees = EES_LIMIT; /* code isn't smart enough to handle chars
465                               * straddling more than two buckets
466                               */
467    }
468
469    return rv;
470}
471
472static void log_xlate_error(ap_filter_t *f, apr_status_t rv)
473{
474    charset_filter_ctx_t *ctx = f->ctx;
475    const char *msg;
476    char msgbuf[100];
477    apr_size_t len;
478
479    switch(ctx->ees) {
480    case EES_LIMIT:
481        rv = 0;
482        msg = APLOGNO(02193) "xlate filter - a built-in restriction was encountered";
483        break;
484    case EES_BAD_INPUT:
485        rv = 0;
486        msg = APLOGNO(02194) "xlate filter - an input character was invalid";
487        break;
488    case EES_BUCKET_READ:
489        rv = 0;
490        msg = APLOGNO(02195) "xlate filter - bucket read routine failed";
491        break;
492    case EES_INCOMPLETE_CHAR:
493        rv = 0;
494        strcpy(msgbuf, APLOGNO(02196) "xlate filter - incomplete char at end of input - ");
495        len = ctx->saved;
496
497        /* We must ensure not to process more than what would fit in the
498         * remaining of the destination buffer, including terminating NULL */
499        if (len > (sizeof(msgbuf) - strlen(msgbuf) - 1) / 2)
500            len = (sizeof(msgbuf) - strlen(msgbuf) - 1) / 2;
501
502        ap_bin2hex(ctx->buf, len, msgbuf + strlen(msgbuf));
503        msg = msgbuf;
504        break;
505    case EES_DOWNSTREAM:
506        msg = APLOGNO(02197) "xlate filter - an error occurred in a lower filter";
507        break;
508    default:
509        msg = APLOGNO(02198) "xlate filter - returning error";
510    }
511    ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, "%s", msg);
512}
513
514/* chk_filter_chain() is called once per filter instance; it tries to
515 * determine if the current filter instance should be disabled because
516 * its translation is incompatible with the translation of an existing
517 * instance of the translate filter
518 *
519 * Example bad scenario:
520 *
521 *   configured filter chain for the request:
522 *     INCLUDES XLATEOUT(8859-1->UTS-16)
523 *   configured filter chain for the subrequest:
524 *     XLATEOUT(8859-1->UTS-16)
525 *
526 *   When the subrequest is processed, the filter chain will be
527 *     XLATEOUT(8859-1->UTS-16) XLATEOUT(8859-1->UTS-16)
528 *   This makes no sense, so the instance of XLATEOUT added for the
529 *   subrequest will be noop-ed.
530 *
531 * Example good scenario:
532 *
533 *   configured filter chain for the request:
534 *     INCLUDES XLATEOUT(8859-1->UTS-16)
535 *   configured filter chain for the subrequest:
536 *     XLATEOUT(IBM-1047->8859-1)
537 *
538 *   When the subrequest is processed, the filter chain will be
539 *     XLATEOUT(IBM-1047->8859-1) XLATEOUT(8859-1->UTS-16)
540 *   This makes sense, so the instance of XLATEOUT added for the
541 *   subrequest will be left alone and it will translate from
542 *   IBM-1047->8859-1.
543 */
544static void chk_filter_chain(ap_filter_t *f)
545{
546    ap_filter_t *curf;
547    charset_filter_ctx_t *curctx, *last_xlate_ctx = NULL,
548        *ctx = f->ctx;
549    int output = !strcasecmp(f->frec->name, XLATEOUT_FILTER_NAME);
550
551    if (ctx->noop) {
552        return;
553    }
554
555    /* walk the filter chain; see if it makes sense for our filter to
556     * do any translation
557     */
558    curf = output ? f->r->output_filters : f->r->input_filters;
559    while (curf) {
560        if (!strcasecmp(curf->frec->name, f->frec->name) &&
561            curf->ctx) {
562            curctx = (charset_filter_ctx_t *)curf->ctx;
563            if (!last_xlate_ctx) {
564                last_xlate_ctx = curctx;
565            }
566            else {
567                if (strcmp(last_xlate_ctx->dc->charset_default,
568                           curctx->dc->charset_source)) {
569                    /* incompatible translation
570                     * if our filter instance is incompatible with an instance
571                     * already in place, noop our instance
572                     * Notes:
573                     * . We are only willing to noop our own instance.
574                     * . It is possible to noop another instance which has not
575                     *   yet run, but this is not currently implemented.
576                     *   Hopefully it will not be needed.
577                     * . It is not possible to noop an instance which has
578                     *   already run.
579                     */
580                    if (last_xlate_ctx == f->ctx) {
581                        last_xlate_ctx->noop = 1;
582                        if (APLOGrtrace1(f->r)) {
583                            const char *symbol = output ? "->" : "<-";
584
585                            ap_log_rerror(APLOG_MARK, APLOG_DEBUG,
586                                          0, f->r, APLOGNO(01451)
587                                          "%s %s - disabling "
588                                          "translation %s%s%s; existing "
589                                          "translation %s%s%s",
590                                          f->r->uri ? "uri" : "file",
591                                          f->r->uri ? f->r->uri : f->r->filename,
592                                          last_xlate_ctx->dc->charset_source,
593                                          symbol,
594                                          last_xlate_ctx->dc->charset_default,
595                                          curctx->dc->charset_source,
596                                          symbol,
597                                          curctx->dc->charset_default);
598                        }
599                    }
600                    else {
601                        const char *symbol = output ? "->" : "<-";
602
603                        ap_log_rerror(APLOG_MARK, APLOG_ERR,
604                                      0, f->r, APLOGNO(01452)
605                                      "chk_filter_chain() - can't disable "
606                                      "translation %s%s%s; existing "
607                                      "translation %s%s%s",
608                                      last_xlate_ctx->dc->charset_source,
609                                      symbol,
610                                      last_xlate_ctx->dc->charset_default,
611                                      curctx->dc->charset_source,
612                                      symbol,
613                                      curctx->dc->charset_default);
614                    }
615                    break;
616                }
617            }
618        }
619        curf = curf->next;
620    }
621}
622
623/* xlate_brigade() is used to filter request and response bodies
624 *
625 * we'll stop when one of the following occurs:
626 * . we run out of buckets
627 * . we run out of space in the output buffer
628 * . we hit an error or metadata
629 *
630 * inputs:
631 *   bb:               brigade to process
632 *   buffer:           storage to hold the translated characters
633 *   buffer_avail:     size of buffer
634 *   (and a few more uninteresting parms)
635 *
636 * outputs:
637 *   return value:     APR_SUCCESS or some error code
638 *   bb:               we've removed any buckets representing the
639 *                     translated characters; the eos bucket, if
640 *                     present, will be left in the brigade
641 *   buffer:           filled in with translated characters
642 *   buffer_avail:     updated with the bytes remaining
643 *   hit_eos:          did we hit an EOS bucket?
644 */
645static apr_status_t xlate_brigade(charset_filter_ctx_t *ctx,
646                                  apr_bucket_brigade *bb,
647                                  char *buffer,
648                                  apr_size_t *buffer_avail,
649                                  int *hit_eos)
650{
651    apr_bucket *b = NULL; /* set to NULL only to quiet some gcc */
652    apr_bucket *consumed_bucket;
653    const char *bucket;
654    apr_size_t bytes_in_bucket; /* total bytes read from current bucket */
655    apr_size_t bucket_avail;    /* bytes left in current bucket */
656    apr_status_t rv = APR_SUCCESS;
657
658    *hit_eos = 0;
659    bucket_avail = 0;
660    consumed_bucket = NULL;
661    while (1) {
662        if (!bucket_avail) { /* no bytes left to process in the current bucket... */
663            if (consumed_bucket) {
664                apr_bucket_delete(consumed_bucket);
665                consumed_bucket = NULL;
666            }
667            b = APR_BRIGADE_FIRST(bb);
668            if (b == APR_BRIGADE_SENTINEL(bb) ||
669                APR_BUCKET_IS_METADATA(b)) {
670                break;
671            }
672            rv = apr_bucket_read(b, &bucket, &bytes_in_bucket, APR_BLOCK_READ);
673            if (rv != APR_SUCCESS) {
674                ctx->ees = EES_BUCKET_READ;
675                break;
676            }
677            bucket_avail = bytes_in_bucket;
678            consumed_bucket = b;   /* for axing when we're done reading it */
679        }
680        if (bucket_avail) {
681            /* We've got data, so translate it. */
682            if (ctx->saved) {
683                /* Rats... we need to finish a partial character from the previous
684                 * bucket.
685                 *
686                 * Strangely, finish_partial_char() increments the input buffer
687                 * pointer but does not increment the output buffer pointer.
688                 */
689                apr_size_t old_buffer_avail = *buffer_avail;
690                rv = finish_partial_char(ctx,
691                                         &bucket, &bucket_avail,
692                                         &buffer, buffer_avail);
693                buffer += old_buffer_avail - *buffer_avail;
694            }
695            else {
696                apr_size_t old_buffer_avail = *buffer_avail;
697                apr_size_t old_bucket_avail = bucket_avail;
698                rv = apr_xlate_conv_buffer(ctx->xlate,
699                                           bucket, &bucket_avail,
700                                           buffer,
701                                           buffer_avail);
702                buffer  += old_buffer_avail - *buffer_avail;
703                bucket  += old_bucket_avail - bucket_avail;
704
705                if (rv == APR_INCOMPLETE) { /* partial character at end of input */
706                    /* We need to save the final byte(s) for next time; we can't
707                     * convert it until we look at the next bucket.
708                     */
709                    rv = set_aside_partial_char(ctx, bucket, bucket_avail);
710                    bucket_avail = 0;
711                }
712            }
713            if (rv != APR_SUCCESS) {
714                /* bad input byte or partial char too big to store */
715                break;
716            }
717            if (*buffer_avail < XLATE_MIN_BUFF_LEFT) {
718                /* if any data remains in the current bucket, split there */
719                if (bucket_avail) {
720                    apr_bucket_split(b, bytes_in_bucket - bucket_avail);
721                }
722                apr_bucket_delete(b);
723                break;
724            }
725        }
726    }
727
728    if (!APR_BRIGADE_EMPTY(bb)) {
729        b = APR_BRIGADE_FIRST(bb);
730        if (APR_BUCKET_IS_EOS(b)) {
731            /* Leave the eos bucket in the brigade for reporting to
732             * subsequent filters.
733             */
734            *hit_eos = 1;
735            if (ctx->saved) {
736                /* Oops... we have a partial char from the previous bucket
737                 * that won't be completed because there's no more data.
738                 */
739                rv = APR_INCOMPLETE;
740                ctx->ees = EES_INCOMPLETE_CHAR;
741            }
742        }
743    }
744
745    return rv;
746}
747
748/* xlate_out_filter() handles (almost) arbitrary conversions from one charset
749 * to another...
750 * translation is determined in the fixup hook (find_code_page), which is
751 * where the filter's context data is set up... the context data gives us
752 * the translation handle
753 */
754static apr_status_t xlate_out_filter(ap_filter_t *f, apr_bucket_brigade *bb)
755{
756    charset_req_t *reqinfo = ap_get_module_config(f->r->request_config,
757                                                  &charset_lite_module);
758    charset_dir_t *dc = ap_get_module_config(f->r->per_dir_config,
759                                             &charset_lite_module);
760    charset_filter_ctx_t *ctx = f->ctx;
761    apr_bucket *dptr, *consumed_bucket;
762    const char *cur_str;
763    apr_size_t cur_len, cur_avail;
764    char tmp[OUTPUT_XLATE_BUF_SIZE];
765    apr_size_t space_avail;
766    int done;
767    apr_status_t rv = APR_SUCCESS;
768
769    if (!ctx) {
770        /* this is SetOutputFilter path; grab the preallocated context,
771         * if any; note that if we decided not to do anything in an earlier
772         * handler, we won't even have a reqinfo
773         */
774        if (reqinfo) {
775            ctx = f->ctx = reqinfo->output_ctx;
776            reqinfo->output_ctx = NULL; /* prevent SNAFU if user coded us twice
777                                         * in the filter chain; we can't have two
778                                         * instances using the same context
779                                         */
780        }
781        if (!ctx) {                   /* no idea how to translate; don't do anything */
782            ctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(charset_filter_ctx_t));
783            ctx->dc = dc;
784            ctx->noop = 1;
785        }
786    }
787
788    /* Check the mime type to see if translation should be performed.
789     */
790    if (!ctx->noop && ctx->xlate == NULL) {
791        const char *mime_type = f->r->content_type;
792
793        if (mime_type && (strncasecmp(mime_type, "text/", 5) == 0 ||
794#if APR_CHARSET_EBCDIC
795        /* On an EBCDIC machine, be willing to translate mod_autoindex-
796         * generated output.  Otherwise, it doesn't look too cool.
797         *
798         * XXX This isn't a perfect fix because this doesn't trigger us
799         * to convert from the charset of the source code to ASCII.  The
800         * general solution seems to be to allow a generator to set an
801         * indicator in the r specifying that the body is coded in the
802         * implementation character set (i.e., the charset of the source
803         * code).  This would get several different types of documents
804         * translated properly: mod_autoindex output, mod_status output,
805         * mod_info output, hard-coded error documents, etc.
806         */
807            strcmp(mime_type, DIR_MAGIC_TYPE) == 0 ||
808#endif
809            strncasecmp(mime_type, "message/", 8) == 0 ||
810            dc->force_xlate == FX_FORCE)) {
811
812            rv = apr_xlate_open(&ctx->xlate,
813                                dc->charset_default, dc->charset_source, f->r->pool);
814            if (rv != APR_SUCCESS) {
815                ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, APLOGNO(01453)
816                              "can't open translation %s->%s",
817                              dc->charset_source, dc->charset_default);
818                ctx->noop = 1;
819            }
820            else {
821                if (apr_xlate_sb_get(ctx->xlate, &ctx->is_sb) != APR_SUCCESS) {
822                    ctx->is_sb = 0;
823                }
824            }
825        }
826        else {
827            ctx->noop = 1;
828            if (mime_type) {
829                ap_log_rerror(APLOG_MARK, APLOG_TRACE6, 0, f->r,
830                              "mime type is %s; no translation selected",
831                              mime_type);
832            }
833        }
834    }
835
836    ap_log_rerror(APLOG_MARK, APLOG_TRACE6, 0, f->r,
837                  "xlate_out_filter() - "
838                  "charset_source: %s charset_default: %s",
839                  dc && dc->charset_source ? dc->charset_source : "(none)",
840                  dc && dc->charset_default ? dc->charset_default : "(none)");
841
842    if (!ctx->ran) {  /* filter never ran before */
843        chk_filter_chain(f);
844        ctx->ran = 1;
845        if (!ctx->noop && !ctx->is_sb) {
846            /* We're not converting between two single-byte charsets, so unset
847             * Content-Length since it is unlikely to remain the same.
848             */
849            apr_table_unset(f->r->headers_out, "Content-Length");
850        }
851    }
852
853    if (ctx->noop) {
854        return ap_pass_brigade(f->next, bb);
855    }
856
857    dptr = APR_BRIGADE_FIRST(bb);
858    done = 0;
859    cur_len = 0;
860    space_avail = sizeof(tmp);
861    consumed_bucket = NULL;
862    while (!done) {
863        if (!cur_len) { /* no bytes left to process in the current bucket... */
864            if (consumed_bucket) {
865                apr_bucket_delete(consumed_bucket);
866                consumed_bucket = NULL;
867            }
868            if (dptr == APR_BRIGADE_SENTINEL(bb)) {
869                break;
870            }
871            if (APR_BUCKET_IS_EOS(dptr)) {
872                cur_len = -1; /* XXX yuck, but that tells us to send
873                                 * eos down; when we minimize our bb construction
874                                 * we'll fix this crap */
875                if (ctx->saved) {
876                    /* Oops... we have a partial char from the previous bucket
877                     * that won't be completed because there's no more data.
878                     */
879                    rv = APR_INCOMPLETE;
880                    ctx->ees = EES_INCOMPLETE_CHAR;
881                }
882                break;
883            }
884            if (APR_BUCKET_IS_METADATA(dptr)) {
885                apr_bucket *metadata_bucket;
886                metadata_bucket = dptr;
887                dptr = APR_BUCKET_NEXT(dptr);
888                APR_BUCKET_REMOVE(metadata_bucket);
889                rv = send_bucket_downstream(f, metadata_bucket);
890                if (rv != APR_SUCCESS) {
891                    done = 1;
892                }
893                continue;
894            }
895            rv = apr_bucket_read(dptr, &cur_str, &cur_len, APR_BLOCK_READ);
896            if (rv != APR_SUCCESS) {
897                ctx->ees = EES_BUCKET_READ;
898                break;
899            }
900            consumed_bucket = dptr; /* for axing when we're done reading it */
901            dptr = APR_BUCKET_NEXT(dptr); /* get ready for when we access the
902                                          * next bucket */
903        }
904        /* Try to fill up our tmp buffer with translated data. */
905        cur_avail = cur_len;
906
907        if (cur_len) { /* maybe we just hit the end of a pipe (len = 0) ? */
908            if (ctx->saved) {
909                /* Rats... we need to finish a partial character from the previous
910                 * bucket.
911                 */
912                char *tmp_tmp;
913
914                tmp_tmp = tmp + sizeof(tmp) - space_avail;
915                rv = finish_partial_char(ctx,
916                                         &cur_str, &cur_len,
917                                         &tmp_tmp, &space_avail);
918            }
919            else {
920                rv = apr_xlate_conv_buffer(ctx->xlate,
921                                           cur_str, &cur_avail,
922                                           tmp + sizeof(tmp) - space_avail, &space_avail);
923
924                /* Update input ptr and len after consuming some bytes */
925                cur_str += cur_len - cur_avail;
926                cur_len = cur_avail;
927
928                if (rv == APR_INCOMPLETE) { /* partial character at end of input */
929                    /* We need to save the final byte(s) for next time; we can't
930                     * convert it until we look at the next bucket.
931                     */
932                    rv = set_aside_partial_char(ctx, cur_str, cur_len);
933                    cur_len = 0;
934                }
935            }
936        }
937
938        if (rv != APR_SUCCESS) {
939            /* bad input byte or partial char too big to store */
940            done = 1;
941        }
942
943        if (space_avail < XLATE_MIN_BUFF_LEFT) {
944            /* It is time to flush, as there is not enough space left in the
945             * current output buffer to bother with converting more data.
946             */
947            rv = send_downstream(f, tmp, sizeof(tmp) - space_avail);
948            if (rv != APR_SUCCESS) {
949                done = 1;
950            }
951
952            /* tmp is now empty */
953            space_avail = sizeof(tmp);
954        }
955    }
956
957    if (rv == APR_SUCCESS) {
958        if (space_avail < sizeof(tmp)) { /* gotta write out what we converted */
959            rv = send_downstream(f, tmp, sizeof(tmp) - space_avail);
960        }
961    }
962    if (rv == APR_SUCCESS) {
963        if (cur_len == -1) {
964            rv = send_eos(f);
965        }
966    }
967    else {
968        log_xlate_error(f, rv);
969    }
970
971    return rv;
972}
973
974static apr_status_t xlate_in_filter(ap_filter_t *f, apr_bucket_brigade *bb,
975                                    ap_input_mode_t mode, apr_read_type_e block,
976                                    apr_off_t readbytes)
977{
978    apr_status_t rv;
979    charset_req_t *reqinfo = ap_get_module_config(f->r->request_config,
980                                                  &charset_lite_module);
981    charset_dir_t *dc = ap_get_module_config(f->r->per_dir_config,
982                                             &charset_lite_module);
983    charset_filter_ctx_t *ctx = f->ctx;
984    apr_size_t buffer_size;
985    int hit_eos;
986
987    if (!ctx) {
988        /* this is SetInputFilter path; grab the preallocated context,
989         * if any; note that if we decided not to do anything in an earlier
990         * handler, we won't even have a reqinfo
991         */
992        if (reqinfo) {
993            ctx = f->ctx = reqinfo->input_ctx;
994            reqinfo->input_ctx = NULL; /* prevent SNAFU if user coded us twice
995                                        * in the filter chain; we can't have two
996                                        * instances using the same context
997                                        */
998        }
999        if (!ctx) {                   /* no idea how to translate; don't do anything */
1000            ctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(charset_filter_ctx_t));
1001            ctx->dc = dc;
1002            ctx->noop = 1;
1003        }
1004    }
1005
1006    ap_log_rerror(APLOG_MARK, APLOG_TRACE6, 0, f->r,
1007                 "xlate_in_filter() - "
1008                 "charset_source: %s charset_default: %s",
1009                 dc && dc->charset_source ? dc->charset_source : "(none)",
1010                 dc && dc->charset_default ? dc->charset_default : "(none)");
1011
1012    if (!ctx->ran) {  /* filter never ran before */
1013        chk_filter_chain(f);
1014        ctx->ran = 1;
1015        if (!ctx->noop && !ctx->is_sb
1016            && apr_table_get(f->r->headers_in, "Content-Length")) {
1017            /* A Content-Length header is present, but it won't be valid after
1018             * conversion because we're not converting between two single-byte
1019             * charsets.  This will affect most CGI scripts and may affect
1020             * some modules.
1021             * Content-Length can't be unset here because that would break
1022             * being able to read the request body.
1023             * Processing of chunked request bodies is not impacted by this
1024             * filter since the the length was not declared anyway.
1025             */
1026            ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, f->r,
1027                          "Request body length may change, resulting in "
1028                          "misprocessing by some modules or scripts");
1029        }
1030    }
1031
1032    if (ctx->noop) {
1033        return ap_get_brigade(f->next, bb, mode, block, readbytes);
1034    }
1035
1036    if (APR_BRIGADE_EMPTY(ctx->bb)) {
1037        if ((rv = ap_get_brigade(f->next, bb, mode, block,
1038                                 readbytes)) != APR_SUCCESS) {
1039            return rv;
1040        }
1041    }
1042    else {
1043        APR_BRIGADE_PREPEND(bb, ctx->bb); /* first use the leftovers */
1044    }
1045
1046    buffer_size = INPUT_XLATE_BUF_SIZE;
1047    rv = xlate_brigade(ctx, bb, ctx->tmp, &buffer_size, &hit_eos);
1048    if (rv == APR_SUCCESS) {
1049        if (!hit_eos) {
1050            /* move anything leftover into our context for next time;
1051             * we don't currently "set aside" since the data came from
1052             * down below, but I suspect that for long-term we need to
1053             * do that
1054             */
1055            APR_BRIGADE_CONCAT(ctx->bb, bb);
1056        }
1057        if (buffer_size < INPUT_XLATE_BUF_SIZE) { /* do we have output? */
1058            apr_bucket *e;
1059
1060            e = apr_bucket_heap_create(ctx->tmp,
1061                                       INPUT_XLATE_BUF_SIZE - buffer_size,
1062                                       NULL, f->r->connection->bucket_alloc);
1063            /* make sure we insert at the head, because there may be
1064             * an eos bucket already there, and the eos bucket should
1065             * come after the data
1066             */
1067            APR_BRIGADE_INSERT_HEAD(bb, e);
1068        }
1069        else {
1070            /* XXX need to get some more data... what if the last brigade
1071             * we got had only the first byte of a multibyte char?  we need
1072             * to grab more data from the network instead of returning an
1073             * empty brigade
1074             */
1075        }
1076        /* If we have any metadata at the head of ctx->bb, go ahead and move it
1077         * onto the end of bb to be returned to our caller.
1078         */
1079        if (!APR_BRIGADE_EMPTY(ctx->bb)) {
1080            apr_bucket *b = APR_BRIGADE_FIRST(ctx->bb);
1081            while (b != APR_BRIGADE_SENTINEL(ctx->bb)
1082                   && APR_BUCKET_IS_METADATA(b)) {
1083                APR_BUCKET_REMOVE(b);
1084                APR_BRIGADE_INSERT_TAIL(bb, b);
1085                b = APR_BRIGADE_FIRST(ctx->bb);
1086            }
1087        }
1088    }
1089    else {
1090        log_xlate_error(f, rv);
1091    }
1092
1093    return rv;
1094}
1095
1096static const command_rec cmds[] =
1097{
1098    AP_INIT_TAKE1("CharsetSourceEnc",
1099                  add_charset_source,
1100                  NULL,
1101                  OR_FILEINFO,
1102                  "source (html,cgi,ssi) file charset"),
1103    AP_INIT_TAKE1("CharsetDefault",
1104                  add_charset_default,
1105                  NULL,
1106                  OR_FILEINFO,
1107                  "name of default charset"),
1108    AP_INIT_ITERATE("CharsetOptions",
1109                    add_charset_options,
1110                    NULL,
1111                    OR_FILEINFO,
1112                    "valid options: ImplicitAdd, NoImplicitAdd, TranslateAllMimeTypes, "
1113                    "NoTranslateAllMimeTypes"),
1114    {NULL}
1115};
1116
1117static void charset_register_hooks(apr_pool_t *p)
1118{
1119    ap_hook_fixups(find_code_page, NULL, NULL, APR_HOOK_MIDDLE);
1120    ap_hook_insert_filter(xlate_insert_filter, NULL, NULL, APR_HOOK_REALLY_LAST);
1121    ap_register_output_filter(XLATEOUT_FILTER_NAME, xlate_out_filter, NULL,
1122                              AP_FTYPE_RESOURCE);
1123    ap_register_input_filter(XLATEIN_FILTER_NAME, xlate_in_filter, NULL,
1124                             AP_FTYPE_RESOURCE);
1125}
1126
1127AP_DECLARE_MODULE(charset_lite) =
1128{
1129    STANDARD20_MODULE_STUFF,
1130    create_charset_dir_conf,
1131    merge_charset_dir_conf,
1132    NULL,
1133    NULL,
1134    cmds,
1135    charset_register_hooks
1136};
1137
1138