1/*      Copyright (c) 2007-11, WebThing Ltd
2 *      Copyright (c) 2011-, The Apache Software Foundation
3 *
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements.  See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License.  You may obtain a copy of the License at
10 *
11 *     http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20#if defined(WIN32)
21#define XML2ENC_DECLARE_EXPORT
22#endif
23
24#include <ctype.h>
25
26/* libxml2 */
27#include <libxml/encoding.h>
28
29#include "http_protocol.h"
30#include "http_config.h"
31#include "http_log.h"
32#include "apr_strings.h"
33#include "apr_xlate.h"
34
35#include "apr_optional.h"
36#include "mod_xml2enc.h"
37
38module AP_MODULE_DECLARE_DATA xml2enc_module;
39
40#define BUFLEN 8192
41#define BUF_MIN 4096
42#define APR_BRIGADE_DO(b,bb) for (b = APR_BRIGADE_FIRST(bb); \
43                                  b != APR_BRIGADE_SENTINEL(bb); \
44                                  b = APR_BUCKET_NEXT(b))
45
46#define ENC_INITIALISED 0x100
47#define ENC_SEEN_EOS 0x200
48#define ENC_SKIPTO ENCIO_SKIPTO
49
50#define HAVE_ENCODING(enc) \
51        (((enc)!=XML_CHAR_ENCODING_NONE)&&((enc)!=XML_CHAR_ENCODING_ERROR))
52
53/*
54 * XXX: Check all those ap_assert()s ans replace those that should not happen
55 * XXX: with AP_DEBUG_ASSERT and those that may happen with proper error
56 * XXX: handling.
57 */
58typedef struct {
59    xmlCharEncoding xml2enc;
60    char* buf;
61    apr_size_t bytes;
62    apr_xlate_t* convset;
63    unsigned int flags;
64    apr_off_t bblen;
65    apr_bucket_brigade* bbnext;
66    apr_bucket_brigade* bbsave;
67    const char* encoding;
68} xml2ctx;
69
70typedef struct {
71    const char* default_charset;
72    xmlCharEncoding default_encoding;
73    apr_array_header_t* skipto;
74} xml2cfg;
75
76typedef struct {
77    const char* val;
78} tattr;
79
80static ap_regex_t* seek_meta_ctype;
81static ap_regex_t* seek_charset;
82
83static apr_status_t xml2enc_filter(request_rec* r, const char* enc,
84                                   unsigned int mode)
85{
86    /* set up a ready-initialised ctx to convert to enc, and insert filter */
87    apr_xlate_t* convset;
88    apr_status_t rv;
89    unsigned int flags = (mode ^ ENCIO);
90    if ((mode & ENCIO) == ENCIO_OUTPUT) {
91        rv = apr_xlate_open(&convset, enc, "UTF-8", r->pool);
92        flags |= ENC_INITIALISED;
93    }
94    else if ((mode & ENCIO) == ENCIO_INPUT) {
95        rv = apr_xlate_open(&convset, "UTF-8", enc, r->pool);
96        flags |= ENC_INITIALISED;
97    }
98    else if ((mode & ENCIO) == ENCIO_INPUT_CHECKS) {
99        convset = NULL;
100        rv = APR_SUCCESS; /* we'll initialise later by sniffing */
101    }
102    else {
103        rv = APR_EGENERAL;
104        ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(01426)
105                      "xml2enc: bad mode %x", mode);
106    }
107    if (rv == APR_SUCCESS) {
108        xml2ctx* ctx = apr_pcalloc(r->pool, sizeof(xml2ctx));
109        ctx->flags = flags;
110        if (flags & ENC_INITIALISED) {
111            ctx->convset = convset;
112            ctx->bblen = BUFLEN;
113            ctx->buf = apr_palloc(r->pool, (apr_size_t)ctx->bblen);
114        }
115        ap_add_output_filter("xml2enc", ctx, r, r->connection);
116    }
117    else {
118        ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r, APLOGNO(01427)
119                      "xml2enc: Charset %s not supported.", enc) ;
120    }
121    return rv;
122}
123
124/* This needs to operate only when we're using htmlParser */
125/* Different modules may apply different rules here.  Ho, hum.  */
126static void fix_skipto(request_rec* r, xml2ctx* ctx)
127{
128    apr_status_t rv;
129    xml2cfg* cfg = ap_get_module_config(r->per_dir_config, &xml2enc_module);
130    if ((cfg->skipto != NULL) && (ctx->flags | ENC_SKIPTO)) {
131        int found = 0;
132        char* p = ap_strchr(ctx->buf, '<');
133        tattr* starts = (tattr*) cfg->skipto->elts;
134        while (!found && p && *p) {
135            int i;
136            for (i = 0; i < cfg->skipto->nelts; ++i) {
137                if (!strncasecmp(p+1, starts[i].val, strlen(starts[i].val))) {
138                    /* found a starting element. Strip all that comes before. */
139                    apr_bucket* b;
140                    apr_bucket* bstart;
141                    rv = apr_brigade_partition(ctx->bbsave, (p-ctx->buf),
142                                               &bstart);
143                    ap_assert(rv == APR_SUCCESS);
144                    while (b = APR_BRIGADE_FIRST(ctx->bbsave), b != bstart) {
145                        APR_BUCKET_REMOVE(b);
146                        apr_bucket_destroy(b);
147                    }
148                    ctx->bytes -= (p-ctx->buf);
149                    ctx->buf = p ;
150                    found = 1;
151                    ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(01428)
152                                  "Skipped to first <%s> element",
153                                  starts[i].val) ;
154                    break;
155                }
156            }
157            p = ap_strchr(p+1, '<');
158        }
159        if (p == NULL) {
160            ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r, APLOGNO(01429)
161                          "Failed to find start of recognised HTML!");
162        }
163    }
164}
165static void sniff_encoding(request_rec* r, xml2ctx* ctx)
166{
167    xml2cfg* cfg = NULL; /* initialise to shut compiler warnings up */
168    char* p ;
169    apr_bucket* cutb;
170    apr_bucket* cute;
171    apr_bucket* b;
172    ap_regmatch_t match[2] ;
173    apr_status_t rv;
174    const char* ctype = r->content_type;
175
176    if (ctype) {
177        ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(01430)
178                      "Content-Type is %s", ctype) ;
179
180        /* If we've got it in the HTTP headers, there's nothing to do */
181        if (ctype && (p = ap_strcasestr(ctype, "charset=") , p != NULL)) {
182            p += 8 ;
183            if (ctx->encoding = apr_pstrndup(r->pool, p, strcspn(p, " ;") ),
184                ctx->encoding) {
185                ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, APLOGNO(01431)
186                              "Got charset %s from HTTP headers", ctx->encoding) ;
187                ctx->xml2enc = xmlParseCharEncoding(ctx->encoding);
188            }
189        }
190    }
191
192    /* to sniff, first we look for BOM */
193    if (ctx->xml2enc == XML_CHAR_ENCODING_NONE) {
194        ctx->xml2enc = xmlDetectCharEncoding((const xmlChar*)ctx->buf,
195                                             ctx->bytes);
196        if (HAVE_ENCODING(ctx->xml2enc)) {
197            ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, APLOGNO(01432)
198                          "Got charset from XML rules.") ;
199            ctx->encoding = xmlGetCharEncodingName(ctx->xml2enc);
200        }
201    }
202
203    /* If none of the above, look for a META-thingey */
204    /* also we're probably about to invalidate it, so we remove it. */
205    if (ap_regexec(seek_meta_ctype, ctx->buf, 1, match, 0) == 0 ) {
206        /* get markers on the start and end of the match */
207        rv = apr_brigade_partition(ctx->bbsave, match[0].rm_eo, &cute);
208        ap_assert(rv == APR_SUCCESS);
209        rv = apr_brigade_partition(ctx->bbsave, match[0].rm_so, &cutb);
210        ap_assert(rv == APR_SUCCESS);
211        /* now set length of useful buf for start-of-data hooks */
212        ctx->bytes = match[0].rm_so;
213        if (ctx->encoding == NULL) {
214            p = apr_pstrndup(r->pool, ctx->buf + match[0].rm_so,
215                             match[0].rm_eo - match[0].rm_so) ;
216            if (ap_regexec(seek_charset, p, 2, match, 0) == 0) {
217                if (ctx->encoding = apr_pstrndup(r->pool, p+match[1].rm_so,
218                                               match[1].rm_eo - match[1].rm_so),
219                    ctx->encoding) {
220                    ctx->xml2enc = xmlParseCharEncoding(ctx->encoding);
221                    if (HAVE_ENCODING(ctx->xml2enc))
222                        ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, APLOGNO(01433)
223                                      "Got charset %s from HTML META", ctx->encoding) ;
224                }
225            }
226        }
227
228        /* cut out the <meta> we're invalidating */
229        while (cutb != cute) {
230            b = APR_BUCKET_NEXT(cutb);
231            APR_BUCKET_REMOVE(cutb);
232            apr_bucket_destroy(cutb);
233            cutb = b;
234        }
235        /* and leave a string */
236        ctx->buf[ctx->bytes] = 0;
237    }
238
239    /* either it's set to something we found or it's still the default */
240    /* Aaargh!  libxml2 has undocumented <META-crap> support.  So this fails
241     * if metafix is not active.  Have to make it conditional.
242     *
243     * No, that means no-metafix breaks things.  Deal immediately with
244     * this particular instance of metafix.
245     */
246    if (!HAVE_ENCODING(ctx->xml2enc)) {
247        cfg = ap_get_module_config(r->per_dir_config, &xml2enc_module);
248        if (!ctx->encoding) {
249            ctx->encoding = cfg->default_charset?cfg->default_charset:"ISO-8859-1";
250        }
251        /* Unsupported charset. Can we get (iconv) support through apr_xlate? */
252        ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(01434)
253                      "Charset %s not supported by libxml2; trying apr_xlate",
254                      ctx->encoding);
255        if (apr_xlate_open(&ctx->convset, "UTF-8", ctx->encoding, r->pool)
256            == APR_SUCCESS) {
257            ctx->xml2enc = XML_CHAR_ENCODING_UTF8 ;
258        } else {
259            ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(01435)
260                          "Charset %s not supported.  Consider aliasing it?",
261                          ctx->encoding) ;
262        }
263    }
264
265    if (!HAVE_ENCODING(ctx->xml2enc)) {
266        /* Use configuration default as a last resort */
267        ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r, APLOGNO(01436)
268                  "No usable charset information; using configuration default");
269        ctx->xml2enc = (cfg->default_encoding == XML_CHAR_ENCODING_NONE)
270                        ? XML_CHAR_ENCODING_8859_1 : cfg->default_encoding ;
271    }
272    if (ctype && ctx->encoding) {
273        if (ap_regexec(seek_charset, ctype, 2, match, 0)) {
274            r->content_type = apr_pstrcat(r->pool, ctype, ";charset=utf-8",
275                                          NULL);
276        } else {
277            char* str = apr_palloc(r->pool, strlen(r->content_type) + 13
278                                   - (match[0].rm_eo - match[0].rm_so) + 1);
279            memcpy(str, r->content_type, match[1].rm_so);
280            memcpy(str + match[1].rm_so, "utf-8", 5);
281            strcpy(str + match[1].rm_so + 5, r->content_type+match[1].rm_eo);
282            r->content_type = str;
283        }
284    }
285}
286
287static apr_status_t xml2enc_filter_init(ap_filter_t* f)
288{
289    xml2ctx* ctx;
290    if (!f->ctx) {
291        xml2cfg* cfg = ap_get_module_config(f->r->per_dir_config,
292                                            &xml2enc_module);
293        f->ctx = ctx = apr_pcalloc(f->r->pool, sizeof(xml2ctx));
294        ctx->xml2enc = XML_CHAR_ENCODING_NONE;
295        if (cfg->skipto != NULL) {
296            ctx->flags |= ENC_SKIPTO;
297        }
298    }
299    return APR_SUCCESS;
300}
301static apr_status_t xml2enc_ffunc(ap_filter_t* f, apr_bucket_brigade* bb)
302{
303    xml2ctx* ctx = f->ctx;
304    apr_status_t rv;
305    apr_bucket* b;
306    apr_bucket* bstart;
307    apr_size_t insz = 0;
308    char *ctype;
309    char *p;
310
311    if (!ctx || !f->r->content_type) {
312        /* log error about configuring this */
313        ap_remove_output_filter(f);
314        return ap_pass_brigade(f->next, bb) ;
315    }
316
317    ctype = apr_pstrdup(f->r->pool, f->r->content_type);
318    for (p = ctype; *p; ++p)
319        if (isupper(*p))
320            *p = tolower(*p);
321
322    /* only act if starts-with "text/" or contains "xml" */
323    if (strncmp(ctype, "text/", 5) && !strstr(ctype, "xml"))  {
324        ap_remove_output_filter(f);
325        return ap_pass_brigade(f->next, bb) ;
326    }
327
328    if (ctx->bbsave == NULL) {
329        ctx->bbsave = apr_brigade_create(f->r->pool,
330                                         f->r->connection->bucket_alloc);
331    }
332    /* append to any data left over from last time */
333    APR_BRIGADE_CONCAT(ctx->bbsave, bb);
334
335    if (!(ctx->flags & ENC_INITIALISED)) {
336        /* some kind of initialisation required */
337        /* Turn all this off when post-processing */
338
339        /* if we don't have enough data to sniff but more's to come, wait */
340        apr_brigade_length(ctx->bbsave, 0, &ctx->bblen);
341        if ((ctx->bblen < BUF_MIN) && (ctx->bblen != -1)) {
342            APR_BRIGADE_DO(b, ctx->bbsave) {
343                if (APR_BUCKET_IS_EOS(b)) {
344                    ctx->flags |= ENC_SEEN_EOS;
345                    break;
346                }
347            }
348            if (!(ctx->flags & ENC_SEEN_EOS)) {
349                /* not enough data to sniff.  Wait for more */
350                APR_BRIGADE_DO(b, ctx->bbsave) {
351                    rv = apr_bucket_setaside(b, f->r->pool);
352                    ap_assert(rv == APR_SUCCESS);
353                }
354                return APR_SUCCESS;
355            }
356        }
357        if (ctx->bblen == -1) {
358            ctx->bblen = BUFLEN-1;
359        }
360
361        /* flatten it into a NULL-terminated string */
362        ctx->buf = apr_palloc(f->r->pool, (apr_size_t)(ctx->bblen+1));
363        ctx->bytes = (apr_size_t)ctx->bblen;
364        rv = apr_brigade_flatten(ctx->bbsave, ctx->buf, &ctx->bytes);
365        ap_assert(rv == APR_SUCCESS);
366        ctx->buf[ctx->bytes] = 0;
367        sniff_encoding(f->r, ctx);
368
369        /* FIXME: hook here for rewriting start-of-data? */
370        /* nah, we only have one action here - call it inline */
371        fix_skipto(f->r, ctx);
372
373        /* we might change the Content-Length, so let's force its re-calculation */
374        apr_table_unset(f->r->headers_out, "Content-Length");
375
376        /* consume the data we just sniffed */
377        /* we need to omit any <meta> we just invalidated */
378        ctx->flags |= ENC_INITIALISED;
379        ap_set_module_config(f->r->request_config, &xml2enc_module, ctx);
380    }
381    if (ctx->bbnext == NULL) {
382        ctx->bbnext = apr_brigade_create(f->r->pool,
383                                         f->r->connection->bucket_alloc);
384    }
385
386    if (!ctx->convset) {
387        rv = ap_pass_brigade(f->next, ctx->bbsave);
388        apr_brigade_cleanup(ctx->bbsave);
389        ap_remove_output_filter(f);
390        return rv;
391    }
392    /* move the data back to bb */
393    APR_BRIGADE_CONCAT(bb, ctx->bbsave);
394
395    while (b = APR_BRIGADE_FIRST(bb), b != APR_BRIGADE_SENTINEL(bb)) {
396        ctx->bytes = 0;
397        if (APR_BUCKET_IS_METADATA(b)) {
398            APR_BUCKET_REMOVE(b);
399            if (APR_BUCKET_IS_EOS(b)) {
400                /* send remaining data */
401                APR_BRIGADE_INSERT_TAIL(ctx->bbnext, b);
402                return ap_fflush(f->next, ctx->bbnext);
403            } else if (APR_BUCKET_IS_FLUSH(b)) {
404                ap_fflush(f->next, ctx->bbnext);
405            }
406            apr_bucket_destroy(b);
407        }
408        else {        /* data bucket */
409            char* buf;
410            apr_size_t bytes = 0;
411            char fixbuf[BUFLEN];
412            apr_bucket* bdestroy = NULL;
413            if (insz > 0) { /* we have dangling data.  Flatten it. */
414                buf = fixbuf;
415                bytes = BUFLEN;
416                rv = apr_brigade_flatten(bb, buf, &bytes);
417                ap_assert(rv == APR_SUCCESS);
418                if (bytes == insz) {
419                    /* this is only what we've already tried to convert.
420                     * The brigade is exhausted.
421                     * Save remaining data for next time round
422                     */
423
424                    ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01437)
425                                  "xml2enc: Setting aside %" APR_SIZE_T_FMT
426                                  " unconverted bytes", bytes);
427                    rv = ap_fflush(f->next, ctx->bbnext);
428                    APR_BRIGADE_CONCAT(ctx->bbsave, bb);
429                    APR_BRIGADE_DO(b, ctx->bbsave) {
430                        ap_assert(apr_bucket_setaside(b, f->r->pool)
431                                  == APR_SUCCESS);
432                    }
433                    return rv;
434                }
435                /* remove the data we've just read */
436                rv = apr_brigade_partition(bb, bytes, &bstart);
437                while (b = APR_BRIGADE_FIRST(bb), b != bstart) {
438                    APR_BUCKET_REMOVE(b);
439                    apr_bucket_destroy(b);
440                }
441                ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01438)
442                              "xml2enc: consuming %" APR_SIZE_T_FMT
443                              " bytes flattened", bytes);
444            }
445            else {
446                rv = apr_bucket_read(b, (const char**)&buf, &bytes,
447                                     APR_BLOCK_READ);
448                APR_BUCKET_REMOVE(b);
449                bdestroy = b;  /* can't destroy until finished with the data */
450                ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01439)
451                              "xml2enc: consuming %" APR_SIZE_T_FMT
452                              " bytes from bucket", bytes);
453            }
454            /* OK, we've got some input we can use in [buf,bytes] */
455            if (rv == APR_SUCCESS) {
456                apr_size_t consumed;
457                xml2enc_run_preprocess(f, &buf, &bytes);
458                consumed = insz = bytes;
459                while (insz > 0) {
460                    apr_status_t rv2;
461                    if (ctx->bytes == ctx->bblen) {
462                        /* nothing was converted last time!
463                         * break out of this loop!
464                         */
465                        b = apr_bucket_transient_create(buf+(bytes - insz), insz,
466                                                        bb->bucket_alloc);
467                        APR_BRIGADE_INSERT_HEAD(bb, b);
468                        ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01440)
469                                      "xml2enc: reinserting %" APR_SIZE_T_FMT
470                                      " unconsumed bytes from bucket", insz);
471                        break;
472                    }
473                    ctx->bytes = (apr_size_t)ctx->bblen;
474                    rv = apr_xlate_conv_buffer(ctx->convset, buf+(bytes - insz),
475                                               &insz, ctx->buf, &ctx->bytes);
476                    ap_log_rerror(APLOG_MARK, APLOG_DEBUG, rv, f->r, APLOGNO(01441)
477                                  "xml2enc: converted %" APR_SIZE_T_FMT
478                                  "/%" APR_OFF_T_FMT " bytes", consumed - insz,
479                                  ctx->bblen - ctx->bytes);
480                    consumed = insz;
481                    rv2 = ap_fwrite(f->next, ctx->bbnext, ctx->buf,
482                                    (apr_size_t)ctx->bblen - ctx->bytes);
483                    if (rv2 != APR_SUCCESS) {
484                        ap_log_rerror(APLOG_MARK, APLOG_DEBUG, rv2, f->r, APLOGNO(01442)
485                                      "ap_fwrite failed");
486                        return rv2;
487                    }
488                    switch (rv) {
489                    case APR_SUCCESS:
490                        continue;
491                    case APR_EINCOMPLETE:
492                        ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01443)
493                                      "INCOMPLETE");
494                        continue;     /* If outbuf too small, go round again.
495                                       * If it was inbuf, we'll break out when
496                                       * we test ctx->bytes == ctx->bblen
497                                       */
498                    case APR_EINVAL: /* try skipping one bad byte */
499                        ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, APLOGNO(01444)
500                                   "Skipping invalid byte(s) in input stream!");
501                        --insz;
502                        continue;
503                    default:
504                        /* Erk!  What's this?
505                         * Bail out, flush, and hope to eat the buf raw
506                         */
507                        ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, APLOGNO(01445)
508                                      "Failed to convert input; trying it raw") ;
509                        ctx->convset = NULL;
510                        rv = ap_fflush(f->next, ctx->bbnext);
511                        if (rv != APR_SUCCESS)
512                            ap_log_rerror(APLOG_MARK, APLOG_DEBUG, rv, f->r, APLOGNO(01446)
513                                          "ap_fflush failed");
514                        else
515                            rv = ap_pass_brigade(f->next, ctx->bbnext);
516                    }
517                }
518            } else {
519                ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, APLOGNO(01447)
520                              "xml2enc: error reading data") ;
521            }
522            if (bdestroy)
523                apr_bucket_destroy(bdestroy);
524            if (rv != APR_SUCCESS)
525                return rv;
526        }
527    }
528    return APR_SUCCESS;
529}
530static apr_status_t xml2enc_charset(request_rec* r, xmlCharEncoding* encp,
531                                    const char** encoding)
532{
533    xml2ctx* ctx = ap_get_module_config(r->request_config, &xml2enc_module);
534    if (!ctx || !(ctx->flags & ENC_INITIALISED)) {
535        return APR_EAGAIN;
536    }
537    *encp = ctx->xml2enc;
538    *encoding = ctx->encoding;
539    return HAVE_ENCODING(ctx->xml2enc) ? APR_SUCCESS : APR_EGENERAL;
540}
541
542#define PROTO_FLAGS AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH
543static void xml2enc_hooks(apr_pool_t* pool)
544{
545    ap_register_output_filter_protocol("xml2enc", xml2enc_ffunc,
546                                       xml2enc_filter_init,
547                                       AP_FTYPE_RESOURCE, PROTO_FLAGS);
548    APR_REGISTER_OPTIONAL_FN(xml2enc_filter);
549    APR_REGISTER_OPTIONAL_FN(xml2enc_charset);
550    seek_meta_ctype = ap_pregcomp(pool,
551                       "(<meta[^>]*http-equiv[ \t\r\n='\"]*content-type[^>]*>)",
552                                  AP_REG_EXTENDED|AP_REG_ICASE) ;
553    seek_charset = ap_pregcomp(pool, "charset=([A-Za-z0-9_-]+)",
554                               AP_REG_EXTENDED|AP_REG_ICASE) ;
555}
556static const char* set_alias(cmd_parms* cmd, void* CFG,
557                             const char* charset, const char* alias)
558{
559    const char* errmsg = ap_check_cmd_context(cmd, GLOBAL_ONLY);
560    if (errmsg != NULL)
561        return errmsg ;
562    else if (xmlAddEncodingAlias(charset, alias) == 0)
563        return NULL;
564    else
565        return "Error setting charset alias";
566}
567
568static const char* set_default(cmd_parms* cmd, void* CFG, const char* charset)
569{
570    xml2cfg* cfg = CFG;
571    cfg->default_charset = charset;
572    cfg->default_encoding = xmlParseCharEncoding(charset);
573    switch(cfg->default_encoding) {
574    case XML_CHAR_ENCODING_NONE:
575        return "Default charset not found";
576    case XML_CHAR_ENCODING_ERROR:
577        return "Invalid or unsupported default charset";
578    default:
579        return NULL;
580    }
581}
582static const char* set_skipto(cmd_parms* cmd, void* CFG, const char* arg)
583{
584    tattr* attr;
585    xml2cfg* cfg = CFG;
586    if (cfg->skipto == NULL)
587        cfg->skipto = apr_array_make(cmd->pool, 4, sizeof(tattr));
588    attr = apr_array_push(cfg->skipto) ;
589    attr->val = arg;
590    return NULL;
591}
592
593static const command_rec xml2enc_cmds[] = {
594    AP_INIT_TAKE1("xml2EncDefault", set_default, NULL, OR_ALL,
595                  "Usage: xml2EncDefault charset"),
596    AP_INIT_ITERATE2("xml2EncAlias", set_alias, NULL, RSRC_CONF,
597                     "EncodingAlias charset alias [more aliases]"),
598    AP_INIT_ITERATE("xml2StartParse", set_skipto, NULL, OR_ALL,
599                    "Ignore anything in front of the first of these elements"),
600    { NULL }
601};
602static void* xml2enc_config(apr_pool_t* pool, char* x)
603{
604    xml2cfg* ret = apr_pcalloc(pool, sizeof(xml2cfg));
605    ret->default_encoding = XML_CHAR_ENCODING_NONE ;
606    return ret;
607}
608
609static void* xml2enc_merge(apr_pool_t* pool, void* BASE, void* ADD)
610{
611    xml2cfg* base = BASE;
612    xml2cfg* add = ADD;
613    xml2cfg* ret = apr_pcalloc(pool, sizeof(xml2cfg));
614    ret->default_encoding = (add->default_encoding == XML_CHAR_ENCODING_NONE)
615                          ? base->default_encoding : add->default_encoding ;
616    ret->default_charset = add->default_charset
617                         ? add->default_charset : base->default_charset;
618    ret->skipto = add->skipto ? add->skipto : base->skipto;
619    return ret;
620}
621
622AP_DECLARE_MODULE(xml2enc) = {
623    STANDARD20_MODULE_STUFF,
624    xml2enc_config,
625    xml2enc_merge,
626    NULL,
627    NULL,
628    xml2enc_cmds,
629    xml2enc_hooks
630};
631
632APR_IMPLEMENT_OPTIONAL_HOOK_RUN_ALL(xml2enc, XML2ENC, int, preprocess,
633                      (ap_filter_t *f, char** bufp, apr_size_t* bytesp),
634                      (f, bufp, bytesp), OK, DECLINED)
635