1/*      Copyright (c) 2003-11, WebThing Ltd
2 *      Copyright (c) 2011-, The Apache Software Foundation
3 *
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements.  See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License.  You may obtain a copy of the License at
10 *
11 *     http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20/*      GO_FASTER
21        You can #define GO_FASTER to disable trace logging.
22*/
23
24#ifdef GO_FASTER
25#define VERBOSE(x)
26#define VERBOSEB(x)
27#else
28#define VERBOSE(x) if (verbose) x
29#define VERBOSEB(x) if (verbose) {x}
30#endif
31
32/* libxml2 */
33#include <libxml/HTMLparser.h>
34
35#include "http_protocol.h"
36#include "http_config.h"
37#include "http_log.h"
38#include "apr_strings.h"
39#include "apr_hash.h"
40#include "apr_strmatch.h"
41#include "apr_lib.h"
42
43#include "apr_optional.h"
44#include "mod_xml2enc.h"
45#include "http_request.h"
46#include "ap_expr.h"
47
48/* globals set once at startup */
49static ap_rxplus_t *old_expr;
50static ap_regex_t *seek_meta;
51static const apr_strmatch_pattern* seek_content;
52static apr_status_t (*xml2enc_charset)(request_rec*, xmlCharEncoding*, const char**) = NULL;
53static apr_status_t (*xml2enc_filter)(request_rec*, const char*, unsigned int) = NULL;
54
55module AP_MODULE_DECLARE_DATA proxy_html_module;
56
57#define M_HTML                  0x01
58#define M_EVENTS                0x02
59#define M_CDATA                 0x04
60#define M_REGEX                 0x08
61#define M_ATSTART               0x10
62#define M_ATEND                 0x20
63#define M_LAST                  0x40
64#define M_NOTLAST               0x80
65#define M_INTERPOLATE_TO        0x100
66#define M_INTERPOLATE_FROM      0x200
67
68typedef struct {
69    const char *val;
70} tattr;
71typedef struct {
72    unsigned int start;
73    unsigned int end;
74} meta;
75typedef struct urlmap {
76    struct urlmap *next;
77    unsigned int flags;
78    unsigned int regflags;
79    union {
80        const char *c;
81        ap_regex_t *r;
82    } from;
83    const char *to;
84    ap_expr_info_t *cond;
85} urlmap;
86typedef struct {
87    urlmap *map;
88    const char *doctype;
89    const char *etag;
90    unsigned int flags;
91    size_t bufsz;
92    apr_hash_t *links;
93    apr_array_header_t *events;
94    const char *charset_out;
95    int extfix;
96    int metafix;
97    int strip_comments;
98    int interp;
99    int enabled;
100} proxy_html_conf;
101typedef struct {
102    ap_filter_t *f;
103    proxy_html_conf *cfg;
104    htmlParserCtxtPtr parser;
105    apr_bucket_brigade *bb;
106    char *buf;
107    size_t offset;
108    size_t avail;
109    const char *encoding;
110    urlmap *map;
111} saxctxt;
112
113
114#define NORM_LC 0x1
115#define NORM_MSSLASH 0x2
116#define NORM_RESET 0x4
117static htmlSAXHandler sax;
118
119typedef enum { ATTR_IGNORE, ATTR_URI, ATTR_EVENT } rewrite_t;
120
121static const char *const fpi_html =
122        "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n";
123static const char *const fpi_html_legacy =
124        "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n";
125static const char *const fpi_xhtml =
126        "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n";
127static const char *const fpi_xhtml_legacy =
128        "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n";
129static const char *const html_etag = ">";
130static const char *const xhtml_etag = " />";
131/*#define DEFAULT_DOCTYPE fpi_html */
132static const char *const DEFAULT_DOCTYPE = "";
133#define DEFAULT_ETAG html_etag
134
135static void normalise(unsigned int flags, char *str)
136{
137    char *p;
138    if (flags & NORM_LC)
139        for (p = str; *p; ++p)
140            if (isupper(*p))
141                *p = tolower(*p);
142
143    if (flags & NORM_MSSLASH)
144        for (p = ap_strchr(str, '\\'); p; p = ap_strchr(p+1, '\\'))
145            *p = '/';
146
147}
148#define consume_buffer(ctx,inbuf,bytes,flag) \
149        htmlParseChunk(ctx->parser, inbuf, bytes, flag)
150
151#define AP_fwrite(ctx,inbuf,bytes,flush) \
152        ap_fwrite(ctx->f->next, ctx->bb, inbuf, bytes);
153
154/* This is always utf-8 on entry.  We can convert charset within FLUSH */
155#define FLUSH AP_fwrite(ctx, (chars+begin), (i-begin), 0); begin = i+1
156static void pcharacters(void *ctxt, const xmlChar *uchars, int length)
157{
158    const char *chars = (const char*) uchars;
159    saxctxt *ctx = (saxctxt*) ctxt;
160    int i;
161    int begin;
162    for (begin=i=0; i<length; i++) {
163        switch (chars[i]) {
164        case '&' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, "&amp;"); break;
165        case '<' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, "&lt;"); break;
166        case '>' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, "&gt;"); break;
167        case '"' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, "&quot;"); break;
168        default : break;
169        }
170    }
171    FLUSH;
172}
173
174static void preserve(saxctxt *ctx, const size_t len)
175{
176    char *newbuf;
177    if (len <= (ctx->avail - ctx->offset))
178        return;
179    else while (len > (ctx->avail - ctx->offset))
180        ctx->avail += ctx->cfg->bufsz;
181
182    newbuf = realloc(ctx->buf, ctx->avail);
183    if (newbuf != ctx->buf) {
184        if (ctx->buf)
185            apr_pool_cleanup_kill(ctx->f->r->pool, ctx->buf,
186                                  (int(*)(void*))free);
187        apr_pool_cleanup_register(ctx->f->r->pool, newbuf,
188                                  (int(*)(void*))free, apr_pool_cleanup_null);
189        ctx->buf = newbuf;
190    }
191}
192
193static void pappend(saxctxt *ctx, const char *buf, const size_t len)
194{
195    preserve(ctx, len);
196    memcpy(ctx->buf+ctx->offset, buf, len);
197    ctx->offset += len;
198}
199
200static void dump_content(saxctxt *ctx)
201{
202    urlmap *m;
203    char *found;
204    size_t s_from, s_to;
205    size_t match;
206    char c = 0;
207    int nmatch;
208    ap_regmatch_t pmatch[10];
209    char *subs;
210    size_t len, offs;
211    urlmap *themap = ctx->map;
212#ifndef GO_FASTER
213    int verbose = APLOGrtrace1(ctx->f->r);
214#endif
215
216    pappend(ctx, &c, 1);        /* append null byte */
217        /* parse the text for URLs */
218    for (m = themap; m; m = m->next) {
219        if (!(m->flags & M_CDATA))
220            continue;
221        if (m->flags & M_REGEX) {
222            nmatch = 10;
223            offs = 0;
224            while (!ap_regexec(m->from.r, ctx->buf+offs, nmatch, pmatch, 0)) {
225                match = pmatch[0].rm_so;
226                s_from = pmatch[0].rm_eo - match;
227                subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
228                                  nmatch, pmatch);
229                s_to = strlen(subs);
230                len = strlen(ctx->buf);
231                offs += match;
232                VERBOSEB(
233                    const char *f = apr_pstrndup(ctx->f->r->pool,
234                    ctx->buf + offs, s_from);
235                    ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, ctx->f->r,
236                                  "C/RX: match at %s, substituting %s", f, subs);
237                )
238                if (s_to > s_from) {
239                    preserve(ctx, s_to - s_from);
240                    memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
241                            len + 1 - s_from - offs);
242                    memcpy(ctx->buf+offs, subs, s_to);
243                }
244                else {
245                    memcpy(ctx->buf + offs, subs, s_to);
246                    memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
247                            len + 1 - s_from - offs);
248                }
249                offs += s_to;
250            }
251        }
252        else {
253            s_from = strlen(m->from.c);
254            s_to = strlen(m->to);
255            for (found = strstr(ctx->buf, m->from.c); found;
256                 found = strstr(ctx->buf+match+s_to, m->from.c)) {
257                match = found - ctx->buf;
258                if ((m->flags & M_ATSTART) && (match != 0))
259                    break;
260                len = strlen(ctx->buf);
261                if ((m->flags & M_ATEND) && (match < (len - s_from)))
262                    continue;
263                VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, ctx->f->r,
264                                      "C: matched %s, substituting %s",
265                                      m->from.c, m->to));
266                if (s_to > s_from) {
267                    preserve(ctx, s_to - s_from);
268                    memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
269                            len + 1 - s_from - match);
270                    memcpy(ctx->buf+match, m->to, s_to);
271                }
272                else {
273                    memcpy(ctx->buf+match, m->to, s_to);
274                    memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
275                            len + 1 - s_from - match);
276                }
277            }
278        }
279    }
280    AP_fwrite(ctx, ctx->buf, strlen(ctx->buf), 1);
281}
282static void pcdata(void *ctxt, const xmlChar *uchars, int length)
283{
284    const char *chars = (const char*) uchars;
285    saxctxt *ctx = (saxctxt*) ctxt;
286    if (ctx->cfg->extfix) {
287        pappend(ctx, chars, length);
288    }
289    else {
290        /* not sure if this should force-flush
291         * (i.e. can one cdata section come in multiple calls?)
292         */
293        AP_fwrite(ctx, chars, length, 0);
294    }
295}
296static void pcomment(void *ctxt, const xmlChar *uchars)
297{
298    const char *chars = (const char*) uchars;
299    saxctxt *ctx = (saxctxt*) ctxt;
300    if (ctx->cfg->strip_comments)
301        return;
302
303    if (ctx->cfg->extfix) {
304        pappend(ctx, "<!--", 4);
305        pappend(ctx, chars, strlen(chars));
306        pappend(ctx, "-->", 3);
307    }
308    else {
309        ap_fputs(ctx->f->next, ctx->bb, "<!--");
310        AP_fwrite(ctx, chars, strlen(chars), 1);
311        ap_fputs(ctx->f->next, ctx->bb, "-->");
312    }
313}
314static void pendElement(void *ctxt, const xmlChar *uname)
315{
316    saxctxt *ctx = (saxctxt*) ctxt;
317    const char *name = (const char*) uname;
318    const htmlElemDesc* desc = htmlTagLookup(uname);
319
320    if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
321        /* enforce html */
322        if (!desc || desc->depr)
323            return;
324
325    }
326    else if ((ctx->cfg->doctype == fpi_html)
327             || (ctx->cfg->doctype == fpi_xhtml)) {
328        /* enforce html legacy */
329        if (!desc)
330            return;
331    }
332    /* TODO - implement HTML "allowed here" using the stack */
333    /* nah.  Keeping the stack is too much overhead */
334
335    if (ctx->offset > 0) {
336        dump_content(ctx);
337        ctx->offset = 0;        /* having dumped it, we can re-use the memory */
338    }
339    if (!desc || !desc->empty) {
340        ap_fprintf(ctx->f->next, ctx->bb, "</%s>", name);
341    }
342}
343
344static void pstartElement(void *ctxt, const xmlChar *uname,
345                          const xmlChar** uattrs)
346{
347    int required_attrs;
348    int num_match;
349    size_t offs, len;
350    char *subs;
351    rewrite_t is_uri;
352    const char** a;
353    urlmap *m;
354    size_t s_to, s_from, match;
355    char *found;
356    saxctxt *ctx = (saxctxt*) ctxt;
357    size_t nmatch;
358    ap_regmatch_t pmatch[10];
359#ifndef GO_FASTER
360    int verbose = APLOGrtrace1(ctx->f->r);
361#endif
362    apr_array_header_t *linkattrs;
363    int i;
364    const char *name = (const char*) uname;
365    const char** attrs = (const char**) uattrs;
366    const htmlElemDesc* desc = htmlTagLookup(uname);
367    urlmap *themap = ctx->map;
368#ifdef HAVE_STACK
369    const void** descp;
370#endif
371    int enforce = 0;
372    if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
373        /* enforce html */
374        enforce = 2;
375        if (!desc || desc->depr)
376            return;
377
378    }
379    else if ((ctx->cfg->doctype == fpi_html)
380             || (ctx->cfg->doctype == fpi_xhtml)) {
381        enforce = 1;
382        /* enforce html legacy */
383        if (!desc) {
384            return;
385        }
386    }
387    if (!desc && enforce) {
388        ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01416)
389                      "Bogus HTML element %s dropped", name);
390        return;
391    }
392    if (desc && desc->depr && (enforce == 2)) {
393        ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01417)
394                      "Deprecated HTML element %s dropped", name);
395        return;
396    }
397#ifdef HAVE_STACK
398    descp = apr_array_push(ctx->stack);
399    *descp = desc;
400    /* TODO - implement HTML "allowed here" */
401#endif
402
403    ap_fputc(ctx->f->next, ctx->bb, '<');
404    ap_fputs(ctx->f->next, ctx->bb, name);
405
406    required_attrs = 0;
407    if ((enforce > 0) && (desc != NULL) && (desc->attrs_req != NULL))
408        for (a = desc->attrs_req; *a; a++)
409            ++required_attrs;
410
411    if (attrs) {
412        linkattrs = apr_hash_get(ctx->cfg->links, name, APR_HASH_KEY_STRING);
413        for (a = attrs; *a; a += 2) {
414            if (desc && enforce > 0) {
415                switch (htmlAttrAllowed(desc, (xmlChar*)*a, 2-enforce)) {
416                case HTML_INVALID:
417                    ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01418)
418                                  "Bogus HTML attribute %s of %s dropped",
419                                  *a, name);
420                    continue;
421                case HTML_DEPRECATED:
422                    ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01419)
423                                  "Deprecated HTML attribute %s of %s dropped",
424                                  *a, name);
425                    continue;
426                case HTML_REQUIRED:
427                    required_attrs--;   /* cross off the number still needed */
428                /* fallthrough - required implies valid */
429                default:
430                    break;
431                }
432            }
433            ctx->offset = 0;
434            if (a[1]) {
435                pappend(ctx, a[1], strlen(a[1])+1);
436                is_uri = ATTR_IGNORE;
437                if (linkattrs) {
438                    tattr *attrs = (tattr*) linkattrs->elts;
439                    for (i=0; i < linkattrs->nelts; ++i) {
440                        if (!strcmp(*a, attrs[i].val)) {
441                            is_uri = ATTR_URI;
442                            break;
443                        }
444                    }
445                }
446                if ((is_uri == ATTR_IGNORE) && ctx->cfg->extfix
447                    && (ctx->cfg->events != NULL)) {
448                    for (i=0; i < ctx->cfg->events->nelts; ++i) {
449                        tattr *attrs = (tattr*) ctx->cfg->events->elts;
450                        if (!strcmp(*a, attrs[i].val)) {
451                            is_uri = ATTR_EVENT;
452                            break;
453                        }
454                    }
455                }
456                switch (is_uri) {
457                case ATTR_URI:
458                    num_match = 0;
459                    for (m = themap; m; m = m->next) {
460                        if (!(m->flags & M_HTML))
461                            continue;
462                        if (m->flags & M_REGEX) {
463                            nmatch = 10;
464                            if (!ap_regexec(m->from.r, ctx->buf, nmatch,
465                                            pmatch, 0)) {
466                                ++num_match;
467                                offs = match = pmatch[0].rm_so;
468                                s_from = pmatch[0].rm_eo - match;
469                                subs = ap_pregsub(ctx->f->r->pool, m->to,
470                                                  ctx->buf, nmatch, pmatch);
471                                VERBOSE({
472                                    const char *f;
473                                    f = apr_pstrndup(ctx->f->r->pool,
474                                                     ctx->buf + offs, s_from);
475                                    ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0,
476                                                  ctx->f->r,
477                                         "H/RX: match at %s, substituting %s",
478                                                  f, subs);
479                                })
480                                s_to = strlen(subs);
481                                len = strlen(ctx->buf);
482                                if (s_to > s_from) {
483                                    preserve(ctx, s_to - s_from);
484                                    memmove(ctx->buf+offs+s_to,
485                                            ctx->buf+offs+s_from,
486                                            len + 1 - s_from - offs);
487                                    memcpy(ctx->buf+offs, subs, s_to);
488                                }
489                                else {
490                                    memcpy(ctx->buf + offs, subs, s_to);
491                                    memmove(ctx->buf+offs+s_to,
492                                            ctx->buf+offs+s_from,
493                                            len + 1 - s_from - offs);
494                                }
495                            }
496                        } else {
497                            s_from = strlen(m->from.c);
498                            if (!strncasecmp(ctx->buf, m->from.c, s_from)) {
499                                ++num_match;
500                                s_to = strlen(m->to);
501                                len = strlen(ctx->buf);
502                                VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3,
503                                                      0, ctx->f->r,
504                                              "H: matched %s, substituting %s",
505                                                      m->from.c, m->to));
506                                if (s_to > s_from) {
507                                    preserve(ctx, s_to - s_from);
508                                    memmove(ctx->buf+s_to, ctx->buf+s_from,
509                                            len + 1 - s_from);
510                                    memcpy(ctx->buf, m->to, s_to);
511                                }
512                                else {     /* it fits in the existing space */
513                                    memcpy(ctx->buf, m->to, s_to);
514                                    memmove(ctx->buf+s_to, ctx->buf+s_from,
515                                            len + 1 - s_from);
516                                }
517                                break;
518                            }
519                        }
520                        /* URIs only want one match unless overridden in the config */
521                        if ((num_match > 0) && !(m->flags & M_NOTLAST))
522                            break;
523                    }
524                    break;
525                case ATTR_EVENT:
526                    for (m = themap; m; m = m->next) {
527                        num_match = 0;        /* reset here since we're working per-rule */
528                        if (!(m->flags & M_EVENTS))
529                            continue;
530                        if (m->flags & M_REGEX) {
531                            nmatch = 10;
532                            offs = 0;
533                            while (!ap_regexec(m->from.r, ctx->buf+offs,
534                                               nmatch, pmatch, 0)) {
535                                match = pmatch[0].rm_so;
536                                s_from = pmatch[0].rm_eo - match;
537                                subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
538                                                    nmatch, pmatch);
539                                VERBOSE({
540                                    const char *f;
541                                    f = apr_pstrndup(ctx->f->r->pool,
542                                                     ctx->buf + offs, s_from);
543                                    ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0,
544                                                  ctx->f->r,
545                                           "E/RX: match at %s, substituting %s",
546                                                  f, subs);
547                                })
548                                s_to = strlen(subs);
549                                offs += match;
550                                len = strlen(ctx->buf);
551                                if (s_to > s_from) {
552                                    preserve(ctx, s_to - s_from);
553                                    memmove(ctx->buf+offs+s_to,
554                                            ctx->buf+offs+s_from,
555                                            len + 1 - s_from - offs);
556                                    memcpy(ctx->buf+offs, subs, s_to);
557                                }
558                                else {
559                                    memcpy(ctx->buf + offs, subs, s_to);
560                                    memmove(ctx->buf+offs+s_to,
561                                            ctx->buf+offs+s_from,
562                                            len + 1 - s_from - offs);
563                                }
564                                offs += s_to;
565                                ++num_match;
566                            }
567                        }
568                        else {
569                            found = strstr(ctx->buf, m->from.c);
570                            if ((m->flags & M_ATSTART) && (found != ctx->buf))
571                                continue;
572                            while (found) {
573                                s_from = strlen(m->from.c);
574                                s_to = strlen(m->to);
575                                match = found - ctx->buf;
576                                if ((s_from < strlen(found))
577                                    && (m->flags & M_ATEND)) {
578                                    found = strstr(ctx->buf+match+s_from,
579                                                   m->from.c);
580                                    continue;
581                                }
582                                else {
583                                    found = strstr(ctx->buf+match+s_to,
584                                                   m->from.c);
585                                }
586                                VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3,
587                                                      0, ctx->f->r,
588                                              "E: matched %s, substituting %s",
589                                                      m->from.c, m->to));
590                                len = strlen(ctx->buf);
591                                if (s_to > s_from) {
592                                    preserve(ctx, s_to - s_from);
593                                    memmove(ctx->buf+match+s_to,
594                                            ctx->buf+match+s_from,
595                                            len + 1 - s_from - match);
596                                    memcpy(ctx->buf+match, m->to, s_to);
597                                }
598                                else {
599                                    memcpy(ctx->buf+match, m->to, s_to);
600                                    memmove(ctx->buf+match+s_to,
601                                            ctx->buf+match+s_from,
602                                            len + 1 - s_from - match);
603                                }
604                                ++num_match;
605                            }
606                        }
607                        if (num_match && (m->flags & M_LAST))
608                            break;
609                    }
610                    break;
611                case ATTR_IGNORE:
612                    break;
613                }
614            }
615            if (!a[1])
616                ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], NULL);
617            else {
618
619                if (ctx->cfg->flags != 0)
620                    normalise(ctx->cfg->flags, ctx->buf);
621
622                /* write the attribute, using pcharacters to html-escape
623                   anything that needs it in the value.
624                */
625                ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], "=\"", NULL);
626                pcharacters(ctx, (const xmlChar*)ctx->buf, strlen(ctx->buf));
627                ap_fputc(ctx->f->next, ctx->bb, '"');
628            }
629        }
630    }
631    ctx->offset = 0;
632    if (desc && desc->empty)
633        ap_fputs(ctx->f->next, ctx->bb, ctx->cfg->etag);
634    else
635        ap_fputc(ctx->f->next, ctx->bb, '>');
636
637    if ((enforce > 0) && (required_attrs > 0)) {
638        /* if there are more required attributes than we found then complain */
639        ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01420)
640                      "HTML element %s is missing %d required attributes",
641                      name, required_attrs);
642    }
643}
644
645static meta *metafix(request_rec *r, const char *buf)
646{
647    meta *ret = NULL;
648    size_t offs = 0;
649    const char *p;
650    const char *q;
651    char *header;
652    char *content;
653    ap_regmatch_t pmatch[2];
654    char delim;
655
656    while (!ap_regexec(seek_meta, buf+offs, 2, pmatch, 0)) {
657        header = NULL;
658        content = NULL;
659        p = buf+offs+pmatch[1].rm_eo;
660        while (!apr_isalpha(*++p));
661        for (q = p; apr_isalnum(*q) || (*q == '-'); ++q);
662        header = apr_pstrndup(r->pool, p, q-p);
663        if (strncasecmp(header, "Content-", 8)) {
664            /* find content=... string */
665            p = apr_strmatch(seek_content, buf+offs+pmatch[0].rm_so,
666                              pmatch[0].rm_eo - pmatch[0].rm_so);
667            /* if it doesn't contain "content", ignore, don't crash! */
668            if (p != NULL) {
669                while (*p) {
670                    p += 7;
671                    while (apr_isspace(*p))
672                        ++p;
673                    if (*p != '=')
674                        continue;
675                    while (*p && apr_isspace(*++p));
676                    if ((*p == '\'') || (*p == '"')) {
677                        delim = *p++;
678                        for (q = p; *q != delim; ++q);
679                    } else {
680                        for (q = p; *q && !apr_isspace(*q) && (*q != '>'); ++q);
681                    }
682                    content = apr_pstrndup(r->pool, p, q-p);
683                    break;
684                }
685            }
686        }
687        else if (!strncasecmp(header, "Content-Type", 12)) {
688            ret = apr_palloc(r->pool, sizeof(meta));
689            ret->start = pmatch[0].rm_so;
690            ret->end = pmatch[0].rm_eo;
691        }
692        if (header && content) {
693#ifndef GO_FASTER
694            ap_log_rerror(APLOG_MARK, APLOG_TRACE2, 0, r,
695                          "Adding header [%s: %s] from HTML META",
696                          header, content);
697#endif
698            apr_table_setn(r->headers_out, header, content);
699        }
700        offs += pmatch[0].rm_eo;
701    }
702    return ret;
703}
704
705static const char *interpolate_vars(request_rec *r, const char *str)
706{
707    const char *start;
708    const char *end;
709    const char *delim;
710    const char *before;
711    const char *after;
712    const char *replacement;
713    const char *var;
714    for (;;) {
715        start = str;
716        if (start = ap_strstr_c(start, "${"), start == NULL)
717            break;
718
719        if (end = ap_strchr_c(start+2, '}'), end == NULL)
720            break;
721
722        delim = ap_strchr_c(start, '|');
723        before = apr_pstrndup(r->pool, str, start-str);
724        after = end+1;
725        if (delim) {
726            var = apr_pstrndup(r->pool, start+2, delim-start-2);
727        }
728        else {
729            var = apr_pstrndup(r->pool, start+2, end-start-2);
730        }
731        replacement = apr_table_get(r->subprocess_env, var);
732        if (!replacement) {
733            if (delim)
734                replacement = apr_pstrndup(r->pool, delim+1, end-delim-1);
735            else
736                replacement = "";
737        }
738        str = apr_pstrcat(r->pool, before, replacement, after, NULL);
739        ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, r,
740                      "Interpolating %s  =>  %s", var, replacement);
741    }
742    return str;
743}
744static void fixup_rules(saxctxt *ctx)
745{
746    urlmap *newp;
747    urlmap *p;
748    urlmap *prev = NULL;
749    request_rec *r = ctx->f->r;
750
751    for (p = ctx->cfg->map; p; p = p->next) {
752        if (p->cond != NULL) {
753            const char *err;
754            int ok = ap_expr_exec(r, p->cond, &err);
755            if (err) {
756                ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(01421)
757                              "Error evaluating expr: %s", err);
758            }
759            if (ok == 0) {
760                continue;  /* condition is unsatisfied */
761            }
762        }
763
764        newp = apr_pmemdup(r->pool, p, sizeof(urlmap));
765
766        if (newp->flags & M_INTERPOLATE_FROM) {
767            newp->from.c = interpolate_vars(r, newp->from.c);
768            if (!newp->from.c || !*newp->from.c)
769                continue;        /* don't use empty from-pattern */
770            if (newp->flags & M_REGEX) {
771                newp->from.r = ap_pregcomp(r->pool, newp->from.c,
772                                           newp->regflags);
773            }
774        }
775        if (newp->flags & M_INTERPOLATE_TO) {
776            newp->to = interpolate_vars(r, newp->to);
777        }
778        /* evaluate p->cond; continue if unsatisfied */
779        /* create new urlmap with memcpy and append to map */
780        /* interpolate from if flagged to do so */
781        /* interpolate to if flagged to do so */
782
783        if (prev != NULL)
784            prev->next = newp;
785        else
786            ctx->map = newp;
787        prev = newp;
788    }
789
790    if (prev)
791        prev->next = NULL;
792}
793
794static saxctxt *check_filter_init (ap_filter_t *f)
795{
796    saxctxt *fctx;
797    if (!f->ctx) {
798        proxy_html_conf *cfg;
799        const char *force;
800        const char *errmsg = NULL;
801        cfg = ap_get_module_config(f->r->per_dir_config, &proxy_html_module);
802        force = apr_table_get(f->r->subprocess_env, "PROXY_HTML_FORCE");
803
804        if (!force) {
805            if (!f->r->proxyreq) {
806                errmsg = "Non-proxy request; not inserting proxy-html filter";
807            }
808            else if (!f->r->content_type) {
809                errmsg = "No content-type; bailing out of proxy-html filter";
810            }
811            else if (strncasecmp(f->r->content_type, "text/html", 9) &&
812                     strncasecmp(f->r->content_type,
813                                 "application/xhtml+xml", 21)) {
814                errmsg = "Non-HTML content; not inserting proxy-html filter";
815            }
816        }
817        if (!cfg->links) {
818            errmsg = "No links configured: nothing for proxy-html filter to do";
819        }
820
821        if (errmsg) {
822#ifndef GO_FASTER
823            ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, f->r, "%s", errmsg);
824#endif
825            ap_remove_output_filter(f);
826            return NULL;
827        }
828
829        fctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(saxctxt));
830        fctx->f = f;
831        fctx->bb = apr_brigade_create(f->r->pool,
832                                      f->r->connection->bucket_alloc);
833        fctx->cfg = cfg;
834        apr_table_unset(f->r->headers_out, "Content-Length");
835
836        if (cfg->interp)
837            fixup_rules(fctx);
838        else
839            fctx->map = cfg->map;
840        /* defer dealing with charset_out until after sniffing charset_in
841         * so we can support setting one to t'other.
842         */
843    }
844    return f->ctx;
845}
846
847static apr_status_t proxy_html_filter(ap_filter_t *f, apr_bucket_brigade *bb)
848{
849    apr_bucket* b;
850    meta *m = NULL;
851    xmlCharEncoding enc;
852    const char *buf = 0;
853    apr_size_t bytes = 0;
854#ifndef USE_OLD_LIBXML2
855    int xmlopts = XML_PARSE_RECOVER | XML_PARSE_NONET |
856                  XML_PARSE_NOBLANKS | XML_PARSE_NOERROR | XML_PARSE_NOWARNING;
857#endif
858
859    saxctxt *ctxt = check_filter_init(f);
860    if (!ctxt)
861        return ap_pass_brigade(f->next, bb);
862    for (b = APR_BRIGADE_FIRST(bb);
863         b != APR_BRIGADE_SENTINEL(bb);
864         b = APR_BUCKET_NEXT(b)) {
865        if (APR_BUCKET_IS_METADATA(b)) {
866            if (APR_BUCKET_IS_EOS(b)) {
867                if (ctxt->parser != NULL) {
868                    consume_buffer(ctxt, buf, 0, 1);
869                }
870                APR_BRIGADE_INSERT_TAIL(ctxt->bb,
871                apr_bucket_eos_create(ctxt->bb->bucket_alloc));
872                ap_pass_brigade(ctxt->f->next, ctxt->bb);
873            }
874            else if (APR_BUCKET_IS_FLUSH(b)) {
875                /* pass on flush, except at start where it would cause
876                 * headers to be sent before doc sniffing
877                 */
878                if (ctxt->parser != NULL) {
879                    ap_fflush(ctxt->f->next, ctxt->bb);
880                }
881            }
882        }
883        else if (apr_bucket_read(b, &buf, &bytes, APR_BLOCK_READ)
884                 == APR_SUCCESS) {
885            if (ctxt->parser == NULL) {
886                const char *cenc;
887                if (!xml2enc_charset ||
888                    (xml2enc_charset(f->r, &enc, &cenc) != APR_SUCCESS)) {
889                    if (!xml2enc_charset)
890                        ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r, APLOGNO(01422)
891                     "No i18n support found.  Install mod_xml2enc if required");
892                    enc = XML_CHAR_ENCODING_NONE;
893                    ap_set_content_type(f->r, "text/html;charset=utf-8");
894                }
895                else {
896                    /* if we wanted a non-default charset_out, insert the
897                     * xml2enc filter now that we've sniffed it
898                     */
899                    if (ctxt->cfg->charset_out && xml2enc_filter) {
900                        if (*ctxt->cfg->charset_out != '*')
901                            cenc = ctxt->cfg->charset_out;
902                        xml2enc_filter(f->r, cenc, ENCIO_OUTPUT);
903                        ap_set_content_type(f->r,
904                                            apr_pstrcat(f->r->pool,
905                                                        "text/html;charset=",
906                                                        cenc, NULL));
907                    }
908                    else /* Normal case, everything worked, utf-8 output */
909                        ap_set_content_type(f->r, "text/html;charset=utf-8");
910                }
911
912                ap_fputs(f->next, ctxt->bb, ctxt->cfg->doctype);
913                ctxt->parser = htmlCreatePushParserCtxt(&sax, ctxt, buf,
914                                                        4, 0, enc);
915                buf += 4;
916                bytes -= 4;
917                if (ctxt->parser == NULL) {
918                    apr_status_t rv = ap_pass_brigade(f->next, bb);
919                    ap_remove_output_filter(f);
920                    return rv;
921                }
922                apr_pool_cleanup_register(f->r->pool, ctxt->parser,
923                                          (int(*)(void*))htmlFreeParserCtxt,
924                                          apr_pool_cleanup_null);
925#ifndef USE_OLD_LIBXML2
926                if (xmlopts = xmlCtxtUseOptions(ctxt->parser, xmlopts), xmlopts)
927                    ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r, APLOGNO(01423)
928                                  "Unsupported parser opts %x", xmlopts);
929#endif
930                if (ctxt->cfg->metafix)
931                    m = metafix(f->r, buf);
932                if (m) {
933                    consume_buffer(ctxt, buf, m->start, 0);
934                    consume_buffer(ctxt, buf+m->end, bytes-m->end, 0);
935                }
936                else {
937                    consume_buffer(ctxt, buf, bytes, 0);
938                }
939            }
940            else {
941                consume_buffer(ctxt, buf, bytes, 0);
942            }
943        }
944        else {
945            ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, APLOGNO(01424)
946                          "Error in bucket read");
947        }
948    }
949    /*ap_fflush(ctxt->f->next, ctxt->bb);        // uncomment for debug */
950    apr_brigade_cleanup(bb);
951    return APR_SUCCESS;
952}
953
954static void *proxy_html_config(apr_pool_t *pool, char *x)
955{
956    proxy_html_conf *ret = apr_pcalloc(pool, sizeof(proxy_html_conf));
957    ret->doctype = DEFAULT_DOCTYPE;
958    ret->etag = DEFAULT_ETAG;
959    ret->bufsz = 8192;
960    /* ret->interp = 1; */
961    /* don't initialise links and events until they get set/used */
962    return ret;
963}
964
965static void *proxy_html_merge(apr_pool_t *pool, void *BASE, void *ADD)
966{
967    proxy_html_conf *base = (proxy_html_conf *) BASE;
968    proxy_html_conf *add = (proxy_html_conf *) ADD;
969    proxy_html_conf *conf = apr_palloc(pool, sizeof(proxy_html_conf));
970
971    /* don't merge declarations - just use the most specific */
972    conf->links = (add->links == NULL) ? base->links : add->links;
973    conf->events = (add->events == NULL) ? base->events : add->events;
974
975    conf->charset_out = (add->charset_out == NULL)
976                        ? base->charset_out : add->charset_out;
977
978    if (add->map && base->map) {
979        urlmap *a;
980        conf->map = NULL;
981        for (a = base->map; a; a = a->next) {
982            urlmap *save = conf->map;
983            conf->map = apr_pmemdup(pool, a, sizeof(urlmap));
984            conf->map->next = save;
985        }
986        for (a = add->map; a; a = a->next) {
987            urlmap *save = conf->map;
988            conf->map = apr_pmemdup(pool, a, sizeof(urlmap));
989            conf->map->next = save;
990        }
991    }
992    else
993        conf->map = add->map ? add->map : base->map;
994
995    conf->doctype = (add->doctype == DEFAULT_DOCTYPE)
996                    ? base->doctype : add->doctype;
997    conf->etag = (add->etag == DEFAULT_ETAG) ? base->etag : add->etag;
998    conf->bufsz = add->bufsz;
999    if (add->flags & NORM_RESET) {
1000        conf->flags = add->flags ^ NORM_RESET;
1001        conf->metafix = add->metafix;
1002        conf->extfix = add->extfix;
1003        conf->interp = add->interp;
1004        conf->strip_comments = add->strip_comments;
1005        conf->enabled = add->enabled;
1006    }
1007    else {
1008        conf->flags = base->flags | add->flags;
1009        conf->metafix = base->metafix | add->metafix;
1010        conf->extfix = base->extfix | add->extfix;
1011        conf->interp = base->interp | add->interp;
1012        conf->strip_comments = base->strip_comments | add->strip_comments;
1013        conf->enabled = add->enabled | base->enabled;
1014    }
1015    return conf;
1016}
1017#define REGFLAG(n,s,c) ((s&&(ap_strchr_c((s),(c))!=NULL)) ? (n) : 0)
1018#define XREGFLAG(n,s,c) ((!s||(ap_strchr_c((s),(c))==NULL)) ? (n) : 0)
1019static const char *comp_urlmap(cmd_parms *cmd, urlmap *newmap,
1020                               const char *from, const char *to,
1021                               const char *flags, const char *cond)
1022{
1023    const char *err = NULL;
1024    newmap->flags
1025        = XREGFLAG(M_HTML,flags,'h')
1026        | XREGFLAG(M_EVENTS,flags,'e')
1027        | XREGFLAG(M_CDATA,flags,'c')
1028        | REGFLAG(M_ATSTART,flags,'^')
1029        | REGFLAG(M_ATEND,flags,'$')
1030        | REGFLAG(M_REGEX,flags,'R')
1031        | REGFLAG(M_LAST,flags,'L')
1032        | REGFLAG(M_NOTLAST,flags,'l')
1033        | REGFLAG(M_INTERPOLATE_TO,flags,'V')
1034        | REGFLAG(M_INTERPOLATE_FROM,flags,'v');
1035
1036    if ((newmap->flags & M_INTERPOLATE_FROM) || !(newmap->flags & M_REGEX)) {
1037        newmap->from.c = from;
1038        newmap->to = to;
1039    }
1040    else {
1041        newmap->regflags
1042            = REGFLAG(AP_REG_EXTENDED,flags,'x')
1043            | REGFLAG(AP_REG_ICASE,flags,'i')
1044            | REGFLAG(AP_REG_NOSUB,flags,'n')
1045            | REGFLAG(AP_REG_NEWLINE,flags,'s');
1046        newmap->from.r = ap_pregcomp(cmd->pool, from, newmap->regflags);
1047        newmap->to = to;
1048    }
1049    if (cond != NULL) {
1050        /* back-compatibility: support old-style ENV expressions
1051         * by converting to ap_expr syntax.
1052         *
1053         * 1. var --> env(var)
1054         * 2. var=val --> env(var)=val
1055         * 3. !var --> !env(var)
1056         * 4. !var=val --> env(var)!=val
1057         */
1058        char *newcond = NULL;
1059        if (ap_rxplus_exec(cmd->temp_pool, old_expr, cond, &newcond)) {
1060           /* we got a substitution.  Check for the case (3) above
1061            * that the regexp gets wrong: a negation without a comparison.
1062            */
1063            if ((cond[0] == '!') && !ap_strchr_c(cond, '=')) {
1064                memmove(newcond+1, newcond, strlen(newcond)-1);
1065                newcond[0] = '!';
1066            }
1067            cond = newcond;
1068        }
1069        newmap->cond = ap_expr_parse_cmd(cmd, cond, 0, &err, NULL);
1070    }
1071    else {
1072        newmap->cond = NULL;
1073    }
1074    return err;
1075}
1076
1077static const char *set_urlmap(cmd_parms *cmd, void *CFG, const char *args)
1078{
1079    proxy_html_conf *cfg = (proxy_html_conf *)CFG;
1080    urlmap *map;
1081    apr_pool_t *pool = cmd->pool;
1082    urlmap *newmap;
1083    const char *usage =
1084              "Usage: ProxyHTMLURLMap from-pattern to-pattern [flags] [cond]";
1085    const char *from;
1086    const char *to;
1087    const char *flags;
1088    const char *cond = NULL;
1089
1090    if (from = ap_getword_conf(cmd->pool, &args), !from)
1091        return usage;
1092    if (to = ap_getword_conf(cmd->pool, &args), !to)
1093        return usage;
1094    flags = ap_getword_conf(cmd->pool, &args);
1095    if (flags && *flags)
1096        cond = ap_getword_conf(cmd->pool, &args);
1097    if (cond && !*cond)
1098        cond = NULL;
1099
1100    /* the args look OK, so let's use them */
1101    newmap = apr_palloc(pool, sizeof(urlmap));
1102    newmap->next = NULL;
1103    if (cfg->map) {
1104        for (map = cfg->map; map->next; map = map->next);
1105        map->next = newmap;
1106    }
1107    else
1108        cfg->map = newmap;
1109
1110    return comp_urlmap(cmd, newmap, from, to, flags, cond);
1111}
1112
1113static const char *set_doctype(cmd_parms *cmd, void *CFG,
1114                               const char *t, const char *l)
1115{
1116    proxy_html_conf *cfg = (proxy_html_conf *)CFG;
1117    if (!strcasecmp(t, "xhtml")) {
1118        cfg->etag = xhtml_etag;
1119        if (l && !strcasecmp(l, "legacy"))
1120            cfg->doctype = fpi_xhtml_legacy;
1121        else
1122            cfg->doctype = fpi_xhtml;
1123    }
1124    else if (!strcasecmp(t, "html")) {
1125        cfg->etag = html_etag;
1126        if (l && !strcasecmp(l, "legacy"))
1127            cfg->doctype = fpi_html_legacy;
1128        else
1129            cfg->doctype = fpi_html;
1130    }
1131    else {
1132        cfg->doctype = apr_pstrdup(cmd->pool, t);
1133        if (l && ((l[0] == 'x') || (l[0] == 'X')))
1134            cfg->etag = xhtml_etag;
1135        else
1136            cfg->etag = html_etag;
1137    }
1138    return NULL;
1139}
1140
1141static const char *set_flags(cmd_parms *cmd, void *CFG, const char *arg)
1142{
1143    proxy_html_conf *cfg = CFG;
1144    if (arg && *arg) {
1145        if (!strcasecmp(arg, "lowercase"))
1146            cfg->flags |= NORM_LC;
1147        else if (!strcasecmp(arg, "dospath"))
1148            cfg->flags |= NORM_MSSLASH;
1149        else if (!strcasecmp(arg, "reset"))
1150            cfg->flags |= NORM_RESET;
1151    }
1152    return NULL;
1153}
1154
1155static const char *set_events(cmd_parms *cmd, void *CFG, const char *arg)
1156{
1157    tattr *attr;
1158    proxy_html_conf *cfg = CFG;
1159    if (cfg->events == NULL)
1160        cfg->events = apr_array_make(cmd->pool, 20, sizeof(tattr));
1161    attr = apr_array_push(cfg->events);
1162    attr->val = arg;
1163    return NULL;
1164}
1165
1166static const char *set_links(cmd_parms *cmd, void *CFG,
1167                             const char *elt, const char *att)
1168{
1169    apr_array_header_t *attrs;
1170    tattr *attr;
1171    proxy_html_conf *cfg = CFG;
1172
1173    if (cfg->links == NULL)
1174        cfg->links = apr_hash_make(cmd->pool);
1175
1176    attrs = apr_hash_get(cfg->links, elt, APR_HASH_KEY_STRING);
1177    if (!attrs) {
1178        attrs = apr_array_make(cmd->pool, 2, sizeof(tattr*));
1179        apr_hash_set(cfg->links, elt, APR_HASH_KEY_STRING, attrs);
1180    }
1181    attr = apr_array_push(attrs);
1182    attr->val = att;
1183    return NULL;
1184}
1185static const command_rec proxy_html_cmds[] = {
1186    AP_INIT_ITERATE("ProxyHTMLEvents", set_events, NULL,
1187                    RSRC_CONF|ACCESS_CONF,
1188                    "Strings to be treated as scripting events"),
1189    AP_INIT_ITERATE2("ProxyHTMLLinks", set_links, NULL,
1190                     RSRC_CONF|ACCESS_CONF, "Declare HTML Attributes"),
1191    AP_INIT_RAW_ARGS("ProxyHTMLURLMap", set_urlmap, NULL,
1192                     RSRC_CONF|ACCESS_CONF, "Map URL From To"),
1193    AP_INIT_TAKE12("ProxyHTMLDoctype", set_doctype, NULL,
1194                   RSRC_CONF|ACCESS_CONF, "(HTML|XHTML) [Legacy]"),
1195    AP_INIT_ITERATE("ProxyHTMLFixups", set_flags, NULL,
1196                    RSRC_CONF|ACCESS_CONF, "Options are lowercase, dospath"),
1197    AP_INIT_FLAG("ProxyHTMLMeta", ap_set_flag_slot,
1198                 (void*)APR_OFFSETOF(proxy_html_conf, metafix),
1199                 RSRC_CONF|ACCESS_CONF, "Fix META http-equiv elements"),
1200    AP_INIT_FLAG("ProxyHTMLInterp", ap_set_flag_slot,
1201                 (void*)APR_OFFSETOF(proxy_html_conf, interp),
1202                 RSRC_CONF|ACCESS_CONF,
1203                 "Support interpolation and conditions in URLMaps"),
1204    AP_INIT_FLAG("ProxyHTMLExtended", ap_set_flag_slot,
1205                 (void*)APR_OFFSETOF(proxy_html_conf, extfix),
1206                 RSRC_CONF|ACCESS_CONF, "Map URLs in Javascript and CSS"),
1207    AP_INIT_FLAG("ProxyHTMLStripComments", ap_set_flag_slot,
1208                 (void*)APR_OFFSETOF(proxy_html_conf, strip_comments),
1209                 RSRC_CONF|ACCESS_CONF, "Strip out comments"),
1210    AP_INIT_TAKE1("ProxyHTMLBufSize", ap_set_int_slot,
1211                  (void*)APR_OFFSETOF(proxy_html_conf, bufsz),
1212                  RSRC_CONF|ACCESS_CONF, "Buffer size"),
1213    AP_INIT_TAKE1("ProxyHTMLCharsetOut", ap_set_string_slot,
1214                  (void*)APR_OFFSETOF(proxy_html_conf, charset_out),
1215                  RSRC_CONF|ACCESS_CONF, "Usage: ProxyHTMLCharsetOut charset"),
1216    AP_INIT_FLAG("ProxyHTMLEnable", ap_set_flag_slot,
1217                 (void*)APR_OFFSETOF(proxy_html_conf, enabled),
1218                 RSRC_CONF|ACCESS_CONF,
1219                 "Enable proxy-html and xml2enc filters"),
1220    { NULL }
1221};
1222static int mod_proxy_html(apr_pool_t *p, apr_pool_t *p1, apr_pool_t *p2)
1223{
1224    seek_meta = ap_pregcomp(p, "<meta[^>]*(http-equiv)[^>]*>",
1225                            AP_REG_EXTENDED|AP_REG_ICASE);
1226    seek_content = apr_strmatch_precompile(p, "content", 0);
1227    memset(&sax, 0, sizeof(htmlSAXHandler));
1228    sax.startElement = pstartElement;
1229    sax.endElement = pendElement;
1230    sax.characters = pcharacters;
1231    sax.comment = pcomment;
1232    sax.cdataBlock = pcdata;
1233    xml2enc_charset = APR_RETRIEVE_OPTIONAL_FN(xml2enc_charset);
1234    xml2enc_filter = APR_RETRIEVE_OPTIONAL_FN(xml2enc_filter);
1235    if (!xml2enc_charset) {
1236        ap_log_perror(APLOG_MARK, APLOG_NOTICE, 0, p2, APLOGNO(01425)
1237                      "I18n support in mod_proxy_html requires mod_xml2enc. "
1238                      "Without it, non-ASCII characters in proxied pages are "
1239                      "likely to display incorrectly.");
1240    }
1241
1242    /* old_expr only needs to last the life of the config phase */
1243    old_expr = ap_rxplus_compile(p1, "s/^(!)?(\\w+)((=)(.+))?$/reqenv('$2')$1$4'$5'/");
1244    return OK;
1245}
1246static void proxy_html_insert(request_rec *r)
1247{
1248    proxy_html_conf *cfg;
1249    cfg = ap_get_module_config(r->per_dir_config, &proxy_html_module);
1250    if (cfg->enabled) {
1251        if (xml2enc_filter)
1252            xml2enc_filter(r, NULL, ENCIO_INPUT_CHECKS);
1253        ap_add_output_filter("proxy-html", NULL, r, r->connection);
1254    }
1255}
1256static void proxy_html_hooks(apr_pool_t *p)
1257{
1258    static const char *aszSucc[] = { "mod_filter.c", NULL };
1259    ap_register_output_filter_protocol("proxy-html", proxy_html_filter,
1260                                       NULL, AP_FTYPE_RESOURCE,
1261                          AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH);
1262    /* move this to pre_config so old_expr is available to interpret
1263     * old-style conditions on URL maps.
1264     */
1265    ap_hook_pre_config(mod_proxy_html, NULL, NULL, APR_HOOK_MIDDLE);
1266    ap_hook_insert_filter(proxy_html_insert, NULL, aszSucc, APR_HOOK_MIDDLE);
1267}
1268
1269AP_DECLARE_MODULE(proxy_html) = {
1270    STANDARD20_MODULE_STUFF,
1271    proxy_html_config,
1272    proxy_html_merge,
1273    NULL,
1274    NULL,
1275    proxy_html_cmds,
1276    proxy_html_hooks
1277};
1278