1/* Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements.  See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License.  You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * mod_substitute.c: Perform content rewriting on the fly
19 */
20
21#include "httpd.h"
22#include "http_config.h"
23#include "http_core.h"
24#include "http_log.h"
25#include "apr_general.h"
26#include "apr_strings.h"
27#include "apr_strmatch.h"
28#include "apr_lib.h"
29#include "util_filter.h"
30#include "util_varbuf.h"
31#include "apr_buckets.h"
32#include "http_request.h"
33#define APR_WANT_STRFUNC
34#include "apr_want.h"
35
36static const char substitute_filter_name[] = "SUBSTITUTE";
37
38module AP_MODULE_DECLARE_DATA substitute_module;
39
40typedef struct subst_pattern_t {
41    const apr_strmatch_pattern *pattern;
42    const ap_regex_t *regexp;
43    const char *replacement;
44    apr_size_t replen;
45    apr_size_t patlen;
46    int flatten;
47} subst_pattern_t;
48
49typedef struct {
50    apr_array_header_t *patterns;
51} subst_dir_conf;
52
53typedef struct {
54    apr_bucket_brigade *linebb;
55    apr_bucket_brigade *linesbb;
56    apr_bucket_brigade *passbb;
57    apr_bucket_brigade *pattbb;
58    apr_pool_t *tpool;
59} substitute_module_ctx;
60
61static void *create_substitute_dcfg(apr_pool_t *p, char *d)
62{
63    subst_dir_conf *dcfg =
64    (subst_dir_conf *) apr_pcalloc(p, sizeof(subst_dir_conf));
65
66    dcfg->patterns = apr_array_make(p, 10, sizeof(subst_pattern_t));
67    return dcfg;
68}
69
70static void *merge_substitute_dcfg(apr_pool_t *p, void *basev, void *overv)
71{
72    subst_dir_conf *a =
73    (subst_dir_conf *) apr_pcalloc(p, sizeof(subst_dir_conf));
74    subst_dir_conf *base = (subst_dir_conf *) basev;
75    subst_dir_conf *over = (subst_dir_conf *) overv;
76
77    a->patterns = apr_array_append(p, over->patterns,
78                                                  base->patterns);
79    return a;
80}
81
82#define AP_MAX_BUCKETS 1000
83/*
84 * We want to limit the memory usage in a way that is predictable. Therefore
85 * we limit the resulting length of the line to this value.
86 */
87#define AP_SUBST_MAX_LINE_LENGTH (128*MAX_STRING_LEN)
88
89#define SEDRMPATBCKT(b, offset, tmp_b, patlen) do {  \
90    apr_bucket_split(b, offset);                     \
91    tmp_b = APR_BUCKET_NEXT(b);                      \
92    apr_bucket_split(tmp_b, patlen);                 \
93    b = APR_BUCKET_NEXT(tmp_b);                      \
94    apr_bucket_delete(tmp_b);                        \
95} while (0)
96
97static apr_status_t do_pattmatch(ap_filter_t *f, apr_bucket *inb,
98                                 apr_bucket_brigade *mybb,
99                                 apr_pool_t *pool)
100{
101    int i;
102    int force_quick = 0;
103    ap_regmatch_t regm[AP_MAX_REG_MATCH];
104    apr_size_t bytes;
105    apr_size_t len;
106    const char *buff;
107    struct ap_varbuf vb;
108    apr_bucket *b;
109    apr_bucket *tmp_b;
110
111    subst_dir_conf *cfg =
112    (subst_dir_conf *) ap_get_module_config(f->r->per_dir_config,
113                                             &substitute_module);
114    subst_pattern_t *script;
115
116    APR_BRIGADE_INSERT_TAIL(mybb, inb);
117    ap_varbuf_init(pool, &vb, 0);
118
119    script = (subst_pattern_t *) cfg->patterns->elts;
120    /*
121     * Simple optimization. If we only have one pattern, then
122     * we can safely avoid the overhead of flattening
123     */
124    if (cfg->patterns->nelts == 1) {
125       force_quick = 1;
126    }
127    for (i = 0; i < cfg->patterns->nelts; i++) {
128        for (b = APR_BRIGADE_FIRST(mybb);
129             b != APR_BRIGADE_SENTINEL(mybb);
130             b = APR_BUCKET_NEXT(b)) {
131            if (APR_BUCKET_IS_METADATA(b)) {
132                /*
133                 * we should NEVER see this, because we should never
134                 * be passed any, but "handle" it just in case.
135                 */
136                continue;
137            }
138            if (apr_bucket_read(b, &buff, &bytes, APR_BLOCK_READ)
139                    == APR_SUCCESS) {
140                int have_match = 0;
141                vb.strlen = 0;
142                if (script->pattern) {
143                    const char *repl;
144                    /*
145                     * space_left counts how many bytes we have left until the
146                     * line length reaches AP_SUBST_MAX_LINE_LENGTH.
147                     */
148                    apr_size_t space_left = AP_SUBST_MAX_LINE_LENGTH;
149                    apr_size_t repl_len = strlen(script->replacement);
150                    while ((repl = apr_strmatch(script->pattern, buff, bytes)))
151                    {
152                        have_match = 1;
153                        /* get offset into buff for pattern */
154                        len = (apr_size_t) (repl - buff);
155                        if (script->flatten && !force_quick) {
156                            /*
157                             * We are flattening the buckets here, meaning
158                             * that we don't do the fast bucket splits.
159                             * Instead we copy over what the buckets would
160                             * contain and use them. This is slow, since we
161                             * are constanting allocing space and copying
162                             * strings.
163                             */
164                            if (vb.strlen + len + repl_len > AP_SUBST_MAX_LINE_LENGTH)
165                                return APR_ENOMEM;
166                            ap_varbuf_strmemcat(&vb, buff, len);
167                            ap_varbuf_strmemcat(&vb, script->replacement, repl_len);
168                        }
169                        else {
170                            /*
171                             * The string before the match but after the
172                             * previous match (if any) has length 'len'.
173                             * Check if we still have space for this string and
174                             * the replacement string.
175                             */
176                            if (space_left < len + repl_len)
177                                return APR_ENOMEM;
178                            space_left -= len + repl_len;
179                            /*
180                             * We now split off the string before the match
181                             * as its own bucket, then isolate the matched
182                             * string and delete it.
183                             */
184                            SEDRMPATBCKT(b, len, tmp_b, script->patlen);
185                            /*
186                             * Finally, we create a bucket that contains the
187                             * replacement...
188                             */
189                            tmp_b = apr_bucket_transient_create(script->replacement,
190                                      script->replen,
191                                      f->r->connection->bucket_alloc);
192                            /* ... and insert it */
193                            APR_BUCKET_INSERT_BEFORE(b, tmp_b);
194                        }
195                        /* now we need to adjust buff for all these changes */
196                        len += script->patlen;
197                        bytes -= len;
198                        buff += len;
199                    }
200                    if (have_match) {
201                        if (script->flatten && !force_quick) {
202                            /* XXX: we should check for AP_MAX_BUCKETS here and
203                             * XXX: call ap_pass_brigade accordingly
204                             */
205                            char *copy = ap_varbuf_pdup(pool, &vb, NULL, 0,
206                                                        buff, bytes, &len);
207                            tmp_b = apr_bucket_pool_create(copy, len, pool,
208                                                           f->r->connection->bucket_alloc);
209                            APR_BUCKET_INSERT_BEFORE(b, tmp_b);
210                            apr_bucket_delete(b);
211                            b = tmp_b;
212                        }
213                        else {
214                            /*
215                             * We want the behaviour to be predictable.
216                             * Therefore we try to always error out if the
217                             * line length is larger than the limit,
218                             * regardless of the content of the line. So,
219                             * let's check if the remaining non-matching
220                             * string does not exceed the limit.
221                             */
222                            if (space_left < b->length)
223                                return APR_ENOMEM;
224                        }
225                    }
226                }
227                else if (script->regexp) {
228                    int left = bytes;
229                    const char *pos = buff;
230                    char *repl;
231                    apr_size_t space_left = AP_SUBST_MAX_LINE_LENGTH;
232                    while (!ap_regexec_len(script->regexp, pos, left,
233                                       AP_MAX_REG_MATCH, regm, 0)) {
234                        apr_status_t rv;
235                        have_match = 1;
236                        if (script->flatten && !force_quick) {
237                            /* copy bytes before the match */
238                            if (regm[0].rm_so > 0)
239                                ap_varbuf_strmemcat(&vb, pos, regm[0].rm_so);
240                            /* add replacement string */
241                            rv = ap_varbuf_regsub(&vb, script->replacement, pos,
242                                                  AP_MAX_REG_MATCH, regm,
243                                                  AP_SUBST_MAX_LINE_LENGTH - vb.strlen);
244                            if (rv != APR_SUCCESS)
245                                return rv;
246                        }
247                        else {
248                            apr_size_t repl_len;
249                            /* acount for string before the match */
250                            if (space_left <= regm[0].rm_so)
251                                return APR_ENOMEM;
252                            space_left -= regm[0].rm_so;
253                            rv = ap_pregsub_ex(pool, &repl,
254                                               script->replacement, pos,
255                                               AP_MAX_REG_MATCH, regm,
256                                               space_left);
257                            if (rv != APR_SUCCESS)
258                                return rv;
259                            repl_len = strlen(repl);
260                            space_left -= repl_len;
261                            len = (apr_size_t) (regm[0].rm_eo - regm[0].rm_so);
262                            SEDRMPATBCKT(b, regm[0].rm_so, tmp_b, len);
263                            tmp_b = apr_bucket_transient_create(repl, repl_len,
264                                                f->r->connection->bucket_alloc);
265                            APR_BUCKET_INSERT_BEFORE(b, tmp_b);
266                        }
267                        /*
268                         * reset to past what we just did. pos now maps to b
269                         * again
270                         */
271                        pos += regm[0].rm_eo;
272                        left -= regm[0].rm_eo;
273                    }
274                    if (have_match && script->flatten && !force_quick) {
275                        char *copy;
276                        /* Copy result plus the part after the last match into
277                         * a bucket.
278                         */
279                        copy = ap_varbuf_pdup(pool, &vb, NULL, 0, pos, left,
280                                              &len);
281                        tmp_b = apr_bucket_pool_create(copy, len, pool,
282                                           f->r->connection->bucket_alloc);
283                        APR_BUCKET_INSERT_BEFORE(b, tmp_b);
284                        apr_bucket_delete(b);
285                        b = tmp_b;
286                    }
287                }
288                else {
289                    ap_assert(0);
290                    continue;
291                }
292            }
293        }
294        script++;
295    }
296    ap_varbuf_free(&vb);
297    return APR_SUCCESS;
298}
299
300static apr_status_t substitute_filter(ap_filter_t *f, apr_bucket_brigade *bb)
301{
302    apr_size_t bytes;
303    apr_size_t len;
304    apr_size_t fbytes;
305    const char *buff;
306    const char *nl = NULL;
307    char *bflat;
308    apr_bucket *b;
309    apr_bucket *tmp_b;
310    apr_bucket_brigade *tmp_bb = NULL;
311    apr_status_t rv;
312
313    substitute_module_ctx *ctx = f->ctx;
314
315    /*
316     * First time around? Create the saved bb that we used for each pass
317     * through. Note that we can also get here when we explicitly clear ctx,
318     * for error handling
319     */
320    if (!ctx) {
321        f->ctx = ctx = apr_pcalloc(f->r->pool, sizeof(*ctx));
322        /*
323         * Create all the temporary brigades we need and reuse them to avoid
324         * creating them over and over again from r->pool which would cost a
325         * lot of memory in some cases.
326         */
327        ctx->linebb = apr_brigade_create(f->r->pool, f->c->bucket_alloc);
328        ctx->linesbb = apr_brigade_create(f->r->pool, f->c->bucket_alloc);
329        ctx->pattbb = apr_brigade_create(f->r->pool, f->c->bucket_alloc);
330        /*
331         * Everything to be passed to the next filter goes in
332         * here, our pass brigade.
333         */
334        ctx->passbb = apr_brigade_create(f->r->pool, f->c->bucket_alloc);
335        /* Create our temporary pool only once */
336        apr_pool_create(&(ctx->tpool), f->r->pool);
337        apr_table_unset(f->r->headers_out, "Content-Length");
338    }
339
340    /*
341     * Shortcircuit processing
342     */
343    if (APR_BRIGADE_EMPTY(bb))
344        return APR_SUCCESS;
345
346    /*
347     * Here's the concept:
348     *  Read in the data and look for newlines. Once we
349     *  find a full "line", add it to our working brigade.
350     *  If we've finished reading the brigade and we have
351     *  any left over data (not a "full" line), store that
352     *  for the next pass.
353     *
354     * Note: anything stored in ctx->linebb for sure does not have
355     * a newline char, so we don't concat that bb with the
356     * new bb, since we would spending time searching for the newline
357     * in data we know it doesn't exist. So instead, we simply scan
358     * our current bb and, if we see a newline, prepend ctx->linebb
359     * to the front of it. This makes the code much less straight-
360     * forward (otherwise we could APR_BRIGADE_CONCAT(ctx->linebb, bb)
361     * and just scan for newlines and not bother with needing to know
362     * when ctx->linebb needs to be reset) but also faster. We'll take
363     * the speed.
364     *
365     * Note: apr_brigade_split_line would be nice here, but we
366     * really can't use it since we need more control and we want
367     * to re-use already read bucket data.
368     *
369     * See mod_include if still confused :)
370     */
371
372    while ((b = APR_BRIGADE_FIRST(bb)) && (b != APR_BRIGADE_SENTINEL(bb))) {
373        if (APR_BUCKET_IS_EOS(b)) {
374            /*
375             * if we see the EOS, then we need to pass along everything we
376             * have. But if the ctx->linebb isn't empty, then we need to add
377             * that to the end of what we'll be passing.
378             */
379            if (!APR_BRIGADE_EMPTY(ctx->linebb)) {
380                rv = apr_brigade_pflatten(ctx->linebb, &bflat,
381                                          &fbytes, ctx->tpool);
382                if (rv != APR_SUCCESS)
383                    goto err;
384                if (fbytes > AP_SUBST_MAX_LINE_LENGTH) {
385                    rv = APR_ENOMEM;
386                    goto err;
387                }
388                tmp_b = apr_bucket_transient_create(bflat, fbytes,
389                                                f->r->connection->bucket_alloc);
390                rv = do_pattmatch(f, tmp_b, ctx->pattbb, ctx->tpool);
391                if (rv != APR_SUCCESS)
392                    goto err;
393                APR_BRIGADE_CONCAT(ctx->passbb, ctx->pattbb);
394            }
395            apr_brigade_cleanup(ctx->linebb);
396            APR_BUCKET_REMOVE(b);
397            APR_BRIGADE_INSERT_TAIL(ctx->passbb, b);
398        }
399        /*
400         * No need to handle FLUSH buckets separately as we call
401         * ap_pass_brigade anyway at the end of the loop.
402         */
403        else if (APR_BUCKET_IS_METADATA(b)) {
404            APR_BUCKET_REMOVE(b);
405            APR_BRIGADE_INSERT_TAIL(ctx->passbb, b);
406        }
407        else {
408            /*
409             * We have actual "data" so read in as much as we can and start
410             * scanning and splitting from our read buffer
411             */
412            rv = apr_bucket_read(b, &buff, &bytes, APR_BLOCK_READ);
413            if (rv != APR_SUCCESS || bytes == 0) {
414                apr_bucket_delete(b);
415            }
416            else {
417                int num = 0;
418                while (bytes > 0) {
419                    nl = memchr(buff, APR_ASCII_LF, bytes);
420                    if (nl) {
421                        len = (apr_size_t) (nl - buff) + 1;
422                        /* split *after* the newline */
423                        apr_bucket_split(b, len);
424                        /*
425                         * We've likely read more data, so bypass rereading
426                         * bucket data and continue scanning through this
427                         * buffer
428                         */
429                        bytes -= len;
430                        buff += len;
431                        /*
432                         * we need b to be updated for future potential
433                         * splitting
434                         */
435                        tmp_b = APR_BUCKET_NEXT(b);
436                        APR_BUCKET_REMOVE(b);
437                        /*
438                         * Hey, we found a newline! Don't forget the old
439                         * stuff that needs to be added to the front. So we
440                         * add the split bucket to the end, flatten the whole
441                         * bb, morph the whole shebang into a bucket which is
442                         * then added to the tail of the newline bb.
443                         */
444                        if (!APR_BRIGADE_EMPTY(ctx->linebb)) {
445                            APR_BRIGADE_INSERT_TAIL(ctx->linebb, b);
446                            rv = apr_brigade_pflatten(ctx->linebb, &bflat,
447                                                      &fbytes, ctx->tpool);
448                            if (rv != APR_SUCCESS)
449                                goto err;
450                            if (fbytes > AP_SUBST_MAX_LINE_LENGTH) {
451                                /* Avoid pflattening further lines, we will
452                                 * abort later on anyway.
453                                 */
454                                rv = APR_ENOMEM;
455                                goto err;
456                            }
457                            b = apr_bucket_transient_create(bflat, fbytes,
458                                            f->r->connection->bucket_alloc);
459                            apr_brigade_cleanup(ctx->linebb);
460                        }
461                        rv = do_pattmatch(f, b, ctx->pattbb, ctx->tpool);
462                        if (rv != APR_SUCCESS)
463                            goto err;
464                        /*
465                         * Count how many buckets we have in ctx->passbb
466                         * so far. Yes, this is correct we count ctx->passbb
467                         * and not ctx->pattbb as we do not reset num on every
468                         * iteration.
469                         */
470                        for (b = APR_BRIGADE_FIRST(ctx->pattbb);
471                             b != APR_BRIGADE_SENTINEL(ctx->pattbb);
472                             b = APR_BUCKET_NEXT(b)) {
473                            num++;
474                        }
475                        APR_BRIGADE_CONCAT(ctx->passbb, ctx->pattbb);
476                        /*
477                         * If the number of buckets in ctx->passbb reaches an
478                         * "insane" level, we consume much memory for all the
479                         * buckets as such. So lets flush them down the chain
480                         * in this case and thus clear ctx->passbb. This frees
481                         * the buckets memory for further processing.
482                         * Usually this condition should not become true, but
483                         * it is a safety measure for edge cases.
484                         */
485                        if (num > AP_MAX_BUCKETS) {
486                            b = apr_bucket_flush_create(
487                                                f->r->connection->bucket_alloc);
488                            APR_BRIGADE_INSERT_TAIL(ctx->passbb, b);
489                            rv = ap_pass_brigade(f->next, ctx->passbb);
490                            apr_brigade_cleanup(ctx->passbb);
491                            num = 0;
492                            apr_pool_clear(ctx->tpool);
493                            if (rv != APR_SUCCESS)
494                                goto err;
495                        }
496                        b = tmp_b;
497                    }
498                    else {
499                        /*
500                         * no newline in whatever is left of this buffer so
501                         * tuck data away and get next bucket
502                         */
503                        APR_BUCKET_REMOVE(b);
504                        APR_BRIGADE_INSERT_TAIL(ctx->linebb, b);
505                        bytes = 0;
506                    }
507                }
508            }
509        }
510        if (!APR_BRIGADE_EMPTY(ctx->passbb)) {
511            rv = ap_pass_brigade(f->next, ctx->passbb);
512            apr_brigade_cleanup(ctx->passbb);
513            if (rv != APR_SUCCESS)
514                goto err;
515        }
516        apr_pool_clear(ctx->tpool);
517    }
518
519    /* Anything left we want to save/setaside for the next go-around */
520    if (!APR_BRIGADE_EMPTY(ctx->linebb)) {
521        /*
522         * Provide ap_save_brigade with an existing empty brigade
523         * (ctx->linesbb) to avoid creating a new one.
524         */
525        ap_save_brigade(f, &(ctx->linesbb), &(ctx->linebb), f->r->pool);
526        tmp_bb = ctx->linebb;
527        ctx->linebb = ctx->linesbb;
528        ctx->linesbb = tmp_bb;
529    }
530
531    return APR_SUCCESS;
532err:
533    if (rv == APR_ENOMEM)
534        ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, APLOGNO(01328) "Line too long, URI %s",
535                      f->r->uri);
536    apr_pool_clear(ctx->tpool);
537    return rv;
538}
539
540static const char *set_pattern(cmd_parms *cmd, void *cfg, const char *line)
541{
542    char *from = NULL;
543    char *to = NULL;
544    char *flags = NULL;
545    char *ourline;
546    char delim;
547    subst_pattern_t *nscript;
548    int is_pattern = 0;
549    int ignore_case = 0;
550    int flatten = 1;
551    ap_regex_t *r = NULL;
552
553    if (apr_tolower(*line) != 's') {
554        return "Bad Substitute format, must be an s/// pattern";
555    }
556    ourline = apr_pstrdup(cmd->pool, line);
557    delim = *++ourline;
558    if (delim)
559        from = ++ourline;
560    if (from) {
561        if (*ourline != delim) {
562            while (*++ourline && *ourline != delim);
563        }
564        if (*ourline) {
565            *ourline = '\0';
566            to = ++ourline;
567        }
568    }
569    if (to) {
570        if (*ourline != delim) {
571            while (*++ourline && *ourline != delim);
572        }
573        if (*ourline) {
574            *ourline = '\0';
575            flags = ++ourline;
576        }
577    }
578
579    if (!delim || !from || !*from || !to) {
580        return "Bad Substitute format, must be a complete s/// pattern";
581    }
582
583    if (flags) {
584        while (*flags) {
585            delim = apr_tolower(*flags);    /* re-use */
586            if (delim == 'i')
587                ignore_case = 1;
588            else if (delim == 'n')
589                is_pattern = 1;
590            else if (delim == 'f')
591                flatten = 1;
592            else if (delim == 'q')
593                flatten = 0;
594            else
595                return "Bad Substitute flag, only s///[infq] are supported";
596            flags++;
597        }
598    }
599
600    /* first see if we can compile the regex */
601    if (!is_pattern) {
602        r = ap_pregcomp(cmd->pool, from, AP_REG_EXTENDED |
603                        (ignore_case ? AP_REG_ICASE : 0));
604        if (!r)
605            return "Substitute could not compile regex";
606    }
607    nscript = apr_array_push(((subst_dir_conf *) cfg)->patterns);
608    /* init the new entries */
609    nscript->pattern = NULL;
610    nscript->regexp = NULL;
611    nscript->replacement = NULL;
612    nscript->patlen = 0;
613
614    if (is_pattern) {
615        nscript->patlen = strlen(from);
616        nscript->pattern = apr_strmatch_precompile(cmd->pool, from,
617                                                   !ignore_case);
618    }
619    else {
620        nscript->regexp = r;
621    }
622
623    nscript->replacement = to;
624    nscript->replen = strlen(to);
625    nscript->flatten = flatten;
626
627    return NULL;
628}
629
630#define PROTO_FLAGS AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH
631static void register_hooks(apr_pool_t *pool)
632{
633    ap_register_output_filter(substitute_filter_name, substitute_filter,
634                              NULL, AP_FTYPE_RESOURCE);
635}
636
637static const command_rec substitute_cmds[] = {
638    AP_INIT_TAKE1("Substitute", set_pattern, NULL, OR_ALL,
639                  "Pattern to filter the response content (s/foo/bar/[inf])"),
640    {NULL}
641};
642
643AP_DECLARE_MODULE(substitute) = {
644    STANDARD20_MODULE_STUFF,
645    create_substitute_dcfg,     /* dir config creater */
646    merge_substitute_dcfg,      /* dir merger --- default is to override */
647    NULL,                       /* server config */
648    NULL,                       /* merge server config */
649    substitute_cmds,            /* command table */
650    register_hooks              /* register hooks */
651};
652