1/* Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements.  See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License.  You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * mod_substitute.c: Perform content rewriting on the fly
19 */
20
21#include "httpd.h"
22#include "http_config.h"
23#include "http_core.h"
24#include "apr_general.h"
25#include "apr_strings.h"
26#include "apr_strmatch.h"
27#include "apr_lib.h"
28#include "util_filter.h"
29#include "apr_buckets.h"
30#include "http_request.h"
31#define APR_WANT_STRFUNC
32#include "apr_want.h"
33
34static const char substitute_filter_name[] = "SUBSTITUTE";
35
36module AP_MODULE_DECLARE_DATA substitute_module;
37
38typedef struct subst_pattern_t {
39    const apr_strmatch_pattern *pattern;
40    const ap_regex_t *regexp;
41    const char *replacement;
42    apr_size_t replen;
43    apr_size_t patlen;
44    int flatten;
45} subst_pattern_t;
46
47typedef struct {
48    apr_array_header_t *patterns;
49} subst_dir_conf;
50
51typedef struct {
52    apr_bucket_brigade *linebb;
53    apr_bucket_brigade *linesbb;
54    apr_bucket_brigade *passbb;
55    apr_bucket_brigade *pattbb;
56    apr_pool_t *tpool;
57} substitute_module_ctx;
58
59static void *create_substitute_dcfg(apr_pool_t *p, char *d)
60{
61    subst_dir_conf *dcfg =
62    (subst_dir_conf *) apr_pcalloc(p, sizeof(subst_dir_conf));
63
64    dcfg->patterns = apr_array_make(p, 10, sizeof(subst_pattern_t));
65    return dcfg;
66}
67
68static void *merge_substitute_dcfg(apr_pool_t *p, void *basev, void *overv)
69{
70    subst_dir_conf *a =
71    (subst_dir_conf *) apr_pcalloc(p, sizeof(subst_dir_conf));
72    subst_dir_conf *base = (subst_dir_conf *) basev;
73    subst_dir_conf *over = (subst_dir_conf *) overv;
74
75    a->patterns = apr_array_append(p, over->patterns,
76                                                  base->patterns);
77    return a;
78}
79
80#define AP_MAX_BUCKETS 1000
81
82#define SEDSCAT(s1, s2, pool, buff, blen, repl) do { \
83    if (!s1) {                                       \
84        s1 = apr_pstrmemdup(pool, buff, blen);       \
85    }                                                \
86    else {                                           \
87        s2 = apr_pstrmemdup(pool, buff, blen);       \
88        s1 = apr_pstrcat(pool, s1, s2, NULL);        \
89    }                                                \
90    s1 = apr_pstrcat(pool, s1, repl, NULL);          \
91} while (0)
92
93#define SEDRMPATBCKT(b, offset, tmp_b, patlen) do {  \
94    apr_bucket_split(b, offset);                     \
95    tmp_b = APR_BUCKET_NEXT(b);                      \
96    apr_bucket_split(tmp_b, patlen);                 \
97    b = APR_BUCKET_NEXT(tmp_b);                      \
98    apr_bucket_delete(tmp_b);                        \
99} while (0)
100
101static void do_pattmatch(ap_filter_t *f, apr_bucket *inb,
102                         apr_bucket_brigade *mybb,
103                         apr_pool_t *tmp_pool)
104{
105    int i;
106    int force_quick = 0;
107    ap_regmatch_t regm[AP_MAX_REG_MATCH];
108    apr_size_t bytes;
109    apr_size_t len;
110    apr_size_t fbytes;
111    const char *buff;
112    const char *repl;
113    char *scratch;
114    char *p;
115    char *s1;
116    char *s2;
117    apr_bucket *b;
118    apr_bucket *tmp_b;
119    apr_pool_t *tpool;
120
121    subst_dir_conf *cfg =
122    (subst_dir_conf *) ap_get_module_config(f->r->per_dir_config,
123                                             &substitute_module);
124    subst_pattern_t *script;
125
126    APR_BRIGADE_INSERT_TAIL(mybb, inb);
127
128    script = (subst_pattern_t *) cfg->patterns->elts;
129    apr_pool_create(&tpool, tmp_pool);
130    scratch = NULL;
131    fbytes = 0;
132    /*
133     * Simple optimization. If we only have one pattern, then
134     * we can safely avoid the overhead of flattening
135     */
136    if (cfg->patterns->nelts == 1) {
137       force_quick = 1;
138    }
139    for (i = 0; i < cfg->patterns->nelts; i++) {
140        for (b = APR_BRIGADE_FIRST(mybb);
141             b != APR_BRIGADE_SENTINEL(mybb);
142             b = APR_BUCKET_NEXT(b)) {
143            if (APR_BUCKET_IS_METADATA(b)) {
144                /*
145                 * we should NEVER see this, because we should never
146                 * be passed any, but "handle" it just in case.
147                 */
148                continue;
149            }
150            if (apr_bucket_read(b, &buff, &bytes, APR_BLOCK_READ)
151                    == APR_SUCCESS) {
152                s1 = NULL;
153                if (script->pattern) {
154                    while ((repl = apr_strmatch(script->pattern, buff, bytes)))
155                    {
156                        /* get offset into buff for pattern */
157                        len = (apr_size_t) (repl - buff);
158                        if (script->flatten && !force_quick) {
159                            /*
160                             * We are flattening the buckets here, meaning
161                             * that we don't do the fast bucket splits.
162                             * Instead we copy over what the buckets would
163                             * contain and use them. This is slow, since we
164                             * are constanting allocing space and copying
165                             * strings.
166                             */
167                            SEDSCAT(s1, s2, tmp_pool, buff, len,
168                                    script->replacement);
169                        }
170                        else {
171                            /*
172                             * We now split off the stuff before the regex
173                             * as its own bucket, then isolate the pattern
174                             * and delete it.
175                             */
176                            SEDRMPATBCKT(b, len, tmp_b, script->patlen);
177                            /*
178                             * Finally, we create a bucket that contains the
179                             * replacement...
180                             */
181                            tmp_b = apr_bucket_transient_create(script->replacement,
182                                      script->replen,
183                                      f->r->connection->bucket_alloc);
184                            /* ... and insert it */
185                            APR_BUCKET_INSERT_BEFORE(b, tmp_b);
186                        }
187                        /* now we need to adjust buff for all these changes */
188                        len += script->patlen;
189                        bytes -= len;
190                        buff += len;
191                    }
192                    if (script->flatten && s1 && !force_quick) {
193                        /*
194                         * we've finished looking at the bucket, so remove the
195                         * old one and add in our new one
196                         */
197                        s2 = apr_pstrmemdup(tmp_pool, buff, bytes);
198                        s1 = apr_pstrcat(tmp_pool, s1, s2, NULL);
199                        tmp_b = apr_bucket_transient_create(s1, strlen(s1),
200                                            f->r->connection->bucket_alloc);
201                        APR_BUCKET_INSERT_BEFORE(b, tmp_b);
202                        apr_bucket_delete(b);
203                        b = tmp_b;
204                    }
205
206                }
207                else if (script->regexp) {
208                    /*
209                     * we need a null terminated string here :(. To hopefully
210                     * save time and memory, we don't alloc for each run
211                     * through, but only if we need to have a larger chunk
212                     * to save the string to. So we keep track of how much
213                     * we've allocated and only re-alloc when we need it.
214                     * NOTE: this screams for a macro.
215                     */
216                    if (!scratch || (bytes + 1 > fbytes)) {
217                        fbytes = bytes + 1;
218                        scratch = apr_palloc(tpool, fbytes);
219                    }
220                    /* reset pointer to the scratch space */
221                    p = scratch;
222                    memcpy(p, buff, bytes);
223                    p[bytes] = '\0';
224                    while (!ap_regexec(script->regexp, p,
225                                       AP_MAX_REG_MATCH, regm, 0)) {
226                        /* first, grab the replacement string */
227                        repl = ap_pregsub(tmp_pool, script->replacement, p,
228                                          AP_MAX_REG_MATCH, regm);
229                        if (script->flatten && !force_quick) {
230                            SEDSCAT(s1, s2, tmp_pool, p, regm[0].rm_so, repl);
231                        }
232                        else {
233                            len = (apr_size_t) (regm[0].rm_eo - regm[0].rm_so);
234                            SEDRMPATBCKT(b, regm[0].rm_so, tmp_b, len);
235                            tmp_b = apr_bucket_transient_create(repl,
236                                                                strlen(repl),
237                                             f->r->connection->bucket_alloc);
238                            APR_BUCKET_INSERT_BEFORE(b, tmp_b);
239                        }
240                        /*
241                         * reset to past what we just did. buff now maps to b
242                         * again
243                         */
244                        p += regm[0].rm_eo;
245                    }
246                    if (script->flatten && s1 && !force_quick) {
247                        s1 = apr_pstrcat(tmp_pool, s1, p, NULL);
248                        tmp_b = apr_bucket_transient_create(s1, strlen(s1),
249                                            f->r->connection->bucket_alloc);
250                        APR_BUCKET_INSERT_BEFORE(b, tmp_b);
251                        apr_bucket_delete(b);
252                        b = tmp_b;
253                    }
254
255                }
256                else {
257                    /* huh? */
258                    continue;
259                }
260            }
261        }
262        script++;
263    }
264
265    apr_pool_destroy(tpool);
266
267    return;
268}
269
270static apr_status_t substitute_filter(ap_filter_t *f, apr_bucket_brigade *bb)
271{
272    apr_size_t bytes;
273    apr_size_t len;
274    apr_size_t fbytes;
275    const char *buff;
276    const char *nl = NULL;
277    char *bflat;
278    apr_bucket *b;
279    apr_bucket *tmp_b;
280    apr_bucket_brigade *tmp_bb = NULL;
281    apr_status_t rv;
282
283    substitute_module_ctx *ctx = f->ctx;
284
285    /*
286     * First time around? Create the saved bb that we used for each pass
287     * through. Note that we can also get here when we explicitly clear ctx,
288     * for error handling
289     */
290    if (!ctx) {
291        f->ctx = ctx = apr_pcalloc(f->r->pool, sizeof(*ctx));
292        /*
293         * Create all the temporary brigades we need and reuse them to avoid
294         * creating them over and over again from r->pool which would cost a
295         * lot of memory in some cases.
296         */
297        ctx->linebb = apr_brigade_create(f->r->pool, f->c->bucket_alloc);
298        ctx->linesbb = apr_brigade_create(f->r->pool, f->c->bucket_alloc);
299        ctx->pattbb = apr_brigade_create(f->r->pool, f->c->bucket_alloc);
300        /*
301         * Everything to be passed to the next filter goes in
302         * here, our pass brigade.
303         */
304        ctx->passbb = apr_brigade_create(f->r->pool, f->c->bucket_alloc);
305        /* Create our temporary pool only once */
306        apr_pool_create(&(ctx->tpool), f->r->pool);
307        apr_table_unset(f->r->headers_out, "Content-Length");
308    }
309
310    /*
311     * Shortcircuit processing
312     */
313    if (APR_BRIGADE_EMPTY(bb))
314        return APR_SUCCESS;
315
316    /*
317     * Here's the concept:
318     *  Read in the data and look for newlines. Once we
319     *  find a full "line", add it to our working brigade.
320     *  If we've finished reading the brigade and we have
321     *  any left over data (not a "full" line), store that
322     *  for the next pass.
323     *
324     * Note: anything stored in ctx->linebb for sure does not have
325     * a newline char, so we don't concat that bb with the
326     * new bb, since we would spending time searching for the newline
327     * in data we know it doesn't exist. So instead, we simply scan
328     * our current bb and, if we see a newline, prepend ctx->linebb
329     * to the front of it. This makes the code much less straight-
330     * forward (otherwise we could APR_BRIGADE_CONCAT(ctx->linebb, bb)
331     * and just scan for newlines and not bother with needing to know
332     * when ctx->linebb needs to be reset) but also faster. We'll take
333     * the speed.
334     *
335     * Note: apr_brigade_split_line would be nice here, but we
336     * really can't use it since we need more control and we want
337     * to re-use already read bucket data.
338     *
339     * See mod_include if still confused :)
340     */
341
342    while ((b = APR_BRIGADE_FIRST(bb)) && (b != APR_BRIGADE_SENTINEL(bb))) {
343        if (APR_BUCKET_IS_EOS(b)) {
344            /*
345             * if we see the EOS, then we need to pass along everything we
346             * have. But if the ctx->linebb isn't empty, then we need to add
347             * that to the end of what we'll be passing.
348             */
349            if (!APR_BRIGADE_EMPTY(ctx->linebb)) {
350                rv = apr_brigade_pflatten(ctx->linebb, &bflat,
351                                          &fbytes, ctx->tpool);
352                tmp_b = apr_bucket_transient_create(bflat, fbytes,
353                                                f->r->connection->bucket_alloc);
354                do_pattmatch(f, tmp_b, ctx->pattbb, ctx->tpool);
355                APR_BRIGADE_CONCAT(ctx->passbb, ctx->pattbb);
356            }
357            apr_brigade_cleanup(ctx->linebb);
358            APR_BUCKET_REMOVE(b);
359            APR_BRIGADE_INSERT_TAIL(ctx->passbb, b);
360        }
361        /*
362         * No need to handle FLUSH buckets separately as we call
363         * ap_pass_brigade anyway at the end of the loop.
364         */
365        else if (APR_BUCKET_IS_METADATA(b)) {
366            APR_BUCKET_REMOVE(b);
367            APR_BRIGADE_INSERT_TAIL(ctx->passbb, b);
368        }
369        else {
370            /*
371             * We have actual "data" so read in as much as we can and start
372             * scanning and splitting from our read buffer
373             */
374            rv = apr_bucket_read(b, &buff, &bytes, APR_BLOCK_READ);
375            if (rv != APR_SUCCESS || bytes == 0) {
376                apr_bucket_delete(b);
377            }
378            else {
379                int num = 0;
380                while (bytes > 0) {
381                    nl = memchr(buff, APR_ASCII_LF, bytes);
382                    if (nl) {
383                        len = (apr_size_t) (nl - buff) + 1;
384                        /* split *after* the newline */
385                        apr_bucket_split(b, len);
386                        /*
387                         * We've likely read more data, so bypass rereading
388                         * bucket data and continue scanning through this
389                         * buffer
390                         */
391                        bytes -= len;
392                        buff += len;
393                        /*
394                         * we need b to be updated for future potential
395                         * splitting
396                         */
397                        tmp_b = APR_BUCKET_NEXT(b);
398                        APR_BUCKET_REMOVE(b);
399                        /*
400                         * Hey, we found a newline! Don't forget the old
401                         * stuff that needs to be added to the front. So we
402                         * add the split bucket to the end, flatten the whole
403                         * bb, morph the whole shebang into a bucket which is
404                         * then added to the tail of the newline bb.
405                         */
406                        if (!APR_BRIGADE_EMPTY(ctx->linebb)) {
407                            APR_BRIGADE_INSERT_TAIL(ctx->linebb, b);
408                            rv = apr_brigade_pflatten(ctx->linebb, &bflat,
409                                                      &fbytes, ctx->tpool);
410                            b = apr_bucket_transient_create(bflat, fbytes,
411                                            f->r->connection->bucket_alloc);
412                            apr_brigade_cleanup(ctx->linebb);
413                        }
414                        do_pattmatch(f, b, ctx->pattbb, ctx->tpool);
415                        /*
416                         * Count how many buckets we have in ctx->passbb
417                         * so far. Yes, this is correct we count ctx->passbb
418                         * and not ctx->pattbb as we do not reset num on every
419                         * iteration.
420                         */
421                        for (b = APR_BRIGADE_FIRST(ctx->pattbb);
422                             b != APR_BRIGADE_SENTINEL(ctx->pattbb);
423                             b = APR_BUCKET_NEXT(b)) {
424                            num++;
425                        }
426                        APR_BRIGADE_CONCAT(ctx->passbb, ctx->pattbb);
427                        /*
428                         * If the number of buckets in ctx->passbb reaches an
429                         * "insane" level, we consume much memory for all the
430                         * buckets as such. So lets flush them down the chain
431                         * in this case and thus clear ctx->passbb. This frees
432                         * the buckets memory for further processing.
433                         * Usually this condition should not become true, but
434                         * it is a safety measure for edge cases.
435                         */
436                        if (num > AP_MAX_BUCKETS) {
437                            b = apr_bucket_flush_create(
438                                                f->r->connection->bucket_alloc);
439                            APR_BRIGADE_INSERT_TAIL(ctx->passbb, b);
440                            rv = ap_pass_brigade(f->next, ctx->passbb);
441                            apr_brigade_cleanup(ctx->passbb);
442                            num = 0;
443                            apr_pool_clear(ctx->tpool);
444                            if (rv != APR_SUCCESS)
445                                return rv;
446                        }
447                        b = tmp_b;
448                    }
449                    else {
450                        /*
451                         * no newline in whatever is left of this buffer so
452                         * tuck data away and get next bucket
453                         */
454                        APR_BUCKET_REMOVE(b);
455                        APR_BRIGADE_INSERT_TAIL(ctx->linebb, b);
456                        bytes = 0;
457                    }
458                }
459            }
460        }
461        if (!APR_BRIGADE_EMPTY(ctx->passbb)) {
462            rv = ap_pass_brigade(f->next, ctx->passbb);
463            apr_brigade_cleanup(ctx->passbb);
464            if (rv != APR_SUCCESS) {
465                apr_pool_clear(ctx->tpool);
466                return rv;
467            }
468        }
469        apr_pool_clear(ctx->tpool);
470    }
471
472    /* Anything left we want to save/setaside for the next go-around */
473    if (!APR_BRIGADE_EMPTY(ctx->linebb)) {
474        /*
475         * Provide ap_save_brigade with an existing empty brigade
476         * (ctx->linesbb) to avoid creating a new one.
477         */
478        ap_save_brigade(f, &(ctx->linesbb), &(ctx->linebb), f->r->pool);
479        tmp_bb = ctx->linebb;
480        ctx->linebb = ctx->linesbb;
481        ctx->linesbb = tmp_bb;
482    }
483
484    return APR_SUCCESS;
485}
486
487static const char *set_pattern(cmd_parms *cmd, void *cfg, const char *line)
488{
489    char *from = NULL;
490    char *to = NULL;
491    char *flags = NULL;
492    char *ourline;
493    char delim;
494    subst_pattern_t *nscript;
495    int is_pattern = 0;
496    int ignore_case = 0;
497    int flatten = 1;
498    ap_regex_t *r = NULL;
499
500    if (apr_tolower(*line) != 's') {
501        return "Bad Substitute format, must be an s/// pattern";
502    }
503    ourline = apr_pstrdup(cmd->pool, line);
504    delim = *++ourline;
505    if (delim)
506        from = ++ourline;
507    if (from) {
508        if (*ourline != delim) {
509            while (*++ourline && *ourline != delim);
510        }
511        if (*ourline) {
512            *ourline = '\0';
513            to = ++ourline;
514        }
515    }
516    if (to) {
517        if (*ourline != delim) {
518            while (*++ourline && *ourline != delim);
519        }
520        if (*ourline) {
521            *ourline = '\0';
522            flags = ++ourline;
523        }
524    }
525
526    if (!delim || !from || !*from || !to) {
527        return "Bad Substitute format, must be a complete s/// pattern";
528    }
529
530    if (flags) {
531        while (*flags) {
532            delim = apr_tolower(*flags);    /* re-use */
533            if (delim == 'i')
534                ignore_case = 1;
535            else if (delim == 'n')
536                is_pattern = 1;
537            else if (delim == 'f')
538                flatten = 1;
539            else if (delim == 'q')
540                flatten = 0;
541            else
542                return "Bad Substitute flag, only s///[infq] are supported";
543            flags++;
544        }
545    }
546
547    /* first see if we can compile the regex */
548    if (!is_pattern) {
549        r = ap_pregcomp(cmd->pool, from, AP_REG_EXTENDED |
550                        (ignore_case ? AP_REG_ICASE : 0));
551        if (!r)
552            return "Substitute could not compile regex";
553    }
554    nscript = apr_array_push(((subst_dir_conf *) cfg)->patterns);
555    /* init the new entries */
556    nscript->pattern = NULL;
557    nscript->regexp = NULL;
558    nscript->replacement = NULL;
559    nscript->patlen = 0;
560
561    if (is_pattern) {
562        nscript->patlen = strlen(from);
563        nscript->pattern = apr_strmatch_precompile(cmd->pool, from,
564                                                   !ignore_case);
565    }
566    else {
567        nscript->regexp = r;
568    }
569
570    nscript->replacement = to;
571    nscript->replen = strlen(to);
572    nscript->flatten = flatten;
573
574    return NULL;
575}
576
577#define PROTO_FLAGS AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH
578static void register_hooks(apr_pool_t *pool)
579{
580    ap_register_output_filter(substitute_filter_name, substitute_filter,
581                              NULL, AP_FTYPE_RESOURCE);
582}
583
584static const command_rec substitute_cmds[] = {
585    AP_INIT_TAKE1("Substitute", set_pattern, NULL, OR_ALL,
586                  "Pattern to filter the response content (s/foo/bar/[inf])"),
587    {NULL}
588};
589
590module AP_MODULE_DECLARE_DATA substitute_module = {
591    STANDARD20_MODULE_STUFF,
592    create_substitute_dcfg,     /* dir config creater */
593    merge_substitute_dcfg,      /* dir merger --- default is to override */
594    NULL,                       /* server config */
595    NULL,                       /* merge server config */
596    substitute_cmds,            /* command table */
597    register_hooks              /* register hooks */
598};
599