1/* Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements.  See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License.  You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * logresolve 2.0
19 *
20 * Tom Rathborne - tomr uunet.ca - http://www.uunet.ca/~tomr/
21 * UUNET Canada, April 16, 1995
22 *
23 * Rewritten by David Robinson. (drtr ast.cam.ac.uk)
24 * Rewritten again, and ported to APR by Colm MacCarthaigh
25 *
26 * Usage: logresolve [-s filename] [-c] < access_log > new_log
27 *
28 * Arguments:
29 *    -s filename     name of a file to record statistics
30 *    -c              check the DNS for a matching A record for the host.
31 *
32 * Notes:             (For historical interest)
33 *
34 * To generate meaningful statistics from an HTTPD log file, it's good
35 * to have the domain name of each machine that accessed your site, but
36 * doing this on the fly can slow HTTPD down.
37 *
38 * Compiling NCSA HTTPD with the -DMINIMAL_DNS flag turns IP#->hostname
39 * resolution off. Before running your stats program, just run your log
40 * file through this program (logresolve) and all of your IP numbers will
41 * be resolved into hostnames (where possible).
42 *
43 * logresolve takes an HTTPD access log (in the COMMON log file format,
44 * or any other format that has the IP number/domain name as the first
45 * field for that matter), and outputs the same file with all of the
46 * domain names looked up. Where no domain name can be found, the IP
47 * number is left in.
48 *
49 * To minimize impact on your nameserver, logresolve has its very own
50 * internal hash-table cache. This means that each IP number will only
51 * be looked up the first time it is found in the log file.
52 *
53 * The -c option causes logresolve to apply the same check as httpd
54 * compiled with -DMAXIMUM_DNS; after finding the hostname from the IP
55 * address, it looks up the IP addresses for the hostname and checks
56 * that one of these matches the original address.
57 */
58
59#include "apr.h"
60#include "apr_lib.h"
61#include "apr_hash.h"
62#include "apr_getopt.h"
63#include "apr_strings.h"
64#include "apr_file_io.h"
65#include "apr_network_io.h"
66
67#if APR_HAVE_STDLIB_H
68#include <stdlib.h>
69#endif
70
71#define READ_BUF_SIZE  128*1024
72#define WRITE_BUF_SIZE 128*1024
73#define LINE_BUF_SIZE  128*1024
74
75static apr_file_t *errfile;
76static const char *shortname = "logresolve";
77static apr_hash_t *cache;
78
79/* Statistics */
80static int cachehits = 0;
81static int cachesize = 0;
82static int entries = 0;
83static int resolves = 0;
84static int withname = 0;
85static int doublefailed = 0;
86static int noreverse = 0;
87
88/*
89 * prints various statistics to output
90 */
91#define NL APR_EOL_STR
92static void print_statistics (apr_file_t *output)
93{
94    apr_file_printf(output, "logresolve Statistics:" NL);
95    apr_file_printf(output, "Entries: %d" NL, entries);
96    apr_file_printf(output, "    With name   : %d" NL, withname);
97    apr_file_printf(output, "    Resolves    : %d" NL, resolves);
98
99    if (noreverse) {
100        apr_file_printf(output, "    - No reverse : %d" NL,
101                        noreverse);
102    }
103
104    if (doublefailed) {
105        apr_file_printf(output, "    - Double lookup failed : %d" NL,
106                        doublefailed);
107    }
108
109    apr_file_printf(output, "Cache hits      : %d" NL, cachehits);
110    apr_file_printf(output, "Cache size      : %d" NL, cachesize);
111}
112
113/*
114 * usage info
115 */
116static void usage(void)
117{
118    apr_file_printf(errfile,
119    "%s -- Resolve IP-addresses to hostnames in Apache log files."           NL
120    "Usage: %s [-s STATFILE] [-c]"                                           NL
121                                                                             NL
122    "Options:"                                                               NL
123    "  -s   Record statistics to STATFILE when finished."                    NL
124                                                                             NL
125    "  -c   Perform double lookups when resolving IP addresses."            NL,
126    shortname, shortname);
127    exit(1);
128}
129#undef NL
130
131int main(int argc, const char * const argv[])
132{
133    apr_file_t * outfile;
134    apr_file_t * infile;
135    apr_getopt_t * o;
136    apr_pool_t * pool;
137    apr_pool_t *pline;
138    apr_status_t status;
139    const char * arg;
140    char * stats = NULL;
141    char * inbuffer;
142    char * outbuffer;
143    char * line;
144    int doublelookups = 0;
145
146    if (apr_app_initialize(&argc, &argv, NULL) != APR_SUCCESS) {
147        return 1;
148    }
149    atexit(apr_terminate);
150
151    if (argc) {
152        shortname = apr_filepath_name_get(argv[0]);
153    }
154
155    if (apr_pool_create(&pool, NULL) != APR_SUCCESS) {
156        return 1;
157    }
158    apr_file_open_stderr(&errfile, pool);
159    apr_getopt_init(&o, pool, argc, argv);
160
161    while (1) {
162        char opt;
163        status = apr_getopt(o, "s:c", &opt, &arg);
164        if (status == APR_EOF) {
165            break;
166        }
167        else if (status != APR_SUCCESS) {
168            usage();
169        }
170        else {
171            switch (opt) {
172            case 'c':
173                if (doublelookups) {
174                    usage();
175                }
176                doublelookups = 1;
177                break;
178            case 's':
179                if (stats) {
180                    usage();
181                }
182                stats = apr_pstrdup(pool, arg);
183                break;
184            } /* switch */
185        } /* else */
186    } /* while */
187
188    apr_file_open_stdout(&outfile, pool);
189    apr_file_open_stdin(&infile, pool);
190
191    /* Allocate two new 10k file buffers */
192    if (   (outbuffer = apr_palloc(pool, WRITE_BUF_SIZE)) == NULL
193        || (inbuffer  = apr_palloc(pool, READ_BUF_SIZE))  == NULL
194        || (line      = apr_palloc(pool, LINE_BUF_SIZE))  == NULL) {
195        return 1;
196    }
197
198    /* Set the buffers */
199    apr_file_buffer_set(infile, inbuffer, READ_BUF_SIZE);
200    apr_file_buffer_set(outfile, outbuffer, WRITE_BUF_SIZE);
201
202    cache = apr_hash_make(pool);
203    if(apr_pool_create(&pline, pool) != APR_SUCCESS){
204        return 1;
205    }
206
207    while (apr_file_gets(line, LINE_BUF_SIZE, infile) == APR_SUCCESS) {
208        char *hostname;
209        char *space;
210        apr_sockaddr_t *ip;
211        apr_sockaddr_t *ipdouble;
212        char dummy[] = " " APR_EOL_STR;
213
214        if (line[0] == '\0') {
215            continue;
216        }
217
218        /* Count our log entries */
219        entries++;
220
221        /* Check if this could even be an IP address */
222        if (!apr_isxdigit(line[0]) && line[0] != ':') {
223                withname++;
224            apr_file_puts(line, outfile);
225            continue;
226        }
227
228        /* Terminate the line at the next space */
229        if ((space = strchr(line, ' ')) != NULL) {
230            *space = '\0';
231        }
232        else {
233            space = dummy;
234        }
235
236        /* See if we have it in our cache */
237        hostname = (char *) apr_hash_get(cache, line, APR_HASH_KEY_STRING);
238        if (hostname) {
239            apr_file_printf(outfile, "%s %s", hostname, space + 1);
240            cachehits++;
241            continue;
242        }
243
244        /* Parse the IP address */
245        status = apr_sockaddr_info_get(&ip, line, APR_UNSPEC, 0, 0, pline);
246        if (status != APR_SUCCESS) {
247            /* Not an IP address */
248            withname++;
249            *space = ' ';
250            apr_file_puts(line, outfile);
251            continue;
252        }
253
254        /* This does not make much sense, but historically "resolves" means
255         * "parsed as an IP address". It does not mean we actually resolved
256         * the IP address into a hostname.
257         */
258        resolves++;
259
260        /* From here on our we cache each result, even if it was not
261         * succesful
262         */
263        cachesize++;
264
265        /* Try and perform a reverse lookup */
266        status = apr_getnameinfo(&hostname, ip, 0) != APR_SUCCESS;
267        if (status || hostname == NULL) {
268            /* Could not perform a reverse lookup */
269            *space = ' ';
270            apr_file_puts(line, outfile);
271            noreverse++;
272
273            /* Add to cache */
274            *space = '\0';
275            apr_hash_set(cache, line, APR_HASH_KEY_STRING,
276                         apr_pstrdup(apr_hash_pool_get(cache), line));
277            continue;
278        }
279
280        /* Perform a double lookup */
281        if (doublelookups) {
282            /* Do a forward lookup on our hostname, and see if that matches our
283             * original IP address.
284             */
285            status = apr_sockaddr_info_get(&ipdouble, hostname, ip->family, 0,
286                                           0, pline);
287            if (status == APR_SUCCESS ||
288                memcmp(ipdouble->ipaddr_ptr, ip->ipaddr_ptr, ip->ipaddr_len)) {
289                /* Double-lookup failed  */
290                *space = ' ';
291                apr_file_puts(line, outfile);
292                doublefailed++;
293
294                /* Add to cache */
295                *space = '\0';
296                apr_hash_set(cache, line, APR_HASH_KEY_STRING,
297                             apr_pstrdup(apr_hash_pool_get(cache), line));
298                continue;
299            }
300        }
301
302        /* Outout the resolved name */
303        apr_file_printf(outfile, "%s %s", hostname, space + 1);
304
305        /* Store it in the cache */
306        apr_hash_set(cache, line, APR_HASH_KEY_STRING,
307                     apr_pstrdup(apr_hash_pool_get(cache), hostname));
308
309        apr_pool_clear(pline);
310    }
311
312    /* Flush any remaining output */
313    apr_file_flush(outfile);
314
315    if (stats) {
316        apr_file_t *statsfile;
317        if (apr_file_open(&statsfile, stats,
318                       APR_FOPEN_WRITE | APR_FOPEN_CREATE | APR_FOPEN_TRUNCATE,
319                          APR_OS_DEFAULT, pool) != APR_SUCCESS) {
320            apr_file_printf(errfile, "%s: Could not open %s for writing.",
321                            shortname, stats);
322            return 1;
323        }
324        print_statistics(statsfile);
325        apr_file_close(statsfile);
326    }
327
328    return 0;
329}
330