1/*
2  bench.c - simple regex benchmark program
3
4  This software is released under a BSD-style license.
5  See the file LICENSE for details and copyright.
6
7*/
8
9#ifdef HAVE_CONFIG_H
10#include <config.h>
11#endif /* HAVE_CONFIG_H */
12
13#include <stdio.h>
14#include <stdlib.h>
15#ifdef HAVE_GETOPT_H
16#include <getopt.h>
17#endif /* HAVE_GETOPT_H */
18#include <time.h>
19#include <unistd.h>
20#include <math.h>
21#include <sys/types.h>
22
23#if 0
24#include <hackerlab/rx-posix/regex.h>
25#else
26#include <regex.h>
27#endif
28
29/* T distribution for alpha = 0.025 (for 95% confidence).  XXX - is
30   this correct? */
31double t_distribution[] = {
32  12.71,
33  4.303,
34  3.182,
35  2.776,
36  2.571,
37  2.447,
38  2.365,
39  2.306,
40  2.262,
41  2.228,
42  2.201,
43  2.179,
44  2.160,
45  2.145,
46  2.131,
47  2.120,
48  2.110,
49  2.101,
50  2.093,
51  2.086,
52  2.080,
53  2.074,
54  2.069,
55  2.064,
56  2.060,
57  2.056,
58  2.052,
59  2.048,
60  2.045,
61  2.042
62};
63
64void
65stats(double *sample_data, int samples, int len)
66{
67  double mean, tmp1, tmp2, variance, stddev, error, percent;
68  int i;
69
70  mean = 0;
71  for (i = 0; i < samples; i++)
72    mean += sample_data[i];
73  mean = mean/i;
74  printf("# mean: %.5f\n", mean);
75
76  tmp1 = 0;
77  for (i = 0; i < samples; i++) {
78    tmp2 = sample_data[i] - mean;
79    tmp1 += tmp2*tmp2;
80  }
81  if (samples > 1)
82    variance = tmp1 / (samples-1);
83  else
84    variance = 0;
85  stddev = sqrt(variance);
86  printf("# variance: %.16f\n", variance);
87  printf("# standard deviation: %.16f\n", stddev);
88
89  error = t_distribution[samples-1] * stddev / sqrt(samples);
90  if (mean != 0)
91    percent = 100*error/mean;
92  else
93    percent = 0;
94  printf("# error: �%.16f (�%.4f%%)\n", error, percent);
95
96  printf("%d\t%.5f\t%.5f\n", len, mean, error);
97
98  fflush(stdout);
99}
100
101void
102run_tests(int len, int samples, double *sample_data, int repeats,
103	  regex_t *reobj, char *str, char *tmpbuf)
104{
105  int i, j, errcode;
106  clock_t c1, c2;
107  regmatch_t pmatch[10];
108
109
110  printf("# len = %d\n", len);
111  fflush(stdout);
112  for (i = 0; i < samples; i++) {
113    c1 = clock();
114    for (j = 0; j < repeats; j++)
115      if ((errcode = tre_regexec(reobj, str, 10, pmatch, 0))) {
116	tre_regerror(errcode, reobj, tmpbuf, 255);
117	printf("error: %s\n", tmpbuf);
118      }
119    c2 = clock();
120
121    sample_data[i] = (double)(c2-c1)/(CLOCKS_PER_SEC*repeats);
122
123    printf("# sample: %.5f sec, clocks: %ld\n",
124	   (double)(c2-c1)/(CLOCKS_PER_SEC*repeats),
125	   (long)(c2-c1));
126    fflush(stdout);
127  }
128  fflush(stdout);
129
130  for (i = 0; i < 10; i += 2) {
131    printf("# pmatch[%d].rm_so = %d\n", i/2, (int)pmatch[i/2].rm_so);
132    printf("# pmatch[%d].rm_eo = %d\n", i/2, (int)pmatch[i/2].rm_eo);
133  }
134}
135
136
137int
138main(int argc, char **argv)
139{
140  regex_t reobj;
141  char *str;
142  char tmpbuf[256];
143  int i, j;
144  int max_len = 1024*1024*10;
145  int steps = 20;
146  int repeats = 10;
147  int samples = 20;
148  int len;
149  clock_t c1, c2;
150  int opt;
151  double sample_data[30];
152
153  int test_id = -1;
154
155  while ((opt = getopt(argc, argv, "r:l:s:j:t:")) != -1) {
156    switch (opt) {
157    case 't':
158      test_id = atoi(optarg);
159      break;
160    case 'l':
161      max_len = atoi(optarg);
162      break;
163    case 'j':
164      steps = atoi(optarg);
165      break;
166    case 's':
167      samples = atoi(optarg);
168      break;
169    case 'r':
170      repeats = atoi(optarg);
171      break;
172    default:
173      printf("P�lli.\n");
174      return 1;
175    }
176  }
177
178  /* XXX - Check that the correct results are returned.  For example, GNU
179           regex-0.12 returns incorrect results for very long strings in
180	   test number 1. */
181
182  switch (test_id) {
183  case 0:
184    printf("# pattern: \"a*\"\n");
185    printf("# string:  \"aaaaaa...\"\n");
186    len = 0;
187    tre_regcomp(&reobj, "a*", REG_EXTENDED);
188    while (len <= max_len) {
189
190      str = malloc(sizeof(char) * (len+1));
191      for (i = 0; i < len; i++)
192	str[i] = 'a';
193      str[len-1] = '\0';
194
195      run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
196      stats(sample_data, samples, len);
197      len = len + (max_len/steps);
198      free(str);
199    }
200    break;
201
202
203  case 1:
204    printf("# pattern: \"(a)*\"\n");
205    printf("# string:  \"aaaaaa...\"\n");
206    len = 0;
207    tre_regcomp(&reobj, "(a)*", REG_EXTENDED);
208    while (len <= max_len) {
209
210      str = malloc(sizeof(char) * (len+1));
211      for (i = 0; i < len; i++)
212	str[i] = 'a';
213      str[len-1] = '\0';
214
215      run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
216      stats(sample_data, samples, len);
217      len = len + (max_len/steps);
218      free(str);
219    }
220    break;
221
222
223  case 2:
224    printf("# pattern: \"(a*)\"\n");
225    printf("# string:  \"aaaaaa...\"\n");    len = 0;
226    tre_regcomp(&reobj, "(a*)", REG_EXTENDED);
227    while (len <= max_len) {
228
229      str = malloc(sizeof(char) * (len+1));
230      for (i = 0; i < len; i++)
231	str[i] = 'a';
232      str[len-1] = '\0';
233
234      run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
235      stats(sample_data, samples, len);
236      len = len + (max_len/steps);
237      free(str);
238    }
239    break;
240
241  case 3:
242    printf("# pattern: \"(a*)*|b*\"\n");
243    printf("# string:  \"aaaaaa...b\"\n");
244    len = 0;
245    tre_regcomp(&reobj, "(a*)*|b*", REG_EXTENDED);
246    while (len <= max_len) {
247      str = malloc(sizeof(char) * (len+1));
248      for (i = 0; i < len-1; i++)
249	str[i] = 'a';
250      if (len > 0)
251	str[len-1] = 'b';
252      str[len] = '\0';
253
254      run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
255      stats(sample_data, samples, len);
256      len = len + (max_len/steps);
257      free(str);
258    }
259    break;
260
261  case 4:
262    printf("# pattern: \"(a|a|a|...|a)\"\n");
263    printf("# string:  \"aaaaaa...\"\n");
264    len = 1024*1024;
265    str = malloc(sizeof(char) * (len+1));
266    for (i = 0; i < len-1; i++)
267      str[i] = 'a';
268    str[len] = '\0';
269    len = 0;
270    while (len <= max_len) {
271      tmpbuf[0] = '(';
272      for (i = 1; i < (len*2); i++) {
273	tmpbuf[i] = 'a';
274	if (i < len*2-2) {
275	  i++;
276	  tmpbuf[i] = '|';
277	}
278      }
279      printf("# i = %d\n", i);
280      tmpbuf[i] = ')';
281      tmpbuf[i+1] = '*';
282      tmpbuf[i+2] = '\0';
283      printf("# pat = %s\n", tmpbuf);
284      tre_regcomp(&reobj, tmpbuf, REG_EXTENDED);
285
286      run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
287      stats(sample_data, samples, len);
288      len = len + (max_len/steps);
289      tre_regfree(&reobj);
290    }
291    free(str);
292    break;
293
294  case 5:
295    printf("# pattern: \"foobar\"\n");
296    printf("# string:  \"aaaaaa...foobar\"\n");
297    len = 0;
298    tre_regcomp(&reobj, "foobar", REG_EXTENDED);
299    while (len <= max_len) {
300      str = malloc(sizeof(char) * (len+7));
301      for (i = 0; i < len; i++) {
302	if (i*i % 3)
303	  str[i] = 'a';
304	else
305	  str[i] = 'a';
306      }
307      str[len+0] = 'f';
308      str[len+1] = 'o';
309      str[len+2] = 'o';
310      str[len+3] = 'b';
311      str[len+4] = 'a';
312      str[len+5] = 'r';
313      str[len+6] = '\0';
314
315      run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
316      stats(sample_data, samples, len);
317      len = len + (max_len/steps);
318      free(str);
319    }
320    break;
321
322
323  case 6:
324    printf("# pattern: \"a*foobar\"\n");
325    printf("# string:  \"aaaaaa...foobar\"\n");
326    len = 0;
327    tre_regcomp(&reobj, "a*foobar", REG_EXTENDED);
328    while (len <= max_len) {
329      str = malloc(sizeof(char) * (len+7));
330      for (i = 0; i < len; i++) {
331	str[i] = 'a';
332      }
333      str[len+0] = 'f';
334      str[len+1] = 'o';
335      str[len+2] = 'o';
336      str[len+3] = 'b';
337      str[len+4] = 'a';
338      str[len+5] = 'r';
339      str[len+6] = '\0';
340
341      run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
342      stats(sample_data, samples, len);
343      len = len + (max_len/steps);
344      free(str);
345    }
346    break;
347
348
349  case 7:
350    printf("# pattern: \"(a)*foobar\"\n");
351    printf("# string:  \"aaaaabbaaab...foobar\"\n");
352    len = 0;
353    tre_regcomp(&reobj, "(a)*foobar", REG_EXTENDED);
354    while (len <= max_len) {
355      str = malloc(sizeof(char) * (len+7));
356      for (i = 0; i < len; i++) {
357	/* Without this GNU regex won't find a match! */
358	if (i*(i-1) % 3)
359	  str[i] = 'b';
360	else
361	  str[i] = 'a';
362      }
363      str[len+0] = 'f';
364      str[len+1] = 'o';
365      str[len+2] = 'o';
366      str[len+3] = 'b';
367      str[len+4] = 'a';
368      str[len+5] = 'r';
369      str[len+6] = '\0';
370
371      run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
372      stats(sample_data, samples, len);
373      len = len + (max_len/steps);
374      free(str);
375    }
376    break;
377
378
379  case 8:
380    printf("# pattern: \"(a|b)*foobar\"\n");
381    printf("# string:  \"aaaaabbaaab...foobar\"\n");
382    len = 0;
383    tre_regcomp(&reobj, "(a|b)*foobar", REG_EXTENDED);
384    while (len <= max_len) {
385      str = malloc(sizeof(char) * (len+7));
386      for (i = 0; i < len; i++) {
387	if (i*(i-1) % 3)
388	  str[i] = 'b';
389	else
390	  str[i] = 'a';
391	/* Without this GNU regex won't find a match! */
392	if (i % (1024*1024*10 - 100))
393	  str[i] = 'f';
394      }
395      str[len+0] = 'f';
396      str[len+1] = 'o';
397      str[len+2] = 'o';
398      str[len+3] = 'b';
399      str[len+4] = 'a';
400      str[len+5] = 'r';
401      str[len+6] = '\0';
402
403      run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
404      stats(sample_data, samples, len);
405      len = len + (max_len/steps);
406      free(str);
407    }
408    break;
409
410
411  case 9:
412    printf("# pattern: hand-coded a*\n");
413    printf("# string:  \"aaaaaa...\"\n");
414    len = 0;
415    while (len <= max_len) {
416      printf("# len = %d\n", len);
417
418      str = malloc(sizeof(char)*(len+1));
419      for (i = 0; i < len; i++)
420	str[i] = 'a';
421      str[len-1] = '\0';
422
423      for (i = 0; i < samples; i++) {
424	c1 = clock();
425	for (j = 0; j < repeats; j++) {
426	  char *s;
427	  int l;
428
429	  s = str;
430	  l = 0;
431
432
433	  while (s != '\0') {
434	    if (*s == 'a') {
435	      s++;
436	      l++;
437	    } else
438	      break;
439	  }
440	}
441      	c2 = clock();
442	sample_data[i] = (double)(c2-c1)/(CLOCKS_PER_SEC*repeats);
443
444	printf("# sample: %.5f sec, clocks: %ld\n",
445	       (double)(c2-c1)/(CLOCKS_PER_SEC*repeats),
446	       (long)(c2-c1));
447	fflush(stdout);
448      }
449      fflush(stdout);
450
451      stats(sample_data, samples, len);
452      len = len + (max_len/steps);
453      free(str);
454    }
455    break;
456
457
458  default:
459    printf("Pelle.\n");
460    return 1;
461  }
462
463  tre_regfree(&reobj);
464
465  return 0;
466}
467