strfile.c revision 181386
12490Sjkh/*- 22490Sjkh * Copyright (c) 1989, 1993 32490Sjkh * The Regents of the University of California. All rights reserved. 42490Sjkh * 52490Sjkh * This code is derived from software contributed to Berkeley by 62490Sjkh * Ken Arnold. 72490Sjkh * 82490Sjkh * Redistribution and use in source and binary forms, with or without 92490Sjkh * modification, are permitted provided that the following conditions 102490Sjkh * are met: 112490Sjkh * 1. Redistributions of source code must retain the above copyright 122490Sjkh * notice, this list of conditions and the following disclaimer. 132490Sjkh * 2. Redistributions in binary form must reproduce the above copyright 142490Sjkh * notice, this list of conditions and the following disclaimer in the 152490Sjkh * documentation and/or other materials provided with the distribution. 162490Sjkh * 3. All advertising materials mentioning features or use of this software 172490Sjkh * must display the following acknowledgement: 182490Sjkh * This product includes software developed by the University of 192490Sjkh * California, Berkeley and its contributors. 202490Sjkh * 4. Neither the name of the University nor the names of its contributors 212490Sjkh * may be used to endorse or promote products derived from this software 222490Sjkh * without specific prior written permission. 232490Sjkh * 242490Sjkh * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 252490Sjkh * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 262490Sjkh * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 272490Sjkh * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 282490Sjkh * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 292490Sjkh * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 302490Sjkh * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 312490Sjkh * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 322490Sjkh * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 332490Sjkh * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 342490Sjkh * SUCH DAMAGE. 352490Sjkh */ 362490Sjkh 37114725Sobrien#if 0 382490Sjkh#ifndef lint 3915944Sachestatic const char copyright[] = 402490Sjkh"@(#) Copyright (c) 1989, 1993\n\ 412490Sjkh The Regents of the University of California. All rights reserved.\n"; 422490Sjkh#endif /* not lint */ 432490Sjkh 442490Sjkh#ifndef lint 4515944Sachestatic const char sccsid[] = "@(#)strfile.c 8.1 (Berkeley) 5/31/93"; 46114725Sobrien#endif /* not lint */ 4751287Speter#endif 48114725Sobrien#include <sys/cdefs.h> 49114725Sobrien__FBSDID("$FreeBSD: head/games/fortune/strfile/strfile.c 181386 2008-08-07 20:05:51Z ache $"); 502490Sjkh 512490Sjkh# include <sys/param.h> 52142022Sru# include <sys/endian.h> 532490Sjkh# include <stdio.h> 5415944Sache# include <stdlib.h> 552490Sjkh# include <ctype.h> 5615944Sache# include <string.h> 5715944Sache# include <time.h> 5815944Sache# include <locale.h> 5915944Sache# include <unistd.h> 602490Sjkh# include "strfile.h" 612490Sjkh 622490Sjkh/* 6372089Sasmodai * This program takes a file composed of strings separated by 642490Sjkh * lines starting with two consecutive delimiting character (default 652490Sjkh * character is '%') and creates another file which consists of a table 662490Sjkh * describing the file (structure from "strfile.h"), a table of seek 672490Sjkh * pointers to the start of the strings, and the strings, each terminated 682490Sjkh * by a null byte. Usage: 692490Sjkh * 702490Sjkh * % strfile [-iorsx] [ -cC ] sourcefile [ datafile ] 712490Sjkh * 7251864Sdcs * C - Allow comments marked by a double delimiter at line's beginning 732490Sjkh * c - Change delimiting character from '%' to 'C' 742490Sjkh * s - Silent. Give no summary of data processed at the end of 752490Sjkh * the run. 762490Sjkh * o - order the strings in alphabetic order 778856Srgrimes * i - if ordering, ignore case 782490Sjkh * r - randomize the order of the strings 792490Sjkh * x - set rotated bit 802490Sjkh * 812490Sjkh * Ken Arnold Sept. 7, 1978 -- 822490Sjkh * 832490Sjkh * Added ordering options. 842490Sjkh */ 852490Sjkh 862490Sjkh# define TRUE 1 872490Sjkh# define FALSE 0 882490Sjkh 892490Sjkh# define STORING_PTRS (Oflag || Rflag) 902490Sjkh# define CHUNKSIZE 512 912490Sjkh 9215944Sache# define ALLOC(ptr,sz) { \ 932490Sjkh if (ptr == NULL) \ 94142022Sru ptr = malloc(CHUNKSIZE * sizeof *ptr); \ 952490Sjkh else if (((sz) + 1) % CHUNKSIZE == 0) \ 96142022Sru ptr = realloc(ptr, ((sz) + CHUNKSIZE) * sizeof *ptr); \ 972490Sjkh if (ptr == NULL) { \ 982490Sjkh fprintf(stderr, "out of space\n"); \ 992490Sjkh exit(1); \ 1002490Sjkh } \ 10115944Sache } 1022490Sjkh 1032490Sjkh#ifdef NO_VOID 1042490Sjkh# define void char 1052490Sjkh#endif 1062490Sjkh 1072490Sjkhtypedef struct { 108123243Sdes int first; 109142022Sru off_t pos; 1102490Sjkh} STR; 1112490Sjkh 112123243Sdesstatic char *Infile = NULL, /* input file name */ 113123243Sdes Outfile[MAXPATHLEN] = "", /* output file name */ 114123243Sdes Delimch = '%'; /* delimiting character */ 1152490Sjkh 116123243Sdesstatic int Cflag = FALSE; /* embedded comments */ 117123243Sdesstatic int Sflag = FALSE; /* silent run flag */ 118123243Sdesstatic int Oflag = FALSE; /* ordering flag */ 119123243Sdesstatic int Iflag = FALSE; /* ignore case flag */ 120123243Sdesstatic int Rflag = FALSE; /* randomize order flag */ 121123243Sdesstatic int Xflag = FALSE; /* set rotated bit */ 122142022Srustatic uint32_t Num_pts = 0; /* number of pointers/strings */ 1232490Sjkh 124142022Srustatic off_t *Seekpts; 1252490Sjkh 126123243Sdesstatic FILE *Sort_1, *Sort_2; /* pointers for sorting */ 1272490Sjkh 128123243Sdesstatic STRFILE Tbl; /* statistics table */ 1292490Sjkh 130123243Sdesstatic STR *Firstch; /* first chars of each string */ 1312490Sjkh 132142022Srustatic void add_offset(FILE *, off_t); 133123243Sdesstatic int cmp_str(const void *, const void *); 134118397Sachestatic int stable_collate_range_cmp(int, int); 135123243Sdesstatic void do_order(void); 136123243Sdesstatic void getargs(int, char **); 137123243Sdesstatic void randomize(void); 138123243Sdesstatic void usage(void); 1392490Sjkh 1402490Sjkh/* 1412490Sjkh * main: 1422490Sjkh * Drive the sucker. There are two main modes -- either we store 1432490Sjkh * the seek pointers, if the table is to be sorted or randomized, 1442490Sjkh * or we write the pointer directly to the file, if we are to stay 1452490Sjkh * in file order. If the former, we allocate and re-allocate in 1462490Sjkh * CHUNKSIZE blocks; if the latter, we just write each pointer, 1472490Sjkh * and then seek back to the beginning to write in the table. 1482490Sjkh */ 149132578Sleint main(int ac, char *av[]) 1502490Sjkh{ 15153210Sbillf char *sp, dc; 15253210Sbillf FILE *inf, *outf; 153142022Sru off_t last_off, pos, *p; 154142022Sru size_t length; 155142022Sru int first; 156142022Sru uint32_t cnt; 15753210Sbillf char *nsp; 15853210Sbillf STR *fp; 1592490Sjkh static char string[257]; 1602490Sjkh 16116140Sache (void) setlocale(LC_ALL, ""); 16215944Sache 1632490Sjkh getargs(ac, av); /* evalute arguments */ 1642490Sjkh dc = Delimch; 1652490Sjkh if ((inf = fopen(Infile, "r")) == NULL) { 1662490Sjkh perror(Infile); 1672490Sjkh exit(1); 1682490Sjkh } 1692490Sjkh 1702490Sjkh if ((outf = fopen(Outfile, "w")) == NULL) { 1712490Sjkh perror(Outfile); 1722490Sjkh exit(1); 1732490Sjkh } 1742490Sjkh if (!STORING_PTRS) 17515944Sache (void) fseek(outf, (long) sizeof Tbl, 0); 1762490Sjkh 1772490Sjkh /* 1782490Sjkh * Write the strings onto the file 1792490Sjkh */ 1802490Sjkh 1812490Sjkh Tbl.str_longlen = 0; 182142022Sru Tbl.str_shortlen = 0xffffffff; 1832490Sjkh Tbl.str_delim = dc; 1842490Sjkh Tbl.str_version = VERSION; 1852490Sjkh first = Oflag; 186142022Sru add_offset(outf, ftello(inf)); 1872490Sjkh last_off = 0; 1882490Sjkh do { 1892490Sjkh sp = fgets(string, 256, inf); 19015944Sache if (sp == NULL || (sp[0] == dc && sp[1] == '\n')) { 191142022Sru pos = ftello(inf); 192142022Sru length = (size_t)(pos - last_off) - 193142022Sru (sp != NULL ? strlen(sp) : 0); 1942490Sjkh last_off = pos; 195142022Sru if (length == 0) 1962490Sjkh continue; 1972490Sjkh add_offset(outf, pos); 198142022Sru if ((size_t)Tbl.str_longlen < length) 1992490Sjkh Tbl.str_longlen = length; 200142022Sru if ((size_t)Tbl.str_shortlen > length) 2012490Sjkh Tbl.str_shortlen = length; 2022490Sjkh first = Oflag; 2032490Sjkh } 2042490Sjkh else if (first) { 20515944Sache for (nsp = sp; !isalnum((unsigned char)*nsp); nsp++) 2062490Sjkh continue; 2072490Sjkh ALLOC(Firstch, Num_pts); 2082490Sjkh fp = &Firstch[Num_pts - 1]; 20915944Sache if (Iflag && isupper((unsigned char)*nsp)) 21015944Sache fp->first = tolower((unsigned char)*nsp); 2112490Sjkh else 2122490Sjkh fp->first = *nsp; 2132490Sjkh fp->pos = Seekpts[Num_pts - 1]; 2142490Sjkh first = FALSE; 2152490Sjkh } 2162490Sjkh } while (sp != NULL); 2172490Sjkh 2182490Sjkh /* 2192490Sjkh * write the tables in 2202490Sjkh */ 2212490Sjkh 2222490Sjkh (void) fclose(inf); 22333633Ssteve Tbl.str_numstr = Num_pts - 1; 2242490Sjkh 22551864Sdcs if (Cflag) 22651864Sdcs Tbl.str_flags |= STR_COMMENTS; 22751864Sdcs 2282490Sjkh if (Oflag) 2292490Sjkh do_order(); 2302490Sjkh else if (Rflag) 2312490Sjkh randomize(); 2322490Sjkh 2332490Sjkh if (Xflag) 2342490Sjkh Tbl.str_flags |= STR_ROTATED; 2352490Sjkh 2362490Sjkh if (!Sflag) { 2372490Sjkh printf("\"%s\" created\n", Outfile); 2382490Sjkh if (Num_pts == 2) 2392490Sjkh puts("There was 1 string"); 2402490Sjkh else 241142022Sru printf("There were %u strings\n", Num_pts - 1); 242142022Sru printf("Longest string: %u byte%s\n", Tbl.str_longlen, 2432490Sjkh Tbl.str_longlen == 1 ? "" : "s"); 244142022Sru printf("Shortest string: %u byte%s\n", Tbl.str_shortlen, 2452490Sjkh Tbl.str_shortlen == 1 ? "" : "s"); 2462490Sjkh } 2472490Sjkh 24815944Sache rewind(outf); 249142022Sru Tbl.str_version = htobe32(Tbl.str_version); 250142022Sru Tbl.str_numstr = htobe32(Tbl.str_numstr); 251142022Sru Tbl.str_longlen = htobe32(Tbl.str_longlen); 252142022Sru Tbl.str_shortlen = htobe32(Tbl.str_shortlen); 253142022Sru Tbl.str_flags = htobe32(Tbl.str_flags); 2542490Sjkh (void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf); 2552490Sjkh if (STORING_PTRS) { 2562490Sjkh for (p = Seekpts, cnt = Num_pts; cnt--; ++p) 257142022Sru *p = htobe64(*p); 258142022Sru (void) fwrite(Seekpts, sizeof *Seekpts, (size_t) Num_pts, outf); 2592490Sjkh } 2602490Sjkh (void) fclose(outf); 2612490Sjkh exit(0); 2622490Sjkh} 2632490Sjkh 2642490Sjkh/* 2652490Sjkh * This routine evaluates arguments from the command line 2662490Sjkh */ 26715944Sachevoid getargs(argc, argv) 2682490Sjkhint argc; 2692490Sjkhchar **argv; 2702490Sjkh{ 2712490Sjkh int ch; 2722490Sjkh 273176407Sru while ((ch = getopt(argc, argv, "Cc:iorsx")) != -1) 2742490Sjkh switch(ch) { 27551864Sdcs case 'C': /* embedded comments */ 27651864Sdcs Cflag++; 27751864Sdcs break; 2782490Sjkh case 'c': /* new delimiting char */ 2792490Sjkh Delimch = *optarg; 2802490Sjkh if (!isascii(Delimch)) { 2812490Sjkh printf("bad delimiting character: '\\%o\n'", 28215944Sache (unsigned char)Delimch); 2832490Sjkh } 2842490Sjkh break; 2852490Sjkh case 'i': /* ignore case in ordering */ 2862490Sjkh Iflag++; 2872490Sjkh break; 2882490Sjkh case 'o': /* order strings */ 2892490Sjkh Oflag++; 2902490Sjkh break; 2912490Sjkh case 'r': /* randomize pointers */ 2922490Sjkh Rflag++; 2932490Sjkh break; 2942490Sjkh case 's': /* silent */ 2952490Sjkh Sflag++; 2962490Sjkh break; 2972490Sjkh case 'x': /* set the rotated bit */ 2982490Sjkh Xflag++; 2992490Sjkh break; 3002490Sjkh case '?': 3012490Sjkh default: 3022490Sjkh usage(); 3032490Sjkh } 3042490Sjkh argv += optind; 3052490Sjkh 3062490Sjkh if (*argv) { 3072490Sjkh Infile = *argv; 3082490Sjkh if (*++argv) 3092490Sjkh (void) strcpy(Outfile, *argv); 3102490Sjkh } 3112490Sjkh if (!Infile) { 3122490Sjkh puts("No input file name"); 3132490Sjkh usage(); 3142490Sjkh } 3152490Sjkh if (*Outfile == '\0') { 3162490Sjkh (void) strcpy(Outfile, Infile); 3172490Sjkh (void) strcat(Outfile, ".dat"); 3182490Sjkh } 3192490Sjkh} 3202490Sjkh 32115944Sachevoid usage() 3222490Sjkh{ 3232490Sjkh (void) fprintf(stderr, 324141581Sru "strfile [-Ciorsx] [-c char] source_file [output_file]\n"); 3252490Sjkh exit(1); 3262490Sjkh} 3272490Sjkh 3282490Sjkh/* 3292490Sjkh * add_offset: 3302490Sjkh * Add an offset to the list, or write it out, as appropriate. 3312490Sjkh */ 33215944Sachevoid add_offset(fp, off) 3332490SjkhFILE *fp; 334142022Sruoff_t off; 3352490Sjkh{ 336142022Sru off_t beoff; 3372490Sjkh 3382490Sjkh if (!STORING_PTRS) { 339142022Sru beoff = htobe64(off); 340142022Sru fwrite(&beoff, 1, sizeof beoff, fp); 3412490Sjkh } else { 3422490Sjkh ALLOC(Seekpts, Num_pts + 1); 3432490Sjkh Seekpts[Num_pts] = off; 3442490Sjkh } 3452490Sjkh Num_pts++; 3462490Sjkh} 3472490Sjkh 3482490Sjkh/* 3492490Sjkh * do_order: 3502490Sjkh * Order the strings alphabetically (possibly ignoring case). 3512490Sjkh */ 35215944Sachevoid do_order() 3532490Sjkh{ 354142022Sru uint32_t i; 355142022Sru off_t *lp; 35653210Sbillf STR *fp; 3572490Sjkh 3582490Sjkh Sort_1 = fopen(Infile, "r"); 3592490Sjkh Sort_2 = fopen(Infile, "r"); 360142022Sru qsort(Firstch, (size_t) Tbl.str_numstr, sizeof *Firstch, cmp_str); 3612490Sjkh i = Tbl.str_numstr; 3622490Sjkh lp = Seekpts; 3632490Sjkh fp = Firstch; 3642490Sjkh while (i--) 3652490Sjkh *lp++ = fp++->pos; 3662490Sjkh (void) fclose(Sort_1); 3672490Sjkh (void) fclose(Sort_2); 3682490Sjkh Tbl.str_flags |= STR_ORDERED; 3692490Sjkh} 3702490Sjkh 371118397Sachestatic int stable_collate_range_cmp(c1, c2) 37219288Sache int c1, c2; 37319288Sache{ 37419288Sache static char s1[2], s2[2]; 37519288Sache int ret; 37619288Sache 37719288Sache s1[0] = c1; 37819288Sache s2[0] = c2; 37919288Sache if ((ret = strcoll(s1, s2)) != 0) 38019288Sache return (ret); 38119288Sache return (c1 - c2); 38219288Sache} 38319288Sache 3842490Sjkh/* 3852490Sjkh * cmp_str: 3862490Sjkh * Compare two strings in the file 3872490Sjkh */ 38862488Sbillfint cmp_str(s1, s2) 38962488Sbillfconst void *s1, *s2; 3902490Sjkh{ 39162488Sbillf const STR *p1, *p2; 39253210Sbillf int c1, c2; 39353210Sbillf int n1, n2; 39415944Sache int r; 3952490Sjkh 3962490Sjkh# define SET_N(nf,ch) (nf = (ch == '\n')) 39715944Sache# define IS_END(ch,nf) (ch == EOF || (ch == (unsigned char) Delimch && nf)) 3982490Sjkh 39962671Sbillf p1 = (const STR *) s1; 40062671Sbillf p2 = (const STR *) s2; 40162488Sbillf 40216262Sache c1 = (unsigned char) p1->first; 40316262Sache c2 = (unsigned char) p2->first; 404118397Sache if ((r = stable_collate_range_cmp(c1, c2)) != 0) 405118397Sache return (r); 4062490Sjkh 407142022Sru (void) fseeko(Sort_1, p1->pos, 0); 408142022Sru (void) fseeko(Sort_2, p2->pos, 0); 4092490Sjkh 4102490Sjkh n1 = FALSE; 4112490Sjkh n2 = FALSE; 41215944Sache while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0' && c1 != EOF) 4132490Sjkh SET_N(n1, c1); 41415944Sache while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0' && c2 != EOF) 4152490Sjkh SET_N(n2, c2); 4162490Sjkh 4172490Sjkh while (!IS_END(c1, n1) && !IS_END(c2, n2)) { 4182490Sjkh if (Iflag) { 4192490Sjkh if (isupper(c1)) 4202490Sjkh c1 = tolower(c1); 4212490Sjkh if (isupper(c2)) 4222490Sjkh c2 = tolower(c2); 4232490Sjkh } 424118397Sache if ((r = stable_collate_range_cmp(c1, c2)) != 0) 425118397Sache return (r); 4262490Sjkh SET_N(n1, c1); 4272490Sjkh SET_N(n2, c2); 4282490Sjkh c1 = getc(Sort_1); 4292490Sjkh c2 = getc(Sort_2); 4302490Sjkh } 4312490Sjkh if (IS_END(c1, n1)) 4322490Sjkh c1 = 0; 4332490Sjkh if (IS_END(c2, n2)) 4342490Sjkh c2 = 0; 435118397Sache return (stable_collate_range_cmp(c1, c2)); 4362490Sjkh} 4372490Sjkh 4382490Sjkh/* 4392490Sjkh * randomize: 4402490Sjkh * Randomize the order of the string table. We must be careful 4412490Sjkh * not to randomize across delimiter boundaries. All 4422490Sjkh * randomization is done within each block. 4432490Sjkh */ 44415944Sachevoid randomize() 4452490Sjkh{ 446142022Sru uint32_t cnt, i; 447142022Sru off_t tmp; 448142022Sru off_t *sp; 4492490Sjkh 4502490Sjkh Tbl.str_flags |= STR_RANDOM; 4512490Sjkh cnt = Tbl.str_numstr; 4522490Sjkh 4532490Sjkh /* 4542490Sjkh * move things around randomly 4552490Sjkh */ 4562490Sjkh 4572490Sjkh for (sp = Seekpts; cnt > 0; cnt--, sp++) { 458181386Sache i = arc4random_uniform(cnt); 4592490Sjkh tmp = sp[0]; 4602490Sjkh sp[0] = sp[i]; 4612490Sjkh sp[i] = tmp; 4622490Sjkh } 4632490Sjkh} 464