strfile.c revision 19288
12490Sjkh/*- 22490Sjkh * Copyright (c) 1989, 1993 32490Sjkh * The Regents of the University of California. All rights reserved. 42490Sjkh * 52490Sjkh * This code is derived from software contributed to Berkeley by 62490Sjkh * Ken Arnold. 72490Sjkh * 82490Sjkh * Redistribution and use in source and binary forms, with or without 92490Sjkh * modification, are permitted provided that the following conditions 102490Sjkh * are met: 112490Sjkh * 1. Redistributions of source code must retain the above copyright 122490Sjkh * notice, this list of conditions and the following disclaimer. 132490Sjkh * 2. Redistributions in binary form must reproduce the above copyright 142490Sjkh * notice, this list of conditions and the following disclaimer in the 152490Sjkh * documentation and/or other materials provided with the distribution. 162490Sjkh * 3. All advertising materials mentioning features or use of this software 172490Sjkh * must display the following acknowledgement: 182490Sjkh * This product includes software developed by the University of 192490Sjkh * California, Berkeley and its contributors. 202490Sjkh * 4. Neither the name of the University nor the names of its contributors 212490Sjkh * may be used to endorse or promote products derived from this software 222490Sjkh * without specific prior written permission. 232490Sjkh * 242490Sjkh * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 252490Sjkh * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 262490Sjkh * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 272490Sjkh * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 282490Sjkh * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 292490Sjkh * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 302490Sjkh * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 312490Sjkh * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 322490Sjkh * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 332490Sjkh * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 342490Sjkh * SUCH DAMAGE. 352490Sjkh */ 362490Sjkh 372490Sjkh#ifndef lint 3815944Sachestatic const char copyright[] = 392490Sjkh"@(#) Copyright (c) 1989, 1993\n\ 402490Sjkh The Regents of the University of California. All rights reserved.\n"; 412490Sjkh#endif /* not lint */ 422490Sjkh 432490Sjkh#ifndef lint 4415944Sachestatic const char sccsid[] = "@(#)strfile.c 8.1 (Berkeley) 5/31/93"; 452490Sjkh#endif /* not lint */ 462490Sjkh 472490Sjkh# include <sys/param.h> 482490Sjkh# include <stdio.h> 4915944Sache# include <stdlib.h> 502490Sjkh# include <ctype.h> 5115944Sache# include <string.h> 5215944Sache# include <time.h> 5315944Sache# include <locale.h> 5415944Sache# include <unistd.h> 552490Sjkh# include "strfile.h" 562490Sjkh 572490Sjkh/* 582490Sjkh * This program takes a file composed of strings seperated by 592490Sjkh * lines starting with two consecutive delimiting character (default 602490Sjkh * character is '%') and creates another file which consists of a table 612490Sjkh * describing the file (structure from "strfile.h"), a table of seek 622490Sjkh * pointers to the start of the strings, and the strings, each terminated 632490Sjkh * by a null byte. Usage: 642490Sjkh * 652490Sjkh * % strfile [-iorsx] [ -cC ] sourcefile [ datafile ] 662490Sjkh * 672490Sjkh * c - Change delimiting character from '%' to 'C' 682490Sjkh * s - Silent. Give no summary of data processed at the end of 692490Sjkh * the run. 702490Sjkh * o - order the strings in alphabetic order 718856Srgrimes * i - if ordering, ignore case 722490Sjkh * r - randomize the order of the strings 732490Sjkh * x - set rotated bit 742490Sjkh * 752490Sjkh * Ken Arnold Sept. 7, 1978 -- 762490Sjkh * 772490Sjkh * Added ordering options. 782490Sjkh */ 792490Sjkh 802490Sjkh# define TRUE 1 812490Sjkh# define FALSE 0 822490Sjkh 832490Sjkh# define STORING_PTRS (Oflag || Rflag) 842490Sjkh# define CHUNKSIZE 512 852490Sjkh 8615944Sache# define ALLOC(ptr,sz) { \ 872490Sjkh if (ptr == NULL) \ 882490Sjkh ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \ 892490Sjkh else if (((sz) + 1) % CHUNKSIZE == 0) \ 902490Sjkh ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \ 912490Sjkh if (ptr == NULL) { \ 922490Sjkh fprintf(stderr, "out of space\n"); \ 932490Sjkh exit(1); \ 942490Sjkh } \ 9515944Sache } 962490Sjkh 972490Sjkh#ifdef NO_VOID 982490Sjkh# define void char 992490Sjkh#endif 1002490Sjkh 1012490Sjkhtypedef struct { 1022490Sjkh char first; 10315944Sache long pos; 1042490Sjkh} STR; 1052490Sjkh 1062490Sjkhchar *Infile = NULL, /* input file name */ 1072490Sjkh Outfile[MAXPATHLEN] = "", /* output file name */ 1082490Sjkh Delimch = '%'; /* delimiting character */ 1092490Sjkh 1102490Sjkhint Sflag = FALSE; /* silent run flag */ 1112490Sjkhint Oflag = FALSE; /* ordering flag */ 1122490Sjkhint Iflag = FALSE; /* ignore case flag */ 1132490Sjkhint Rflag = FALSE; /* randomize order flag */ 1142490Sjkhint Xflag = FALSE; /* set rotated bit */ 1152490Sjkhlong Num_pts = 0; /* number of pointers/strings */ 1162490Sjkh 11715944Sachelong *Seekpts; 1182490Sjkh 1192490SjkhFILE *Sort_1, *Sort_2; /* pointers for sorting */ 1202490Sjkh 1212490SjkhSTRFILE Tbl; /* statistics table */ 1222490Sjkh 1232490SjkhSTR *Firstch; /* first chars of each string */ 1242490Sjkh 12515944Sachevoid getargs(), add_offset(), do_order(), randomize(), usage(); 12615944Sacheint cmp_str(); 1272490Sjkh 1282490Sjkh/* 1292490Sjkh * main: 1302490Sjkh * Drive the sucker. There are two main modes -- either we store 1312490Sjkh * the seek pointers, if the table is to be sorted or randomized, 1322490Sjkh * or we write the pointer directly to the file, if we are to stay 1332490Sjkh * in file order. If the former, we allocate and re-allocate in 1342490Sjkh * CHUNKSIZE blocks; if the latter, we just write each pointer, 1352490Sjkh * and then seek back to the beginning to write in the table. 1362490Sjkh */ 13715944Sachevoid main(ac, av) 1382490Sjkhint ac; 1392490Sjkhchar **av; 1402490Sjkh{ 1412490Sjkh register char *sp, dc; 1422490Sjkh register FILE *inf, *outf; 14315944Sache register long last_off, length, pos, *p; 1442490Sjkh register int first, cnt; 1452490Sjkh register char *nsp; 1462490Sjkh register STR *fp; 1472490Sjkh static char string[257]; 1482490Sjkh 14916140Sache (void) setlocale(LC_ALL, ""); 15015944Sache 1512490Sjkh getargs(ac, av); /* evalute arguments */ 1522490Sjkh dc = Delimch; 1532490Sjkh if ((inf = fopen(Infile, "r")) == NULL) { 1542490Sjkh perror(Infile); 1552490Sjkh exit(1); 1562490Sjkh } 1572490Sjkh 1582490Sjkh if ((outf = fopen(Outfile, "w")) == NULL) { 1592490Sjkh perror(Outfile); 1602490Sjkh exit(1); 1612490Sjkh } 1622490Sjkh if (!STORING_PTRS) 16315944Sache (void) fseek(outf, (long) sizeof Tbl, 0); 1642490Sjkh 1652490Sjkh /* 1662490Sjkh * Write the strings onto the file 1672490Sjkh */ 1682490Sjkh 1692490Sjkh Tbl.str_longlen = 0; 17015944Sache Tbl.str_shortlen = ~((unsigned long) 0); 1712490Sjkh Tbl.str_delim = dc; 1722490Sjkh Tbl.str_version = VERSION; 1732490Sjkh first = Oflag; 1742490Sjkh add_offset(outf, ftell(inf)); 1752490Sjkh last_off = 0; 1762490Sjkh do { 1772490Sjkh sp = fgets(string, 256, inf); 17815944Sache if (sp == NULL || (sp[0] == dc && sp[1] == '\n')) { 1792490Sjkh pos = ftell(inf); 1802490Sjkh length = pos - last_off - (sp ? strlen(sp) : 0); 1812490Sjkh last_off = pos; 1822490Sjkh if (!length) 1832490Sjkh continue; 1842490Sjkh add_offset(outf, pos); 1852490Sjkh if (Tbl.str_longlen < length) 1862490Sjkh Tbl.str_longlen = length; 1872490Sjkh if (Tbl.str_shortlen > length) 1882490Sjkh Tbl.str_shortlen = length; 1892490Sjkh first = Oflag; 1902490Sjkh } 1912490Sjkh else if (first) { 19215944Sache for (nsp = sp; !isalnum((unsigned char)*nsp); nsp++) 1932490Sjkh continue; 1942490Sjkh ALLOC(Firstch, Num_pts); 1952490Sjkh fp = &Firstch[Num_pts - 1]; 19615944Sache if (Iflag && isupper((unsigned char)*nsp)) 19715944Sache fp->first = tolower((unsigned char)*nsp); 1982490Sjkh else 1992490Sjkh fp->first = *nsp; 2002490Sjkh fp->pos = Seekpts[Num_pts - 1]; 2012490Sjkh first = FALSE; 2022490Sjkh } 2032490Sjkh } while (sp != NULL); 2042490Sjkh 2052490Sjkh /* 2062490Sjkh * write the tables in 2072490Sjkh */ 2082490Sjkh 2092490Sjkh (void) fclose(inf); 2102490Sjkh 2112490Sjkh if (Oflag) 2122490Sjkh do_order(); 2132490Sjkh else if (Rflag) 2142490Sjkh randomize(); 2152490Sjkh 2162490Sjkh if (Xflag) 2172490Sjkh Tbl.str_flags |= STR_ROTATED; 2182490Sjkh 2192490Sjkh if (!Sflag) { 2202490Sjkh printf("\"%s\" created\n", Outfile); 2212490Sjkh if (Num_pts == 2) 2222490Sjkh puts("There was 1 string"); 2232490Sjkh else 22415944Sache printf("There were %ld strings\n", Num_pts - 1); 2252490Sjkh printf("Longest string: %lu byte%s\n", Tbl.str_longlen, 2262490Sjkh Tbl.str_longlen == 1 ? "" : "s"); 2272490Sjkh printf("Shortest string: %lu byte%s\n", Tbl.str_shortlen, 2282490Sjkh Tbl.str_shortlen == 1 ? "" : "s"); 2292490Sjkh } 2302490Sjkh 23115944Sache rewind(outf); 2322490Sjkh Tbl.str_version = htonl(Tbl.str_version); 2332490Sjkh Tbl.str_numstr = htonl(Num_pts - 1); 2342490Sjkh Tbl.str_longlen = htonl(Tbl.str_longlen); 2352490Sjkh Tbl.str_shortlen = htonl(Tbl.str_shortlen); 2362490Sjkh Tbl.str_flags = htonl(Tbl.str_flags); 2372490Sjkh (void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf); 2382490Sjkh if (STORING_PTRS) { 2392490Sjkh for (p = Seekpts, cnt = Num_pts; cnt--; ++p) 2402490Sjkh *p = htonl(*p); 2412490Sjkh (void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf); 2422490Sjkh } 2432490Sjkh (void) fclose(outf); 2442490Sjkh exit(0); 2452490Sjkh} 2462490Sjkh 2472490Sjkh/* 2482490Sjkh * This routine evaluates arguments from the command line 2492490Sjkh */ 25015944Sachevoid getargs(argc, argv) 2512490Sjkhint argc; 2522490Sjkhchar **argv; 2532490Sjkh{ 2542490Sjkh extern char *optarg; 2552490Sjkh extern int optind; 2562490Sjkh int ch; 2572490Sjkh 2582490Sjkh while ((ch = getopt(argc, argv, "c:iorsx")) != EOF) 2592490Sjkh switch(ch) { 2602490Sjkh case 'c': /* new delimiting char */ 2612490Sjkh Delimch = *optarg; 2622490Sjkh if (!isascii(Delimch)) { 2632490Sjkh printf("bad delimiting character: '\\%o\n'", 26415944Sache (unsigned char)Delimch); 2652490Sjkh } 2662490Sjkh break; 2672490Sjkh case 'i': /* ignore case in ordering */ 2682490Sjkh Iflag++; 2692490Sjkh break; 2702490Sjkh case 'o': /* order strings */ 2712490Sjkh Oflag++; 2722490Sjkh break; 2732490Sjkh case 'r': /* randomize pointers */ 2742490Sjkh Rflag++; 2752490Sjkh break; 2762490Sjkh case 's': /* silent */ 2772490Sjkh Sflag++; 2782490Sjkh break; 2792490Sjkh case 'x': /* set the rotated bit */ 2802490Sjkh Xflag++; 2812490Sjkh break; 2822490Sjkh case '?': 2832490Sjkh default: 2842490Sjkh usage(); 2852490Sjkh } 2862490Sjkh argv += optind; 2872490Sjkh 2882490Sjkh if (*argv) { 2892490Sjkh Infile = *argv; 2902490Sjkh if (*++argv) 2912490Sjkh (void) strcpy(Outfile, *argv); 2922490Sjkh } 2932490Sjkh if (!Infile) { 2942490Sjkh puts("No input file name"); 2952490Sjkh usage(); 2962490Sjkh } 2972490Sjkh if (*Outfile == '\0') { 2982490Sjkh (void) strcpy(Outfile, Infile); 2992490Sjkh (void) strcat(Outfile, ".dat"); 3002490Sjkh } 3012490Sjkh} 3022490Sjkh 30315944Sachevoid usage() 3042490Sjkh{ 3052490Sjkh (void) fprintf(stderr, 3062490Sjkh "strfile [-iorsx] [-c char] sourcefile [datafile]\n"); 3072490Sjkh exit(1); 3082490Sjkh} 3092490Sjkh 3102490Sjkh/* 3112490Sjkh * add_offset: 3122490Sjkh * Add an offset to the list, or write it out, as appropriate. 3132490Sjkh */ 31415944Sachevoid add_offset(fp, off) 3152490SjkhFILE *fp; 31615944Sachelong off; 3172490Sjkh{ 31815944Sache long net; 3192490Sjkh 3202490Sjkh if (!STORING_PTRS) { 3212490Sjkh net = htonl(off); 3222490Sjkh fwrite(&net, 1, sizeof net, fp); 3232490Sjkh } else { 3242490Sjkh ALLOC(Seekpts, Num_pts + 1); 3252490Sjkh Seekpts[Num_pts] = off; 3262490Sjkh } 3272490Sjkh Num_pts++; 3282490Sjkh} 3292490Sjkh 3302490Sjkh/* 3312490Sjkh * do_order: 3322490Sjkh * Order the strings alphabetically (possibly ignoring case). 3332490Sjkh */ 33415944Sachevoid do_order() 3352490Sjkh{ 3362490Sjkh register int i; 33715944Sache register long *lp; 3382490Sjkh register STR *fp; 3392490Sjkh 3402490Sjkh Sort_1 = fopen(Infile, "r"); 3412490Sjkh Sort_2 = fopen(Infile, "r"); 3422490Sjkh qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str); 3432490Sjkh i = Tbl.str_numstr; 3442490Sjkh lp = Seekpts; 3452490Sjkh fp = Firstch; 3462490Sjkh while (i--) 3472490Sjkh *lp++ = fp++->pos; 3482490Sjkh (void) fclose(Sort_1); 3492490Sjkh (void) fclose(Sort_2); 3502490Sjkh Tbl.str_flags |= STR_ORDERED; 3512490Sjkh} 3522490Sjkh 35319288Sache/* static */ int collate_range_cmp (c1, c2) 35419288Sache int c1, c2; 35519288Sache{ 35619288Sache static char s1[2], s2[2]; 35719288Sache int ret; 35819288Sache 35919288Sache c1 &= UCHAR_MAX; 36019288Sache c2 &= UCHAR_MAX; 36119288Sache if (c1 == c2) 36219288Sache return (0); 36319288Sache s1[0] = c1; 36419288Sache s2[0] = c2; 36519288Sache if ((ret = strcoll(s1, s2)) != 0) 36619288Sache return (ret); 36719288Sache return (c1 - c2); 36819288Sache} 36919288Sache 3702490Sjkh/* 3712490Sjkh * cmp_str: 3722490Sjkh * Compare two strings in the file 3732490Sjkh */ 37415944Sacheint cmp_str(p1, p2) 3752490SjkhSTR *p1, *p2; 3762490Sjkh{ 3772490Sjkh register int c1, c2; 3782490Sjkh register int n1, n2; 37915944Sache int r; 3802490Sjkh 3812490Sjkh# define SET_N(nf,ch) (nf = (ch == '\n')) 38215944Sache# define IS_END(ch,nf) (ch == EOF || (ch == (unsigned char) Delimch && nf)) 3832490Sjkh 38416262Sache c1 = (unsigned char) p1->first; 38516262Sache c2 = (unsigned char) p2->first; 38617555Sache if ((r = collate_range_cmp(c1, c2)) != 0) 38715944Sache return r; 3882490Sjkh 3892490Sjkh (void) fseek(Sort_1, p1->pos, 0); 3902490Sjkh (void) fseek(Sort_2, p2->pos, 0); 3912490Sjkh 3922490Sjkh n1 = FALSE; 3932490Sjkh n2 = FALSE; 39415944Sache while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0' && c1 != EOF) 3952490Sjkh SET_N(n1, c1); 39615944Sache while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0' && c2 != EOF) 3972490Sjkh SET_N(n2, c2); 3982490Sjkh 3992490Sjkh while (!IS_END(c1, n1) && !IS_END(c2, n2)) { 4002490Sjkh if (Iflag) { 4012490Sjkh if (isupper(c1)) 4022490Sjkh c1 = tolower(c1); 4032490Sjkh if (isupper(c2)) 4042490Sjkh c2 = tolower(c2); 4052490Sjkh } 40617555Sache if ((r = collate_range_cmp(c1, c2)) != 0) 40715944Sache return r; 4082490Sjkh SET_N(n1, c1); 4092490Sjkh SET_N(n2, c2); 4102490Sjkh c1 = getc(Sort_1); 4112490Sjkh c2 = getc(Sort_2); 4122490Sjkh } 4132490Sjkh if (IS_END(c1, n1)) 4142490Sjkh c1 = 0; 4152490Sjkh if (IS_END(c2, n2)) 4162490Sjkh c2 = 0; 41717555Sache return collate_range_cmp(c1, c2); 4182490Sjkh} 4192490Sjkh 4202490Sjkh/* 4212490Sjkh * randomize: 4222490Sjkh * Randomize the order of the string table. We must be careful 4232490Sjkh * not to randomize across delimiter boundaries. All 4242490Sjkh * randomization is done within each block. 4252490Sjkh */ 42615944Sachevoid randomize() 4272490Sjkh{ 4282490Sjkh register int cnt, i; 42915944Sache register long tmp; 43015944Sache register long *sp; 4312490Sjkh 43215944Sache srandom((int)(time((time_t *) NULL) ^ getpid())); 4332490Sjkh 4342490Sjkh Tbl.str_flags |= STR_RANDOM; 4352490Sjkh cnt = Tbl.str_numstr; 4362490Sjkh 4372490Sjkh /* 4382490Sjkh * move things around randomly 4392490Sjkh */ 4402490Sjkh 4412490Sjkh for (sp = Seekpts; cnt > 0; cnt--, sp++) { 4422490Sjkh i = random() % cnt; 4432490Sjkh tmp = sp[0]; 4442490Sjkh sp[0] = sp[i]; 4452490Sjkh sp[i] = tmp; 4462490Sjkh } 4472490Sjkh} 448