12490Sjkh/*-
22490Sjkh * Copyright (c) 1989, 1993
32490Sjkh *	The Regents of the University of California.  All rights reserved.
42490Sjkh *
52490Sjkh * This code is derived from software contributed to Berkeley by
62490Sjkh * Ken Arnold.
72490Sjkh *
82490Sjkh * Redistribution and use in source and binary forms, with or without
92490Sjkh * modification, are permitted provided that the following conditions
102490Sjkh * are met:
112490Sjkh * 1. Redistributions of source code must retain the above copyright
122490Sjkh *    notice, this list of conditions and the following disclaimer.
132490Sjkh * 2. Redistributions in binary form must reproduce the above copyright
142490Sjkh *    notice, this list of conditions and the following disclaimer in the
152490Sjkh *    documentation and/or other materials provided with the distribution.
16203926Suqs * 3. Neither the name of the University nor the names of its contributors
172490Sjkh *    may be used to endorse or promote products derived from this software
182490Sjkh *    without specific prior written permission.
192490Sjkh *
202490Sjkh * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
212490Sjkh * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
222490Sjkh * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
232490Sjkh * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
242490Sjkh * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
252490Sjkh * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
262490Sjkh * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
272490Sjkh * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
282490Sjkh * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
292490Sjkh * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
302490Sjkh * SUCH DAMAGE.
312490Sjkh */
322490Sjkh
33114725Sobrien#if 0
342490Sjkh#ifndef lint
3515944Sachestatic const char copyright[] =
362490Sjkh"@(#) Copyright (c) 1989, 1993\n\
372490Sjkh	The Regents of the University of California.  All rights reserved.\n";
382490Sjkh#endif /* not lint */
392490Sjkh
402490Sjkh#ifndef lint
4115944Sachestatic const char sccsid[] = "@(#)strfile.c   8.1 (Berkeley) 5/31/93";
42114725Sobrien#endif /* not lint */
4351287Speter#endif
44114725Sobrien#include <sys/cdefs.h>
45114725Sobrien__FBSDID("$FreeBSD: stable/11/usr.bin/fortune/strfile/strfile.c 317438 2017-04-26 14:43:21Z asomers $");
462490Sjkh
47203926Suqs#include <sys/param.h>
48203926Suqs#include <sys/endian.h>
49203926Suqs#include <ctype.h>
50203926Suqs#include <locale.h>
51203926Suqs#include <stdbool.h>
52203926Suqs#include <stdio.h>
53203926Suqs#include <stdlib.h>
54203926Suqs#include <string.h>
55203926Suqs#include <time.h>
56203926Suqs#include <unistd.h>
572490Sjkh
58203926Suqs#include "strfile.h"
59203926Suqs
602490Sjkh/*
6172089Sasmodai *	This program takes a file composed of strings separated by
622490Sjkh * lines starting with two consecutive delimiting character (default
632490Sjkh * character is '%') and creates another file which consists of a table
642490Sjkh * describing the file (structure from "strfile.h"), a table of seek
652490Sjkh * pointers to the start of the strings, and the strings, each terminated
662490Sjkh * by a null byte.  Usage:
672490Sjkh *
682490Sjkh *	% strfile [-iorsx] [ -cC ] sourcefile [ datafile ]
692490Sjkh *
7051864Sdcs *	C - Allow comments marked by a double delimiter at line's beginning
712490Sjkh *	c - Change delimiting character from '%' to 'C'
722490Sjkh *	s - Silent.  Give no summary of data processed at the end of
732490Sjkh *	    the run.
742490Sjkh *	o - order the strings in alphabetic order
758856Srgrimes *	i - if ordering, ignore case
762490Sjkh *	r - randomize the order of the strings
772490Sjkh *	x - set rotated bit
782490Sjkh *
792490Sjkh *		Ken Arnold	Sept. 7, 1978 --
802490Sjkh *
812490Sjkh *	Added ordering options.
822490Sjkh */
832490Sjkh
84203926Suqs#define	STORING_PTRS	(Oflag || Rflag)
85203926Suqs#define	CHUNKSIZE	512
862490Sjkh
87203926Suqs#define		ALLOC(ptr, sz)	do { \
882490Sjkh			if (ptr == NULL) \
89203926Suqs				ptr = malloc(CHUNKSIZE * sizeof(*ptr)); \
902490Sjkh			else if (((sz) + 1) % CHUNKSIZE == 0) \
91203926Suqs				ptr = realloc(ptr, ((sz) + CHUNKSIZE) * sizeof(*ptr)); \
922490Sjkh			if (ptr == NULL) { \
932490Sjkh				fprintf(stderr, "out of space\n"); \
942490Sjkh				exit(1); \
952490Sjkh			} \
96203926Suqs		} while (0)
972490Sjkh
982490Sjkhtypedef struct {
99123243Sdes	int	first;
100142022Sru	off_t	pos;
1012490Sjkh} STR;
1022490Sjkh
103123243Sdesstatic char	*Infile		= NULL,		/* input file name */
104123243Sdes		Outfile[MAXPATHLEN] = "",	/* output file name */
105123243Sdes		Delimch		= '%';		/* delimiting character */
1062490Sjkh
107203926Suqsstatic int	Cflag		= false;	/* embedded comments */
108203926Suqsstatic int	Sflag		= false;	/* silent run flag */
109203926Suqsstatic int	Oflag		= false;	/* ordering flag */
110203926Suqsstatic int	Iflag		= false;	/* ignore case flag */
111203926Suqsstatic int	Rflag		= false;	/* randomize order flag */
112203926Suqsstatic int	Xflag		= false;	/* set rotated bit */
113142022Srustatic uint32_t	Num_pts		= 0;		/* number of pointers/strings */
1142490Sjkh
115142022Srustatic off_t	*Seekpts;
1162490Sjkh
117123243Sdesstatic FILE	*Sort_1, *Sort_2;		/* pointers for sorting */
1182490Sjkh
119123243Sdesstatic STRFILE	Tbl;				/* statistics table */
1202490Sjkh
121123243Sdesstatic STR	*Firstch;			/* first chars of each string */
1222490Sjkh
123203926Suqsstatic void add_offset(FILE *, off_t);
124203926Suqsstatic int cmp_str(const void *, const void *);
125203926Suqsstatic int stable_collate_range_cmp(int, int);
126203926Suqsstatic void do_order(void);
127203926Suqsstatic void getargs(int, char **);
128203926Suqsstatic void randomize(void);
129203926Suqsstatic void usage(void);
1302490Sjkh
1312490Sjkh/*
1322490Sjkh * main:
1332490Sjkh *	Drive the sucker.  There are two main modes -- either we store
1342490Sjkh *	the seek pointers, if the table is to be sorted or randomized,
1352490Sjkh *	or we write the pointer directly to the file, if we are to stay
1362490Sjkh *	in file order.  If the former, we allocate and re-allocate in
1372490Sjkh *	CHUNKSIZE blocks; if the latter, we just write each pointer,
1382490Sjkh *	and then seek back to the beginning to write in the table.
1392490Sjkh */
140201175Sedint
141201175Sedmain(int ac, char *av[])
1422490Sjkh{
143203926Suqs	char *sp, *nsp, dc;
144203926Suqs	FILE *inf, *outf;
145203926Suqs	off_t last_off, pos, *p;
146203926Suqs	size_t length;
147203926Suqs	int first;
148203926Suqs	uint32_t cnt;
149203926Suqs	STR *fp;
150203926Suqs	static char string[257];
1512490Sjkh
152203926Suqs	setlocale(LC_ALL, "");
15315944Sache
1542490Sjkh	getargs(ac, av);		/* evalute arguments */
1552490Sjkh	dc = Delimch;
1562490Sjkh	if ((inf = fopen(Infile, "r")) == NULL) {
1572490Sjkh		perror(Infile);
1582490Sjkh		exit(1);
1592490Sjkh	}
1602490Sjkh
1612490Sjkh	if ((outf = fopen(Outfile, "w")) == NULL) {
1622490Sjkh		perror(Outfile);
1632490Sjkh		exit(1);
1642490Sjkh	}
1652490Sjkh	if (!STORING_PTRS)
166203926Suqs		fseek(outf, (long)sizeof(Tbl), SEEK_SET);
1672490Sjkh
1682490Sjkh	/*
1692490Sjkh	 * Write the strings onto the file
1702490Sjkh	 */
1712490Sjkh
1722490Sjkh	Tbl.str_longlen = 0;
173142022Sru	Tbl.str_shortlen = 0xffffffff;
1742490Sjkh	Tbl.str_delim = dc;
1752490Sjkh	Tbl.str_version = VERSION;
1762490Sjkh	first = Oflag;
177142022Sru	add_offset(outf, ftello(inf));
1782490Sjkh	last_off = 0;
1792490Sjkh	do {
1802490Sjkh		sp = fgets(string, 256, inf);
18115944Sache		if (sp == NULL || (sp[0] == dc && sp[1] == '\n')) {
182142022Sru			pos = ftello(inf);
183142022Sru			length = (size_t)(pos - last_off) -
184142022Sru			    (sp != NULL ? strlen(sp) : 0);
1852490Sjkh			last_off = pos;
186142022Sru			if (length == 0)
1872490Sjkh				continue;
1882490Sjkh			add_offset(outf, pos);
189142022Sru			if ((size_t)Tbl.str_longlen < length)
1902490Sjkh				Tbl.str_longlen = length;
191142022Sru			if ((size_t)Tbl.str_shortlen > length)
1922490Sjkh				Tbl.str_shortlen = length;
1932490Sjkh			first = Oflag;
1942490Sjkh		}
1952490Sjkh		else if (first) {
19615944Sache			for (nsp = sp; !isalnum((unsigned char)*nsp); nsp++)
1972490Sjkh				continue;
1982490Sjkh			ALLOC(Firstch, Num_pts);
1992490Sjkh			fp = &Firstch[Num_pts - 1];
20015944Sache			if (Iflag && isupper((unsigned char)*nsp))
20115944Sache				fp->first = tolower((unsigned char)*nsp);
2022490Sjkh			else
2032490Sjkh				fp->first = *nsp;
2042490Sjkh			fp->pos = Seekpts[Num_pts - 1];
205203926Suqs			first = false;
2062490Sjkh		}
2072490Sjkh	} while (sp != NULL);
2082490Sjkh
2092490Sjkh	/*
2102490Sjkh	 * write the tables in
2112490Sjkh	 */
2122490Sjkh
213203926Suqs	fclose(inf);
21433633Ssteve	Tbl.str_numstr = Num_pts - 1;
2152490Sjkh
21651864Sdcs	if (Cflag)
21751864Sdcs		Tbl.str_flags |= STR_COMMENTS;
21851864Sdcs
2192490Sjkh	if (Oflag)
2202490Sjkh		do_order();
2212490Sjkh	else if (Rflag)
2222490Sjkh		randomize();
2232490Sjkh
2242490Sjkh	if (Xflag)
2252490Sjkh		Tbl.str_flags |= STR_ROTATED;
2262490Sjkh
2272490Sjkh	if (!Sflag) {
2282490Sjkh		printf("\"%s\" created\n", Outfile);
2292490Sjkh		if (Num_pts == 2)
2302490Sjkh			puts("There was 1 string");
2312490Sjkh		else
232142022Sru			printf("There were %u strings\n", Num_pts - 1);
233142022Sru		printf("Longest string: %u byte%s\n", Tbl.str_longlen,
2342490Sjkh		       Tbl.str_longlen == 1 ? "" : "s");
235142022Sru		printf("Shortest string: %u byte%s\n", Tbl.str_shortlen,
2362490Sjkh		       Tbl.str_shortlen == 1 ? "" : "s");
2372490Sjkh	}
2382490Sjkh
23915944Sache	rewind(outf);
240142022Sru	Tbl.str_version = htobe32(Tbl.str_version);
241142022Sru	Tbl.str_numstr = htobe32(Tbl.str_numstr);
242142022Sru	Tbl.str_longlen = htobe32(Tbl.str_longlen);
243142022Sru	Tbl.str_shortlen = htobe32(Tbl.str_shortlen);
244142022Sru	Tbl.str_flags = htobe32(Tbl.str_flags);
245203926Suqs	fwrite((char *)&Tbl, sizeof(Tbl), 1, outf);
2462490Sjkh	if (STORING_PTRS) {
2472490Sjkh		for (p = Seekpts, cnt = Num_pts; cnt--; ++p)
248142022Sru			*p = htobe64(*p);
249203926Suqs		fwrite(Seekpts, sizeof(*Seekpts), (size_t)Num_pts, outf);
2502490Sjkh	}
251203926Suqs	fclose(outf);
2522490Sjkh	exit(0);
2532490Sjkh}
2542490Sjkh
2552490Sjkh/*
2562490Sjkh *	This routine evaluates arguments from the command line
2572490Sjkh */
258201175Sedvoid
259201175Sedgetargs(int argc, char **argv)
2602490Sjkh{
261203926Suqs	int ch;
2622490Sjkh
263176407Sru	while ((ch = getopt(argc, argv, "Cc:iorsx")) != -1)
2642490Sjkh		switch(ch) {
26551864Sdcs		case 'C':			/* embedded comments */
26651864Sdcs			Cflag++;
26751864Sdcs			break;
2682490Sjkh		case 'c':			/* new delimiting char */
2692490Sjkh			Delimch = *optarg;
2702490Sjkh			if (!isascii(Delimch)) {
2712490Sjkh				printf("bad delimiting character: '\\%o\n'",
27215944Sache				       (unsigned char)Delimch);
2732490Sjkh			}
2742490Sjkh			break;
2752490Sjkh		case 'i':			/* ignore case in ordering */
2762490Sjkh			Iflag++;
2772490Sjkh			break;
2782490Sjkh		case 'o':			/* order strings */
2792490Sjkh			Oflag++;
2802490Sjkh			break;
2812490Sjkh		case 'r':			/* randomize pointers */
2822490Sjkh			Rflag++;
2832490Sjkh			break;
2842490Sjkh		case 's':			/* silent */
2852490Sjkh			Sflag++;
2862490Sjkh			break;
2872490Sjkh		case 'x':			/* set the rotated bit */
2882490Sjkh			Xflag++;
2892490Sjkh			break;
2902490Sjkh		case '?':
2912490Sjkh		default:
2922490Sjkh			usage();
2932490Sjkh		}
2942490Sjkh	argv += optind;
2952490Sjkh
2962490Sjkh	if (*argv) {
2972490Sjkh		Infile = *argv;
2982490Sjkh		if (*++argv)
299203926Suqs			strcpy(Outfile, *argv);
3002490Sjkh	}
3012490Sjkh	if (!Infile) {
3022490Sjkh		puts("No input file name");
3032490Sjkh		usage();
3042490Sjkh	}
3052490Sjkh	if (*Outfile == '\0') {
306317438Sasomers		strlcpy(Outfile, Infile, sizeof(Outfile));
307317438Sasomers		strlcat(Outfile, ".dat", sizeof(Outfile));
3082490Sjkh	}
3092490Sjkh}
3102490Sjkh
311201175Sedvoid
312201175Sedusage(void)
3132490Sjkh{
314203926Suqs	fprintf(stderr,
315141581Sru	    "strfile [-Ciorsx] [-c char] source_file [output_file]\n");
3162490Sjkh	exit(1);
3172490Sjkh}
3182490Sjkh
3192490Sjkh/*
3202490Sjkh * add_offset:
3212490Sjkh *	Add an offset to the list, or write it out, as appropriate.
3222490Sjkh */
323201175Sedvoid
324201175Sedadd_offset(FILE *fp, off_t off)
3252490Sjkh{
326142022Sru	off_t beoff;
3272490Sjkh
3282490Sjkh	if (!STORING_PTRS) {
329142022Sru		beoff = htobe64(off);
330203926Suqs		fwrite(&beoff, 1, sizeof(beoff), fp);
3312490Sjkh	} else {
3322490Sjkh		ALLOC(Seekpts, Num_pts + 1);
3332490Sjkh		Seekpts[Num_pts] = off;
3342490Sjkh	}
3352490Sjkh	Num_pts++;
3362490Sjkh}
3372490Sjkh
3382490Sjkh/*
3392490Sjkh * do_order:
3402490Sjkh *	Order the strings alphabetically (possibly ignoring case).
3412490Sjkh */
342201175Sedvoid
343201175Seddo_order(void)
3442490Sjkh{
345142022Sru	uint32_t i;
346203926Suqs	off_t *lp;
347203926Suqs	STR *fp;
3482490Sjkh
3492490Sjkh	Sort_1 = fopen(Infile, "r");
3502490Sjkh	Sort_2 = fopen(Infile, "r");
351203926Suqs	qsort(Firstch, (size_t)Tbl.str_numstr, sizeof(*Firstch), cmp_str);
3522490Sjkh	i = Tbl.str_numstr;
3532490Sjkh	lp = Seekpts;
3542490Sjkh	fp = Firstch;
3552490Sjkh	while (i--)
3562490Sjkh		*lp++ = fp++->pos;
357203926Suqs	fclose(Sort_1);
358203926Suqs	fclose(Sort_2);
3592490Sjkh	Tbl.str_flags |= STR_ORDERED;
3602490Sjkh}
3612490Sjkh
362201175Sedstatic int
363201175Sedstable_collate_range_cmp(int c1, int c2)
36419288Sache{
36519288Sache	static char s1[2], s2[2];
36619288Sache	int ret;
36719288Sache
36819288Sache	s1[0] = c1;
36919288Sache	s2[0] = c2;
37019288Sache	if ((ret = strcoll(s1, s2)) != 0)
37119288Sache		return (ret);
37219288Sache	return (c1 - c2);
37319288Sache}
37419288Sache
3752490Sjkh/*
3762490Sjkh * cmp_str:
3772490Sjkh *	Compare two strings in the file
3782490Sjkh */
379201175Sedint
380201175Sedcmp_str(const void *s1, const void *s2)
3812490Sjkh{
382203926Suqs	const STR *p1, *p2;
383203926Suqs	int c1, c2, n1, n2, r;
3842490Sjkh
385203926Suqs#define	SET_N(nf,ch)	(nf = (ch == '\n'))
386203926Suqs#define	IS_END(ch,nf)	(ch == EOF || (ch == (unsigned char)Delimch && nf))
3872490Sjkh
388203926Suqs	p1 = (const STR *)s1;
389203926Suqs	p2 = (const STR *)s2;
390203926Suqs
391203926Suqs	c1 = (unsigned char)p1->first;
392203926Suqs	c2 = (unsigned char)p2->first;
393118397Sache	if ((r = stable_collate_range_cmp(c1, c2)) != 0)
394118397Sache		return (r);
3952490Sjkh
396203926Suqs	fseeko(Sort_1, p1->pos, SEEK_SET);
397203926Suqs	fseeko(Sort_2, p2->pos, SEEK_SET);
3982490Sjkh
399203926Suqs	n1 = false;
400203926Suqs	n2 = false;
40115944Sache	while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0' && c1 != EOF)
4022490Sjkh		SET_N(n1, c1);
40315944Sache	while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0' && c2 != EOF)
4042490Sjkh		SET_N(n2, c2);
4052490Sjkh
4062490Sjkh	while (!IS_END(c1, n1) && !IS_END(c2, n2)) {
4072490Sjkh		if (Iflag) {
4082490Sjkh			if (isupper(c1))
4092490Sjkh				c1 = tolower(c1);
4102490Sjkh			if (isupper(c2))
4112490Sjkh				c2 = tolower(c2);
4122490Sjkh		}
413118397Sache		if ((r = stable_collate_range_cmp(c1, c2)) != 0)
414118397Sache			return (r);
4152490Sjkh		SET_N(n1, c1);
4162490Sjkh		SET_N(n2, c2);
4172490Sjkh		c1 = getc(Sort_1);
4182490Sjkh		c2 = getc(Sort_2);
4192490Sjkh	}
4202490Sjkh	if (IS_END(c1, n1))
4212490Sjkh		c1 = 0;
4222490Sjkh	if (IS_END(c2, n2))
4232490Sjkh		c2 = 0;
424203926Suqs
425118397Sache	return (stable_collate_range_cmp(c1, c2));
4262490Sjkh}
4272490Sjkh
4282490Sjkh/*
4292490Sjkh * randomize:
4302490Sjkh *	Randomize the order of the string table.  We must be careful
4312490Sjkh *	not to randomize across delimiter boundaries.  All
4322490Sjkh *	randomization is done within each block.
4332490Sjkh */
434201175Sedvoid
435201175Sedrandomize(void)
4362490Sjkh{
437142022Sru	uint32_t cnt, i;
438203926Suqs	off_t tmp;
439203926Suqs	off_t *sp;
4402490Sjkh
4412490Sjkh	Tbl.str_flags |= STR_RANDOM;
4422490Sjkh	cnt = Tbl.str_numstr;
4432490Sjkh
4442490Sjkh	/*
4452490Sjkh	 * move things around randomly
4462490Sjkh	 */
4472490Sjkh
4482490Sjkh	for (sp = Seekpts; cnt > 0; cnt--, sp++) {
449181386Sache		i = arc4random_uniform(cnt);
4502490Sjkh		tmp = sp[0];
4512490Sjkh		sp[0] = sp[i];
4522490Sjkh		sp[i] = tmp;
4532490Sjkh	}
4542490Sjkh}
455