1/*	$OpenBSD: spellprog.c,v 1.16 2022/12/26 19:16:03 jmc Exp $	*/
2
3/*
4 * Copyright (c) 1991, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 *	@(#)spell.h	8.1 (Berkeley) 6/6/93
32 */
33/*
34 * Copyright (C) Caldera International Inc.  2001-2002.
35 * All rights reserved.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code and documentation must retain the above
41 *    copyright notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 *    notice, this list of conditions and the following disclaimer in the
44 *    documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 *    must display the following acknowledgement:
47 *	This product includes software developed or owned by Caldera
48 *	International, Inc.
49 * 4. Neither the name of Caldera International, Inc. nor the names of other
50 *    contributors may be used to endorse or promote products derived from
51 *    this software without specific prior written permission.
52 *
53 * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
54 * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
55 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
56 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
57 * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
58 * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
59 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
60 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
62 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
63 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
64 * POSSIBILITY OF SUCH DAMAGE.
65 */
66
67#include <sys/mman.h>
68#include <sys/stat.h>
69
70#include <ctype.h>
71#include <err.h>
72#include <errno.h>
73#include <fcntl.h>
74#include <limits.h>
75#include <stdint.h>
76#include <stdio.h>
77#include <stdlib.h>
78#include <string.h>
79#include <unistd.h>
80
81#define DLEV 2
82
83int	 an(char *, char *, char *, int);
84int	 bility(char *, char *, char *, int);
85int	 es(char *, char *, char *, int);
86int	 dict(char *, char *);
87int	 i_to_y(char *, char *, char *, int);
88int	 ily(char *, char *, char *, int);
89int	 ize(char *, char *, char *, int);
90int	 metry(char *, char *, char *, int);
91int	 monosyl(char *, char *);
92int	 ncy(char *, char *, char *, int);
93int	 nop(char *, char *, char *, int);
94int	 trypref(char *, char *, int);
95int	 tryword(char *, char *, int);
96int	 s(char *, char *, char *, int);
97int	 strip(char *, char *, char *, int);
98int	 suffix(char *, int);
99int	 tion(char *, char *, char *, int);
100int	 vowel(unsigned char);
101int	 y_to_e(char *, char *, char *, int);
102int	 CCe(char *, char *, char *, int);
103int	 VCe(char *, char *, char *, int);
104char	*lookuppref(char **, char *);
105char	*skipv(char *);
106char	*estrdup(const char *);
107void	 ise(void);
108void	 print_word(FILE *);
109void	 ztos(char *);
110static void __dead usage(void);
111
112/* from look.c */
113int	 look(unsigned char *, unsigned char *, unsigned char *);
114
115struct suftab {
116	char *suf;
117	int (*p1)(char *, char *, char *, int);
118	int n1;
119	char *d1;
120	char *a1;
121	int (*p2)(char *, char *, char *, int);
122	int n2;
123	char *d2;
124	char *a2;
125} suftab[] = {
126	{"ssen", ily, 4, "-y+iness", "+ness" },
127	{"ssel", ily, 4, "-y+i+less", "+less" },
128	{"se", s, 1, "", "+s", es, 2, "-y+ies", "+es" },
129	{"s'", s, 2, "", "+'s"},
130	{"s", s, 1, "", "+s"},
131	{"ecn", ncy, 1, "", "-t+ce"},
132	{"ycn", ncy, 1, "", "-cy+t"},
133	{"ytilb", nop, 0, "", ""},
134	{"ytilib", bility, 5, "-le+ility", ""},
135	{"elbaif", i_to_y, 4, "-y+iable", ""},
136	{"elba", CCe, 4, "-e+able", "+able"},
137	{"yti", CCe, 3, "-e+ity", "+ity"},
138	{"ylb", y_to_e, 1, "-e+y", ""},
139	{"yl", ily, 2, "-y+ily", "+ly"},
140	{"laci", strip, 2, "", "+al"},
141	{"latnem", strip, 2, "", "+al"},
142	{"lanoi", strip, 2, "", "+al"},
143	{"tnem", strip, 4, "", "+ment"},
144	{"gni", CCe, 3, "-e+ing", "+ing"},
145	{"reta", nop, 0, "", ""},
146	{"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"},
147	{"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"},
148	{"citsi", strip, 2, "", "+ic"},
149	{"cihparg", i_to_y, 1, "-y+ic", ""},
150	{"tse", strip, 2, "", "+st", i_to_y, 3, "-y+iest", "+est"},
151	{"cirtem", i_to_y, 1, "-y+ic", ""},
152	{"yrtem", metry, 0, "-ry+er", ""},
153	{"cigol", i_to_y, 1, "-y+ic", ""},
154	{"tsigol", i_to_y, 2, "-y+ist", ""},
155	{"tsi", VCe, 3, "-e+ist", "+ist"},
156	{"msi", VCe, 3, "-e+ism", "+ist"},
157	{"noitacif", i_to_y, 6, "-y+ication", ""},
158	{"noitazi", ize, 5, "-e+ation", ""},
159	{"rota", tion, 2, "-e+or", ""},
160	{"noit", tion, 3, "-e+ion", "+ion"},
161	{"naino", an, 3, "", "+ian"},
162	{"na", an, 1, "", "+n"},
163	{"evit", tion, 3, "-e+ive", "+ive"},
164	{"ezi", CCe, 3, "-e+ize", "+ize"},
165	{"pihs", strip, 4, "", "+ship"},
166	{"dooh", ily, 4, "-y+hood", "+hood"},
167	{"ekil", strip, 4, "", "+like"},
168	{ NULL }
169};
170
171char *preftab[] = {
172	"anti",
173	"bio",
174	"dis",
175	"electro",
176	"en",
177	"fore",
178	"hyper",
179	"intra",
180	"inter",
181	"iso",
182	"kilo",
183	"magneto",
184	"meta",
185	"micro",
186	"milli",
187	"mis",
188	"mono",
189	"multi",
190	"non",
191	"out",
192	"over",
193	"photo",
194	"poly",
195	"pre",
196	"pseudo",
197	"re",
198	"semi",
199	"stereo",
200	"sub",
201	"super",
202	"thermo",
203	"ultra",
204	"under",	/* must precede un */
205	"un",
206	NULL
207};
208
209struct wlist {
210	int fd;
211	unsigned char *front;
212	unsigned char *back;
213} *wlists;
214
215int vflag;
216int xflag;
217char word[LINE_MAX];
218char original[LINE_MAX];
219char *deriv[40];
220char affix[40];
221
222/*
223 * The spellprog utility accepts a newline-delimited list of words
224 * on stdin.  For arguments it expects the path to a word list and
225 * the path to a file in which to store found words.
226 *
227 * In normal usage, spell is called twice.  The first time it is
228 * called with a stop list to flag commonly misspelled words.  The
229 * remaining words are then passed to spell again, this time with
230 * the dictionary file as the first (non-flag) argument.
231 *
232 * Unlike historic versions of spellprog, this one does not use
233 * hashed files.  Instead it simply requires that files be sorted
234 * lexigraphically and uses the same algorithm as the look utility.
235 *
236 * Note that spellprog should be called via the spell shell script
237 * and is not meant to be invoked directly by the user.
238 */
239
240int
241main(int argc, char **argv)
242{
243	char *ep, *cp, *dp;
244	char *outfile;
245	int ch, fold, i;
246	struct stat sb;
247	FILE *file, *found;
248
249	if (pledge("stdio rpath wpath cpath", NULL) == -1)
250		err(1, "pledge");
251
252	outfile = NULL;
253	while ((ch = getopt(argc, argv, "bvxo:")) != -1) {
254		switch (ch) {
255		case 'b':
256			/* Use British dictionary and convert ize -> ise. */
257			ise();
258			break;
259		case 'o':
260			outfile = optarg;
261			break;
262		case 'v':
263			/* Also write derivations to "found" file. */
264			vflag = 1;
265			break;
266		case 'x':
267			/* Print plausible stems to stdout. */
268			xflag = 1;
269			break;
270		default:
271			usage();
272		}
273
274	}
275	argc -= optind;
276	argv += optind;
277	if (argc < 1)
278		usage();
279
280	/* Open and mmap the word/stop lists. */
281	if ((wlists = calloc(sizeof(struct wlist), (argc + 1))) == NULL)
282		err(1, "malloc");
283	for (i = 0; argc--; i++) {
284		wlists[i].fd = open(argv[i], O_RDONLY);
285		if (wlists[i].fd == -1 || fstat(wlists[i].fd, &sb) != 0)
286			err(1, "%s", argv[i]);
287		if (sb.st_size > SIZE_MAX)
288			errc(1, EFBIG, "%s", argv[i]);
289		wlists[i].front = mmap(NULL, (size_t)sb.st_size, PROT_READ,
290		    MAP_PRIVATE, wlists[i].fd, (off_t)0);
291		if (wlists[i].front == MAP_FAILED)
292			err(1, "%s", argv[i]);
293		wlists[i].back = wlists[i].front + sb.st_size;
294	}
295	wlists[i].fd = -1;
296
297	/* Open file where found words are to be saved. */
298	if (outfile == NULL)
299		found = NULL;
300	else if ((found = fopen(outfile, "w")) == NULL)
301		err(1, "cannot open %s", outfile);
302
303	for (;; print_word(file)) {
304		affix[0] = '\0';
305		file = found;
306		for (ep = word; (*ep = ch = getchar()) != '\n'; ep++) {
307			if (ep - word == sizeof(word) - 1) {
308				*ep = '\0';
309				warnx("word too long (%s)", word);
310				while ((ch = getchar()) != '\n')
311					;	/* slurp until EOL */
312			}
313			if (ch == EOF) {
314				if (found != NULL)
315					fclose(found);
316				return (0);
317			}
318		}
319		for (cp = word, dp = original; cp < ep; )
320			*dp++ = *cp++;
321		*dp = '\0';
322		fold = 0;
323		for (cp = word; cp < ep; cp++)
324			if (islower((unsigned char)*cp))
325				goto lcase;
326		if (trypref(ep, ".", 0))
327			continue;
328		++fold;
329		for (cp = original + 1, dp = word + 1; dp < ep; dp++, cp++)
330			*dp = tolower((unsigned char)*cp);
331lcase:
332		if (trypref(ep, ".", 0) || suffix(ep, 0))
333			continue;
334		if (isupper((unsigned char)word[0])) {
335			for (cp = original, dp = word; (*dp = *cp++); dp++) {
336				if (fold)
337					*dp = tolower((unsigned char)*dp);
338			}
339			word[0] = tolower((unsigned char)word[0]);
340			goto lcase;
341		}
342		file = stdout;
343	}
344
345	return (0);
346}
347
348void
349print_word(FILE *f)
350{
351
352	if (f != NULL) {
353		if (vflag && affix[0] != '\0' && affix[0] != '.')
354			fprintf(f, "%s\t%s\n", affix, original);
355		else
356			fprintf(f, "%s\n", original);
357	}
358}
359
360/*
361 * For each matching suffix in suftab, call the function associated
362 * with that suffix (p1 and p2).
363 */
364int
365suffix(char *ep, int lev)
366{
367	struct suftab *t;
368	char *cp, *sp;
369
370	lev += DLEV;
371	deriv[lev] = deriv[lev-1] = 0;
372	for (t = suftab; (sp = t->suf); t++) {
373		cp = ep;
374		while (*sp) {
375			if (*--cp != *sp++)
376				goto next;
377		}
378		for (sp = cp; --sp >= word && !vowel(*sp);)
379			;	/* nothing */
380		if (sp < word)
381			return (0);
382		if ((*t->p1)(ep-t->n1, t->d1, t->a1, lev+1))
383			return (1);
384		if (t->p2 != NULL) {
385			deriv[lev] = deriv[lev+1] = 0;
386			return ((*t->p2)(ep-t->n2, t->d2, t->a2, lev));
387		}
388		return (0);
389next:		;
390	}
391	return (0);
392}
393
394int
395nop(char *ep, char *d, char *a, int lev)
396{
397
398	return (0);
399}
400
401int
402strip(char *ep, char *d, char *a, int lev)
403{
404
405	return (trypref(ep, a, lev) || suffix(ep, lev));
406}
407
408int
409s(char *ep, char *d, char *a, int lev)
410{
411
412	if (lev > DLEV + 1)
413		return (0);
414	if (*ep == 's' && ep[-1] == 's')
415		return (0);
416	return (strip(ep, d, a, lev));
417}
418
419int
420an(char *ep, char *d, char *a, int lev)
421{
422
423	if (!isupper((unsigned char)*word))	/* must be proper name */
424		return (0);
425	return (trypref(ep,a,lev));
426}
427
428int
429ize(char *ep, char *d, char *a, int lev)
430{
431
432	*ep++ = 'e';
433	return (strip(ep ,"", d, lev));
434}
435
436int
437y_to_e(char *ep, char *d, char *a, int lev)
438{
439	char c = *ep;
440
441	*ep++ = 'e';
442	if (strip(ep, "", d, lev))
443		return (1);
444	ep[-1] = c;
445	return (0);
446}
447
448int
449ily(char *ep, char *d, char *a, int lev)
450{
451
452	if (ep[-1] == 'i')
453		return (i_to_y(ep, d, a, lev));
454	else
455		return (strip(ep, d, a, lev));
456}
457
458int
459ncy(char *ep, char *d, char *a, int lev)
460{
461
462	if (skipv(skipv(ep-1)) < word)
463		return (0);
464	ep[-1] = 't';
465	return (strip(ep, d, a, lev));
466}
467
468int
469bility(char *ep, char *d, char *a, int lev)
470{
471
472	*ep++ = 'l';
473	return (y_to_e(ep, d, a, lev));
474}
475
476int
477i_to_y(char *ep, char *d, char *a, int lev)
478{
479
480	if (ep[-1] == 'i') {
481		ep[-1] = 'y';
482		a = d;
483	}
484	return (strip(ep, "", a, lev));
485}
486
487int
488es(char *ep, char *d, char *a, int lev)
489{
490
491	if (lev > DLEV)
492		return (0);
493
494	switch (ep[-1]) {
495	default:
496		return (0);
497	case 'i':
498		return (i_to_y(ep, d, a, lev));
499	case 's':
500	case 'h':
501	case 'z':
502	case 'x':
503		return (strip(ep, d, a, lev));
504	}
505}
506
507int
508metry(char *ep, char *d, char *a, int lev)
509{
510
511	ep[-2] = 'e';
512	ep[-1] = 'r';
513	return (strip(ep, d, a, lev));
514}
515
516int
517tion(char *ep, char *d, char *a, int lev)
518{
519
520	switch (ep[-2]) {
521	case 'c':
522	case 'r':
523		return (trypref(ep, a, lev));
524	case 'a':
525		return (y_to_e(ep, d, a, lev));
526	}
527	return (0);
528}
529
530/*
531 * Possible consonant-consonant-e ending.
532 */
533int
534CCe(char *ep, char *d, char *a, int lev)
535{
536
537	switch (ep[-1]) {
538	case 'l':
539		if (vowel(ep[-2]))
540			break;
541		switch (ep[-2]) {
542		case 'l':
543		case 'r':
544		case 'w':
545			break;
546		default:
547			return (y_to_e(ep, d, a, lev));
548		}
549		break;
550	case 's':
551		if (ep[-2] == 's')
552			break;
553	case 'c':
554	case 'g':
555		if (*ep == 'a')
556			return (0);
557	case 'v':
558	case 'z':
559		if (vowel(ep[-2]))
560			break;
561	case 'u':
562		if (y_to_e(ep, d, a, lev))
563			return (1);
564		if (!(ep[-2] == 'n' && ep[-1] == 'g'))
565			return (0);
566	}
567	return (VCe(ep, d, a, lev));
568}
569
570/*
571 * Possible consonant-vowel-consonant-e ending.
572 */
573int
574VCe(char *ep, char *d, char *a, int lev)
575{
576	char c;
577
578	c = ep[-1];
579	if (c == 'e')
580		return (0);
581	if (!vowel(c) && vowel(ep[-2])) {
582		c = *ep;
583		*ep++ = 'e';
584		if (trypref(ep, d, lev) || suffix(ep, lev))
585			return (1);
586		ep--;
587		*ep = c;
588	}
589	return (strip(ep, d, a, lev));
590}
591
592char *
593lookuppref(char **wp, char *ep)
594{
595	char **sp;
596	char *bp,*cp;
597
598	for (sp = preftab; *sp; sp++) {
599		bp = *wp;
600		for (cp = *sp; *cp; cp++, bp++) {
601			if (tolower((unsigned char)*bp) != *cp)
602				goto next;
603		}
604		for (cp = bp; cp < ep; cp++) {
605			if (vowel(*cp)) {
606				*wp = bp;
607				return (*sp);
608			}
609		}
610next:		;
611	}
612	return (0);
613}
614
615/*
616 * If the word is not in the dictionary, try stripping off prefixes
617 * until the word is found or we run out of prefixes to check.
618 */
619int
620trypref(char *ep, char *a, int lev)
621{
622	char *cp;
623	char *bp;
624	char *pp;
625	int val = 0;
626	char space[20];
627
628	deriv[lev] = a;
629	if (tryword(word, ep, lev))
630		return (1);
631	bp = word;
632	pp = space;
633	deriv[lev+1] = pp;
634	while ((cp = lookuppref(&bp, ep))) {
635		*pp++ = '+';
636		while ((*pp = *cp++))
637			pp++;
638		if (tryword(bp, ep, lev+1)) {
639			val = 1;
640			break;
641		}
642		if (pp - space >= sizeof(space))
643			return (0);
644	}
645	deriv[lev+1] = deriv[lev+2] = 0;
646	return (val);
647}
648
649int
650tryword(char *bp, char *ep, int lev)
651{
652	int i, j;
653	char duple[3];
654
655	if (ep-bp <= 1)
656		return (0);
657	if (vowel(*ep) && monosyl(bp, ep))
658		return (0);
659
660	i = dict(bp, ep);
661	if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] && monosyl(bp, ep-1)) {
662		ep--;
663		deriv[++lev] = duple;
664		duple[0] = '+';
665		duple[1] = *ep;
666		duple[2] = '\0';
667		i = dict(bp, ep);
668	}
669	if (vflag == 0 || i == 0)
670		return (i);
671
672	/* Also tack on possible derivations. (XXX - warn on truncation?) */
673	for (j = lev; j > 0; j--) {
674		if (deriv[j])
675			strlcat(affix, deriv[j], sizeof(affix));
676	}
677	return (i);
678}
679
680int
681monosyl(char *bp, char *ep)
682{
683
684	if (ep < bp + 2)
685		return (0);
686	if (vowel(*--ep) || !vowel(*--ep) || ep[1] == 'x' || ep[1] == 'w')
687		return (0);
688	while (--ep >= bp)
689		if (vowel(*ep))
690			return (0);
691	return (1);
692}
693
694char *
695skipv(char *s)
696{
697
698	if (s >= word && vowel(*s))
699		s--;
700	while (s >= word && !vowel(*s))
701		s--;
702	return (s);
703}
704
705int
706vowel(unsigned char c)
707{
708
709	switch (tolower(c)) {
710	case 'a':
711	case 'e':
712	case 'i':
713	case 'o':
714	case 'u':
715	case 'y':
716		return (1);
717	}
718	return (0);
719}
720
721/*
722 * Crummy way to Britishise.
723 */
724void
725ise(void)
726{
727	struct suftab *tab;
728
729	for (tab = suftab; tab->suf; tab++) {
730		/* Assume that suffix will contain 'z' if a1 or d1 do */
731		if (strchr(tab->suf, 'z')) {
732			tab->suf = estrdup(tab->suf);
733			ztos(tab->suf);
734			if (strchr(tab->d1, 'z')) {
735				tab->d1 = estrdup(tab->d1);
736				ztos(tab->d1);
737			}
738			if (strchr(tab->a1, 'z')) {
739				tab->a1 = estrdup(tab->a1);
740				ztos(tab->a1);
741			}
742		}
743	}
744}
745
746void
747ztos(char *s)
748{
749
750	for (; *s; s++)
751		if (*s == 'z')
752			*s = 's';
753}
754
755char *
756estrdup(const char *s)
757{
758	char *d;
759
760	if ((d = strdup(s)) == NULL)
761		err(1, "strdup");
762	return (d);
763}
764
765/*
766 * Look up a word in the dictionary.
767 * Returns 1 if found, 0 if not.
768 */
769int
770dict(char *bp, char *ep)
771{
772	char c;
773	int i, rval;
774
775	c = *ep;
776	*ep = '\0';
777	if (xflag)
778		printf("=%s\n", bp);
779	for (i = rval = 0; wlists[i].fd != -1; i++) {
780		if ((rval = look((unsigned char *)bp, wlists[i].front,
781		    wlists[i].back)) == 1)
782			break;
783	}
784	*ep = c;
785	return (rval);
786}
787
788static void __dead
789usage(void)
790{
791	extern char *__progname;
792
793	fprintf(stderr, "usage: %s [-bvx] [-o found-words] word-list ...\n",
794	    __progname);
795	exit(1);
796}
797