spell/spellprog/spellprog.c

/*	$NetBSD: spellprog.c,v 1.4 2005/07/17 17:08:25 christos Exp $	*/

/* derived from OpenBSD: spellprog.c,v 1.4 2003/06/03 02:56:16 millert Exp */

/*
 * Copyright (c) 1991, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)spell.h	8.1 (Berkeley) 6/6/93
 */
/*
 * Copyright (C) Caldera International Inc.  2001-2002.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code and documentation must retain the above
 *    copyright notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed or owned by Caldera
 *	International, Inc.
 * 4. Neither the name of Caldera International, Inc. nor the names of other
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
 * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
 * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef lint
static const char copyright[] =
"@(#) Copyright (c) 1991, 1993\n\
	The Regents of the University of California.  All rights reserved.\n";
#endif /* not lint */

#ifndef lint
#if 0
static const char sccsid[] = "@(#)spell.c	8.1 (Berkeley) 6/6/93";
#else
#endif
static const char rcsid[] = "$OpenBSD: spellprog.c,v 1.4 2003/06/03 02:56:16 millert Exp $";
#endif /* not lint */

#include <sys/param.h>
#include <sys/mman.h>
#include <sys/stat.h>

#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "extern.h"

#define DLEV 2

static int	 dict(char *, char *);
static int	 trypref(char *, const char *, size_t);
static int	 tryword(char *, char *, size_t);
static int	 suffix(char *, size_t);
static int	 vowel(int);
static const char *lookuppref(char **, char *);
static char	*skipv(char *);
static char	*estrdup(const char *);
static void	 ise(void);
static void	 print_word(FILE *);
static void	 ztos(char *);
static int	 monosyl(char *, char *);
static void 	 usage(void) __attribute__((__noreturn__));
static void	 getderiv(size_t);

static int	 an(char *, const char *, const char *, size_t);
static int	 bility(char *, const char *, const char *, size_t);
static int	 es(char *, const char *, const char *, size_t);
static int	 i_to_y(char *, const char *, const char *, size_t);
static int	 ily(char *, const char *, const char *, size_t);
static int	 ize(char *, const char *, const char *, size_t);
static int	 metry(char *, const char *, const char *, size_t);
static int	 ncy(char *, const char *, const char *, size_t);
static int	 nop(char *, const char *, const char *, size_t);
static int	 s(char *, const char *, const char *, size_t);
static int	 strip(char *, const char *, const char *, size_t);
static int	 tion(char *, const char *, const char *, size_t);
static int	 y_to_e(char *, const char *, const char *, size_t);
static int	 CCe(char *, const char *, const char *, size_t);
static int	 VCe(char *, const char *, const char *, size_t);

/*
 * This cannot be const because we modify it when we choose british
 * spelling.
 */
static struct suftab {
	const char *suf;
	int (*p1)(char *, const char *, const char *, size_t);
	int n1;
	const char *d1;
	const char *a1;
	int (*p2)(char *, const char *, const char *, size_t);
	int n2;
	const char *d2;
	const char *a2;
} suftab[] = {
	{"ssen",     ily,    4, "-y+iness", "+ness" },
	{"ssel",     ily,    4, "-y+i+less", "+less" },
	{"se",       s,      1, "", "+s", es, 2, "-y+ies", "+es" },
	{"s'",       s,      2, "", "+'s"},
	{"s",        s,      1, "", "+s"},
	{"ecn",      ncy,    1, "", "-t+ce"},
	{"ycn",      ncy,    1, "", "-cy+t"},
	{"ytilb",    nop,    0, "", ""},
	{"ytilib",   bility, 5, "-le+ility", ""},
	{"elbaif",   i_to_y, 4, "-y+iable", ""},
	{"elba",     CCe,    4, "-e+able", "+able"},
	{"yti",      CCe,    3, "-e+ity", "+ity"},
	{"ylb",      y_to_e, 1, "-e+y", ""},
	{"yl",       ily,    2, "-y+ily", "+ly"},
	{"laci",     strip,  2, "", "+al"},
	{"latnem",   strip,  2, "", "+al"},
	{"lanoi",    strip,  2, "", "+al"},
	{"tnem",     strip,  4, "", "+ment"},
	{"gni",      CCe,    3, "-e+ing", "+ing"},
	{"reta",     nop,    0, "", ""},
	{"re",       strip,  1, "", "+r", i_to_y, 2, "-y+ier", "+er"},
	{"de",       strip,  1, "", "+d", i_to_y, 2, "-y+ied", "+ed"},
	{"citsi",    strip,  2, "", "+ic"},
	{"cihparg",  i_to_y, 1, "-y+ic", ""},
	{"tse",      strip,  2, "", "+st", i_to_y, 3, "-y+iest", "+est"},
	{"cirtem",   i_to_y, 1, "-y+ic", ""},
	{"yrtem",    metry,  0, "-ry+er", ""},
	{"cigol",    i_to_y, 1, "-y+ic", ""},
	{"tsigol",   i_to_y, 2, "-y+ist", ""},
	{"tsi",      VCe,    3, "-e+ist", "+ist"},
	{"msi",      VCe,    3, "-e+ism", "+ist"},
	{"noitacif", i_to_y, 6, "-y+ication", ""},
	{"noitazi",  ize,    5, "-e+ation", ""},
	{"rota",     tion,   2, "-e+or", ""},
	{"noit",     tion,   3, "-e+ion", "+ion"},
	{"naino",    an,     3, "", "+ian"},
	{"na",       an,     1, "", "+n"},
	{"evit",     tion,   3, "-e+ive", "+ive"},
	{"ezi",      CCe,    3, "-e+ize", "+ize"},
	{"pihs",     strip,  4, "", "+ship"},
	{"dooh",     ily,    4, "-y+hood", "+hood"},
	{"ekil",     strip,  4, "", "+like"},
	{ NULL, }
};

static const char *preftab[] = {
	"anti",
	"bio",
	"dis",
	"electro",
	"en",
	"fore",
	"hyper",
	"intra",
	"inter",
	"iso",
	"kilo",
	"magneto",
	"meta",
	"micro",
	"milli",
	"mis",
	"mono",
	"multi",
	"non",
	"out",
	"over",
	"photo",
	"poly",
	"pre",
	"pseudo",
	"re",
	"semi",
	"stereo",
	"sub",
	"super",
	"thermo",
	"ultra",
	"under",	/* must precede un */
	"un",
	NULL
};

static struct wlist {
	int fd;
	unsigned char *front;
	unsigned char *back;
} *wlists;

static int vflag;
static int xflag;
static char word[LINE_MAX];
static char original[LINE_MAX];
static char affix[LINE_MAX];
static struct {
	const char **buf;
	size_t maxlev;
} deriv;

/*
 * The spellprog utility accepts a newline-delimited list of words
 * on stdin.  For arguments it expects the path to a word list and
 * the path to a file in which to store found words.
 *
 * In normal usage, spell is called twice.  The first time it is
 * called with a stop list to flag commonly mispelled words.  The
 * remaining words are then passed to spell again, this time with
 * the dictionary file as the first (non-flag) argument.
 *
 * Unlike historic versions of spellprog, this one does not use
 * hashed files.  Instead it simply requires that files be sorted
 * lexigraphically and uses the same algorithm as the look utility.
 *
 * Note that spellprog should be called via the spell shell script
 * and is not meant to be invoked directly by the user.
 */

int
main(int argc, char **argv)
{
	char *ep, *cp, *dp;
	char *outfile;
	int ch, fold, i;
	struct stat sb;
	FILE *file, *found;

	setlocale(LC_ALL, "");

	outfile = NULL;
	while ((ch = getopt(argc, argv, "bvxo:")) != -1) {
		switch (ch) {
		case 'b':
			/* Use British dictionary and convert ize -> ise. */
			ise();
			break;
		case 'o':
			outfile = optarg;
			break;
		case 'v':
			/* Also write derivations to "found" file. */
			vflag++;
			break;
		case 'x':
			/* Print plausible stems to stdout. */
			xflag++;
			break;
		default:
			usage();
		}

	}
	argc -= optind;
	argv += optind;
	if (argc < 1)
		usage();

	/* Open and mmap the word/stop lists. */
	if ((wlists = malloc(sizeof(struct wlist) * (argc + 1))) == NULL)
		err(1, "malloc");

	for (i = 0; argc--; i++) {
		wlists[i].fd = open(argv[i], O_RDONLY, 0);
		if (wlists[i].fd == -1 || fstat(wlists[i].fd, &sb) != 0)
			err(1, "%s", argv[i]);
		if (sb.st_size > SIZE_T_MAX)
			errx(1, "%s: %s", argv[i], strerror(EFBIG));
		wlists[i].front = mmap(NULL, (size_t)sb.st_size, PROT_READ,
		    MAP_PRIVATE, wlists[i].fd, (off_t)0);
		if (wlists[i].front == MAP_FAILED)
			err(1, "%s", argv[i]);
		wlists[i].back = wlists[i].front + (size_t)sb.st_size;
	}
	wlists[i].fd = -1;

	/* Open file where found words are to be saved. */
	if (outfile == NULL)
		found = NULL;
	else if ((found = fopen(outfile, "w")) == NULL)
		err(1, "cannot open %s", outfile);

	for (;; print_word(file)) {
		affix[0] = '\0';
		file = found;
		for (ep = word; (*ep = ch = getchar()) != '\n'; ep++) {
			if (ep - word == sizeof(word) - 1) {
				*ep = '\0';
				warnx("word too long (%s)", word);
				while ((ch = getchar()) != '\n')
					;	/* slurp until EOL */
			}
			if (ch == EOF) {
				if (found != NULL)
					fclose(found);
				exit(0);
			}
		}
		for (cp = word, dp = original; cp < ep; )
			*dp++ = *cp++;
		*dp = '\0';
		fold = 0;
		for (cp = word; cp < ep; cp++)
			if (islower((unsigned char)*cp))
				goto lcase;
		if (trypref(ep, ".", 0))
			continue;
		++fold;
		for (cp = original + 1, dp = word + 1; dp < ep; dp++, cp++)
			*dp = tolower((unsigned char)*cp);
lcase:
		if (trypref(ep, ".", 0) || suffix(ep, 0))
			continue;
		if (isupper((unsigned char)word[0])) {
			for (cp = original, dp = word; (*dp = *cp++); dp++) {
				if (fold)
					*dp = tolower((unsigned char)*dp);
			}
			word[0] = tolower((unsigned char)word[0]);
			goto lcase;
		}
		file = stdout;
	}
}

static void
print_word(FILE *f)
{

	if (f != NULL) {
		if (vflag && affix[0] != '\0' && affix[0] != '.')
			fprintf(f, "%s\t%s\n", affix, original);
		else
			fprintf(f, "%s\n", original);
	}
}

/*
 * For each matching suffix in suftab, call the function associated
 * with that suffix (p1 and p2).
 */
static int
suffix(char *ep, size_t lev)
{
	const struct suftab *t;
	char *cp;
	const char *sp;

	lev += DLEV;
	getderiv(lev + 1);
	deriv.buf[lev] = deriv.buf[lev - 1] = 0;
	for (t = suftab; (sp = t->suf) != NULL; t++) {
		cp = ep;
		while (*sp) {
			if (*--cp != *sp++)
				goto next;
		}
		for (sp = cp; --sp >= word && !vowel(*sp);)
			;	/* nothing */
		if (sp < word)
			return 0;
		if ((*t->p1)(ep - t->n1, t->d1, t->a1, lev + 1))
			return 1;
		if (t->p2 != NULL) {
			deriv.buf[lev] = deriv.buf[lev + 1] = '\0';
			return (*t->p2)(ep - t->n2, t->d2, t->a2, lev);
		}
		return 0;
next:		;
	}
	return 0;
}

static int
/*ARGSUSED*/
nop(char *ep, const char *d, const char *a, size_t lev)
{

	return 0;
}

static int
/*ARGSUSED*/
strip(char *ep, const char *d, const char *a, size_t lev)
{

	return trypref(ep, a, lev) || suffix(ep, lev);
}

static int
s(char *ep, const char *d, const char *a, const size_t lev)
{

	if (lev > DLEV + 1)
		return 0;
	if (*ep == 's' && ep[-1] == 's')
		return 0;
	return strip(ep, d, a, lev);
}

static int
/*ARGSUSED*/
an(char *ep, const char *d, const char *a, size_t lev)
{

	if (!isupper((unsigned char)*word))	/* must be proper name */
		return 0;
	return trypref(ep, a, lev);
}

static int
/*ARGSUSED*/
ize(char *ep, const char *d, const char *a, size_t lev)
{

	*ep++ = 'e';
	return strip(ep ,"", d, lev);
}

static int
/*ARGSUSED*/
y_to_e(char *ep, const char *d, const char *a, size_t lev)
{
	char c = *ep;

	*ep++ = 'e';
	if (strip(ep, "", d, lev))
		return 1;
	ep[-1] = c;
	return 0;
}

static int
ily(char *ep, const char *d, const char *a, size_t lev)
{

	if (ep[-1] == 'i')
		return i_to_y(ep, d, a, lev);
	else
		return strip(ep, d, a, lev);
}

static int
ncy(char *ep, const char *d, const char *a, size_t lev)
{

	if (skipv(skipv(ep - 1)) < word)
		return 0;
	ep[-1] = 't';
	return strip(ep, d, a, lev);
}

static int
bility(char *ep, const char *d, const char *a, size_t lev)
{

	*ep++ = 'l';
	return y_to_e(ep, d, a, lev);
}

static int
i_to_y(char *ep, const char *d, const char *a, size_t lev)
{

	if (ep[-1] == 'i') {
		ep[-1] = 'y';
		a = d;
	}
	return strip(ep, "", a, lev);
}

static int
es(char *ep, const char *d, const char *a, size_t lev)
{

	if (lev > DLEV)
		return 0;

	switch (ep[-1]) {
	default:
		return 0;
	case 'i':
		return i_to_y(ep, d, a, lev);
	case 's':
	case 'h':
	case 'z':
	case 'x':
		return strip(ep, d, a, lev);
	}
}

static int
metry(char *ep, const char *d, const char *a, size_t lev)
{

	ep[-2] = 'e';
	ep[-1] = 'r';
	return strip(ep, d, a, lev);
}

static int
tion(char *ep, const char *d, const char *a, size_t lev)
{

	switch (ep[-2]) {
	case 'c':
	case 'r':
		return trypref(ep, a, lev);
	case 'a':
		return y_to_e(ep, d, a, lev);
	}
	return 0;
}

/*
 * Possible consonant-consonant-e ending.
 */
static int
CCe(char *ep, const char *d, const char *a, size_t lev)
{

	switch (ep[-1]) {
	case 'l':
		if (vowel(ep[-2]))
			break;
		switch (ep[-2]) {
		case 'l':
		case 'r':
		case 'w':
			break;
		default:
			return y_to_e(ep, d, a, lev);
		}
		break;
	case 's':
		if (ep[-2] == 's')
			break;
		/*FALLTHROUGH*/
	case 'c':
	case 'g':
		if (*ep == 'a')
			return 0;
		/*FALLTHROUGH*/
	case 'v':
	case 'z':
		if (vowel(ep[-2]))
			break;
		/*FALLTHROUGH*/
	case 'u':
		if (y_to_e(ep, d, a, lev))
			return 1;
		if (!(ep[-2] == 'n' && ep[-1] == 'g'))
			return 0;
	}
	return VCe(ep, d, a, lev);
}

/*
 * Possible consonant-vowel-consonant-e ending.
 */
static int
VCe(char *ep, const char *d, const char *a, size_t lev)
{
	char c;

	c = ep[-1];
	if (c == 'e')
		return 0;
	if (!vowel(c) && vowel(ep[-2])) {
		c = *ep;
		*ep++ = 'e';
		if (trypref(ep, d, lev) || suffix(ep, lev))
			return 1;
		ep--;
		*ep = c;
	}
	return strip(ep, d, a, lev);
}

static const char *
lookuppref(char **wp, char *ep)
{
	const char **sp, *cp;
	char *bp;

	for (sp = preftab; *sp; sp++) {
		bp = *wp;
		for (cp = *sp; *cp; cp++, bp++) {
			if (tolower((unsigned char)*bp) != *cp)
				goto next;
		}
		for (cp = bp; cp < ep; cp++) {
			if (vowel(*cp)) {
				*wp = bp;
				return *sp;
			}
		}
next:		;
	}
	return 0;
}

/*
 * If the word is not in the dictionary, try stripping off prefixes
 * until the word is found or we run out of prefixes to check.
 */
static int
trypref(char *ep, const char *a, size_t lev)
{
	const char *cp;
	char *bp;
	char *pp;
	int val = 0;
	char space[20];

	getderiv(lev + 2);
	deriv.buf[lev] = a;
	if (tryword(word, ep, lev))
		return 1;
	bp = word;
	pp = space;
	deriv.buf[lev + 1] = pp;
	while ((cp = lookuppref(&bp, ep)) != NULL) {
		*pp++ = '+';
		while ((*pp = *cp++))
			pp++;
		if (tryword(bp, ep, lev + 1)) {
			val = 1;
			break;
		}
		if (pp - space >= sizeof(space))
			return 0;
	}
	deriv.buf[lev + 1] = deriv.buf[lev + 2] = '\0';
	return val;
}

static int
tryword(char *bp, char *ep, size_t lev)
{
	size_t i, j;
	char duple[3];

	if (ep-bp <= 1)
		return 0;
	if (vowel(*ep) && monosyl(bp, ep))
		return 0;

	i = dict(bp, ep);
	if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] &&
	    monosyl(bp, ep - 1)) {
		ep--;
		getderiv(++lev);
		deriv.buf[lev] = duple;
		duple[0] = '+';
		duple[1] = *ep;
		duple[2] = '\0';
		i = dict(bp, ep);
	}
	if (vflag == 0 || i == 0)
		return i;

	/* Also tack on possible derivations. (XXX - warn on truncation?) */
	for (j = lev; j > 0; j--) {
		if (deriv.buf[j])
			(void)strlcat(affix, deriv.buf[j], sizeof(affix));
	}
	return i;
}

static int
monosyl(char *bp, char *ep)
{

	if (ep < bp + 2)
		return 0;
	if (vowel(*--ep) || !vowel(*--ep) || ep[1] == 'x' || ep[1] == 'w')
		return 0;
	while (--ep >= bp)
		if (vowel(*ep))
			return 0;
	return 1;
}

static char *
skipv(char *st)
{

	if (st >= word && vowel(*st))
		st--;
	while (st >= word && !vowel(*st))
		st--;
	return st;
}

static int
vowel(int c)
{

	switch (tolower(c)) {
	case 'a':
	case 'e':
	case 'i':
	case 'o':
	case 'u':
	case 'y':
		return 1;
	}
	return 0;
}

/*
 * Crummy way to Britishise.
 */
static void
ise(void)
{
	struct suftab *tab;
	char *cp;

	for (tab = suftab; tab->suf; tab++) {
		/* Assume that suffix will contain 'z' if a1 or d1 do */
		if (strchr(tab->suf, 'z')) {
			tab->suf = cp = estrdup(tab->suf);
			ztos(cp);
			if (strchr(tab->d1, 'z')) {
				tab->d1 = cp = estrdup(tab->d1);
				ztos(cp);
			}
			if (strchr(tab->a1, 'z')) {
				tab->a1 = cp = estrdup(tab->a1);
				ztos(cp);
			}
		}
	}
}

static void
ztos(char *st)
{

	for (; *st; st++)
		if (*st == 'z')
			*st = 's';
}

static char *
estrdup(const char *st)
{
	char *d;

	if ((d = strdup(st)) == NULL)
		err(1, "strdup");
	return d;
}

/*
 * Look up a word in the dictionary.
 * Returns 1 if found, 0 if not.
 */
static int
dict(char *bp, char *ep)
{
	char c;
	int i, rval;

	c = *ep;
	*ep = '\0';
	if (xflag)
		printf("=%s\n", bp);
	for (i = rval = 0; wlists[i].fd != -1; i++) {
		if ((rval = look((unsigned char *)bp, wlists[i].front,
		    wlists[i].back)) == 1)
			break;
	}
	*ep = c;
	return rval;
}

static void
getderiv(size_t lev)
{
	if (deriv.maxlev < lev) {
		void *p = realloc(deriv.buf, sizeof(*deriv.buf) * lev);
		if (p == NULL)
			err(1, "Cannot grow array");
		deriv.buf = p;
		deriv.maxlev = lev;
	}
}


static void
usage(void)
{
	(void)fprintf(stderr,
	    "Usage: %s [-bvx] [-o found-words] word-list ...\n",
	    getprogname());
	exit(1);
}