139026Sbostic /*
239026Sbostic  * Copyright (c) 1989 The Regents of the University of California.
339026Sbostic  * All rights reserved.
439026Sbostic  *
539026Sbostic  * This code is derived from software contributed to Berkeley by
639026Sbostic  * Ken Arnold.
739026Sbostic  *
839026Sbostic  * Redistribution and use in source and binary forms are permitted
939026Sbostic  * provided that the above copyright notice and this paragraph are
1039026Sbostic  * duplicated in all such forms and that any documentation,
1139026Sbostic  * advertising materials, and other materials related to such
1239026Sbostic  * distribution and use acknowledge that the software was developed
1339026Sbostic  * by the University of California, Berkeley.  The name of the
1439026Sbostic  * University may not be used to endorse or promote products derived
1539026Sbostic  * from this software without specific prior written permission.
1639026Sbostic  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
1739026Sbostic  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
1839026Sbostic  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
1939026Sbostic  */
2030270Sbostic 
2139026Sbostic #ifndef lint
2239026Sbostic char copyright[] =
2339026Sbostic "@(#) Copyright (c) 1989 The Regents of the University of California.\n\
2439026Sbostic  All rights reserved.\n";
2539026Sbostic #endif /* not lint */
2639026Sbostic 
2739026Sbostic #ifndef lint
28*39033Sbostic static char sccsid[] = "@(#)strfile.c	5.5 (Berkeley) 09/05/89";
2939026Sbostic #endif /* not lint */
3039026Sbostic 
3139021Sbostic # include	<sys/param.h>
3239021Sbostic # include	<sys/types.h>
3330270Sbostic # include	<stdio.h>
3430270Sbostic # include	<ctype.h>
3530270Sbostic # include	"strfile.h"
3630270Sbostic 
3739012Sbostic # ifndef MAXPATHLEN
3839012Sbostic # define	MAXPATHLEN	1024
3939012Sbostic # endif	/* MAXPATHLEN */
4039012Sbostic 
4130270Sbostic /*
4230270Sbostic  *	This program takes a file composed of strings seperated by
4330270Sbostic  * lines starting with two consecutive delimiting character (default
4430270Sbostic  * character is '%') and creates another file which consists of a table
4530270Sbostic  * describing the file (structure from "strfile.h"), a table of seek
46*39033Sbostic  * pointers to the start of the strings, and the strings, each terminated
4730270Sbostic  * by a null byte.  Usage:
4830270Sbostic  *
49*39033Sbostic  *	% strfile [-iorsv] [ -cC ] sourcefile [ datafile ]
5030270Sbostic  *
5130270Sbostic  *	c - Change delimiting character from '%' to 'C'
5230270Sbostic  *	s - Silent.  Give no summary of data processed at the end of
5330270Sbostic  *	    the run.
5430270Sbostic  *	v - Verbose.  Give summary of data processed.  (Default)
5530270Sbostic  *	o - order the strings in alphabetic order
5630270Sbostic  *	i - if ordering, ignore case
5730270Sbostic  *	r - randomize the order of the strings
5830270Sbostic  *
5930270Sbostic  *		Ken Arnold	Sept. 7, 1978 --
6030270Sbostic  *
6130270Sbostic  *	Added ordering options.
6230270Sbostic  */
6330270Sbostic 
6430270Sbostic # define	TRUE	1
6530270Sbostic # define	FALSE	0
6630270Sbostic 
6739012Sbostic # define	STORING_PTRS	(Oflag || Rflag)
6839012Sbostic # define	CHUNKSIZE	512
6930270Sbostic 
7039012Sbostic #ifdef lint
7139012Sbostic # define	ALWAYS	atoi("1")
7239012Sbostic #else
7339012Sbostic # define	ALWAYS	1
7439012Sbostic #endif
7539012Sbostic # define	ALLOC(ptr,sz)	if (ALWAYS) { \
7639012Sbostic 			if (ptr == NULL) \
7739012Sbostic 				ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \
7839012Sbostic 			else if (((sz) + 1) % CHUNKSIZE == 0) \
7939012Sbostic 				ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \
8039012Sbostic 			if (ptr == NULL) { \
8139012Sbostic 				fprintf(stderr, "out of space\n"); \
8239012Sbostic 				exit(1); \
8339012Sbostic 			} \
8439012Sbostic 		} else
8539012Sbostic 
8639012Sbostic #ifdef NO_VOID
8739012Sbostic # define	void	char
8839012Sbostic #endif
8939012Sbostic 
9030270Sbostic typedef struct {
9130270Sbostic 	char	first;
9239012Sbostic 	off_t	pos;
9330270Sbostic } STR;
9430270Sbostic 
9530270Sbostic char	*Infile		= NULL,		/* input file name */
9639012Sbostic 	Outfile[MAXPATHLEN] = "",	/* output file name */
97*39033Sbostic 	Delimch		= '%';		/* delimiting character */
9830270Sbostic 
9930270Sbostic int	Sflag		= FALSE;	/* silent run flag */
10030270Sbostic int	Oflag		= FALSE;	/* ordering flag */
10130270Sbostic int	Iflag		= FALSE;	/* ignore case flag */
10230270Sbostic int	Rflag		= FALSE;	/* randomize order flag */
10339012Sbostic int	Num_pts		= 0;		/* number of pointers/strings */
10430270Sbostic 
10539012Sbostic off_t	*Seekpts;
10630270Sbostic 
10730270Sbostic FILE	*Sort_1, *Sort_2;		/* pointers for sorting */
10830270Sbostic 
10930270Sbostic STRFILE	Tbl;				/* statistics table */
11030270Sbostic 
11130270Sbostic STR	*Firstch;			/* first chars of each string */
11230270Sbostic 
11339012Sbostic char	*fgets(), *strcpy(), *strcat();
11430270Sbostic 
11539012Sbostic void	*malloc(), *realloc();
11630270Sbostic 
11739012Sbostic /*
11839012Sbostic  * main:
11939012Sbostic  *	Drive the sucker.  There are two main modes -- either we store
12039012Sbostic  *	the seek pointers, if the table is to be sorted or randomized,
12139012Sbostic  *	or we write the pointer directly to the file, if we are to stay
12239012Sbostic  *	in file order.  If the former, we allocate and re-allocate in
12339012Sbostic  *	CHUNKSIZE blocks; if the latter, we just write each pointer,
12439012Sbostic  *	and then seek back to the beginning to write in the table.
12539012Sbostic  */
12630270Sbostic main(ac, av)
12730270Sbostic int	ac;
12830270Sbostic char	**av;
12930270Sbostic {
13030270Sbostic 	register char		*sp, dc;
13130270Sbostic 	register FILE		*inf, *outf;
13239012Sbostic 	register off_t		last_off, length, pos;
13330270Sbostic 	register int		first;
13430270Sbostic 	register char		*nsp;
13530270Sbostic 	register STR		*fp;
13630270Sbostic 	static char		string[257];
13730270Sbostic 
13830270Sbostic 	getargs(ac, av);		/* evalute arguments */
13930270Sbostic 	dc = Delimch;
14030270Sbostic 	if ((inf = fopen(Infile, "r")) == NULL) {
14130270Sbostic 		perror(Infile);
14230270Sbostic 		exit(-1);
14330270Sbostic 	}
14430270Sbostic 
14530270Sbostic 	if ((outf = fopen(Outfile, "w")) == NULL) {
14630270Sbostic 		perror(Outfile);
14730270Sbostic 		exit(-1);
14830270Sbostic 	}
14939012Sbostic 	if (!STORING_PTRS)
15039012Sbostic 		(void) fseek(outf, sizeof Tbl, 0);
15130270Sbostic 
15230270Sbostic 	/*
15339012Sbostic 	 * Write the strings onto the file
15430270Sbostic 	 */
15530270Sbostic 
15630270Sbostic 	Tbl.str_longlen = 0;
15730270Sbostic 	Tbl.str_shortlen = (unsigned int) 0xffffffff;
15839012Sbostic 	Tbl.str_delim = dc;
15930270Sbostic 	first = Oflag;
16039012Sbostic 	add_offset(outf, ftell(inf));
16139012Sbostic 	last_off = 0;
16230270Sbostic 	do {
16339012Sbostic 		if (Num_pts > 508)
16439012Sbostic 			atoi("1");
16530270Sbostic 		sp = fgets(string, 256, inf);
166*39033Sbostic 		if (sp == NULL || sp[0] == dc && sp[1] == '\n') {
16739012Sbostic 			pos = ftell(inf);
16839012Sbostic 			length = pos - last_off - strlen(sp);
16939012Sbostic 			last_off = pos;
170*39033Sbostic 			if (!length)
171*39033Sbostic 				continue;
172*39033Sbostic 			add_offset(outf, pos);
17339012Sbostic 			if (Tbl.str_longlen < length)
17439012Sbostic 				Tbl.str_longlen = length;
17539012Sbostic 			if (Tbl.str_shortlen > length)
17639012Sbostic 				Tbl.str_shortlen = length;
17730270Sbostic 			first = Oflag;
17830270Sbostic 		}
179*39033Sbostic 		else if (first) {
180*39033Sbostic 			for (nsp = sp; !isalnum(*nsp); nsp++)
181*39033Sbostic 				continue;
182*39033Sbostic 			ALLOC(Firstch, Num_pts);
183*39033Sbostic 			fp = &Firstch[Num_pts - 1];
184*39033Sbostic 			if (Iflag && isupper(*nsp))
185*39033Sbostic 				fp->first = tolower(*nsp);
186*39033Sbostic 			else
187*39033Sbostic 				fp->first = *nsp;
188*39033Sbostic 			fp->pos = Seekpts[Num_pts - 1];
189*39033Sbostic 			first = FALSE;
19030270Sbostic 		}
19130270Sbostic 	} while (sp != NULL);
19230270Sbostic 
19330270Sbostic 	/*
19430270Sbostic 	 * write the tables in
19530270Sbostic 	 */
19630270Sbostic 
19730270Sbostic 	(void) fclose(inf);
19839012Sbostic 	Tbl.str_numstr = Num_pts - 1;
19930270Sbostic 
20030270Sbostic 	if (Oflag)
20139012Sbostic 		do_order();
20230270Sbostic 	else if (Rflag)
20339012Sbostic 		randomize();
20430270Sbostic 
20539012Sbostic 	(void) fseek(outf, (off_t) 0, 0);
20630270Sbostic 	(void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf);
20739012Sbostic 	if (STORING_PTRS)
20839012Sbostic 		(void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf);
20930270Sbostic 	(void) fclose(outf);
21030270Sbostic 
21130270Sbostic 	if (!Sflag) {
21239012Sbostic 		printf("\"%s\" created\n", Outfile);
21339012Sbostic 		if (Num_pts == 2)
21430270Sbostic 			puts("There was 1 string");
21530270Sbostic 		else
21639021Sbostic 			printf("There were %d strings\n", Num_pts - 1);
21739021Sbostic 		printf("Longest string: %lu byte%s\n", Tbl.str_longlen,
21830270Sbostic 		       Tbl.str_longlen == 1 ? "" : "s");
21939021Sbostic 		printf("Shortest string: %lu byte%s\n", Tbl.str_shortlen,
22030270Sbostic 		       Tbl.str_shortlen == 1 ? "" : "s");
22130270Sbostic 	}
22230270Sbostic 	exit(0);
22339012Sbostic 	/* NOTREACHED */
22430270Sbostic }
22530270Sbostic 
22630270Sbostic /*
22730270Sbostic  *	This routine evaluates arguments from the command line
22830270Sbostic  */
229*39033Sbostic getargs(argc, argv)
230*39033Sbostic int	argc;
231*39033Sbostic char	**argv;
23230270Sbostic {
233*39033Sbostic 	extern char	*optarg;
234*39033Sbostic 	extern int	optind;
235*39033Sbostic 	int	ch;
23630270Sbostic 
237*39033Sbostic 	while ((ch = getopt(argc, argv, "c:iors")) != EOF)
238*39033Sbostic 		switch(ch) {
239*39033Sbostic 		case 'c':			/* new delimiting char */
240*39033Sbostic 			Delimch = *optarg;
241*39033Sbostic 			if (!isascii(Delimch)) {
242*39033Sbostic 				printf("bad delimiting character: '\\%o\n'",
243*39033Sbostic 				       Delimch);
244*39033Sbostic 			}
245*39033Sbostic 			break;
246*39033Sbostic 		case 'i':			/* ignore case in ordering */
247*39033Sbostic 			Iflag++;
248*39033Sbostic 			break;
249*39033Sbostic 		case 'o':			/* order strings */
250*39033Sbostic 			Oflag++;
251*39033Sbostic 			break;
252*39033Sbostic 		case 'r':			/* ignore case in ordering */
253*39033Sbostic 			Rflag++;
254*39033Sbostic 			break;
255*39033Sbostic 		case 's':			/* silent */
256*39033Sbostic 			Sflag++;
257*39033Sbostic 			break;
258*39033Sbostic 		case '?':
259*39033Sbostic 		default:
260*39033Sbostic 			usage();
26130270Sbostic 		}
262*39033Sbostic 	argv += optind;
263*39033Sbostic 
264*39033Sbostic 	if (*argv) {
265*39033Sbostic 		Infile = *argv;
266*39033Sbostic 		if (*++argv)
267*39033Sbostic 			(void) strcpy(Outfile, *argv);
268*39033Sbostic 	}
26930270Sbostic 	if (!Infile) {
27030270Sbostic 		puts("No input file name");
271*39033Sbostic 		usage();
27230270Sbostic 	}
273*39033Sbostic 	if (*Outfile == '\0') {
27430270Sbostic 		(void) strcpy(Outfile, Infile);
27530270Sbostic 		(void) strcat(Outfile, ".dat");
27630270Sbostic 	}
27730270Sbostic }
27830270Sbostic 
279*39033Sbostic usage()
280*39033Sbostic {
281*39033Sbostic 	(void) fprintf(stderr,
282*39033Sbostic 	    "strfile [-iors] [-c char] sourcefile [datafile]\n");
283*39033Sbostic 	exit(1);
284*39033Sbostic }
285*39033Sbostic 
28630270Sbostic /*
28739012Sbostic  * add_offset:
28839012Sbostic  *	Add an offset to the list, or write it out, as appropriate.
28939012Sbostic  */
29039012Sbostic add_offset(fp, off)
29139012Sbostic FILE	*fp;
29239012Sbostic off_t	off;
29339012Sbostic {
29439012Sbostic 	if (!STORING_PTRS)
29539012Sbostic 		fwrite(&off, 1, sizeof off, fp);
29639012Sbostic 	else {
29739012Sbostic 		ALLOC(Seekpts, Num_pts + 1);
29839012Sbostic 		Seekpts[Num_pts] = off;
29939012Sbostic 	}
30039012Sbostic 	Num_pts++;
30139012Sbostic }
30239012Sbostic 
30339012Sbostic /*
30430270Sbostic  * do_order:
30530270Sbostic  *	Order the strings alphabetically (possibly ignoring case).
30630270Sbostic  */
30739012Sbostic do_order()
30830270Sbostic {
30930270Sbostic 	register int	i;
31039012Sbostic 	register off_t	*lp;
31130270Sbostic 	register STR	*fp;
31230270Sbostic 	extern int	cmp_str();
31330270Sbostic 
31439012Sbostic 	Sort_1 = fopen(Infile, "r");
31539012Sbostic 	Sort_2 = fopen(Infile, "r");
31639012Sbostic 	qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str);
31730270Sbostic 	i = Tbl.str_numstr;
31839012Sbostic 	lp = Seekpts;
31930270Sbostic 	fp = Firstch;
32030270Sbostic 	while (i--)
32130270Sbostic 		*lp++ = fp++->pos;
32230270Sbostic 	(void) fclose(Sort_1);
32330270Sbostic 	(void) fclose(Sort_2);
32430270Sbostic 	Tbl.str_flags |= STR_ORDERED;
32530270Sbostic }
32630270Sbostic 
32730270Sbostic /*
32830270Sbostic  * cmp_str:
32930270Sbostic  *	Compare two strings in the file
33030270Sbostic  */
33139012Sbostic char *
33239012Sbostic unctrl(c)
33339012Sbostic char c;
33439012Sbostic {
33539012Sbostic 	static char	buf[3];
33639012Sbostic 
33739012Sbostic 	if (isprint(c)) {
33839012Sbostic 		buf[0] = c;
33939012Sbostic 		buf[1] = '\0';
34039012Sbostic 	}
34139012Sbostic 	else if (c == 0177) {
34239012Sbostic 		buf[0] = '^';
34339012Sbostic 		buf[1] = '?';
34439012Sbostic 	}
34539012Sbostic 	else {
34639012Sbostic 		buf[0] = '^';
34739012Sbostic 		buf[1] = c + 'A' - 1;
34839012Sbostic 	}
34939012Sbostic 	return buf;
35039012Sbostic }
35139012Sbostic 
35230270Sbostic cmp_str(p1, p2)
35330270Sbostic STR	*p1, *p2;
35430270Sbostic {
35530270Sbostic 	register int	c1, c2;
35639012Sbostic 	register int	n1, n2;
35730270Sbostic 
35839012Sbostic # define	SET_N(nf,ch)	(nf = (ch == '\n'))
35939012Sbostic # define	IS_END(ch,nf)	(ch == Delimch && nf)
36039012Sbostic 
36130270Sbostic 	c1 = p1->first;
36230270Sbostic 	c2 = p2->first;
36330270Sbostic 	if (c1 != c2)
36430270Sbostic 		return c1 - c2;
36530270Sbostic 
36630270Sbostic 	(void) fseek(Sort_1, p1->pos, 0);
36730270Sbostic 	(void) fseek(Sort_2, p2->pos, 0);
36830270Sbostic 
36939012Sbostic 	n1 = FALSE;
37039012Sbostic 	n2 = FALSE;
37130270Sbostic 	while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0')
37239012Sbostic 		SET_N(n1, c1);
37330270Sbostic 	while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0')
37439012Sbostic 		SET_N(n2, c2);
37530270Sbostic 
37639012Sbostic 	while (!IS_END(c1, n1) && !IS_END(c2, n2)) {
37730270Sbostic 		if (Iflag) {
37830270Sbostic 			if (isupper(c1))
37930270Sbostic 				c1 = tolower(c1);
38030270Sbostic 			if (isupper(c2))
38130270Sbostic 				c2 = tolower(c2);
38230270Sbostic 		}
38330270Sbostic 		if (c1 != c2)
38430270Sbostic 			return c1 - c2;
38539012Sbostic 		if (c1 == '\n' || c2 == '\n')
38639012Sbostic 			atoi("1");
38739012Sbostic 		SET_N(n1, c1);
38839012Sbostic 		SET_N(n2, c2);
38930270Sbostic 		c1 = getc(Sort_1);
39030270Sbostic 		c2 = getc(Sort_2);
39130270Sbostic 	}
39239012Sbostic 	if (IS_END(c1, n1))
39339012Sbostic 		c1 = 0;
39439012Sbostic 	if (IS_END(c2, n2))
39539012Sbostic 		c2 = 0;
39630270Sbostic 	return c1 - c2;
39730270Sbostic }
39830270Sbostic 
39930270Sbostic /*
40030270Sbostic  * randomize:
40130270Sbostic  *	Randomize the order of the string table.  We must be careful
40230270Sbostic  *	not to randomize across delimiter boundaries.  All
40330270Sbostic  *	randomization is done within each block.
40430270Sbostic  */
40539012Sbostic randomize()
40630270Sbostic {
40739012Sbostic 	register int	cnt, i;
40839012Sbostic 	register off_t	tmp;
40939012Sbostic 	register off_t	*sp;
41039012Sbostic 	extern time_t	time();
41130270Sbostic 
41230270Sbostic 	Tbl.str_flags |= STR_RANDOM;
41339012Sbostic 	cnt = Tbl.str_numstr;
41430270Sbostic 
41539012Sbostic 	/*
41639012Sbostic 	 * move things around randomly
41739012Sbostic 	 */
41830270Sbostic 
41939012Sbostic 	for (sp = Seekpts; cnt > 0; cnt--, sp++) {
42039021Sbostic 		i = random() % cnt;
42139012Sbostic 		tmp = sp[0];
42239012Sbostic 		sp[0] = sp[i];
42339012Sbostic 		sp[i] = tmp;
42430270Sbostic 	}
42530270Sbostic }
426