1*39012Sbostic /* $Header: strfile.c,v 1.19 89/05/05 16:07:46 arnold Exp $ */
230270Sbostic 
330270Sbostic # include	<stdio.h>
430270Sbostic # include	<ctype.h>
5*39012Sbostic # include	<sys/types.h>
6*39012Sbostic # include	<sys/param.h>
730270Sbostic # include	"strfile.h"
8*39012Sbostic # include	"random.h"
930270Sbostic 
10*39012Sbostic # ifndef MAXPATHLEN
11*39012Sbostic # define	MAXPATHLEN	1024
12*39012Sbostic # endif	/* MAXPATHLEN */
13*39012Sbostic 
1430270Sbostic /*
1530270Sbostic  *	This program takes a file composed of strings seperated by
1630270Sbostic  * lines starting with two consecutive delimiting character (default
1730270Sbostic  * character is '%') and creates another file which consists of a table
1830270Sbostic  * describing the file (structure from "strfile.h"), a table of seek
1930270Sbostic  * pointers to the start of the strings, and the strings, each terinated
2030270Sbostic  * by a null byte.  Usage:
2130270Sbostic  *
2230270Sbostic  *	% strfile [ - ] [ -cC ] [ -sv ] [ -oir ] sourcefile [ datafile ]
2330270Sbostic  *
2430270Sbostic  *	- - Give a usage summary useful for jogging the memory
2530270Sbostic  *	c - Change delimiting character from '%' to 'C'
2630270Sbostic  *	s - Silent.  Give no summary of data processed at the end of
2730270Sbostic  *	    the run.
2830270Sbostic  *	v - Verbose.  Give summary of data processed.  (Default)
2930270Sbostic  *	o - order the strings in alphabetic order
3030270Sbostic  *	i - if ordering, ignore case
3130270Sbostic  *	r - randomize the order of the strings
3230270Sbostic  *
3330270Sbostic  *		Ken Arnold	Sept. 7, 1978 --
3430270Sbostic  *
3530270Sbostic  *	Added method to indicate dividers.  A "%-" will cause the address
3630270Sbostic  * to be added to the structure in one of the pointer elements.
3730270Sbostic  *
3830270Sbostic  *		Ken Arnold	Nov., 1984 --
3930270Sbostic  *
4030270Sbostic  *	Added ordering options.
4130270Sbostic  */
4230270Sbostic 
4330270Sbostic # define	TRUE	1
4430270Sbostic # define	FALSE	0
4530270Sbostic 
46*39012Sbostic # define	STORING_PTRS	(Oflag || Rflag)
47*39012Sbostic # define	CHUNKSIZE	512
4830270Sbostic 
49*39012Sbostic #ifdef lint
50*39012Sbostic # define	ALWAYS	atoi("1")
51*39012Sbostic #else
52*39012Sbostic # define	ALWAYS	1
53*39012Sbostic #endif
54*39012Sbostic # define	ALLOC(ptr,sz)	if (ALWAYS) { \
55*39012Sbostic 			if (ptr == NULL) \
56*39012Sbostic 				ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \
57*39012Sbostic 			else if (((sz) + 1) % CHUNKSIZE == 0) \
58*39012Sbostic 				ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \
59*39012Sbostic 			if (ptr == NULL) { \
60*39012Sbostic 				fprintf(stderr, "out of space\n"); \
61*39012Sbostic 				exit(1); \
62*39012Sbostic 			} \
63*39012Sbostic 		} else
64*39012Sbostic 
65*39012Sbostic #ifdef NO_VOID
66*39012Sbostic # define	void	char
67*39012Sbostic #endif
68*39012Sbostic 
6930270Sbostic typedef struct {
7030270Sbostic 	char	first;
71*39012Sbostic 	off_t	pos;
7230270Sbostic } STR;
7330270Sbostic 
7430270Sbostic char	*Infile		= NULL,		/* input file name */
75*39012Sbostic 	Outfile[MAXPATHLEN] = "",	/* output file name */
7630270Sbostic 	Delimch		= '%',		/* delimiting character */
7730270Sbostic 	*Usage[]	= {		/* usage summary */
7830270Sbostic        "usage:	strfile [ - ] [ -cC ] [ -sv ] [ -oir ] inputfile [ datafile ]",
7930270Sbostic        "	- - Give this usage summary",
8030270Sbostic        "	c - Replace delimiting character with 'C'",
8130270Sbostic        "	s - Silent.  Give no summary",
8230270Sbostic        "	v - Verbose.  Give summary.  (default)",
8330270Sbostic        "	o - order strings alphabetically",
8430270Sbostic        "	i - ignore case in ordering",
8530270Sbostic        "	r - randomize the order of the strings",
8630270Sbostic        "	Default \"datafile\" is inputfile.dat",
8730270Sbostic 	};
8830270Sbostic 
8930270Sbostic int	Sflag		= FALSE;	/* silent run flag */
9030270Sbostic int	Oflag		= FALSE;	/* ordering flag */
9130270Sbostic int	Iflag		= FALSE;	/* ignore case flag */
9230270Sbostic int	Rflag		= FALSE;	/* randomize order flag */
93*39012Sbostic int	Num_pts		= 0;		/* number of pointers/strings */
9430270Sbostic 
95*39012Sbostic off_t	*Seekpts;
9630270Sbostic 
9730270Sbostic FILE	*Sort_1, *Sort_2;		/* pointers for sorting */
9830270Sbostic 
9930270Sbostic STRFILE	Tbl;				/* statistics table */
10030270Sbostic 
10130270Sbostic STR	*Firstch;			/* first chars of each string */
10230270Sbostic 
103*39012Sbostic char	*fgets(), *strcpy(), *strcat();
10430270Sbostic 
105*39012Sbostic void	*malloc(), *realloc();
10630270Sbostic 
107*39012Sbostic /*
108*39012Sbostic  * main:
109*39012Sbostic  *	Drive the sucker.  There are two main modes -- either we store
110*39012Sbostic  *	the seek pointers, if the table is to be sorted or randomized,
111*39012Sbostic  *	or we write the pointer directly to the file, if we are to stay
112*39012Sbostic  *	in file order.  If the former, we allocate and re-allocate in
113*39012Sbostic  *	CHUNKSIZE blocks; if the latter, we just write each pointer,
114*39012Sbostic  *	and then seek back to the beginning to write in the table.
115*39012Sbostic  */
11630270Sbostic main(ac, av)
11730270Sbostic int	ac;
11830270Sbostic char	**av;
11930270Sbostic {
12030270Sbostic 	register char		*sp, dc;
12130270Sbostic 	register FILE		*inf, *outf;
122*39012Sbostic 	register off_t		last_off, length, pos;
12330270Sbostic 	register int		first;
12430270Sbostic 	register char		*nsp;
12530270Sbostic 	register STR		*fp;
12630270Sbostic 	static char		string[257];
12730270Sbostic 
12830270Sbostic 	getargs(ac, av);		/* evalute arguments */
12930270Sbostic 	dc = Delimch;
13030270Sbostic 	if ((inf = fopen(Infile, "r")) == NULL) {
13130270Sbostic 		perror(Infile);
13230270Sbostic 		exit(-1);
13330270Sbostic 	}
13430270Sbostic 
13530270Sbostic 	if ((outf = fopen(Outfile, "w")) == NULL) {
13630270Sbostic 		perror(Outfile);
13730270Sbostic 		exit(-1);
13830270Sbostic 	}
139*39012Sbostic 	if (!STORING_PTRS)
140*39012Sbostic 		(void) fseek(outf, sizeof Tbl, 0);
14130270Sbostic 
14230270Sbostic 	/*
143*39012Sbostic 	 * Write the strings onto the file
14430270Sbostic 	 */
14530270Sbostic 
14630270Sbostic 	Tbl.str_longlen = 0;
14730270Sbostic 	Tbl.str_shortlen = (unsigned int) 0xffffffff;
148*39012Sbostic 	Tbl.str_delim = dc;
14930270Sbostic 	first = Oflag;
150*39012Sbostic 	add_offset(outf, ftell(inf));
151*39012Sbostic 	last_off = 0;
15230270Sbostic 	do {
153*39012Sbostic 		if (Num_pts > 508)
154*39012Sbostic 			atoi("1");
15530270Sbostic 		sp = fgets(string, 256, inf);
156*39012Sbostic 		if (sp == NULL || (sp[0] == dc && sp[1] == dc)) {
157*39012Sbostic 			pos = ftell(inf);
158*39012Sbostic 			add_offset(outf, pos);
159*39012Sbostic 			length = pos - last_off - strlen(sp);
160*39012Sbostic 			last_off = pos;
161*39012Sbostic 			if (Tbl.str_longlen < length)
162*39012Sbostic 				Tbl.str_longlen = length;
163*39012Sbostic 			if (Tbl.str_shortlen > length)
164*39012Sbostic 				Tbl.str_shortlen = length;
16530270Sbostic 			first = Oflag;
16630270Sbostic 		}
16730270Sbostic 		else {
16830270Sbostic 			if (first) {
16930270Sbostic 				for (nsp = sp; !isalnum(*nsp); nsp++)
17030270Sbostic 					continue;
171*39012Sbostic 				ALLOC(Firstch, Num_pts);
172*39012Sbostic 				fp = &Firstch[Num_pts - 1];
17330270Sbostic 				if (Iflag && isupper(*nsp))
17430270Sbostic 					fp->first = tolower(*nsp);
17530270Sbostic 				else
17630270Sbostic 					fp->first = *nsp;
177*39012Sbostic 				fp->pos = Seekpts[Num_pts - 1];
17830270Sbostic 				first = FALSE;
17930270Sbostic 			}
18030270Sbostic 		}
18130270Sbostic 	} while (sp != NULL);
18230270Sbostic 
18330270Sbostic 	/*
18430270Sbostic 	 * write the tables in
18530270Sbostic 	 */
18630270Sbostic 
18730270Sbostic 	(void) fclose(inf);
188*39012Sbostic 	Tbl.str_numstr = Num_pts - 1;
18930270Sbostic 
19030270Sbostic 	if (Oflag)
191*39012Sbostic 		do_order();
19230270Sbostic 	else if (Rflag)
193*39012Sbostic 		randomize();
19430270Sbostic 
195*39012Sbostic 	(void) fseek(outf, (off_t) 0, 0);
19630270Sbostic 	(void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf);
197*39012Sbostic 	if (STORING_PTRS)
198*39012Sbostic 		(void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf);
19930270Sbostic 	(void) fclose(outf);
20030270Sbostic 
20130270Sbostic 	if (!Sflag) {
202*39012Sbostic 		printf("\"%s\" created\n", Outfile);
203*39012Sbostic 		if (Num_pts == 2)
20430270Sbostic 			puts("There was 1 string");
20530270Sbostic 		else
206*39012Sbostic 			printf("There were %u strings\n", Num_pts - 1);
20730270Sbostic 		printf("Longest string: %u byte%s\n", Tbl.str_longlen,
20830270Sbostic 		       Tbl.str_longlen == 1 ? "" : "s");
20930270Sbostic 		printf("Shortest string: %u byte%s\n", Tbl.str_shortlen,
21030270Sbostic 		       Tbl.str_shortlen == 1 ? "" : "s");
21130270Sbostic 	}
21230270Sbostic 	exit(0);
213*39012Sbostic 	/* NOTREACHED */
21430270Sbostic }
21530270Sbostic 
21630270Sbostic /*
21730270Sbostic  *	This routine evaluates arguments from the command line
21830270Sbostic  */
21930270Sbostic getargs(ac, av)
22030270Sbostic register int	ac;
22130270Sbostic register char	**av;
22230270Sbostic {
22330270Sbostic 	register char	*sp;
22430270Sbostic 	register int	i;
22530270Sbostic 	register int	bad, j;
22630270Sbostic 
22730270Sbostic 	bad = 0;
22830270Sbostic 	for (i = 1; i < ac; i++)
22930270Sbostic 		if (*av[i] == '-' && av[i][1]) {
23030270Sbostic 			for (sp = &av[i][1]; *sp; sp++)
23130270Sbostic 				switch (*sp) {
23230270Sbostic 				  case 'c': /* new delimiting char */
23330270Sbostic 					if ((Delimch = *++sp) == '\0') {
23430270Sbostic 						--sp;
23530270Sbostic 						Delimch = *av[++i];
23630270Sbostic 					}
237*39012Sbostic 					if (!isascii(Delimch)) {
23830270Sbostic 						printf("bad delimiting character: '\\%o\n'",
23930270Sbostic 						       Delimch);
24030270Sbostic 						bad++;
24130270Sbostic 					}
24230270Sbostic 					break;
24330270Sbostic 				  case 's':	/* silent */
24430270Sbostic 					Sflag++;
24530270Sbostic 					break;
24630270Sbostic 				  case 'v':	/* verbose */
24730270Sbostic 					Sflag = 0;
24830270Sbostic 					break;
24930270Sbostic 				  case 'o':	/* order strings */
25030270Sbostic 					Oflag++;
25130270Sbostic 					break;
25230270Sbostic 				  case 'i':	/* ignore case in ordering */
25330270Sbostic 					Iflag++;
25430270Sbostic 					break;
25530270Sbostic 				  case 'r':	/* ignore case in ordering */
25630270Sbostic 					Rflag++;
25730270Sbostic 					break;
25830270Sbostic 				  default:	/* unknown flag */
25930270Sbostic 					bad++;
26030270Sbostic 					printf("bad flag: '%c'\n", *sp);
26130270Sbostic 					break;
26230270Sbostic 				}
26330270Sbostic 		}
26430270Sbostic 		else if (*av[i] == '-') {
26530270Sbostic 			for (j = 0; Usage[j]; j++)
26630270Sbostic 				puts(Usage[j]);
26730270Sbostic 			exit(0);
26830270Sbostic 		}
26930270Sbostic 		else if (Infile)
27030270Sbostic 			(void) strcpy(Outfile, av[i]);
27130270Sbostic 		else
27230270Sbostic 			Infile = av[i];
27330270Sbostic 	if (!Infile) {
27430270Sbostic 		bad++;
27530270Sbostic 		puts("No input file name");
27630270Sbostic 	}
27730270Sbostic 	if (*Outfile == '\0' && !bad) {
27830270Sbostic 		(void) strcpy(Outfile, Infile);
27930270Sbostic 		(void) strcat(Outfile, ".dat");
28030270Sbostic 	}
28130270Sbostic 	if (bad) {
28230270Sbostic 		puts("use \"strfile -\" to get usage");
28330270Sbostic 		exit(-1);
28430270Sbostic 	}
28530270Sbostic }
28630270Sbostic 
28730270Sbostic /*
288*39012Sbostic  * add_offset:
289*39012Sbostic  *	Add an offset to the list, or write it out, as appropriate.
290*39012Sbostic  */
291*39012Sbostic add_offset(fp, off)
292*39012Sbostic FILE	*fp;
293*39012Sbostic off_t	off;
294*39012Sbostic {
295*39012Sbostic 	if (!STORING_PTRS)
296*39012Sbostic 		fwrite(&off, 1, sizeof off, fp);
297*39012Sbostic 	else {
298*39012Sbostic 		ALLOC(Seekpts, Num_pts + 1);
299*39012Sbostic 		Seekpts[Num_pts] = off;
300*39012Sbostic 	}
301*39012Sbostic 	Num_pts++;
302*39012Sbostic }
303*39012Sbostic 
304*39012Sbostic /*
30530270Sbostic  * do_order:
30630270Sbostic  *	Order the strings alphabetically (possibly ignoring case).
30730270Sbostic  */
308*39012Sbostic do_order()
30930270Sbostic {
31030270Sbostic 	register int	i;
311*39012Sbostic 	register off_t	*lp;
31230270Sbostic 	register STR	*fp;
31330270Sbostic 	extern int	cmp_str();
31430270Sbostic 
315*39012Sbostic 	Sort_1 = fopen(Infile, "r");
316*39012Sbostic 	Sort_2 = fopen(Infile, "r");
317*39012Sbostic 	qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str);
31830270Sbostic 	i = Tbl.str_numstr;
319*39012Sbostic 	lp = Seekpts;
32030270Sbostic 	fp = Firstch;
32130270Sbostic 	while (i--)
32230270Sbostic 		*lp++ = fp++->pos;
32330270Sbostic 	(void) fclose(Sort_1);
32430270Sbostic 	(void) fclose(Sort_2);
32530270Sbostic 	Tbl.str_flags |= STR_ORDERED;
32630270Sbostic }
32730270Sbostic 
32830270Sbostic /*
32930270Sbostic  * cmp_str:
33030270Sbostic  *	Compare two strings in the file
33130270Sbostic  */
332*39012Sbostic char *
333*39012Sbostic unctrl(c)
334*39012Sbostic char c;
335*39012Sbostic {
336*39012Sbostic 	static char	buf[3];
337*39012Sbostic 
338*39012Sbostic 	if (isprint(c)) {
339*39012Sbostic 		buf[0] = c;
340*39012Sbostic 		buf[1] = '\0';
341*39012Sbostic 	}
342*39012Sbostic 	else if (c == 0177) {
343*39012Sbostic 		buf[0] = '^';
344*39012Sbostic 		buf[1] = '?';
345*39012Sbostic 	}
346*39012Sbostic 	else {
347*39012Sbostic 		buf[0] = '^';
348*39012Sbostic 		buf[1] = c + 'A' - 1;
349*39012Sbostic 	}
350*39012Sbostic 	return buf;
351*39012Sbostic }
352*39012Sbostic 
35330270Sbostic cmp_str(p1, p2)
35430270Sbostic STR	*p1, *p2;
35530270Sbostic {
35630270Sbostic 	register int	c1, c2;
357*39012Sbostic 	register int	n1, n2;
35830270Sbostic 
359*39012Sbostic # define	SET_N(nf,ch)	(nf = (ch == '\n'))
360*39012Sbostic # define	IS_END(ch,nf)	(ch == Delimch && nf)
361*39012Sbostic 
36230270Sbostic 	c1 = p1->first;
36330270Sbostic 	c2 = p2->first;
36430270Sbostic 	if (c1 != c2)
36530270Sbostic 		return c1 - c2;
36630270Sbostic 
36730270Sbostic 	(void) fseek(Sort_1, p1->pos, 0);
36830270Sbostic 	(void) fseek(Sort_2, p2->pos, 0);
36930270Sbostic 
370*39012Sbostic 	n1 = FALSE;
371*39012Sbostic 	n2 = FALSE;
37230270Sbostic 	while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0')
373*39012Sbostic 		SET_N(n1, c1);
37430270Sbostic 	while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0')
375*39012Sbostic 		SET_N(n2, c2);
37630270Sbostic 
377*39012Sbostic 	while (!IS_END(c1, n1) && !IS_END(c2, n2)) {
37830270Sbostic 		if (Iflag) {
37930270Sbostic 			if (isupper(c1))
38030270Sbostic 				c1 = tolower(c1);
38130270Sbostic 			if (isupper(c2))
38230270Sbostic 				c2 = tolower(c2);
38330270Sbostic 		}
38430270Sbostic 		if (c1 != c2)
38530270Sbostic 			return c1 - c2;
386*39012Sbostic 		if (c1 == '\n' || c2 == '\n')
387*39012Sbostic 			atoi("1");
388*39012Sbostic 		SET_N(n1, c1);
389*39012Sbostic 		SET_N(n2, c2);
39030270Sbostic 		c1 = getc(Sort_1);
39130270Sbostic 		c2 = getc(Sort_2);
39230270Sbostic 	}
393*39012Sbostic 	if (IS_END(c1, n1))
394*39012Sbostic 		c1 = 0;
395*39012Sbostic 	if (IS_END(c2, n2))
396*39012Sbostic 		c2 = 0;
39730270Sbostic 	return c1 - c2;
39830270Sbostic }
39930270Sbostic 
40030270Sbostic /*
40130270Sbostic  * randomize:
40230270Sbostic  *	Randomize the order of the string table.  We must be careful
40330270Sbostic  *	not to randomize across delimiter boundaries.  All
40430270Sbostic  *	randomization is done within each block.
40530270Sbostic  */
406*39012Sbostic randomize()
40730270Sbostic {
408*39012Sbostic 	register int	cnt, i;
409*39012Sbostic 	register off_t	tmp;
410*39012Sbostic 	register off_t	*sp;
411*39012Sbostic 	extern time_t	time();
41230270Sbostic 
41330270Sbostic 	Tbl.str_flags |= STR_RANDOM;
414*39012Sbostic 	cnt = Tbl.str_numstr;
41530270Sbostic 
416*39012Sbostic 	/*
417*39012Sbostic 	 * move things around randomly
418*39012Sbostic 	 */
41930270Sbostic 
420*39012Sbostic 	for (sp = Seekpts; cnt > 0; cnt--, sp++) {
421*39012Sbostic 		i = rnd((long) cnt);
422*39012Sbostic 		tmp = sp[0];
423*39012Sbostic 		sp[0] = sp[i];
424*39012Sbostic 		sp[i] = tmp;
42530270Sbostic 	}
42630270Sbostic }
427