1*39026Sbostic /*
2*39026Sbostic  * Copyright (c) 1989 The Regents of the University of California.
3*39026Sbostic  * All rights reserved.
4*39026Sbostic  *
5*39026Sbostic  * This code is derived from software contributed to Berkeley by
6*39026Sbostic  * Ken Arnold.
7*39026Sbostic  *
8*39026Sbostic  * Redistribution and use in source and binary forms are permitted
9*39026Sbostic  * provided that the above copyright notice and this paragraph are
10*39026Sbostic  * duplicated in all such forms and that any documentation,
11*39026Sbostic  * advertising materials, and other materials related to such
12*39026Sbostic  * distribution and use acknowledge that the software was developed
13*39026Sbostic  * by the University of California, Berkeley.  The name of the
14*39026Sbostic  * University may not be used to endorse or promote products derived
15*39026Sbostic  * from this software without specific prior written permission.
16*39026Sbostic  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
17*39026Sbostic  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
18*39026Sbostic  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
19*39026Sbostic  */
2030270Sbostic 
21*39026Sbostic #ifndef lint
22*39026Sbostic char copyright[] =
23*39026Sbostic "@(#) Copyright (c) 1989 The Regents of the University of California.\n\
24*39026Sbostic  All rights reserved.\n";
25*39026Sbostic #endif /* not lint */
26*39026Sbostic 
27*39026Sbostic #ifndef lint
28*39026Sbostic static char sccsid[] = "@(#)strfile.c	5.4 (Berkeley) 09/05/89";
29*39026Sbostic #endif /* not lint */
30*39026Sbostic 
3139021Sbostic # include	<sys/param.h>
3239021Sbostic # include	<sys/types.h>
3330270Sbostic # include	<stdio.h>
3430270Sbostic # include	<ctype.h>
3530270Sbostic # include	"strfile.h"
3630270Sbostic 
3739012Sbostic # ifndef MAXPATHLEN
3839012Sbostic # define	MAXPATHLEN	1024
3939012Sbostic # endif	/* MAXPATHLEN */
4039012Sbostic 
4130270Sbostic /*
4230270Sbostic  *	This program takes a file composed of strings seperated by
4330270Sbostic  * lines starting with two consecutive delimiting character (default
4430270Sbostic  * character is '%') and creates another file which consists of a table
4530270Sbostic  * describing the file (structure from "strfile.h"), a table of seek
4630270Sbostic  * pointers to the start of the strings, and the strings, each terinated
4730270Sbostic  * by a null byte.  Usage:
4830270Sbostic  *
4930270Sbostic  *	% strfile [ - ] [ -cC ] [ -sv ] [ -oir ] sourcefile [ datafile ]
5030270Sbostic  *
5130270Sbostic  *	- - Give a usage summary useful for jogging the memory
5230270Sbostic  *	c - Change delimiting character from '%' to 'C'
5330270Sbostic  *	s - Silent.  Give no summary of data processed at the end of
5430270Sbostic  *	    the run.
5530270Sbostic  *	v - Verbose.  Give summary of data processed.  (Default)
5630270Sbostic  *	o - order the strings in alphabetic order
5730270Sbostic  *	i - if ordering, ignore case
5830270Sbostic  *	r - randomize the order of the strings
5930270Sbostic  *
6030270Sbostic  *		Ken Arnold	Sept. 7, 1978 --
6130270Sbostic  *
6230270Sbostic  *	Added method to indicate dividers.  A "%-" will cause the address
6330270Sbostic  * to be added to the structure in one of the pointer elements.
6430270Sbostic  *
6530270Sbostic  *		Ken Arnold	Nov., 1984 --
6630270Sbostic  *
6730270Sbostic  *	Added ordering options.
6830270Sbostic  */
6930270Sbostic 
7030270Sbostic # define	TRUE	1
7130270Sbostic # define	FALSE	0
7230270Sbostic 
7339012Sbostic # define	STORING_PTRS	(Oflag || Rflag)
7439012Sbostic # define	CHUNKSIZE	512
7530270Sbostic 
7639012Sbostic #ifdef lint
7739012Sbostic # define	ALWAYS	atoi("1")
7839012Sbostic #else
7939012Sbostic # define	ALWAYS	1
8039012Sbostic #endif
8139012Sbostic # define	ALLOC(ptr,sz)	if (ALWAYS) { \
8239012Sbostic 			if (ptr == NULL) \
8339012Sbostic 				ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \
8439012Sbostic 			else if (((sz) + 1) % CHUNKSIZE == 0) \
8539012Sbostic 				ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \
8639012Sbostic 			if (ptr == NULL) { \
8739012Sbostic 				fprintf(stderr, "out of space\n"); \
8839012Sbostic 				exit(1); \
8939012Sbostic 			} \
9039012Sbostic 		} else
9139012Sbostic 
9239012Sbostic #ifdef NO_VOID
9339012Sbostic # define	void	char
9439012Sbostic #endif
9539012Sbostic 
9630270Sbostic typedef struct {
9730270Sbostic 	char	first;
9839012Sbostic 	off_t	pos;
9930270Sbostic } STR;
10030270Sbostic 
10130270Sbostic char	*Infile		= NULL,		/* input file name */
10239012Sbostic 	Outfile[MAXPATHLEN] = "",	/* output file name */
10330270Sbostic 	Delimch		= '%',		/* delimiting character */
10430270Sbostic 	*Usage[]	= {		/* usage summary */
10530270Sbostic        "usage:	strfile [ - ] [ -cC ] [ -sv ] [ -oir ] inputfile [ datafile ]",
10630270Sbostic        "	- - Give this usage summary",
10730270Sbostic        "	c - Replace delimiting character with 'C'",
10830270Sbostic        "	s - Silent.  Give no summary",
10930270Sbostic        "	v - Verbose.  Give summary.  (default)",
11030270Sbostic        "	o - order strings alphabetically",
11130270Sbostic        "	i - ignore case in ordering",
11230270Sbostic        "	r - randomize the order of the strings",
11330270Sbostic        "	Default \"datafile\" is inputfile.dat",
11430270Sbostic 	};
11530270Sbostic 
11630270Sbostic int	Sflag		= FALSE;	/* silent run flag */
11730270Sbostic int	Oflag		= FALSE;	/* ordering flag */
11830270Sbostic int	Iflag		= FALSE;	/* ignore case flag */
11930270Sbostic int	Rflag		= FALSE;	/* randomize order flag */
12039012Sbostic int	Num_pts		= 0;		/* number of pointers/strings */
12130270Sbostic 
12239012Sbostic off_t	*Seekpts;
12330270Sbostic 
12430270Sbostic FILE	*Sort_1, *Sort_2;		/* pointers for sorting */
12530270Sbostic 
12630270Sbostic STRFILE	Tbl;				/* statistics table */
12730270Sbostic 
12830270Sbostic STR	*Firstch;			/* first chars of each string */
12930270Sbostic 
13039012Sbostic char	*fgets(), *strcpy(), *strcat();
13130270Sbostic 
13239012Sbostic void	*malloc(), *realloc();
13330270Sbostic 
13439012Sbostic /*
13539012Sbostic  * main:
13639012Sbostic  *	Drive the sucker.  There are two main modes -- either we store
13739012Sbostic  *	the seek pointers, if the table is to be sorted or randomized,
13839012Sbostic  *	or we write the pointer directly to the file, if we are to stay
13939012Sbostic  *	in file order.  If the former, we allocate and re-allocate in
14039012Sbostic  *	CHUNKSIZE blocks; if the latter, we just write each pointer,
14139012Sbostic  *	and then seek back to the beginning to write in the table.
14239012Sbostic  */
14330270Sbostic main(ac, av)
14430270Sbostic int	ac;
14530270Sbostic char	**av;
14630270Sbostic {
14730270Sbostic 	register char		*sp, dc;
14830270Sbostic 	register FILE		*inf, *outf;
14939012Sbostic 	register off_t		last_off, length, pos;
15030270Sbostic 	register int		first;
15130270Sbostic 	register char		*nsp;
15230270Sbostic 	register STR		*fp;
15330270Sbostic 	static char		string[257];
15430270Sbostic 
15530270Sbostic 	getargs(ac, av);		/* evalute arguments */
15630270Sbostic 	dc = Delimch;
15730270Sbostic 	if ((inf = fopen(Infile, "r")) == NULL) {
15830270Sbostic 		perror(Infile);
15930270Sbostic 		exit(-1);
16030270Sbostic 	}
16130270Sbostic 
16230270Sbostic 	if ((outf = fopen(Outfile, "w")) == NULL) {
16330270Sbostic 		perror(Outfile);
16430270Sbostic 		exit(-1);
16530270Sbostic 	}
16639012Sbostic 	if (!STORING_PTRS)
16739012Sbostic 		(void) fseek(outf, sizeof Tbl, 0);
16830270Sbostic 
16930270Sbostic 	/*
17039012Sbostic 	 * Write the strings onto the file
17130270Sbostic 	 */
17230270Sbostic 
17330270Sbostic 	Tbl.str_longlen = 0;
17430270Sbostic 	Tbl.str_shortlen = (unsigned int) 0xffffffff;
17539012Sbostic 	Tbl.str_delim = dc;
17630270Sbostic 	first = Oflag;
17739012Sbostic 	add_offset(outf, ftell(inf));
17839012Sbostic 	last_off = 0;
17930270Sbostic 	do {
18039012Sbostic 		if (Num_pts > 508)
18139012Sbostic 			atoi("1");
18230270Sbostic 		sp = fgets(string, 256, inf);
18339012Sbostic 		if (sp == NULL || (sp[0] == dc && sp[1] == dc)) {
18439012Sbostic 			pos = ftell(inf);
18539012Sbostic 			add_offset(outf, pos);
18639012Sbostic 			length = pos - last_off - strlen(sp);
18739012Sbostic 			last_off = pos;
18839012Sbostic 			if (Tbl.str_longlen < length)
18939012Sbostic 				Tbl.str_longlen = length;
19039012Sbostic 			if (Tbl.str_shortlen > length)
19139012Sbostic 				Tbl.str_shortlen = length;
19230270Sbostic 			first = Oflag;
19330270Sbostic 		}
19430270Sbostic 		else {
19530270Sbostic 			if (first) {
19630270Sbostic 				for (nsp = sp; !isalnum(*nsp); nsp++)
19730270Sbostic 					continue;
19839012Sbostic 				ALLOC(Firstch, Num_pts);
19939012Sbostic 				fp = &Firstch[Num_pts - 1];
20030270Sbostic 				if (Iflag && isupper(*nsp))
20130270Sbostic 					fp->first = tolower(*nsp);
20230270Sbostic 				else
20330270Sbostic 					fp->first = *nsp;
20439012Sbostic 				fp->pos = Seekpts[Num_pts - 1];
20530270Sbostic 				first = FALSE;
20630270Sbostic 			}
20730270Sbostic 		}
20830270Sbostic 	} while (sp != NULL);
20930270Sbostic 
21030270Sbostic 	/*
21130270Sbostic 	 * write the tables in
21230270Sbostic 	 */
21330270Sbostic 
21430270Sbostic 	(void) fclose(inf);
21539012Sbostic 	Tbl.str_numstr = Num_pts - 1;
21630270Sbostic 
21730270Sbostic 	if (Oflag)
21839012Sbostic 		do_order();
21930270Sbostic 	else if (Rflag)
22039012Sbostic 		randomize();
22130270Sbostic 
22239012Sbostic 	(void) fseek(outf, (off_t) 0, 0);
22330270Sbostic 	(void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf);
22439012Sbostic 	if (STORING_PTRS)
22539012Sbostic 		(void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf);
22630270Sbostic 	(void) fclose(outf);
22730270Sbostic 
22830270Sbostic 	if (!Sflag) {
22939012Sbostic 		printf("\"%s\" created\n", Outfile);
23039012Sbostic 		if (Num_pts == 2)
23130270Sbostic 			puts("There was 1 string");
23230270Sbostic 		else
23339021Sbostic 			printf("There were %d strings\n", Num_pts - 1);
23439021Sbostic 		printf("Longest string: %lu byte%s\n", Tbl.str_longlen,
23530270Sbostic 		       Tbl.str_longlen == 1 ? "" : "s");
23639021Sbostic 		printf("Shortest string: %lu byte%s\n", Tbl.str_shortlen,
23730270Sbostic 		       Tbl.str_shortlen == 1 ? "" : "s");
23830270Sbostic 	}
23930270Sbostic 	exit(0);
24039012Sbostic 	/* NOTREACHED */
24130270Sbostic }
24230270Sbostic 
24330270Sbostic /*
24430270Sbostic  *	This routine evaluates arguments from the command line
24530270Sbostic  */
24630270Sbostic getargs(ac, av)
24730270Sbostic register int	ac;
24830270Sbostic register char	**av;
24930270Sbostic {
25030270Sbostic 	register char	*sp;
25130270Sbostic 	register int	i;
25230270Sbostic 	register int	bad, j;
25330270Sbostic 
25430270Sbostic 	bad = 0;
25530270Sbostic 	for (i = 1; i < ac; i++)
25630270Sbostic 		if (*av[i] == '-' && av[i][1]) {
25730270Sbostic 			for (sp = &av[i][1]; *sp; sp++)
25830270Sbostic 				switch (*sp) {
25930270Sbostic 				  case 'c': /* new delimiting char */
26030270Sbostic 					if ((Delimch = *++sp) == '\0') {
26130270Sbostic 						--sp;
26230270Sbostic 						Delimch = *av[++i];
26330270Sbostic 					}
26439012Sbostic 					if (!isascii(Delimch)) {
26530270Sbostic 						printf("bad delimiting character: '\\%o\n'",
26630270Sbostic 						       Delimch);
26730270Sbostic 						bad++;
26830270Sbostic 					}
26930270Sbostic 					break;
27030270Sbostic 				  case 's':	/* silent */
27130270Sbostic 					Sflag++;
27230270Sbostic 					break;
27330270Sbostic 				  case 'v':	/* verbose */
27430270Sbostic 					Sflag = 0;
27530270Sbostic 					break;
27630270Sbostic 				  case 'o':	/* order strings */
27730270Sbostic 					Oflag++;
27830270Sbostic 					break;
27930270Sbostic 				  case 'i':	/* ignore case in ordering */
28030270Sbostic 					Iflag++;
28130270Sbostic 					break;
28230270Sbostic 				  case 'r':	/* ignore case in ordering */
28330270Sbostic 					Rflag++;
28430270Sbostic 					break;
28530270Sbostic 				  default:	/* unknown flag */
28630270Sbostic 					bad++;
28730270Sbostic 					printf("bad flag: '%c'\n", *sp);
28830270Sbostic 					break;
28930270Sbostic 				}
29030270Sbostic 		}
29130270Sbostic 		else if (*av[i] == '-') {
29230270Sbostic 			for (j = 0; Usage[j]; j++)
29330270Sbostic 				puts(Usage[j]);
29430270Sbostic 			exit(0);
29530270Sbostic 		}
29630270Sbostic 		else if (Infile)
29730270Sbostic 			(void) strcpy(Outfile, av[i]);
29830270Sbostic 		else
29930270Sbostic 			Infile = av[i];
30030270Sbostic 	if (!Infile) {
30130270Sbostic 		bad++;
30230270Sbostic 		puts("No input file name");
30330270Sbostic 	}
30430270Sbostic 	if (*Outfile == '\0' && !bad) {
30530270Sbostic 		(void) strcpy(Outfile, Infile);
30630270Sbostic 		(void) strcat(Outfile, ".dat");
30730270Sbostic 	}
30830270Sbostic 	if (bad) {
30930270Sbostic 		puts("use \"strfile -\" to get usage");
31030270Sbostic 		exit(-1);
31130270Sbostic 	}
31230270Sbostic }
31330270Sbostic 
31430270Sbostic /*
31539012Sbostic  * add_offset:
31639012Sbostic  *	Add an offset to the list, or write it out, as appropriate.
31739012Sbostic  */
31839012Sbostic add_offset(fp, off)
31939012Sbostic FILE	*fp;
32039012Sbostic off_t	off;
32139012Sbostic {
32239012Sbostic 	if (!STORING_PTRS)
32339012Sbostic 		fwrite(&off, 1, sizeof off, fp);
32439012Sbostic 	else {
32539012Sbostic 		ALLOC(Seekpts, Num_pts + 1);
32639012Sbostic 		Seekpts[Num_pts] = off;
32739012Sbostic 	}
32839012Sbostic 	Num_pts++;
32939012Sbostic }
33039012Sbostic 
33139012Sbostic /*
33230270Sbostic  * do_order:
33330270Sbostic  *	Order the strings alphabetically (possibly ignoring case).
33430270Sbostic  */
33539012Sbostic do_order()
33630270Sbostic {
33730270Sbostic 	register int	i;
33839012Sbostic 	register off_t	*lp;
33930270Sbostic 	register STR	*fp;
34030270Sbostic 	extern int	cmp_str();
34130270Sbostic 
34239012Sbostic 	Sort_1 = fopen(Infile, "r");
34339012Sbostic 	Sort_2 = fopen(Infile, "r");
34439012Sbostic 	qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str);
34530270Sbostic 	i = Tbl.str_numstr;
34639012Sbostic 	lp = Seekpts;
34730270Sbostic 	fp = Firstch;
34830270Sbostic 	while (i--)
34930270Sbostic 		*lp++ = fp++->pos;
35030270Sbostic 	(void) fclose(Sort_1);
35130270Sbostic 	(void) fclose(Sort_2);
35230270Sbostic 	Tbl.str_flags |= STR_ORDERED;
35330270Sbostic }
35430270Sbostic 
35530270Sbostic /*
35630270Sbostic  * cmp_str:
35730270Sbostic  *	Compare two strings in the file
35830270Sbostic  */
35939012Sbostic char *
36039012Sbostic unctrl(c)
36139012Sbostic char c;
36239012Sbostic {
36339012Sbostic 	static char	buf[3];
36439012Sbostic 
36539012Sbostic 	if (isprint(c)) {
36639012Sbostic 		buf[0] = c;
36739012Sbostic 		buf[1] = '\0';
36839012Sbostic 	}
36939012Sbostic 	else if (c == 0177) {
37039012Sbostic 		buf[0] = '^';
37139012Sbostic 		buf[1] = '?';
37239012Sbostic 	}
37339012Sbostic 	else {
37439012Sbostic 		buf[0] = '^';
37539012Sbostic 		buf[1] = c + 'A' - 1;
37639012Sbostic 	}
37739012Sbostic 	return buf;
37839012Sbostic }
37939012Sbostic 
38030270Sbostic cmp_str(p1, p2)
38130270Sbostic STR	*p1, *p2;
38230270Sbostic {
38330270Sbostic 	register int	c1, c2;
38439012Sbostic 	register int	n1, n2;
38530270Sbostic 
38639012Sbostic # define	SET_N(nf,ch)	(nf = (ch == '\n'))
38739012Sbostic # define	IS_END(ch,nf)	(ch == Delimch && nf)
38839012Sbostic 
38930270Sbostic 	c1 = p1->first;
39030270Sbostic 	c2 = p2->first;
39130270Sbostic 	if (c1 != c2)
39230270Sbostic 		return c1 - c2;
39330270Sbostic 
39430270Sbostic 	(void) fseek(Sort_1, p1->pos, 0);
39530270Sbostic 	(void) fseek(Sort_2, p2->pos, 0);
39630270Sbostic 
39739012Sbostic 	n1 = FALSE;
39839012Sbostic 	n2 = FALSE;
39930270Sbostic 	while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0')
40039012Sbostic 		SET_N(n1, c1);
40130270Sbostic 	while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0')
40239012Sbostic 		SET_N(n2, c2);
40330270Sbostic 
40439012Sbostic 	while (!IS_END(c1, n1) && !IS_END(c2, n2)) {
40530270Sbostic 		if (Iflag) {
40630270Sbostic 			if (isupper(c1))
40730270Sbostic 				c1 = tolower(c1);
40830270Sbostic 			if (isupper(c2))
40930270Sbostic 				c2 = tolower(c2);
41030270Sbostic 		}
41130270Sbostic 		if (c1 != c2)
41230270Sbostic 			return c1 - c2;
41339012Sbostic 		if (c1 == '\n' || c2 == '\n')
41439012Sbostic 			atoi("1");
41539012Sbostic 		SET_N(n1, c1);
41639012Sbostic 		SET_N(n2, c2);
41730270Sbostic 		c1 = getc(Sort_1);
41830270Sbostic 		c2 = getc(Sort_2);
41930270Sbostic 	}
42039012Sbostic 	if (IS_END(c1, n1))
42139012Sbostic 		c1 = 0;
42239012Sbostic 	if (IS_END(c2, n2))
42339012Sbostic 		c2 = 0;
42430270Sbostic 	return c1 - c2;
42530270Sbostic }
42630270Sbostic 
42730270Sbostic /*
42830270Sbostic  * randomize:
42930270Sbostic  *	Randomize the order of the string table.  We must be careful
43030270Sbostic  *	not to randomize across delimiter boundaries.  All
43130270Sbostic  *	randomization is done within each block.
43230270Sbostic  */
43339012Sbostic randomize()
43430270Sbostic {
43539012Sbostic 	register int	cnt, i;
43639012Sbostic 	register off_t	tmp;
43739012Sbostic 	register off_t	*sp;
43839012Sbostic 	extern time_t	time();
43930270Sbostic 
44030270Sbostic 	Tbl.str_flags |= STR_RANDOM;
44139012Sbostic 	cnt = Tbl.str_numstr;
44230270Sbostic 
44339012Sbostic 	/*
44439012Sbostic 	 * move things around randomly
44539012Sbostic 	 */
44630270Sbostic 
44739012Sbostic 	for (sp = Seekpts; cnt > 0; cnt--, sp++) {
44839021Sbostic 		i = random() % cnt;
44939012Sbostic 		tmp = sp[0];
45039012Sbostic 		sp[0] = sp[i];
45139012Sbostic 		sp[i] = tmp;
45230270Sbostic 	}
45330270Sbostic }
454