147857Sbostic /*-
2*60797Sbostic * Copyright (c) 1989, 1993
3*60797Sbostic * The Regents of the University of California. All rights reserved.
439026Sbostic *
539026Sbostic * This code is derived from software contributed to Berkeley by
639026Sbostic * Ken Arnold.
739026Sbostic *
847857Sbostic * %sccs.include.redist.c%
939026Sbostic */
1030270Sbostic
1139026Sbostic #ifndef lint
12*60797Sbostic static char copyright[] =
13*60797Sbostic "@(#) Copyright (c) 1989, 1993\n\
14*60797Sbostic The Regents of the University of California. All rights reserved.\n";
1539026Sbostic #endif /* not lint */
1639026Sbostic
1739026Sbostic #ifndef lint
18*60797Sbostic static char sccsid[] = "@(#)strfile.c 8.1 (Berkeley) 05/31/93";
1939026Sbostic #endif /* not lint */
2039026Sbostic
2139721Sbostic # include <machine/endian.h>
2239021Sbostic # include <sys/param.h>
2330270Sbostic # include <stdio.h>
2430270Sbostic # include <ctype.h>
2530270Sbostic # include "strfile.h"
2630270Sbostic
2739012Sbostic # ifndef MAXPATHLEN
2839012Sbostic # define MAXPATHLEN 1024
2939012Sbostic # endif /* MAXPATHLEN */
3039012Sbostic
3130270Sbostic /*
3230270Sbostic * This program takes a file composed of strings seperated by
3330270Sbostic * lines starting with two consecutive delimiting character (default
3430270Sbostic * character is '%') and creates another file which consists of a table
3530270Sbostic * describing the file (structure from "strfile.h"), a table of seek
3639033Sbostic * pointers to the start of the strings, and the strings, each terminated
3730270Sbostic * by a null byte. Usage:
3830270Sbostic *
3939067Sbostic * % strfile [-iorsx] [ -cC ] sourcefile [ datafile ]
4030270Sbostic *
4130270Sbostic * c - Change delimiting character from '%' to 'C'
4230270Sbostic * s - Silent. Give no summary of data processed at the end of
4330270Sbostic * the run.
4430270Sbostic * o - order the strings in alphabetic order
4530270Sbostic * i - if ordering, ignore case
4630270Sbostic * r - randomize the order of the strings
4739067Sbostic * x - set rotated bit
4830270Sbostic *
4930270Sbostic * Ken Arnold Sept. 7, 1978 --
5030270Sbostic *
5130270Sbostic * Added ordering options.
5230270Sbostic */
5330270Sbostic
5430270Sbostic # define TRUE 1
5530270Sbostic # define FALSE 0
5630270Sbostic
5739012Sbostic # define STORING_PTRS (Oflag || Rflag)
5839012Sbostic # define CHUNKSIZE 512
5930270Sbostic
6039012Sbostic #ifdef lint
6139012Sbostic # define ALWAYS atoi("1")
6239012Sbostic #else
6339012Sbostic # define ALWAYS 1
6439012Sbostic #endif
6539012Sbostic # define ALLOC(ptr,sz) if (ALWAYS) { \
6639012Sbostic if (ptr == NULL) \
6739012Sbostic ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \
6839012Sbostic else if (((sz) + 1) % CHUNKSIZE == 0) \
6939012Sbostic ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \
7039012Sbostic if (ptr == NULL) { \
7139012Sbostic fprintf(stderr, "out of space\n"); \
7239012Sbostic exit(1); \
7339012Sbostic } \
7439012Sbostic } else
7539012Sbostic
7639012Sbostic #ifdef NO_VOID
7739012Sbostic # define void char
7839012Sbostic #endif
7939012Sbostic
8030270Sbostic typedef struct {
8130270Sbostic char first;
8239012Sbostic off_t pos;
8330270Sbostic } STR;
8430270Sbostic
8530270Sbostic char *Infile = NULL, /* input file name */
8639012Sbostic Outfile[MAXPATHLEN] = "", /* output file name */
8739033Sbostic Delimch = '%'; /* delimiting character */
8830270Sbostic
8930270Sbostic int Sflag = FALSE; /* silent run flag */
9030270Sbostic int Oflag = FALSE; /* ordering flag */
9130270Sbostic int Iflag = FALSE; /* ignore case flag */
9230270Sbostic int Rflag = FALSE; /* randomize order flag */
9339067Sbostic int Xflag = FALSE; /* set rotated bit */
9439070Sbostic long Num_pts = 0; /* number of pointers/strings */
9530270Sbostic
9639012Sbostic off_t *Seekpts;
9730270Sbostic
9830270Sbostic FILE *Sort_1, *Sort_2; /* pointers for sorting */
9930270Sbostic
10030270Sbostic STRFILE Tbl; /* statistics table */
10130270Sbostic
10230270Sbostic STR *Firstch; /* first chars of each string */
10330270Sbostic
10439012Sbostic char *fgets(), *strcpy(), *strcat();
10530270Sbostic
10639012Sbostic void *malloc(), *realloc();
10730270Sbostic
10839012Sbostic /*
10939012Sbostic * main:
11039012Sbostic * Drive the sucker. There are two main modes -- either we store
11139012Sbostic * the seek pointers, if the table is to be sorted or randomized,
11239012Sbostic * or we write the pointer directly to the file, if we are to stay
11339012Sbostic * in file order. If the former, we allocate and re-allocate in
11439012Sbostic * CHUNKSIZE blocks; if the latter, we just write each pointer,
11539012Sbostic * and then seek back to the beginning to write in the table.
11639012Sbostic */
main(ac,av)11730270Sbostic main(ac, av)
11830270Sbostic int ac;
11930270Sbostic char **av;
12030270Sbostic {
12130270Sbostic register char *sp, dc;
12230270Sbostic register FILE *inf, *outf;
12339070Sbostic register off_t last_off, length, pos, *p;
12439070Sbostic register int first, cnt;
12530270Sbostic register char *nsp;
12630270Sbostic register STR *fp;
12730270Sbostic static char string[257];
12830270Sbostic
12930270Sbostic getargs(ac, av); /* evalute arguments */
13030270Sbostic dc = Delimch;
13130270Sbostic if ((inf = fopen(Infile, "r")) == NULL) {
13230270Sbostic perror(Infile);
13339037Sbostic exit(1);
13430270Sbostic }
13530270Sbostic
13630270Sbostic if ((outf = fopen(Outfile, "w")) == NULL) {
13730270Sbostic perror(Outfile);
13839037Sbostic exit(1);
13930270Sbostic }
14039012Sbostic if (!STORING_PTRS)
14139012Sbostic (void) fseek(outf, sizeof Tbl, 0);
14230270Sbostic
14330270Sbostic /*
14439012Sbostic * Write the strings onto the file
14530270Sbostic */
14630270Sbostic
14730270Sbostic Tbl.str_longlen = 0;
14830270Sbostic Tbl.str_shortlen = (unsigned int) 0xffffffff;
14939012Sbostic Tbl.str_delim = dc;
15039070Sbostic Tbl.str_version = VERSION;
15130270Sbostic first = Oflag;
15239012Sbostic add_offset(outf, ftell(inf));
15339012Sbostic last_off = 0;
15430270Sbostic do {
15530270Sbostic sp = fgets(string, 256, inf);
15639033Sbostic if (sp == NULL || sp[0] == dc && sp[1] == '\n') {
15739012Sbostic pos = ftell(inf);
15839721Sbostic length = pos - last_off - (sp ? strlen(sp) : 0);
15939012Sbostic last_off = pos;
16039033Sbostic if (!length)
16139033Sbostic continue;
16239033Sbostic add_offset(outf, pos);
16339012Sbostic if (Tbl.str_longlen < length)
16439012Sbostic Tbl.str_longlen = length;
16539012Sbostic if (Tbl.str_shortlen > length)
16639012Sbostic Tbl.str_shortlen = length;
16730270Sbostic first = Oflag;
16830270Sbostic }
16939033Sbostic else if (first) {
17039033Sbostic for (nsp = sp; !isalnum(*nsp); nsp++)
17139033Sbostic continue;
17239033Sbostic ALLOC(Firstch, Num_pts);
17339033Sbostic fp = &Firstch[Num_pts - 1];
17439033Sbostic if (Iflag && isupper(*nsp))
17539033Sbostic fp->first = tolower(*nsp);
17639033Sbostic else
17739033Sbostic fp->first = *nsp;
17839033Sbostic fp->pos = Seekpts[Num_pts - 1];
17939033Sbostic first = FALSE;
18030270Sbostic }
18130270Sbostic } while (sp != NULL);
18230270Sbostic
18330270Sbostic /*
18430270Sbostic * write the tables in
18530270Sbostic */
18630270Sbostic
18730270Sbostic (void) fclose(inf);
18830270Sbostic
18930270Sbostic if (Oflag)
19039012Sbostic do_order();
19130270Sbostic else if (Rflag)
19239012Sbostic randomize();
19330270Sbostic
19439067Sbostic if (Xflag)
19539067Sbostic Tbl.str_flags |= STR_ROTATED;
19639067Sbostic
19730270Sbostic if (!Sflag) {
19839012Sbostic printf("\"%s\" created\n", Outfile);
19939012Sbostic if (Num_pts == 2)
20030270Sbostic puts("There was 1 string");
20130270Sbostic else
20239021Sbostic printf("There were %d strings\n", Num_pts - 1);
20339021Sbostic printf("Longest string: %lu byte%s\n", Tbl.str_longlen,
20430270Sbostic Tbl.str_longlen == 1 ? "" : "s");
20539021Sbostic printf("Shortest string: %lu byte%s\n", Tbl.str_shortlen,
20630270Sbostic Tbl.str_shortlen == 1 ? "" : "s");
20730270Sbostic }
20839070Sbostic
20939070Sbostic (void) fseek(outf, (off_t) 0, 0);
21039070Sbostic Tbl.str_version = htonl(Tbl.str_version);
21139070Sbostic Tbl.str_numstr = htonl(Num_pts - 1);
21239070Sbostic Tbl.str_longlen = htonl(Tbl.str_longlen);
21339070Sbostic Tbl.str_shortlen = htonl(Tbl.str_shortlen);
21439070Sbostic Tbl.str_flags = htonl(Tbl.str_flags);
21539070Sbostic (void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf);
21639070Sbostic if (STORING_PTRS) {
21739070Sbostic for (p = Seekpts, cnt = Num_pts; cnt--; ++p)
21839070Sbostic *p = htonl(*p);
21939070Sbostic (void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf);
22039070Sbostic }
22139070Sbostic (void) fclose(outf);
22230270Sbostic exit(0);
22330270Sbostic }
22430270Sbostic
22530270Sbostic /*
22630270Sbostic * This routine evaluates arguments from the command line
22730270Sbostic */
getargs(argc,argv)22839033Sbostic getargs(argc, argv)
22939033Sbostic int argc;
23039033Sbostic char **argv;
23130270Sbostic {
23239033Sbostic extern char *optarg;
23339033Sbostic extern int optind;
23439033Sbostic int ch;
23530270Sbostic
23639067Sbostic while ((ch = getopt(argc, argv, "c:iorsx")) != EOF)
23739033Sbostic switch(ch) {
23839033Sbostic case 'c': /* new delimiting char */
23939033Sbostic Delimch = *optarg;
24039033Sbostic if (!isascii(Delimch)) {
24139033Sbostic printf("bad delimiting character: '\\%o\n'",
24239033Sbostic Delimch);
24339033Sbostic }
24439033Sbostic break;
24539033Sbostic case 'i': /* ignore case in ordering */
24639033Sbostic Iflag++;
24739033Sbostic break;
24839033Sbostic case 'o': /* order strings */
24939033Sbostic Oflag++;
25039033Sbostic break;
25139067Sbostic case 'r': /* randomize pointers */
25239033Sbostic Rflag++;
25339033Sbostic break;
25439033Sbostic case 's': /* silent */
25539033Sbostic Sflag++;
25639033Sbostic break;
25739067Sbostic case 'x': /* set the rotated bit */
25839067Sbostic Xflag++;
25939067Sbostic break;
26039033Sbostic case '?':
26139033Sbostic default:
26239033Sbostic usage();
26330270Sbostic }
26439033Sbostic argv += optind;
26539033Sbostic
26639033Sbostic if (*argv) {
26739033Sbostic Infile = *argv;
26839033Sbostic if (*++argv)
26939033Sbostic (void) strcpy(Outfile, *argv);
27039033Sbostic }
27130270Sbostic if (!Infile) {
27230270Sbostic puts("No input file name");
27339033Sbostic usage();
27430270Sbostic }
27539033Sbostic if (*Outfile == '\0') {
27630270Sbostic (void) strcpy(Outfile, Infile);
27730270Sbostic (void) strcat(Outfile, ".dat");
27830270Sbostic }
27930270Sbostic }
28030270Sbostic
usage()28139033Sbostic usage()
28239033Sbostic {
28339033Sbostic (void) fprintf(stderr,
28439067Sbostic "strfile [-iorsx] [-c char] sourcefile [datafile]\n");
28539033Sbostic exit(1);
28639033Sbostic }
28739033Sbostic
28830270Sbostic /*
28939012Sbostic * add_offset:
29039012Sbostic * Add an offset to the list, or write it out, as appropriate.
29139012Sbostic */
add_offset(fp,off)29239012Sbostic add_offset(fp, off)
29339012Sbostic FILE *fp;
29439012Sbostic off_t off;
29539012Sbostic {
29639070Sbostic off_t net;
29739070Sbostic
29839070Sbostic if (!STORING_PTRS) {
29939070Sbostic net = htonl(off);
30039070Sbostic fwrite(&net, 1, sizeof net, fp);
30139070Sbostic } else {
30239012Sbostic ALLOC(Seekpts, Num_pts + 1);
30339012Sbostic Seekpts[Num_pts] = off;
30439012Sbostic }
30539012Sbostic Num_pts++;
30639012Sbostic }
30739012Sbostic
30839012Sbostic /*
30930270Sbostic * do_order:
31030270Sbostic * Order the strings alphabetically (possibly ignoring case).
31130270Sbostic */
do_order()31239012Sbostic do_order()
31330270Sbostic {
31430270Sbostic register int i;
31539012Sbostic register off_t *lp;
31630270Sbostic register STR *fp;
31730270Sbostic extern int cmp_str();
31830270Sbostic
31939012Sbostic Sort_1 = fopen(Infile, "r");
32039012Sbostic Sort_2 = fopen(Infile, "r");
32139012Sbostic qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str);
32230270Sbostic i = Tbl.str_numstr;
32339012Sbostic lp = Seekpts;
32430270Sbostic fp = Firstch;
32530270Sbostic while (i--)
32630270Sbostic *lp++ = fp++->pos;
32730270Sbostic (void) fclose(Sort_1);
32830270Sbostic (void) fclose(Sort_2);
32930270Sbostic Tbl.str_flags |= STR_ORDERED;
33030270Sbostic }
33130270Sbostic
33230270Sbostic /*
33330270Sbostic * cmp_str:
33430270Sbostic * Compare two strings in the file
33530270Sbostic */
33639012Sbostic char *
unctrl(c)33739012Sbostic unctrl(c)
33839012Sbostic char c;
33939012Sbostic {
34039012Sbostic static char buf[3];
34139012Sbostic
34239012Sbostic if (isprint(c)) {
34339012Sbostic buf[0] = c;
34439012Sbostic buf[1] = '\0';
34539012Sbostic }
34639012Sbostic else if (c == 0177) {
34739012Sbostic buf[0] = '^';
34839012Sbostic buf[1] = '?';
34939012Sbostic }
35039012Sbostic else {
35139012Sbostic buf[0] = '^';
35239012Sbostic buf[1] = c + 'A' - 1;
35339012Sbostic }
35439012Sbostic return buf;
35539012Sbostic }
35639012Sbostic
cmp_str(p1,p2)35730270Sbostic cmp_str(p1, p2)
35830270Sbostic STR *p1, *p2;
35930270Sbostic {
36030270Sbostic register int c1, c2;
36139012Sbostic register int n1, n2;
36230270Sbostic
36339012Sbostic # define SET_N(nf,ch) (nf = (ch == '\n'))
36439012Sbostic # define IS_END(ch,nf) (ch == Delimch && nf)
36539012Sbostic
36630270Sbostic c1 = p1->first;
36730270Sbostic c2 = p2->first;
36830270Sbostic if (c1 != c2)
36930270Sbostic return c1 - c2;
37030270Sbostic
37130270Sbostic (void) fseek(Sort_1, p1->pos, 0);
37230270Sbostic (void) fseek(Sort_2, p2->pos, 0);
37330270Sbostic
37439012Sbostic n1 = FALSE;
37539012Sbostic n2 = FALSE;
37630270Sbostic while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0')
37739012Sbostic SET_N(n1, c1);
37830270Sbostic while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0')
37939012Sbostic SET_N(n2, c2);
38030270Sbostic
38139012Sbostic while (!IS_END(c1, n1) && !IS_END(c2, n2)) {
38230270Sbostic if (Iflag) {
38330270Sbostic if (isupper(c1))
38430270Sbostic c1 = tolower(c1);
38530270Sbostic if (isupper(c2))
38630270Sbostic c2 = tolower(c2);
38730270Sbostic }
38830270Sbostic if (c1 != c2)
38930270Sbostic return c1 - c2;
39039012Sbostic SET_N(n1, c1);
39139012Sbostic SET_N(n2, c2);
39230270Sbostic c1 = getc(Sort_1);
39330270Sbostic c2 = getc(Sort_2);
39430270Sbostic }
39539012Sbostic if (IS_END(c1, n1))
39639012Sbostic c1 = 0;
39739012Sbostic if (IS_END(c2, n2))
39839012Sbostic c2 = 0;
39930270Sbostic return c1 - c2;
40030270Sbostic }
40130270Sbostic
40230270Sbostic /*
40330270Sbostic * randomize:
40430270Sbostic * Randomize the order of the string table. We must be careful
40530270Sbostic * not to randomize across delimiter boundaries. All
40630270Sbostic * randomization is done within each block.
40730270Sbostic */
randomize()40839012Sbostic randomize()
40930270Sbostic {
41039012Sbostic register int cnt, i;
41139012Sbostic register off_t tmp;
41239012Sbostic register off_t *sp;
41339012Sbostic extern time_t time();
41430270Sbostic
41539077Sbostic srandom((int)(time((time_t *) NULL) + getpid()));
41639077Sbostic
41730270Sbostic Tbl.str_flags |= STR_RANDOM;
41839012Sbostic cnt = Tbl.str_numstr;
41930270Sbostic
42039012Sbostic /*
42139012Sbostic * move things around randomly
42239012Sbostic */
42330270Sbostic
42439012Sbostic for (sp = Seekpts; cnt > 0; cnt--, sp++) {
42539021Sbostic i = random() % cnt;
42639012Sbostic tmp = sp[0];
42739012Sbostic sp[0] = sp[i];
42839012Sbostic sp[i] = tmp;
42930270Sbostic }
43030270Sbostic }
431