1*47857Sbostic /*- 239026Sbostic * Copyright (c) 1989 The Regents of the University of California. 339026Sbostic * All rights reserved. 439026Sbostic * 539026Sbostic * This code is derived from software contributed to Berkeley by 639026Sbostic * Ken Arnold. 739026Sbostic * 8*47857Sbostic * %sccs.include.redist.c% 939026Sbostic */ 1030270Sbostic 1139026Sbostic #ifndef lint 1239026Sbostic char copyright[] = 1339026Sbostic "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 1439026Sbostic All rights reserved.\n"; 1539026Sbostic #endif /* not lint */ 1639026Sbostic 1739026Sbostic #ifndef lint 18*47857Sbostic static char sccsid[] = "@(#)strfile.c 5.12 (Berkeley) 04/08/91"; 1939026Sbostic #endif /* not lint */ 2039026Sbostic 2139721Sbostic # include <machine/endian.h> 2239021Sbostic # include <sys/param.h> 2330270Sbostic # include <stdio.h> 2430270Sbostic # include <ctype.h> 2530270Sbostic # include "strfile.h" 2630270Sbostic 2739012Sbostic # ifndef MAXPATHLEN 2839012Sbostic # define MAXPATHLEN 1024 2939012Sbostic # endif /* MAXPATHLEN */ 3039012Sbostic 3130270Sbostic /* 3230270Sbostic * This program takes a file composed of strings seperated by 3330270Sbostic * lines starting with two consecutive delimiting character (default 3430270Sbostic * character is '%') and creates another file which consists of a table 3530270Sbostic * describing the file (structure from "strfile.h"), a table of seek 3639033Sbostic * pointers to the start of the strings, and the strings, each terminated 3730270Sbostic * by a null byte. Usage: 3830270Sbostic * 3939067Sbostic * % strfile [-iorsx] [ -cC ] sourcefile [ datafile ] 4030270Sbostic * 4130270Sbostic * c - Change delimiting character from '%' to 'C' 4230270Sbostic * s - Silent. Give no summary of data processed at the end of 4330270Sbostic * the run. 4430270Sbostic * o - order the strings in alphabetic order 4530270Sbostic * i - if ordering, ignore case 4630270Sbostic * r - randomize the order of the strings 4739067Sbostic * x - set rotated bit 4830270Sbostic * 4930270Sbostic * Ken Arnold Sept. 7, 1978 -- 5030270Sbostic * 5130270Sbostic * Added ordering options. 5230270Sbostic */ 5330270Sbostic 5430270Sbostic # define TRUE 1 5530270Sbostic # define FALSE 0 5630270Sbostic 5739012Sbostic # define STORING_PTRS (Oflag || Rflag) 5839012Sbostic # define CHUNKSIZE 512 5930270Sbostic 6039012Sbostic #ifdef lint 6139012Sbostic # define ALWAYS atoi("1") 6239012Sbostic #else 6339012Sbostic # define ALWAYS 1 6439012Sbostic #endif 6539012Sbostic # define ALLOC(ptr,sz) if (ALWAYS) { \ 6639012Sbostic if (ptr == NULL) \ 6739012Sbostic ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \ 6839012Sbostic else if (((sz) + 1) % CHUNKSIZE == 0) \ 6939012Sbostic ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \ 7039012Sbostic if (ptr == NULL) { \ 7139012Sbostic fprintf(stderr, "out of space\n"); \ 7239012Sbostic exit(1); \ 7339012Sbostic } \ 7439012Sbostic } else 7539012Sbostic 7639012Sbostic #ifdef NO_VOID 7739012Sbostic # define void char 7839012Sbostic #endif 7939012Sbostic 8030270Sbostic typedef struct { 8130270Sbostic char first; 8239012Sbostic off_t pos; 8330270Sbostic } STR; 8430270Sbostic 8530270Sbostic char *Infile = NULL, /* input file name */ 8639012Sbostic Outfile[MAXPATHLEN] = "", /* output file name */ 8739033Sbostic Delimch = '%'; /* delimiting character */ 8830270Sbostic 8930270Sbostic int Sflag = FALSE; /* silent run flag */ 9030270Sbostic int Oflag = FALSE; /* ordering flag */ 9130270Sbostic int Iflag = FALSE; /* ignore case flag */ 9230270Sbostic int Rflag = FALSE; /* randomize order flag */ 9339067Sbostic int Xflag = FALSE; /* set rotated bit */ 9439070Sbostic long Num_pts = 0; /* number of pointers/strings */ 9530270Sbostic 9639012Sbostic off_t *Seekpts; 9730270Sbostic 9830270Sbostic FILE *Sort_1, *Sort_2; /* pointers for sorting */ 9930270Sbostic 10030270Sbostic STRFILE Tbl; /* statistics table */ 10130270Sbostic 10230270Sbostic STR *Firstch; /* first chars of each string */ 10330270Sbostic 10439012Sbostic char *fgets(), *strcpy(), *strcat(); 10530270Sbostic 10639012Sbostic void *malloc(), *realloc(); 10730270Sbostic 10839012Sbostic /* 10939012Sbostic * main: 11039012Sbostic * Drive the sucker. There are two main modes -- either we store 11139012Sbostic * the seek pointers, if the table is to be sorted or randomized, 11239012Sbostic * or we write the pointer directly to the file, if we are to stay 11339012Sbostic * in file order. If the former, we allocate and re-allocate in 11439012Sbostic * CHUNKSIZE blocks; if the latter, we just write each pointer, 11539012Sbostic * and then seek back to the beginning to write in the table. 11639012Sbostic */ 11730270Sbostic main(ac, av) 11830270Sbostic int ac; 11930270Sbostic char **av; 12030270Sbostic { 12130270Sbostic register char *sp, dc; 12230270Sbostic register FILE *inf, *outf; 12339070Sbostic register off_t last_off, length, pos, *p; 12439070Sbostic register int first, cnt; 12530270Sbostic register char *nsp; 12630270Sbostic register STR *fp; 12730270Sbostic static char string[257]; 12830270Sbostic 12930270Sbostic getargs(ac, av); /* evalute arguments */ 13030270Sbostic dc = Delimch; 13130270Sbostic if ((inf = fopen(Infile, "r")) == NULL) { 13230270Sbostic perror(Infile); 13339037Sbostic exit(1); 13430270Sbostic } 13530270Sbostic 13630270Sbostic if ((outf = fopen(Outfile, "w")) == NULL) { 13730270Sbostic perror(Outfile); 13839037Sbostic exit(1); 13930270Sbostic } 14039012Sbostic if (!STORING_PTRS) 14139012Sbostic (void) fseek(outf, sizeof Tbl, 0); 14230270Sbostic 14330270Sbostic /* 14439012Sbostic * Write the strings onto the file 14530270Sbostic */ 14630270Sbostic 14730270Sbostic Tbl.str_longlen = 0; 14830270Sbostic Tbl.str_shortlen = (unsigned int) 0xffffffff; 14939012Sbostic Tbl.str_delim = dc; 15039070Sbostic Tbl.str_version = VERSION; 15130270Sbostic first = Oflag; 15239012Sbostic add_offset(outf, ftell(inf)); 15339012Sbostic last_off = 0; 15430270Sbostic do { 15530270Sbostic sp = fgets(string, 256, inf); 15639033Sbostic if (sp == NULL || sp[0] == dc && sp[1] == '\n') { 15739012Sbostic pos = ftell(inf); 15839721Sbostic length = pos - last_off - (sp ? strlen(sp) : 0); 15939012Sbostic last_off = pos; 16039033Sbostic if (!length) 16139033Sbostic continue; 16239033Sbostic add_offset(outf, pos); 16339012Sbostic if (Tbl.str_longlen < length) 16439012Sbostic Tbl.str_longlen = length; 16539012Sbostic if (Tbl.str_shortlen > length) 16639012Sbostic Tbl.str_shortlen = length; 16730270Sbostic first = Oflag; 16830270Sbostic } 16939033Sbostic else if (first) { 17039033Sbostic for (nsp = sp; !isalnum(*nsp); nsp++) 17139033Sbostic continue; 17239033Sbostic ALLOC(Firstch, Num_pts); 17339033Sbostic fp = &Firstch[Num_pts - 1]; 17439033Sbostic if (Iflag && isupper(*nsp)) 17539033Sbostic fp->first = tolower(*nsp); 17639033Sbostic else 17739033Sbostic fp->first = *nsp; 17839033Sbostic fp->pos = Seekpts[Num_pts - 1]; 17939033Sbostic first = FALSE; 18030270Sbostic } 18130270Sbostic } while (sp != NULL); 18230270Sbostic 18330270Sbostic /* 18430270Sbostic * write the tables in 18530270Sbostic */ 18630270Sbostic 18730270Sbostic (void) fclose(inf); 18830270Sbostic 18930270Sbostic if (Oflag) 19039012Sbostic do_order(); 19130270Sbostic else if (Rflag) 19239012Sbostic randomize(); 19330270Sbostic 19439067Sbostic if (Xflag) 19539067Sbostic Tbl.str_flags |= STR_ROTATED; 19639067Sbostic 19730270Sbostic if (!Sflag) { 19839012Sbostic printf("\"%s\" created\n", Outfile); 19939012Sbostic if (Num_pts == 2) 20030270Sbostic puts("There was 1 string"); 20130270Sbostic else 20239021Sbostic printf("There were %d strings\n", Num_pts - 1); 20339021Sbostic printf("Longest string: %lu byte%s\n", Tbl.str_longlen, 20430270Sbostic Tbl.str_longlen == 1 ? "" : "s"); 20539021Sbostic printf("Shortest string: %lu byte%s\n", Tbl.str_shortlen, 20630270Sbostic Tbl.str_shortlen == 1 ? "" : "s"); 20730270Sbostic } 20839070Sbostic 20939070Sbostic (void) fseek(outf, (off_t) 0, 0); 21039070Sbostic Tbl.str_version = htonl(Tbl.str_version); 21139070Sbostic Tbl.str_numstr = htonl(Num_pts - 1); 21239070Sbostic Tbl.str_longlen = htonl(Tbl.str_longlen); 21339070Sbostic Tbl.str_shortlen = htonl(Tbl.str_shortlen); 21439070Sbostic Tbl.str_flags = htonl(Tbl.str_flags); 21539070Sbostic (void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf); 21639070Sbostic if (STORING_PTRS) { 21739070Sbostic for (p = Seekpts, cnt = Num_pts; cnt--; ++p) 21839070Sbostic *p = htonl(*p); 21939070Sbostic (void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf); 22039070Sbostic } 22139070Sbostic (void) fclose(outf); 22230270Sbostic exit(0); 22330270Sbostic } 22430270Sbostic 22530270Sbostic /* 22630270Sbostic * This routine evaluates arguments from the command line 22730270Sbostic */ 22839033Sbostic getargs(argc, argv) 22939033Sbostic int argc; 23039033Sbostic char **argv; 23130270Sbostic { 23239033Sbostic extern char *optarg; 23339033Sbostic extern int optind; 23439033Sbostic int ch; 23530270Sbostic 23639067Sbostic while ((ch = getopt(argc, argv, "c:iorsx")) != EOF) 23739033Sbostic switch(ch) { 23839033Sbostic case 'c': /* new delimiting char */ 23939033Sbostic Delimch = *optarg; 24039033Sbostic if (!isascii(Delimch)) { 24139033Sbostic printf("bad delimiting character: '\\%o\n'", 24239033Sbostic Delimch); 24339033Sbostic } 24439033Sbostic break; 24539033Sbostic case 'i': /* ignore case in ordering */ 24639033Sbostic Iflag++; 24739033Sbostic break; 24839033Sbostic case 'o': /* order strings */ 24939033Sbostic Oflag++; 25039033Sbostic break; 25139067Sbostic case 'r': /* randomize pointers */ 25239033Sbostic Rflag++; 25339033Sbostic break; 25439033Sbostic case 's': /* silent */ 25539033Sbostic Sflag++; 25639033Sbostic break; 25739067Sbostic case 'x': /* set the rotated bit */ 25839067Sbostic Xflag++; 25939067Sbostic break; 26039033Sbostic case '?': 26139033Sbostic default: 26239033Sbostic usage(); 26330270Sbostic } 26439033Sbostic argv += optind; 26539033Sbostic 26639033Sbostic if (*argv) { 26739033Sbostic Infile = *argv; 26839033Sbostic if (*++argv) 26939033Sbostic (void) strcpy(Outfile, *argv); 27039033Sbostic } 27130270Sbostic if (!Infile) { 27230270Sbostic puts("No input file name"); 27339033Sbostic usage(); 27430270Sbostic } 27539033Sbostic if (*Outfile == '\0') { 27630270Sbostic (void) strcpy(Outfile, Infile); 27730270Sbostic (void) strcat(Outfile, ".dat"); 27830270Sbostic } 27930270Sbostic } 28030270Sbostic 28139033Sbostic usage() 28239033Sbostic { 28339033Sbostic (void) fprintf(stderr, 28439067Sbostic "strfile [-iorsx] [-c char] sourcefile [datafile]\n"); 28539033Sbostic exit(1); 28639033Sbostic } 28739033Sbostic 28830270Sbostic /* 28939012Sbostic * add_offset: 29039012Sbostic * Add an offset to the list, or write it out, as appropriate. 29139012Sbostic */ 29239012Sbostic add_offset(fp, off) 29339012Sbostic FILE *fp; 29439012Sbostic off_t off; 29539012Sbostic { 29639070Sbostic off_t net; 29739070Sbostic 29839070Sbostic if (!STORING_PTRS) { 29939070Sbostic net = htonl(off); 30039070Sbostic fwrite(&net, 1, sizeof net, fp); 30139070Sbostic } else { 30239012Sbostic ALLOC(Seekpts, Num_pts + 1); 30339012Sbostic Seekpts[Num_pts] = off; 30439012Sbostic } 30539012Sbostic Num_pts++; 30639012Sbostic } 30739012Sbostic 30839012Sbostic /* 30930270Sbostic * do_order: 31030270Sbostic * Order the strings alphabetically (possibly ignoring case). 31130270Sbostic */ 31239012Sbostic do_order() 31330270Sbostic { 31430270Sbostic register int i; 31539012Sbostic register off_t *lp; 31630270Sbostic register STR *fp; 31730270Sbostic extern int cmp_str(); 31830270Sbostic 31939012Sbostic Sort_1 = fopen(Infile, "r"); 32039012Sbostic Sort_2 = fopen(Infile, "r"); 32139012Sbostic qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str); 32230270Sbostic i = Tbl.str_numstr; 32339012Sbostic lp = Seekpts; 32430270Sbostic fp = Firstch; 32530270Sbostic while (i--) 32630270Sbostic *lp++ = fp++->pos; 32730270Sbostic (void) fclose(Sort_1); 32830270Sbostic (void) fclose(Sort_2); 32930270Sbostic Tbl.str_flags |= STR_ORDERED; 33030270Sbostic } 33130270Sbostic 33230270Sbostic /* 33330270Sbostic * cmp_str: 33430270Sbostic * Compare two strings in the file 33530270Sbostic */ 33639012Sbostic char * 33739012Sbostic unctrl(c) 33839012Sbostic char c; 33939012Sbostic { 34039012Sbostic static char buf[3]; 34139012Sbostic 34239012Sbostic if (isprint(c)) { 34339012Sbostic buf[0] = c; 34439012Sbostic buf[1] = '\0'; 34539012Sbostic } 34639012Sbostic else if (c == 0177) { 34739012Sbostic buf[0] = '^'; 34839012Sbostic buf[1] = '?'; 34939012Sbostic } 35039012Sbostic else { 35139012Sbostic buf[0] = '^'; 35239012Sbostic buf[1] = c + 'A' - 1; 35339012Sbostic } 35439012Sbostic return buf; 35539012Sbostic } 35639012Sbostic 35730270Sbostic cmp_str(p1, p2) 35830270Sbostic STR *p1, *p2; 35930270Sbostic { 36030270Sbostic register int c1, c2; 36139012Sbostic register int n1, n2; 36230270Sbostic 36339012Sbostic # define SET_N(nf,ch) (nf = (ch == '\n')) 36439012Sbostic # define IS_END(ch,nf) (ch == Delimch && nf) 36539012Sbostic 36630270Sbostic c1 = p1->first; 36730270Sbostic c2 = p2->first; 36830270Sbostic if (c1 != c2) 36930270Sbostic return c1 - c2; 37030270Sbostic 37130270Sbostic (void) fseek(Sort_1, p1->pos, 0); 37230270Sbostic (void) fseek(Sort_2, p2->pos, 0); 37330270Sbostic 37439012Sbostic n1 = FALSE; 37539012Sbostic n2 = FALSE; 37630270Sbostic while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0') 37739012Sbostic SET_N(n1, c1); 37830270Sbostic while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0') 37939012Sbostic SET_N(n2, c2); 38030270Sbostic 38139012Sbostic while (!IS_END(c1, n1) && !IS_END(c2, n2)) { 38230270Sbostic if (Iflag) { 38330270Sbostic if (isupper(c1)) 38430270Sbostic c1 = tolower(c1); 38530270Sbostic if (isupper(c2)) 38630270Sbostic c2 = tolower(c2); 38730270Sbostic } 38830270Sbostic if (c1 != c2) 38930270Sbostic return c1 - c2; 39039012Sbostic SET_N(n1, c1); 39139012Sbostic SET_N(n2, c2); 39230270Sbostic c1 = getc(Sort_1); 39330270Sbostic c2 = getc(Sort_2); 39430270Sbostic } 39539012Sbostic if (IS_END(c1, n1)) 39639012Sbostic c1 = 0; 39739012Sbostic if (IS_END(c2, n2)) 39839012Sbostic c2 = 0; 39930270Sbostic return c1 - c2; 40030270Sbostic } 40130270Sbostic 40230270Sbostic /* 40330270Sbostic * randomize: 40430270Sbostic * Randomize the order of the string table. We must be careful 40530270Sbostic * not to randomize across delimiter boundaries. All 40630270Sbostic * randomization is done within each block. 40730270Sbostic */ 40839012Sbostic randomize() 40930270Sbostic { 41039012Sbostic register int cnt, i; 41139012Sbostic register off_t tmp; 41239012Sbostic register off_t *sp; 41339012Sbostic extern time_t time(); 41430270Sbostic 41539077Sbostic srandom((int)(time((time_t *) NULL) + getpid())); 41639077Sbostic 41730270Sbostic Tbl.str_flags |= STR_RANDOM; 41839012Sbostic cnt = Tbl.str_numstr; 41930270Sbostic 42039012Sbostic /* 42139012Sbostic * move things around randomly 42239012Sbostic */ 42330270Sbostic 42439012Sbostic for (sp = Seekpts; cnt > 0; cnt--, sp++) { 42539021Sbostic i = random() % cnt; 42639012Sbostic tmp = sp[0]; 42739012Sbostic sp[0] = sp[i]; 42839012Sbostic sp[i] = tmp; 42930270Sbostic } 43030270Sbostic } 431