139026Sbostic /* 239026Sbostic * Copyright (c) 1989 The Regents of the University of California. 339026Sbostic * All rights reserved. 439026Sbostic * 539026Sbostic * This code is derived from software contributed to Berkeley by 639026Sbostic * Ken Arnold. 739026Sbostic * 839026Sbostic * Redistribution and use in source and binary forms are permitted 939026Sbostic * provided that the above copyright notice and this paragraph are 1039026Sbostic * duplicated in all such forms and that any documentation, 1139026Sbostic * advertising materials, and other materials related to such 1239026Sbostic * distribution and use acknowledge that the software was developed 1339026Sbostic * by the University of California, Berkeley. The name of the 1439026Sbostic * University may not be used to endorse or promote products derived 1539026Sbostic * from this software without specific prior written permission. 1639026Sbostic * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 1739026Sbostic * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 1839026Sbostic * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 1939026Sbostic */ 2030270Sbostic 2139026Sbostic #ifndef lint 2239026Sbostic char copyright[] = 2339026Sbostic "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 2439026Sbostic All rights reserved.\n"; 2539026Sbostic #endif /* not lint */ 2639026Sbostic 2739026Sbostic #ifndef lint 28*39077Sbostic static char sccsid[] = "@(#)strfile.c 5.10 (Berkeley) 09/07/89"; 2939026Sbostic #endif /* not lint */ 3039026Sbostic 3139021Sbostic # include <sys/param.h> 3239021Sbostic # include <sys/types.h> 3330270Sbostic # include <stdio.h> 3430270Sbostic # include <ctype.h> 3530270Sbostic # include "strfile.h" 3630270Sbostic 3739012Sbostic # ifndef MAXPATHLEN 3839012Sbostic # define MAXPATHLEN 1024 3939012Sbostic # endif /* MAXPATHLEN */ 4039012Sbostic 4130270Sbostic /* 4230270Sbostic * This program takes a file composed of strings seperated by 4330270Sbostic * lines starting with two consecutive delimiting character (default 4430270Sbostic * character is '%') and creates another file which consists of a table 4530270Sbostic * describing the file (structure from "strfile.h"), a table of seek 4639033Sbostic * pointers to the start of the strings, and the strings, each terminated 4730270Sbostic * by a null byte. Usage: 4830270Sbostic * 4939067Sbostic * % strfile [-iorsx] [ -cC ] sourcefile [ datafile ] 5030270Sbostic * 5130270Sbostic * c - Change delimiting character from '%' to 'C' 5230270Sbostic * s - Silent. Give no summary of data processed at the end of 5330270Sbostic * the run. 5430270Sbostic * o - order the strings in alphabetic order 5530270Sbostic * i - if ordering, ignore case 5630270Sbostic * r - randomize the order of the strings 5739067Sbostic * x - set rotated bit 5830270Sbostic * 5930270Sbostic * Ken Arnold Sept. 7, 1978 -- 6030270Sbostic * 6130270Sbostic * Added ordering options. 6230270Sbostic */ 6330270Sbostic 6430270Sbostic # define TRUE 1 6530270Sbostic # define FALSE 0 6630270Sbostic 6739012Sbostic # define STORING_PTRS (Oflag || Rflag) 6839012Sbostic # define CHUNKSIZE 512 6930270Sbostic 7039012Sbostic #ifdef lint 7139012Sbostic # define ALWAYS atoi("1") 7239012Sbostic #else 7339012Sbostic # define ALWAYS 1 7439012Sbostic #endif 7539012Sbostic # define ALLOC(ptr,sz) if (ALWAYS) { \ 7639012Sbostic if (ptr == NULL) \ 7739012Sbostic ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \ 7839012Sbostic else if (((sz) + 1) % CHUNKSIZE == 0) \ 7939012Sbostic ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \ 8039012Sbostic if (ptr == NULL) { \ 8139012Sbostic fprintf(stderr, "out of space\n"); \ 8239012Sbostic exit(1); \ 8339012Sbostic } \ 8439012Sbostic } else 8539012Sbostic 8639012Sbostic #ifdef NO_VOID 8739012Sbostic # define void char 8839012Sbostic #endif 8939012Sbostic 9030270Sbostic typedef struct { 9130270Sbostic char first; 9239012Sbostic off_t pos; 9330270Sbostic } STR; 9430270Sbostic 9530270Sbostic char *Infile = NULL, /* input file name */ 9639012Sbostic Outfile[MAXPATHLEN] = "", /* output file name */ 9739033Sbostic Delimch = '%'; /* delimiting character */ 9830270Sbostic 9930270Sbostic int Sflag = FALSE; /* silent run flag */ 10030270Sbostic int Oflag = FALSE; /* ordering flag */ 10130270Sbostic int Iflag = FALSE; /* ignore case flag */ 10230270Sbostic int Rflag = FALSE; /* randomize order flag */ 10339067Sbostic int Xflag = FALSE; /* set rotated bit */ 10439070Sbostic long Num_pts = 0; /* number of pointers/strings */ 10530270Sbostic 10639012Sbostic off_t *Seekpts; 10730270Sbostic 10830270Sbostic FILE *Sort_1, *Sort_2; /* pointers for sorting */ 10930270Sbostic 11030270Sbostic STRFILE Tbl; /* statistics table */ 11130270Sbostic 11230270Sbostic STR *Firstch; /* first chars of each string */ 11330270Sbostic 11439012Sbostic char *fgets(), *strcpy(), *strcat(); 11530270Sbostic 11639012Sbostic void *malloc(), *realloc(); 11730270Sbostic 11839012Sbostic /* 11939012Sbostic * main: 12039012Sbostic * Drive the sucker. There are two main modes -- either we store 12139012Sbostic * the seek pointers, if the table is to be sorted or randomized, 12239012Sbostic * or we write the pointer directly to the file, if we are to stay 12339012Sbostic * in file order. If the former, we allocate and re-allocate in 12439012Sbostic * CHUNKSIZE blocks; if the latter, we just write each pointer, 12539012Sbostic * and then seek back to the beginning to write in the table. 12639012Sbostic */ 12730270Sbostic main(ac, av) 12830270Sbostic int ac; 12930270Sbostic char **av; 13030270Sbostic { 13130270Sbostic register char *sp, dc; 13230270Sbostic register FILE *inf, *outf; 13339070Sbostic register off_t last_off, length, pos, *p; 13439070Sbostic register int first, cnt; 13530270Sbostic register char *nsp; 13630270Sbostic register STR *fp; 13730270Sbostic static char string[257]; 13830270Sbostic 13930270Sbostic getargs(ac, av); /* evalute arguments */ 14030270Sbostic dc = Delimch; 14130270Sbostic if ((inf = fopen(Infile, "r")) == NULL) { 14230270Sbostic perror(Infile); 14339037Sbostic exit(1); 14430270Sbostic } 14530270Sbostic 14630270Sbostic if ((outf = fopen(Outfile, "w")) == NULL) { 14730270Sbostic perror(Outfile); 14839037Sbostic exit(1); 14930270Sbostic } 15039012Sbostic if (!STORING_PTRS) 15139012Sbostic (void) fseek(outf, sizeof Tbl, 0); 15230270Sbostic 15330270Sbostic /* 15439012Sbostic * Write the strings onto the file 15530270Sbostic */ 15630270Sbostic 15730270Sbostic Tbl.str_longlen = 0; 15830270Sbostic Tbl.str_shortlen = (unsigned int) 0xffffffff; 15939012Sbostic Tbl.str_delim = dc; 16039070Sbostic Tbl.str_version = VERSION; 16130270Sbostic first = Oflag; 16239012Sbostic add_offset(outf, ftell(inf)); 16339012Sbostic last_off = 0; 16430270Sbostic do { 16530270Sbostic sp = fgets(string, 256, inf); 16639033Sbostic if (sp == NULL || sp[0] == dc && sp[1] == '\n') { 16739012Sbostic pos = ftell(inf); 16839012Sbostic length = pos - last_off - strlen(sp); 16939012Sbostic last_off = pos; 17039033Sbostic if (!length) 17139033Sbostic continue; 17239033Sbostic add_offset(outf, pos); 17339012Sbostic if (Tbl.str_longlen < length) 17439012Sbostic Tbl.str_longlen = length; 17539012Sbostic if (Tbl.str_shortlen > length) 17639012Sbostic Tbl.str_shortlen = length; 17730270Sbostic first = Oflag; 17830270Sbostic } 17939033Sbostic else if (first) { 18039033Sbostic for (nsp = sp; !isalnum(*nsp); nsp++) 18139033Sbostic continue; 18239033Sbostic ALLOC(Firstch, Num_pts); 18339033Sbostic fp = &Firstch[Num_pts - 1]; 18439033Sbostic if (Iflag && isupper(*nsp)) 18539033Sbostic fp->first = tolower(*nsp); 18639033Sbostic else 18739033Sbostic fp->first = *nsp; 18839033Sbostic fp->pos = Seekpts[Num_pts - 1]; 18939033Sbostic first = FALSE; 19030270Sbostic } 19130270Sbostic } while (sp != NULL); 19230270Sbostic 19330270Sbostic /* 19430270Sbostic * write the tables in 19530270Sbostic */ 19630270Sbostic 19730270Sbostic (void) fclose(inf); 19830270Sbostic 19930270Sbostic if (Oflag) 20039012Sbostic do_order(); 20130270Sbostic else if (Rflag) 20239012Sbostic randomize(); 20330270Sbostic 20439067Sbostic if (Xflag) 20539067Sbostic Tbl.str_flags |= STR_ROTATED; 20639067Sbostic 20730270Sbostic if (!Sflag) { 20839012Sbostic printf("\"%s\" created\n", Outfile); 20939012Sbostic if (Num_pts == 2) 21030270Sbostic puts("There was 1 string"); 21130270Sbostic else 21239021Sbostic printf("There were %d strings\n", Num_pts - 1); 21339021Sbostic printf("Longest string: %lu byte%s\n", Tbl.str_longlen, 21430270Sbostic Tbl.str_longlen == 1 ? "" : "s"); 21539021Sbostic printf("Shortest string: %lu byte%s\n", Tbl.str_shortlen, 21630270Sbostic Tbl.str_shortlen == 1 ? "" : "s"); 21730270Sbostic } 21839070Sbostic 21939070Sbostic (void) fseek(outf, (off_t) 0, 0); 22039070Sbostic Tbl.str_version = htonl(Tbl.str_version); 22139070Sbostic Tbl.str_numstr = htonl(Num_pts - 1); 22239070Sbostic Tbl.str_longlen = htonl(Tbl.str_longlen); 22339070Sbostic Tbl.str_shortlen = htonl(Tbl.str_shortlen); 22439070Sbostic Tbl.str_flags = htonl(Tbl.str_flags); 22539070Sbostic (void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf); 22639070Sbostic if (STORING_PTRS) { 22739070Sbostic for (p = Seekpts, cnt = Num_pts; cnt--; ++p) 22839070Sbostic *p = htonl(*p); 22939070Sbostic (void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf); 23039070Sbostic } 23139070Sbostic (void) fclose(outf); 23230270Sbostic exit(0); 23330270Sbostic } 23430270Sbostic 23530270Sbostic /* 23630270Sbostic * This routine evaluates arguments from the command line 23730270Sbostic */ 23839033Sbostic getargs(argc, argv) 23939033Sbostic int argc; 24039033Sbostic char **argv; 24130270Sbostic { 24239033Sbostic extern char *optarg; 24339033Sbostic extern int optind; 24439033Sbostic int ch; 24530270Sbostic 24639067Sbostic while ((ch = getopt(argc, argv, "c:iorsx")) != EOF) 24739033Sbostic switch(ch) { 24839033Sbostic case 'c': /* new delimiting char */ 24939033Sbostic Delimch = *optarg; 25039033Sbostic if (!isascii(Delimch)) { 25139033Sbostic printf("bad delimiting character: '\\%o\n'", 25239033Sbostic Delimch); 25339033Sbostic } 25439033Sbostic break; 25539033Sbostic case 'i': /* ignore case in ordering */ 25639033Sbostic Iflag++; 25739033Sbostic break; 25839033Sbostic case 'o': /* order strings */ 25939033Sbostic Oflag++; 26039033Sbostic break; 26139067Sbostic case 'r': /* randomize pointers */ 26239033Sbostic Rflag++; 26339033Sbostic break; 26439033Sbostic case 's': /* silent */ 26539033Sbostic Sflag++; 26639033Sbostic break; 26739067Sbostic case 'x': /* set the rotated bit */ 26839067Sbostic Xflag++; 26939067Sbostic break; 27039033Sbostic case '?': 27139033Sbostic default: 27239033Sbostic usage(); 27330270Sbostic } 27439033Sbostic argv += optind; 27539033Sbostic 27639033Sbostic if (*argv) { 27739033Sbostic Infile = *argv; 27839033Sbostic if (*++argv) 27939033Sbostic (void) strcpy(Outfile, *argv); 28039033Sbostic } 28130270Sbostic if (!Infile) { 28230270Sbostic puts("No input file name"); 28339033Sbostic usage(); 28430270Sbostic } 28539033Sbostic if (*Outfile == '\0') { 28630270Sbostic (void) strcpy(Outfile, Infile); 28730270Sbostic (void) strcat(Outfile, ".dat"); 28830270Sbostic } 28930270Sbostic } 29030270Sbostic 29139033Sbostic usage() 29239033Sbostic { 29339033Sbostic (void) fprintf(stderr, 29439067Sbostic "strfile [-iorsx] [-c char] sourcefile [datafile]\n"); 29539033Sbostic exit(1); 29639033Sbostic } 29739033Sbostic 29830270Sbostic /* 29939012Sbostic * add_offset: 30039012Sbostic * Add an offset to the list, or write it out, as appropriate. 30139012Sbostic */ 30239012Sbostic add_offset(fp, off) 30339012Sbostic FILE *fp; 30439012Sbostic off_t off; 30539012Sbostic { 30639070Sbostic off_t net; 30739070Sbostic 30839070Sbostic if (!STORING_PTRS) { 30939070Sbostic net = htonl(off); 31039070Sbostic fwrite(&net, 1, sizeof net, fp); 31139070Sbostic } else { 31239012Sbostic ALLOC(Seekpts, Num_pts + 1); 31339012Sbostic Seekpts[Num_pts] = off; 31439012Sbostic } 31539012Sbostic Num_pts++; 31639012Sbostic } 31739012Sbostic 31839012Sbostic /* 31930270Sbostic * do_order: 32030270Sbostic * Order the strings alphabetically (possibly ignoring case). 32130270Sbostic */ 32239012Sbostic do_order() 32330270Sbostic { 32430270Sbostic register int i; 32539012Sbostic register off_t *lp; 32630270Sbostic register STR *fp; 32730270Sbostic extern int cmp_str(); 32830270Sbostic 32939012Sbostic Sort_1 = fopen(Infile, "r"); 33039012Sbostic Sort_2 = fopen(Infile, "r"); 33139012Sbostic qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str); 33230270Sbostic i = Tbl.str_numstr; 33339012Sbostic lp = Seekpts; 33430270Sbostic fp = Firstch; 33530270Sbostic while (i--) 33630270Sbostic *lp++ = fp++->pos; 33730270Sbostic (void) fclose(Sort_1); 33830270Sbostic (void) fclose(Sort_2); 33930270Sbostic Tbl.str_flags |= STR_ORDERED; 34030270Sbostic } 34130270Sbostic 34230270Sbostic /* 34330270Sbostic * cmp_str: 34430270Sbostic * Compare two strings in the file 34530270Sbostic */ 34639012Sbostic char * 34739012Sbostic unctrl(c) 34839012Sbostic char c; 34939012Sbostic { 35039012Sbostic static char buf[3]; 35139012Sbostic 35239012Sbostic if (isprint(c)) { 35339012Sbostic buf[0] = c; 35439012Sbostic buf[1] = '\0'; 35539012Sbostic } 35639012Sbostic else if (c == 0177) { 35739012Sbostic buf[0] = '^'; 35839012Sbostic buf[1] = '?'; 35939012Sbostic } 36039012Sbostic else { 36139012Sbostic buf[0] = '^'; 36239012Sbostic buf[1] = c + 'A' - 1; 36339012Sbostic } 36439012Sbostic return buf; 36539012Sbostic } 36639012Sbostic 36730270Sbostic cmp_str(p1, p2) 36830270Sbostic STR *p1, *p2; 36930270Sbostic { 37030270Sbostic register int c1, c2; 37139012Sbostic register int n1, n2; 37230270Sbostic 37339012Sbostic # define SET_N(nf,ch) (nf = (ch == '\n')) 37439012Sbostic # define IS_END(ch,nf) (ch == Delimch && nf) 37539012Sbostic 37630270Sbostic c1 = p1->first; 37730270Sbostic c2 = p2->first; 37830270Sbostic if (c1 != c2) 37930270Sbostic return c1 - c2; 38030270Sbostic 38130270Sbostic (void) fseek(Sort_1, p1->pos, 0); 38230270Sbostic (void) fseek(Sort_2, p2->pos, 0); 38330270Sbostic 38439012Sbostic n1 = FALSE; 38539012Sbostic n2 = FALSE; 38630270Sbostic while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0') 38739012Sbostic SET_N(n1, c1); 38830270Sbostic while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0') 38939012Sbostic SET_N(n2, c2); 39030270Sbostic 39139012Sbostic while (!IS_END(c1, n1) && !IS_END(c2, n2)) { 39230270Sbostic if (Iflag) { 39330270Sbostic if (isupper(c1)) 39430270Sbostic c1 = tolower(c1); 39530270Sbostic if (isupper(c2)) 39630270Sbostic c2 = tolower(c2); 39730270Sbostic } 39830270Sbostic if (c1 != c2) 39930270Sbostic return c1 - c2; 40039012Sbostic SET_N(n1, c1); 40139012Sbostic SET_N(n2, c2); 40230270Sbostic c1 = getc(Sort_1); 40330270Sbostic c2 = getc(Sort_2); 40430270Sbostic } 40539012Sbostic if (IS_END(c1, n1)) 40639012Sbostic c1 = 0; 40739012Sbostic if (IS_END(c2, n2)) 40839012Sbostic c2 = 0; 40930270Sbostic return c1 - c2; 41030270Sbostic } 41130270Sbostic 41230270Sbostic /* 41330270Sbostic * randomize: 41430270Sbostic * Randomize the order of the string table. We must be careful 41530270Sbostic * not to randomize across delimiter boundaries. All 41630270Sbostic * randomization is done within each block. 41730270Sbostic */ 41839012Sbostic randomize() 41930270Sbostic { 42039012Sbostic register int cnt, i; 42139012Sbostic register off_t tmp; 42239012Sbostic register off_t *sp; 42339012Sbostic extern time_t time(); 42430270Sbostic 425*39077Sbostic srandom((int)(time((time_t *) NULL) + getpid())); 426*39077Sbostic 42730270Sbostic Tbl.str_flags |= STR_RANDOM; 42839012Sbostic cnt = Tbl.str_numstr; 42930270Sbostic 43039012Sbostic /* 43139012Sbostic * move things around randomly 43239012Sbostic */ 43330270Sbostic 43439012Sbostic for (sp = Seekpts; cnt > 0; cnt--, sp++) { 43539021Sbostic i = random() % cnt; 43639012Sbostic tmp = sp[0]; 43739012Sbostic sp[0] = sp[i]; 43839012Sbostic sp[i] = tmp; 43930270Sbostic } 44030270Sbostic } 441