139026Sbostic /* 239026Sbostic * Copyright (c) 1989 The Regents of the University of California. 339026Sbostic * All rights reserved. 439026Sbostic * 539026Sbostic * This code is derived from software contributed to Berkeley by 639026Sbostic * Ken Arnold. 739026Sbostic * 839026Sbostic * Redistribution and use in source and binary forms are permitted 939026Sbostic * provided that the above copyright notice and this paragraph are 1039026Sbostic * duplicated in all such forms and that any documentation, 1139026Sbostic * advertising materials, and other materials related to such 1239026Sbostic * distribution and use acknowledge that the software was developed 1339026Sbostic * by the University of California, Berkeley. The name of the 1439026Sbostic * University may not be used to endorse or promote products derived 1539026Sbostic * from this software without specific prior written permission. 1639026Sbostic * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 1739026Sbostic * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 1839026Sbostic * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 1939026Sbostic */ 2030270Sbostic 2139026Sbostic #ifndef lint 2239026Sbostic char copyright[] = 2339026Sbostic "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 2439026Sbostic All rights reserved.\n"; 2539026Sbostic #endif /* not lint */ 2639026Sbostic 2739026Sbostic #ifndef lint 28*39067Sbostic static char sccsid[] = "@(#)strfile.c 5.8 (Berkeley) 09/06/89"; 2939026Sbostic #endif /* not lint */ 3039026Sbostic 3139021Sbostic # include <sys/param.h> 3239021Sbostic # include <sys/types.h> 3330270Sbostic # include <stdio.h> 3430270Sbostic # include <ctype.h> 3530270Sbostic # include "strfile.h" 3630270Sbostic 3739012Sbostic # ifndef MAXPATHLEN 3839012Sbostic # define MAXPATHLEN 1024 3939012Sbostic # endif /* MAXPATHLEN */ 4039012Sbostic 4130270Sbostic /* 4230270Sbostic * This program takes a file composed of strings seperated by 4330270Sbostic * lines starting with two consecutive delimiting character (default 4430270Sbostic * character is '%') and creates another file which consists of a table 4530270Sbostic * describing the file (structure from "strfile.h"), a table of seek 4639033Sbostic * pointers to the start of the strings, and the strings, each terminated 4730270Sbostic * by a null byte. Usage: 4830270Sbostic * 49*39067Sbostic * % strfile [-iorsx] [ -cC ] sourcefile [ datafile ] 5030270Sbostic * 5130270Sbostic * c - Change delimiting character from '%' to 'C' 5230270Sbostic * s - Silent. Give no summary of data processed at the end of 5330270Sbostic * the run. 5430270Sbostic * o - order the strings in alphabetic order 5530270Sbostic * i - if ordering, ignore case 5630270Sbostic * r - randomize the order of the strings 57*39067Sbostic * x - set rotated bit 5830270Sbostic * 5930270Sbostic * Ken Arnold Sept. 7, 1978 -- 6030270Sbostic * 6130270Sbostic * Added ordering options. 6230270Sbostic */ 6330270Sbostic 6430270Sbostic # define TRUE 1 6530270Sbostic # define FALSE 0 6630270Sbostic 6739012Sbostic # define STORING_PTRS (Oflag || Rflag) 6839012Sbostic # define CHUNKSIZE 512 6930270Sbostic 7039012Sbostic #ifdef lint 7139012Sbostic # define ALWAYS atoi("1") 7239012Sbostic #else 7339012Sbostic # define ALWAYS 1 7439012Sbostic #endif 7539012Sbostic # define ALLOC(ptr,sz) if (ALWAYS) { \ 7639012Sbostic if (ptr == NULL) \ 7739012Sbostic ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \ 7839012Sbostic else if (((sz) + 1) % CHUNKSIZE == 0) \ 7939012Sbostic ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \ 8039012Sbostic if (ptr == NULL) { \ 8139012Sbostic fprintf(stderr, "out of space\n"); \ 8239012Sbostic exit(1); \ 8339012Sbostic } \ 8439012Sbostic } else 8539012Sbostic 8639012Sbostic #ifdef NO_VOID 8739012Sbostic # define void char 8839012Sbostic #endif 8939012Sbostic 9030270Sbostic typedef struct { 9130270Sbostic char first; 9239012Sbostic off_t pos; 9330270Sbostic } STR; 9430270Sbostic 9530270Sbostic char *Infile = NULL, /* input file name */ 9639012Sbostic Outfile[MAXPATHLEN] = "", /* output file name */ 9739033Sbostic Delimch = '%'; /* delimiting character */ 9830270Sbostic 9930270Sbostic int Sflag = FALSE; /* silent run flag */ 10030270Sbostic int Oflag = FALSE; /* ordering flag */ 10130270Sbostic int Iflag = FALSE; /* ignore case flag */ 10230270Sbostic int Rflag = FALSE; /* randomize order flag */ 103*39067Sbostic int Xflag = FALSE; /* set rotated bit */ 10439012Sbostic int Num_pts = 0; /* number of pointers/strings */ 10530270Sbostic 10639012Sbostic off_t *Seekpts; 10730270Sbostic 10830270Sbostic FILE *Sort_1, *Sort_2; /* pointers for sorting */ 10930270Sbostic 11030270Sbostic STRFILE Tbl; /* statistics table */ 11130270Sbostic 11230270Sbostic STR *Firstch; /* first chars of each string */ 11330270Sbostic 11439012Sbostic char *fgets(), *strcpy(), *strcat(); 11530270Sbostic 11639012Sbostic void *malloc(), *realloc(); 11730270Sbostic 11839012Sbostic /* 11939012Sbostic * main: 12039012Sbostic * Drive the sucker. There are two main modes -- either we store 12139012Sbostic * the seek pointers, if the table is to be sorted or randomized, 12239012Sbostic * or we write the pointer directly to the file, if we are to stay 12339012Sbostic * in file order. If the former, we allocate and re-allocate in 12439012Sbostic * CHUNKSIZE blocks; if the latter, we just write each pointer, 12539012Sbostic * and then seek back to the beginning to write in the table. 12639012Sbostic */ 12730270Sbostic main(ac, av) 12830270Sbostic int ac; 12930270Sbostic char **av; 13030270Sbostic { 13130270Sbostic register char *sp, dc; 13230270Sbostic register FILE *inf, *outf; 13339012Sbostic register off_t last_off, length, pos; 13430270Sbostic register int first; 13530270Sbostic register char *nsp; 13630270Sbostic register STR *fp; 13730270Sbostic static char string[257]; 13830270Sbostic 13930270Sbostic getargs(ac, av); /* evalute arguments */ 14030270Sbostic dc = Delimch; 14130270Sbostic if ((inf = fopen(Infile, "r")) == NULL) { 14230270Sbostic perror(Infile); 14339037Sbostic exit(1); 14430270Sbostic } 14530270Sbostic 14630270Sbostic if ((outf = fopen(Outfile, "w")) == NULL) { 14730270Sbostic perror(Outfile); 14839037Sbostic exit(1); 14930270Sbostic } 15039012Sbostic if (!STORING_PTRS) 15139012Sbostic (void) fseek(outf, sizeof Tbl, 0); 15230270Sbostic 15330270Sbostic /* 15439012Sbostic * Write the strings onto the file 15530270Sbostic */ 15630270Sbostic 15730270Sbostic Tbl.str_longlen = 0; 15830270Sbostic Tbl.str_shortlen = (unsigned int) 0xffffffff; 15939012Sbostic Tbl.str_delim = dc; 16030270Sbostic first = Oflag; 16139012Sbostic add_offset(outf, ftell(inf)); 16239012Sbostic last_off = 0; 16330270Sbostic do { 16430270Sbostic sp = fgets(string, 256, inf); 16539033Sbostic if (sp == NULL || sp[0] == dc && sp[1] == '\n') { 16639012Sbostic pos = ftell(inf); 16739012Sbostic length = pos - last_off - strlen(sp); 16839012Sbostic last_off = pos; 16939033Sbostic if (!length) 17039033Sbostic continue; 17139033Sbostic add_offset(outf, pos); 17239012Sbostic if (Tbl.str_longlen < length) 17339012Sbostic Tbl.str_longlen = length; 17439012Sbostic if (Tbl.str_shortlen > length) 17539012Sbostic Tbl.str_shortlen = length; 17630270Sbostic first = Oflag; 17730270Sbostic } 17839033Sbostic else if (first) { 17939033Sbostic for (nsp = sp; !isalnum(*nsp); nsp++) 18039033Sbostic continue; 18139033Sbostic ALLOC(Firstch, Num_pts); 18239033Sbostic fp = &Firstch[Num_pts - 1]; 18339033Sbostic if (Iflag && isupper(*nsp)) 18439033Sbostic fp->first = tolower(*nsp); 18539033Sbostic else 18639033Sbostic fp->first = *nsp; 18739033Sbostic fp->pos = Seekpts[Num_pts - 1]; 18839033Sbostic first = FALSE; 18930270Sbostic } 19030270Sbostic } while (sp != NULL); 19130270Sbostic 19230270Sbostic /* 19330270Sbostic * write the tables in 19430270Sbostic */ 19530270Sbostic 19630270Sbostic (void) fclose(inf); 19739012Sbostic Tbl.str_numstr = Num_pts - 1; 19830270Sbostic 19930270Sbostic if (Oflag) 20039012Sbostic do_order(); 20130270Sbostic else if (Rflag) 20239012Sbostic randomize(); 20330270Sbostic 204*39067Sbostic if (Xflag) 205*39067Sbostic Tbl.str_flags |= STR_ROTATED; 206*39067Sbostic 20739012Sbostic (void) fseek(outf, (off_t) 0, 0); 20830270Sbostic (void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf); 20939012Sbostic if (STORING_PTRS) 21039012Sbostic (void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf); 21130270Sbostic (void) fclose(outf); 21230270Sbostic 21330270Sbostic if (!Sflag) { 21439012Sbostic printf("\"%s\" created\n", Outfile); 21539012Sbostic if (Num_pts == 2) 21630270Sbostic puts("There was 1 string"); 21730270Sbostic else 21839021Sbostic printf("There were %d strings\n", Num_pts - 1); 21939021Sbostic printf("Longest string: %lu byte%s\n", Tbl.str_longlen, 22030270Sbostic Tbl.str_longlen == 1 ? "" : "s"); 22139021Sbostic printf("Shortest string: %lu byte%s\n", Tbl.str_shortlen, 22230270Sbostic Tbl.str_shortlen == 1 ? "" : "s"); 22330270Sbostic } 22430270Sbostic exit(0); 22539012Sbostic /* NOTREACHED */ 22630270Sbostic } 22730270Sbostic 22830270Sbostic /* 22930270Sbostic * This routine evaluates arguments from the command line 23030270Sbostic */ 23139033Sbostic getargs(argc, argv) 23239033Sbostic int argc; 23339033Sbostic char **argv; 23430270Sbostic { 23539033Sbostic extern char *optarg; 23639033Sbostic extern int optind; 23739033Sbostic int ch; 23830270Sbostic 239*39067Sbostic while ((ch = getopt(argc, argv, "c:iorsx")) != EOF) 24039033Sbostic switch(ch) { 24139033Sbostic case 'c': /* new delimiting char */ 24239033Sbostic Delimch = *optarg; 24339033Sbostic if (!isascii(Delimch)) { 24439033Sbostic printf("bad delimiting character: '\\%o\n'", 24539033Sbostic Delimch); 24639033Sbostic } 24739033Sbostic break; 24839033Sbostic case 'i': /* ignore case in ordering */ 24939033Sbostic Iflag++; 25039033Sbostic break; 25139033Sbostic case 'o': /* order strings */ 25239033Sbostic Oflag++; 25339033Sbostic break; 254*39067Sbostic case 'r': /* randomize pointers */ 25539033Sbostic Rflag++; 25639033Sbostic break; 25739033Sbostic case 's': /* silent */ 25839033Sbostic Sflag++; 25939033Sbostic break; 260*39067Sbostic case 'x': /* set the rotated bit */ 261*39067Sbostic Xflag++; 262*39067Sbostic break; 26339033Sbostic case '?': 26439033Sbostic default: 26539033Sbostic usage(); 26630270Sbostic } 26739033Sbostic argv += optind; 26839033Sbostic 26939033Sbostic if (*argv) { 27039033Sbostic Infile = *argv; 27139033Sbostic if (*++argv) 27239033Sbostic (void) strcpy(Outfile, *argv); 27339033Sbostic } 27430270Sbostic if (!Infile) { 27530270Sbostic puts("No input file name"); 27639033Sbostic usage(); 27730270Sbostic } 27839033Sbostic if (*Outfile == '\0') { 27930270Sbostic (void) strcpy(Outfile, Infile); 28030270Sbostic (void) strcat(Outfile, ".dat"); 28130270Sbostic } 28230270Sbostic } 28330270Sbostic 28439033Sbostic usage() 28539033Sbostic { 28639033Sbostic (void) fprintf(stderr, 287*39067Sbostic "strfile [-iorsx] [-c char] sourcefile [datafile]\n"); 28839033Sbostic exit(1); 28939033Sbostic } 29039033Sbostic 29130270Sbostic /* 29239012Sbostic * add_offset: 29339012Sbostic * Add an offset to the list, or write it out, as appropriate. 29439012Sbostic */ 29539012Sbostic add_offset(fp, off) 29639012Sbostic FILE *fp; 29739012Sbostic off_t off; 29839012Sbostic { 29939012Sbostic if (!STORING_PTRS) 30039012Sbostic fwrite(&off, 1, sizeof off, fp); 30139012Sbostic else { 30239012Sbostic ALLOC(Seekpts, Num_pts + 1); 30339012Sbostic Seekpts[Num_pts] = off; 30439012Sbostic } 30539012Sbostic Num_pts++; 30639012Sbostic } 30739012Sbostic 30839012Sbostic /* 30930270Sbostic * do_order: 31030270Sbostic * Order the strings alphabetically (possibly ignoring case). 31130270Sbostic */ 31239012Sbostic do_order() 31330270Sbostic { 31430270Sbostic register int i; 31539012Sbostic register off_t *lp; 31630270Sbostic register STR *fp; 31730270Sbostic extern int cmp_str(); 31830270Sbostic 31939012Sbostic Sort_1 = fopen(Infile, "r"); 32039012Sbostic Sort_2 = fopen(Infile, "r"); 32139012Sbostic qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str); 32230270Sbostic i = Tbl.str_numstr; 32339012Sbostic lp = Seekpts; 32430270Sbostic fp = Firstch; 32530270Sbostic while (i--) 32630270Sbostic *lp++ = fp++->pos; 32730270Sbostic (void) fclose(Sort_1); 32830270Sbostic (void) fclose(Sort_2); 32930270Sbostic Tbl.str_flags |= STR_ORDERED; 33030270Sbostic } 33130270Sbostic 33230270Sbostic /* 33330270Sbostic * cmp_str: 33430270Sbostic * Compare two strings in the file 33530270Sbostic */ 33639012Sbostic char * 33739012Sbostic unctrl(c) 33839012Sbostic char c; 33939012Sbostic { 34039012Sbostic static char buf[3]; 34139012Sbostic 34239012Sbostic if (isprint(c)) { 34339012Sbostic buf[0] = c; 34439012Sbostic buf[1] = '\0'; 34539012Sbostic } 34639012Sbostic else if (c == 0177) { 34739012Sbostic buf[0] = '^'; 34839012Sbostic buf[1] = '?'; 34939012Sbostic } 35039012Sbostic else { 35139012Sbostic buf[0] = '^'; 35239012Sbostic buf[1] = c + 'A' - 1; 35339012Sbostic } 35439012Sbostic return buf; 35539012Sbostic } 35639012Sbostic 35730270Sbostic cmp_str(p1, p2) 35830270Sbostic STR *p1, *p2; 35930270Sbostic { 36030270Sbostic register int c1, c2; 36139012Sbostic register int n1, n2; 36230270Sbostic 36339012Sbostic # define SET_N(nf,ch) (nf = (ch == '\n')) 36439012Sbostic # define IS_END(ch,nf) (ch == Delimch && nf) 36539012Sbostic 36630270Sbostic c1 = p1->first; 36730270Sbostic c2 = p2->first; 36830270Sbostic if (c1 != c2) 36930270Sbostic return c1 - c2; 37030270Sbostic 37130270Sbostic (void) fseek(Sort_1, p1->pos, 0); 37230270Sbostic (void) fseek(Sort_2, p2->pos, 0); 37330270Sbostic 37439012Sbostic n1 = FALSE; 37539012Sbostic n2 = FALSE; 37630270Sbostic while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0') 37739012Sbostic SET_N(n1, c1); 37830270Sbostic while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0') 37939012Sbostic SET_N(n2, c2); 38030270Sbostic 38139012Sbostic while (!IS_END(c1, n1) && !IS_END(c2, n2)) { 38230270Sbostic if (Iflag) { 38330270Sbostic if (isupper(c1)) 38430270Sbostic c1 = tolower(c1); 38530270Sbostic if (isupper(c2)) 38630270Sbostic c2 = tolower(c2); 38730270Sbostic } 38830270Sbostic if (c1 != c2) 38930270Sbostic return c1 - c2; 39039012Sbostic SET_N(n1, c1); 39139012Sbostic SET_N(n2, c2); 39230270Sbostic c1 = getc(Sort_1); 39330270Sbostic c2 = getc(Sort_2); 39430270Sbostic } 39539012Sbostic if (IS_END(c1, n1)) 39639012Sbostic c1 = 0; 39739012Sbostic if (IS_END(c2, n2)) 39839012Sbostic c2 = 0; 39930270Sbostic return c1 - c2; 40030270Sbostic } 40130270Sbostic 40230270Sbostic /* 40330270Sbostic * randomize: 40430270Sbostic * Randomize the order of the string table. We must be careful 40530270Sbostic * not to randomize across delimiter boundaries. All 40630270Sbostic * randomization is done within each block. 40730270Sbostic */ 40839012Sbostic randomize() 40930270Sbostic { 41039012Sbostic register int cnt, i; 41139012Sbostic register off_t tmp; 41239012Sbostic register off_t *sp; 41339012Sbostic extern time_t time(); 41430270Sbostic 41530270Sbostic Tbl.str_flags |= STR_RANDOM; 41639012Sbostic cnt = Tbl.str_numstr; 41730270Sbostic 41839012Sbostic /* 41939012Sbostic * move things around randomly 42039012Sbostic */ 42130270Sbostic 42239012Sbostic for (sp = Seekpts; cnt > 0; cnt--, sp++) { 42339021Sbostic i = random() % cnt; 42439012Sbostic tmp = sp[0]; 42539012Sbostic sp[0] = sp[i]; 42639012Sbostic sp[i] = tmp; 42730270Sbostic } 42830270Sbostic } 429