139026Sbostic /* 239026Sbostic * Copyright (c) 1989 The Regents of the University of California. 339026Sbostic * All rights reserved. 439026Sbostic * 539026Sbostic * This code is derived from software contributed to Berkeley by 639026Sbostic * Ken Arnold. 739026Sbostic * 839026Sbostic * Redistribution and use in source and binary forms are permitted 939026Sbostic * provided that the above copyright notice and this paragraph are 1039026Sbostic * duplicated in all such forms and that any documentation, 1139026Sbostic * advertising materials, and other materials related to such 1239026Sbostic * distribution and use acknowledge that the software was developed 1339026Sbostic * by the University of California, Berkeley. The name of the 1439026Sbostic * University may not be used to endorse or promote products derived 1539026Sbostic * from this software without specific prior written permission. 1639026Sbostic * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 1739026Sbostic * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 1839026Sbostic * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 1939026Sbostic */ 2030270Sbostic 2139026Sbostic #ifndef lint 2239026Sbostic char copyright[] = 2339026Sbostic "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 2439026Sbostic All rights reserved.\n"; 2539026Sbostic #endif /* not lint */ 2639026Sbostic 2739026Sbostic #ifndef lint 28*39033Sbostic static char sccsid[] = "@(#)strfile.c 5.5 (Berkeley) 09/05/89"; 2939026Sbostic #endif /* not lint */ 3039026Sbostic 3139021Sbostic # include <sys/param.h> 3239021Sbostic # include <sys/types.h> 3330270Sbostic # include <stdio.h> 3430270Sbostic # include <ctype.h> 3530270Sbostic # include "strfile.h" 3630270Sbostic 3739012Sbostic # ifndef MAXPATHLEN 3839012Sbostic # define MAXPATHLEN 1024 3939012Sbostic # endif /* MAXPATHLEN */ 4039012Sbostic 4130270Sbostic /* 4230270Sbostic * This program takes a file composed of strings seperated by 4330270Sbostic * lines starting with two consecutive delimiting character (default 4430270Sbostic * character is '%') and creates another file which consists of a table 4530270Sbostic * describing the file (structure from "strfile.h"), a table of seek 46*39033Sbostic * pointers to the start of the strings, and the strings, each terminated 4730270Sbostic * by a null byte. Usage: 4830270Sbostic * 49*39033Sbostic * % strfile [-iorsv] [ -cC ] sourcefile [ datafile ] 5030270Sbostic * 5130270Sbostic * c - Change delimiting character from '%' to 'C' 5230270Sbostic * s - Silent. Give no summary of data processed at the end of 5330270Sbostic * the run. 5430270Sbostic * v - Verbose. Give summary of data processed. (Default) 5530270Sbostic * o - order the strings in alphabetic order 5630270Sbostic * i - if ordering, ignore case 5730270Sbostic * r - randomize the order of the strings 5830270Sbostic * 5930270Sbostic * Ken Arnold Sept. 7, 1978 -- 6030270Sbostic * 6130270Sbostic * Added ordering options. 6230270Sbostic */ 6330270Sbostic 6430270Sbostic # define TRUE 1 6530270Sbostic # define FALSE 0 6630270Sbostic 6739012Sbostic # define STORING_PTRS (Oflag || Rflag) 6839012Sbostic # define CHUNKSIZE 512 6930270Sbostic 7039012Sbostic #ifdef lint 7139012Sbostic # define ALWAYS atoi("1") 7239012Sbostic #else 7339012Sbostic # define ALWAYS 1 7439012Sbostic #endif 7539012Sbostic # define ALLOC(ptr,sz) if (ALWAYS) { \ 7639012Sbostic if (ptr == NULL) \ 7739012Sbostic ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \ 7839012Sbostic else if (((sz) + 1) % CHUNKSIZE == 0) \ 7939012Sbostic ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \ 8039012Sbostic if (ptr == NULL) { \ 8139012Sbostic fprintf(stderr, "out of space\n"); \ 8239012Sbostic exit(1); \ 8339012Sbostic } \ 8439012Sbostic } else 8539012Sbostic 8639012Sbostic #ifdef NO_VOID 8739012Sbostic # define void char 8839012Sbostic #endif 8939012Sbostic 9030270Sbostic typedef struct { 9130270Sbostic char first; 9239012Sbostic off_t pos; 9330270Sbostic } STR; 9430270Sbostic 9530270Sbostic char *Infile = NULL, /* input file name */ 9639012Sbostic Outfile[MAXPATHLEN] = "", /* output file name */ 97*39033Sbostic Delimch = '%'; /* delimiting character */ 9830270Sbostic 9930270Sbostic int Sflag = FALSE; /* silent run flag */ 10030270Sbostic int Oflag = FALSE; /* ordering flag */ 10130270Sbostic int Iflag = FALSE; /* ignore case flag */ 10230270Sbostic int Rflag = FALSE; /* randomize order flag */ 10339012Sbostic int Num_pts = 0; /* number of pointers/strings */ 10430270Sbostic 10539012Sbostic off_t *Seekpts; 10630270Sbostic 10730270Sbostic FILE *Sort_1, *Sort_2; /* pointers for sorting */ 10830270Sbostic 10930270Sbostic STRFILE Tbl; /* statistics table */ 11030270Sbostic 11130270Sbostic STR *Firstch; /* first chars of each string */ 11230270Sbostic 11339012Sbostic char *fgets(), *strcpy(), *strcat(); 11430270Sbostic 11539012Sbostic void *malloc(), *realloc(); 11630270Sbostic 11739012Sbostic /* 11839012Sbostic * main: 11939012Sbostic * Drive the sucker. There are two main modes -- either we store 12039012Sbostic * the seek pointers, if the table is to be sorted or randomized, 12139012Sbostic * or we write the pointer directly to the file, if we are to stay 12239012Sbostic * in file order. If the former, we allocate and re-allocate in 12339012Sbostic * CHUNKSIZE blocks; if the latter, we just write each pointer, 12439012Sbostic * and then seek back to the beginning to write in the table. 12539012Sbostic */ 12630270Sbostic main(ac, av) 12730270Sbostic int ac; 12830270Sbostic char **av; 12930270Sbostic { 13030270Sbostic register char *sp, dc; 13130270Sbostic register FILE *inf, *outf; 13239012Sbostic register off_t last_off, length, pos; 13330270Sbostic register int first; 13430270Sbostic register char *nsp; 13530270Sbostic register STR *fp; 13630270Sbostic static char string[257]; 13730270Sbostic 13830270Sbostic getargs(ac, av); /* evalute arguments */ 13930270Sbostic dc = Delimch; 14030270Sbostic if ((inf = fopen(Infile, "r")) == NULL) { 14130270Sbostic perror(Infile); 14230270Sbostic exit(-1); 14330270Sbostic } 14430270Sbostic 14530270Sbostic if ((outf = fopen(Outfile, "w")) == NULL) { 14630270Sbostic perror(Outfile); 14730270Sbostic exit(-1); 14830270Sbostic } 14939012Sbostic if (!STORING_PTRS) 15039012Sbostic (void) fseek(outf, sizeof Tbl, 0); 15130270Sbostic 15230270Sbostic /* 15339012Sbostic * Write the strings onto the file 15430270Sbostic */ 15530270Sbostic 15630270Sbostic Tbl.str_longlen = 0; 15730270Sbostic Tbl.str_shortlen = (unsigned int) 0xffffffff; 15839012Sbostic Tbl.str_delim = dc; 15930270Sbostic first = Oflag; 16039012Sbostic add_offset(outf, ftell(inf)); 16139012Sbostic last_off = 0; 16230270Sbostic do { 16339012Sbostic if (Num_pts > 508) 16439012Sbostic atoi("1"); 16530270Sbostic sp = fgets(string, 256, inf); 166*39033Sbostic if (sp == NULL || sp[0] == dc && sp[1] == '\n') { 16739012Sbostic pos = ftell(inf); 16839012Sbostic length = pos - last_off - strlen(sp); 16939012Sbostic last_off = pos; 170*39033Sbostic if (!length) 171*39033Sbostic continue; 172*39033Sbostic add_offset(outf, pos); 17339012Sbostic if (Tbl.str_longlen < length) 17439012Sbostic Tbl.str_longlen = length; 17539012Sbostic if (Tbl.str_shortlen > length) 17639012Sbostic Tbl.str_shortlen = length; 17730270Sbostic first = Oflag; 17830270Sbostic } 179*39033Sbostic else if (first) { 180*39033Sbostic for (nsp = sp; !isalnum(*nsp); nsp++) 181*39033Sbostic continue; 182*39033Sbostic ALLOC(Firstch, Num_pts); 183*39033Sbostic fp = &Firstch[Num_pts - 1]; 184*39033Sbostic if (Iflag && isupper(*nsp)) 185*39033Sbostic fp->first = tolower(*nsp); 186*39033Sbostic else 187*39033Sbostic fp->first = *nsp; 188*39033Sbostic fp->pos = Seekpts[Num_pts - 1]; 189*39033Sbostic first = FALSE; 19030270Sbostic } 19130270Sbostic } while (sp != NULL); 19230270Sbostic 19330270Sbostic /* 19430270Sbostic * write the tables in 19530270Sbostic */ 19630270Sbostic 19730270Sbostic (void) fclose(inf); 19839012Sbostic Tbl.str_numstr = Num_pts - 1; 19930270Sbostic 20030270Sbostic if (Oflag) 20139012Sbostic do_order(); 20230270Sbostic else if (Rflag) 20339012Sbostic randomize(); 20430270Sbostic 20539012Sbostic (void) fseek(outf, (off_t) 0, 0); 20630270Sbostic (void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf); 20739012Sbostic if (STORING_PTRS) 20839012Sbostic (void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf); 20930270Sbostic (void) fclose(outf); 21030270Sbostic 21130270Sbostic if (!Sflag) { 21239012Sbostic printf("\"%s\" created\n", Outfile); 21339012Sbostic if (Num_pts == 2) 21430270Sbostic puts("There was 1 string"); 21530270Sbostic else 21639021Sbostic printf("There were %d strings\n", Num_pts - 1); 21739021Sbostic printf("Longest string: %lu byte%s\n", Tbl.str_longlen, 21830270Sbostic Tbl.str_longlen == 1 ? "" : "s"); 21939021Sbostic printf("Shortest string: %lu byte%s\n", Tbl.str_shortlen, 22030270Sbostic Tbl.str_shortlen == 1 ? "" : "s"); 22130270Sbostic } 22230270Sbostic exit(0); 22339012Sbostic /* NOTREACHED */ 22430270Sbostic } 22530270Sbostic 22630270Sbostic /* 22730270Sbostic * This routine evaluates arguments from the command line 22830270Sbostic */ 229*39033Sbostic getargs(argc, argv) 230*39033Sbostic int argc; 231*39033Sbostic char **argv; 23230270Sbostic { 233*39033Sbostic extern char *optarg; 234*39033Sbostic extern int optind; 235*39033Sbostic int ch; 23630270Sbostic 237*39033Sbostic while ((ch = getopt(argc, argv, "c:iors")) != EOF) 238*39033Sbostic switch(ch) { 239*39033Sbostic case 'c': /* new delimiting char */ 240*39033Sbostic Delimch = *optarg; 241*39033Sbostic if (!isascii(Delimch)) { 242*39033Sbostic printf("bad delimiting character: '\\%o\n'", 243*39033Sbostic Delimch); 244*39033Sbostic } 245*39033Sbostic break; 246*39033Sbostic case 'i': /* ignore case in ordering */ 247*39033Sbostic Iflag++; 248*39033Sbostic break; 249*39033Sbostic case 'o': /* order strings */ 250*39033Sbostic Oflag++; 251*39033Sbostic break; 252*39033Sbostic case 'r': /* ignore case in ordering */ 253*39033Sbostic Rflag++; 254*39033Sbostic break; 255*39033Sbostic case 's': /* silent */ 256*39033Sbostic Sflag++; 257*39033Sbostic break; 258*39033Sbostic case '?': 259*39033Sbostic default: 260*39033Sbostic usage(); 26130270Sbostic } 262*39033Sbostic argv += optind; 263*39033Sbostic 264*39033Sbostic if (*argv) { 265*39033Sbostic Infile = *argv; 266*39033Sbostic if (*++argv) 267*39033Sbostic (void) strcpy(Outfile, *argv); 268*39033Sbostic } 26930270Sbostic if (!Infile) { 27030270Sbostic puts("No input file name"); 271*39033Sbostic usage(); 27230270Sbostic } 273*39033Sbostic if (*Outfile == '\0') { 27430270Sbostic (void) strcpy(Outfile, Infile); 27530270Sbostic (void) strcat(Outfile, ".dat"); 27630270Sbostic } 27730270Sbostic } 27830270Sbostic 279*39033Sbostic usage() 280*39033Sbostic { 281*39033Sbostic (void) fprintf(stderr, 282*39033Sbostic "strfile [-iors] [-c char] sourcefile [datafile]\n"); 283*39033Sbostic exit(1); 284*39033Sbostic } 285*39033Sbostic 28630270Sbostic /* 28739012Sbostic * add_offset: 28839012Sbostic * Add an offset to the list, or write it out, as appropriate. 28939012Sbostic */ 29039012Sbostic add_offset(fp, off) 29139012Sbostic FILE *fp; 29239012Sbostic off_t off; 29339012Sbostic { 29439012Sbostic if (!STORING_PTRS) 29539012Sbostic fwrite(&off, 1, sizeof off, fp); 29639012Sbostic else { 29739012Sbostic ALLOC(Seekpts, Num_pts + 1); 29839012Sbostic Seekpts[Num_pts] = off; 29939012Sbostic } 30039012Sbostic Num_pts++; 30139012Sbostic } 30239012Sbostic 30339012Sbostic /* 30430270Sbostic * do_order: 30530270Sbostic * Order the strings alphabetically (possibly ignoring case). 30630270Sbostic */ 30739012Sbostic do_order() 30830270Sbostic { 30930270Sbostic register int i; 31039012Sbostic register off_t *lp; 31130270Sbostic register STR *fp; 31230270Sbostic extern int cmp_str(); 31330270Sbostic 31439012Sbostic Sort_1 = fopen(Infile, "r"); 31539012Sbostic Sort_2 = fopen(Infile, "r"); 31639012Sbostic qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str); 31730270Sbostic i = Tbl.str_numstr; 31839012Sbostic lp = Seekpts; 31930270Sbostic fp = Firstch; 32030270Sbostic while (i--) 32130270Sbostic *lp++ = fp++->pos; 32230270Sbostic (void) fclose(Sort_1); 32330270Sbostic (void) fclose(Sort_2); 32430270Sbostic Tbl.str_flags |= STR_ORDERED; 32530270Sbostic } 32630270Sbostic 32730270Sbostic /* 32830270Sbostic * cmp_str: 32930270Sbostic * Compare two strings in the file 33030270Sbostic */ 33139012Sbostic char * 33239012Sbostic unctrl(c) 33339012Sbostic char c; 33439012Sbostic { 33539012Sbostic static char buf[3]; 33639012Sbostic 33739012Sbostic if (isprint(c)) { 33839012Sbostic buf[0] = c; 33939012Sbostic buf[1] = '\0'; 34039012Sbostic } 34139012Sbostic else if (c == 0177) { 34239012Sbostic buf[0] = '^'; 34339012Sbostic buf[1] = '?'; 34439012Sbostic } 34539012Sbostic else { 34639012Sbostic buf[0] = '^'; 34739012Sbostic buf[1] = c + 'A' - 1; 34839012Sbostic } 34939012Sbostic return buf; 35039012Sbostic } 35139012Sbostic 35230270Sbostic cmp_str(p1, p2) 35330270Sbostic STR *p1, *p2; 35430270Sbostic { 35530270Sbostic register int c1, c2; 35639012Sbostic register int n1, n2; 35730270Sbostic 35839012Sbostic # define SET_N(nf,ch) (nf = (ch == '\n')) 35939012Sbostic # define IS_END(ch,nf) (ch == Delimch && nf) 36039012Sbostic 36130270Sbostic c1 = p1->first; 36230270Sbostic c2 = p2->first; 36330270Sbostic if (c1 != c2) 36430270Sbostic return c1 - c2; 36530270Sbostic 36630270Sbostic (void) fseek(Sort_1, p1->pos, 0); 36730270Sbostic (void) fseek(Sort_2, p2->pos, 0); 36830270Sbostic 36939012Sbostic n1 = FALSE; 37039012Sbostic n2 = FALSE; 37130270Sbostic while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0') 37239012Sbostic SET_N(n1, c1); 37330270Sbostic while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0') 37439012Sbostic SET_N(n2, c2); 37530270Sbostic 37639012Sbostic while (!IS_END(c1, n1) && !IS_END(c2, n2)) { 37730270Sbostic if (Iflag) { 37830270Sbostic if (isupper(c1)) 37930270Sbostic c1 = tolower(c1); 38030270Sbostic if (isupper(c2)) 38130270Sbostic c2 = tolower(c2); 38230270Sbostic } 38330270Sbostic if (c1 != c2) 38430270Sbostic return c1 - c2; 38539012Sbostic if (c1 == '\n' || c2 == '\n') 38639012Sbostic atoi("1"); 38739012Sbostic SET_N(n1, c1); 38839012Sbostic SET_N(n2, c2); 38930270Sbostic c1 = getc(Sort_1); 39030270Sbostic c2 = getc(Sort_2); 39130270Sbostic } 39239012Sbostic if (IS_END(c1, n1)) 39339012Sbostic c1 = 0; 39439012Sbostic if (IS_END(c2, n2)) 39539012Sbostic c2 = 0; 39630270Sbostic return c1 - c2; 39730270Sbostic } 39830270Sbostic 39930270Sbostic /* 40030270Sbostic * randomize: 40130270Sbostic * Randomize the order of the string table. We must be careful 40230270Sbostic * not to randomize across delimiter boundaries. All 40330270Sbostic * randomization is done within each block. 40430270Sbostic */ 40539012Sbostic randomize() 40630270Sbostic { 40739012Sbostic register int cnt, i; 40839012Sbostic register off_t tmp; 40939012Sbostic register off_t *sp; 41039012Sbostic extern time_t time(); 41130270Sbostic 41230270Sbostic Tbl.str_flags |= STR_RANDOM; 41339012Sbostic cnt = Tbl.str_numstr; 41430270Sbostic 41539012Sbostic /* 41639012Sbostic * move things around randomly 41739012Sbostic */ 41830270Sbostic 41939012Sbostic for (sp = Seekpts; cnt > 0; cnt--, sp++) { 42039021Sbostic i = random() % cnt; 42139012Sbostic tmp = sp[0]; 42239012Sbostic sp[0] = sp[i]; 42339012Sbostic sp[i] = tmp; 42430270Sbostic } 42530270Sbostic } 426