1*39026Sbostic /* 2*39026Sbostic * Copyright (c) 1989 The Regents of the University of California. 3*39026Sbostic * All rights reserved. 4*39026Sbostic * 5*39026Sbostic * This code is derived from software contributed to Berkeley by 6*39026Sbostic * Ken Arnold. 7*39026Sbostic * 8*39026Sbostic * Redistribution and use in source and binary forms are permitted 9*39026Sbostic * provided that the above copyright notice and this paragraph are 10*39026Sbostic * duplicated in all such forms and that any documentation, 11*39026Sbostic * advertising materials, and other materials related to such 12*39026Sbostic * distribution and use acknowledge that the software was developed 13*39026Sbostic * by the University of California, Berkeley. The name of the 14*39026Sbostic * University may not be used to endorse or promote products derived 15*39026Sbostic * from this software without specific prior written permission. 16*39026Sbostic * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 17*39026Sbostic * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 18*39026Sbostic * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 19*39026Sbostic */ 2030270Sbostic 21*39026Sbostic #ifndef lint 22*39026Sbostic char copyright[] = 23*39026Sbostic "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 24*39026Sbostic All rights reserved.\n"; 25*39026Sbostic #endif /* not lint */ 26*39026Sbostic 27*39026Sbostic #ifndef lint 28*39026Sbostic static char sccsid[] = "@(#)strfile.c 5.4 (Berkeley) 09/05/89"; 29*39026Sbostic #endif /* not lint */ 30*39026Sbostic 3139021Sbostic # include <sys/param.h> 3239021Sbostic # include <sys/types.h> 3330270Sbostic # include <stdio.h> 3430270Sbostic # include <ctype.h> 3530270Sbostic # include "strfile.h" 3630270Sbostic 3739012Sbostic # ifndef MAXPATHLEN 3839012Sbostic # define MAXPATHLEN 1024 3939012Sbostic # endif /* MAXPATHLEN */ 4039012Sbostic 4130270Sbostic /* 4230270Sbostic * This program takes a file composed of strings seperated by 4330270Sbostic * lines starting with two consecutive delimiting character (default 4430270Sbostic * character is '%') and creates another file which consists of a table 4530270Sbostic * describing the file (structure from "strfile.h"), a table of seek 4630270Sbostic * pointers to the start of the strings, and the strings, each terinated 4730270Sbostic * by a null byte. Usage: 4830270Sbostic * 4930270Sbostic * % strfile [ - ] [ -cC ] [ -sv ] [ -oir ] sourcefile [ datafile ] 5030270Sbostic * 5130270Sbostic * - - Give a usage summary useful for jogging the memory 5230270Sbostic * c - Change delimiting character from '%' to 'C' 5330270Sbostic * s - Silent. Give no summary of data processed at the end of 5430270Sbostic * the run. 5530270Sbostic * v - Verbose. Give summary of data processed. (Default) 5630270Sbostic * o - order the strings in alphabetic order 5730270Sbostic * i - if ordering, ignore case 5830270Sbostic * r - randomize the order of the strings 5930270Sbostic * 6030270Sbostic * Ken Arnold Sept. 7, 1978 -- 6130270Sbostic * 6230270Sbostic * Added method to indicate dividers. A "%-" will cause the address 6330270Sbostic * to be added to the structure in one of the pointer elements. 6430270Sbostic * 6530270Sbostic * Ken Arnold Nov., 1984 -- 6630270Sbostic * 6730270Sbostic * Added ordering options. 6830270Sbostic */ 6930270Sbostic 7030270Sbostic # define TRUE 1 7130270Sbostic # define FALSE 0 7230270Sbostic 7339012Sbostic # define STORING_PTRS (Oflag || Rflag) 7439012Sbostic # define CHUNKSIZE 512 7530270Sbostic 7639012Sbostic #ifdef lint 7739012Sbostic # define ALWAYS atoi("1") 7839012Sbostic #else 7939012Sbostic # define ALWAYS 1 8039012Sbostic #endif 8139012Sbostic # define ALLOC(ptr,sz) if (ALWAYS) { \ 8239012Sbostic if (ptr == NULL) \ 8339012Sbostic ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \ 8439012Sbostic else if (((sz) + 1) % CHUNKSIZE == 0) \ 8539012Sbostic ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \ 8639012Sbostic if (ptr == NULL) { \ 8739012Sbostic fprintf(stderr, "out of space\n"); \ 8839012Sbostic exit(1); \ 8939012Sbostic } \ 9039012Sbostic } else 9139012Sbostic 9239012Sbostic #ifdef NO_VOID 9339012Sbostic # define void char 9439012Sbostic #endif 9539012Sbostic 9630270Sbostic typedef struct { 9730270Sbostic char first; 9839012Sbostic off_t pos; 9930270Sbostic } STR; 10030270Sbostic 10130270Sbostic char *Infile = NULL, /* input file name */ 10239012Sbostic Outfile[MAXPATHLEN] = "", /* output file name */ 10330270Sbostic Delimch = '%', /* delimiting character */ 10430270Sbostic *Usage[] = { /* usage summary */ 10530270Sbostic "usage: strfile [ - ] [ -cC ] [ -sv ] [ -oir ] inputfile [ datafile ]", 10630270Sbostic " - - Give this usage summary", 10730270Sbostic " c - Replace delimiting character with 'C'", 10830270Sbostic " s - Silent. Give no summary", 10930270Sbostic " v - Verbose. Give summary. (default)", 11030270Sbostic " o - order strings alphabetically", 11130270Sbostic " i - ignore case in ordering", 11230270Sbostic " r - randomize the order of the strings", 11330270Sbostic " Default \"datafile\" is inputfile.dat", 11430270Sbostic }; 11530270Sbostic 11630270Sbostic int Sflag = FALSE; /* silent run flag */ 11730270Sbostic int Oflag = FALSE; /* ordering flag */ 11830270Sbostic int Iflag = FALSE; /* ignore case flag */ 11930270Sbostic int Rflag = FALSE; /* randomize order flag */ 12039012Sbostic int Num_pts = 0; /* number of pointers/strings */ 12130270Sbostic 12239012Sbostic off_t *Seekpts; 12330270Sbostic 12430270Sbostic FILE *Sort_1, *Sort_2; /* pointers for sorting */ 12530270Sbostic 12630270Sbostic STRFILE Tbl; /* statistics table */ 12730270Sbostic 12830270Sbostic STR *Firstch; /* first chars of each string */ 12930270Sbostic 13039012Sbostic char *fgets(), *strcpy(), *strcat(); 13130270Sbostic 13239012Sbostic void *malloc(), *realloc(); 13330270Sbostic 13439012Sbostic /* 13539012Sbostic * main: 13639012Sbostic * Drive the sucker. There are two main modes -- either we store 13739012Sbostic * the seek pointers, if the table is to be sorted or randomized, 13839012Sbostic * or we write the pointer directly to the file, if we are to stay 13939012Sbostic * in file order. If the former, we allocate and re-allocate in 14039012Sbostic * CHUNKSIZE blocks; if the latter, we just write each pointer, 14139012Sbostic * and then seek back to the beginning to write in the table. 14239012Sbostic */ 14330270Sbostic main(ac, av) 14430270Sbostic int ac; 14530270Sbostic char **av; 14630270Sbostic { 14730270Sbostic register char *sp, dc; 14830270Sbostic register FILE *inf, *outf; 14939012Sbostic register off_t last_off, length, pos; 15030270Sbostic register int first; 15130270Sbostic register char *nsp; 15230270Sbostic register STR *fp; 15330270Sbostic static char string[257]; 15430270Sbostic 15530270Sbostic getargs(ac, av); /* evalute arguments */ 15630270Sbostic dc = Delimch; 15730270Sbostic if ((inf = fopen(Infile, "r")) == NULL) { 15830270Sbostic perror(Infile); 15930270Sbostic exit(-1); 16030270Sbostic } 16130270Sbostic 16230270Sbostic if ((outf = fopen(Outfile, "w")) == NULL) { 16330270Sbostic perror(Outfile); 16430270Sbostic exit(-1); 16530270Sbostic } 16639012Sbostic if (!STORING_PTRS) 16739012Sbostic (void) fseek(outf, sizeof Tbl, 0); 16830270Sbostic 16930270Sbostic /* 17039012Sbostic * Write the strings onto the file 17130270Sbostic */ 17230270Sbostic 17330270Sbostic Tbl.str_longlen = 0; 17430270Sbostic Tbl.str_shortlen = (unsigned int) 0xffffffff; 17539012Sbostic Tbl.str_delim = dc; 17630270Sbostic first = Oflag; 17739012Sbostic add_offset(outf, ftell(inf)); 17839012Sbostic last_off = 0; 17930270Sbostic do { 18039012Sbostic if (Num_pts > 508) 18139012Sbostic atoi("1"); 18230270Sbostic sp = fgets(string, 256, inf); 18339012Sbostic if (sp == NULL || (sp[0] == dc && sp[1] == dc)) { 18439012Sbostic pos = ftell(inf); 18539012Sbostic add_offset(outf, pos); 18639012Sbostic length = pos - last_off - strlen(sp); 18739012Sbostic last_off = pos; 18839012Sbostic if (Tbl.str_longlen < length) 18939012Sbostic Tbl.str_longlen = length; 19039012Sbostic if (Tbl.str_shortlen > length) 19139012Sbostic Tbl.str_shortlen = length; 19230270Sbostic first = Oflag; 19330270Sbostic } 19430270Sbostic else { 19530270Sbostic if (first) { 19630270Sbostic for (nsp = sp; !isalnum(*nsp); nsp++) 19730270Sbostic continue; 19839012Sbostic ALLOC(Firstch, Num_pts); 19939012Sbostic fp = &Firstch[Num_pts - 1]; 20030270Sbostic if (Iflag && isupper(*nsp)) 20130270Sbostic fp->first = tolower(*nsp); 20230270Sbostic else 20330270Sbostic fp->first = *nsp; 20439012Sbostic fp->pos = Seekpts[Num_pts - 1]; 20530270Sbostic first = FALSE; 20630270Sbostic } 20730270Sbostic } 20830270Sbostic } while (sp != NULL); 20930270Sbostic 21030270Sbostic /* 21130270Sbostic * write the tables in 21230270Sbostic */ 21330270Sbostic 21430270Sbostic (void) fclose(inf); 21539012Sbostic Tbl.str_numstr = Num_pts - 1; 21630270Sbostic 21730270Sbostic if (Oflag) 21839012Sbostic do_order(); 21930270Sbostic else if (Rflag) 22039012Sbostic randomize(); 22130270Sbostic 22239012Sbostic (void) fseek(outf, (off_t) 0, 0); 22330270Sbostic (void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf); 22439012Sbostic if (STORING_PTRS) 22539012Sbostic (void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf); 22630270Sbostic (void) fclose(outf); 22730270Sbostic 22830270Sbostic if (!Sflag) { 22939012Sbostic printf("\"%s\" created\n", Outfile); 23039012Sbostic if (Num_pts == 2) 23130270Sbostic puts("There was 1 string"); 23230270Sbostic else 23339021Sbostic printf("There were %d strings\n", Num_pts - 1); 23439021Sbostic printf("Longest string: %lu byte%s\n", Tbl.str_longlen, 23530270Sbostic Tbl.str_longlen == 1 ? "" : "s"); 23639021Sbostic printf("Shortest string: %lu byte%s\n", Tbl.str_shortlen, 23730270Sbostic Tbl.str_shortlen == 1 ? "" : "s"); 23830270Sbostic } 23930270Sbostic exit(0); 24039012Sbostic /* NOTREACHED */ 24130270Sbostic } 24230270Sbostic 24330270Sbostic /* 24430270Sbostic * This routine evaluates arguments from the command line 24530270Sbostic */ 24630270Sbostic getargs(ac, av) 24730270Sbostic register int ac; 24830270Sbostic register char **av; 24930270Sbostic { 25030270Sbostic register char *sp; 25130270Sbostic register int i; 25230270Sbostic register int bad, j; 25330270Sbostic 25430270Sbostic bad = 0; 25530270Sbostic for (i = 1; i < ac; i++) 25630270Sbostic if (*av[i] == '-' && av[i][1]) { 25730270Sbostic for (sp = &av[i][1]; *sp; sp++) 25830270Sbostic switch (*sp) { 25930270Sbostic case 'c': /* new delimiting char */ 26030270Sbostic if ((Delimch = *++sp) == '\0') { 26130270Sbostic --sp; 26230270Sbostic Delimch = *av[++i]; 26330270Sbostic } 26439012Sbostic if (!isascii(Delimch)) { 26530270Sbostic printf("bad delimiting character: '\\%o\n'", 26630270Sbostic Delimch); 26730270Sbostic bad++; 26830270Sbostic } 26930270Sbostic break; 27030270Sbostic case 's': /* silent */ 27130270Sbostic Sflag++; 27230270Sbostic break; 27330270Sbostic case 'v': /* verbose */ 27430270Sbostic Sflag = 0; 27530270Sbostic break; 27630270Sbostic case 'o': /* order strings */ 27730270Sbostic Oflag++; 27830270Sbostic break; 27930270Sbostic case 'i': /* ignore case in ordering */ 28030270Sbostic Iflag++; 28130270Sbostic break; 28230270Sbostic case 'r': /* ignore case in ordering */ 28330270Sbostic Rflag++; 28430270Sbostic break; 28530270Sbostic default: /* unknown flag */ 28630270Sbostic bad++; 28730270Sbostic printf("bad flag: '%c'\n", *sp); 28830270Sbostic break; 28930270Sbostic } 29030270Sbostic } 29130270Sbostic else if (*av[i] == '-') { 29230270Sbostic for (j = 0; Usage[j]; j++) 29330270Sbostic puts(Usage[j]); 29430270Sbostic exit(0); 29530270Sbostic } 29630270Sbostic else if (Infile) 29730270Sbostic (void) strcpy(Outfile, av[i]); 29830270Sbostic else 29930270Sbostic Infile = av[i]; 30030270Sbostic if (!Infile) { 30130270Sbostic bad++; 30230270Sbostic puts("No input file name"); 30330270Sbostic } 30430270Sbostic if (*Outfile == '\0' && !bad) { 30530270Sbostic (void) strcpy(Outfile, Infile); 30630270Sbostic (void) strcat(Outfile, ".dat"); 30730270Sbostic } 30830270Sbostic if (bad) { 30930270Sbostic puts("use \"strfile -\" to get usage"); 31030270Sbostic exit(-1); 31130270Sbostic } 31230270Sbostic } 31330270Sbostic 31430270Sbostic /* 31539012Sbostic * add_offset: 31639012Sbostic * Add an offset to the list, or write it out, as appropriate. 31739012Sbostic */ 31839012Sbostic add_offset(fp, off) 31939012Sbostic FILE *fp; 32039012Sbostic off_t off; 32139012Sbostic { 32239012Sbostic if (!STORING_PTRS) 32339012Sbostic fwrite(&off, 1, sizeof off, fp); 32439012Sbostic else { 32539012Sbostic ALLOC(Seekpts, Num_pts + 1); 32639012Sbostic Seekpts[Num_pts] = off; 32739012Sbostic } 32839012Sbostic Num_pts++; 32939012Sbostic } 33039012Sbostic 33139012Sbostic /* 33230270Sbostic * do_order: 33330270Sbostic * Order the strings alphabetically (possibly ignoring case). 33430270Sbostic */ 33539012Sbostic do_order() 33630270Sbostic { 33730270Sbostic register int i; 33839012Sbostic register off_t *lp; 33930270Sbostic register STR *fp; 34030270Sbostic extern int cmp_str(); 34130270Sbostic 34239012Sbostic Sort_1 = fopen(Infile, "r"); 34339012Sbostic Sort_2 = fopen(Infile, "r"); 34439012Sbostic qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str); 34530270Sbostic i = Tbl.str_numstr; 34639012Sbostic lp = Seekpts; 34730270Sbostic fp = Firstch; 34830270Sbostic while (i--) 34930270Sbostic *lp++ = fp++->pos; 35030270Sbostic (void) fclose(Sort_1); 35130270Sbostic (void) fclose(Sort_2); 35230270Sbostic Tbl.str_flags |= STR_ORDERED; 35330270Sbostic } 35430270Sbostic 35530270Sbostic /* 35630270Sbostic * cmp_str: 35730270Sbostic * Compare two strings in the file 35830270Sbostic */ 35939012Sbostic char * 36039012Sbostic unctrl(c) 36139012Sbostic char c; 36239012Sbostic { 36339012Sbostic static char buf[3]; 36439012Sbostic 36539012Sbostic if (isprint(c)) { 36639012Sbostic buf[0] = c; 36739012Sbostic buf[1] = '\0'; 36839012Sbostic } 36939012Sbostic else if (c == 0177) { 37039012Sbostic buf[0] = '^'; 37139012Sbostic buf[1] = '?'; 37239012Sbostic } 37339012Sbostic else { 37439012Sbostic buf[0] = '^'; 37539012Sbostic buf[1] = c + 'A' - 1; 37639012Sbostic } 37739012Sbostic return buf; 37839012Sbostic } 37939012Sbostic 38030270Sbostic cmp_str(p1, p2) 38130270Sbostic STR *p1, *p2; 38230270Sbostic { 38330270Sbostic register int c1, c2; 38439012Sbostic register int n1, n2; 38530270Sbostic 38639012Sbostic # define SET_N(nf,ch) (nf = (ch == '\n')) 38739012Sbostic # define IS_END(ch,nf) (ch == Delimch && nf) 38839012Sbostic 38930270Sbostic c1 = p1->first; 39030270Sbostic c2 = p2->first; 39130270Sbostic if (c1 != c2) 39230270Sbostic return c1 - c2; 39330270Sbostic 39430270Sbostic (void) fseek(Sort_1, p1->pos, 0); 39530270Sbostic (void) fseek(Sort_2, p2->pos, 0); 39630270Sbostic 39739012Sbostic n1 = FALSE; 39839012Sbostic n2 = FALSE; 39930270Sbostic while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0') 40039012Sbostic SET_N(n1, c1); 40130270Sbostic while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0') 40239012Sbostic SET_N(n2, c2); 40330270Sbostic 40439012Sbostic while (!IS_END(c1, n1) && !IS_END(c2, n2)) { 40530270Sbostic if (Iflag) { 40630270Sbostic if (isupper(c1)) 40730270Sbostic c1 = tolower(c1); 40830270Sbostic if (isupper(c2)) 40930270Sbostic c2 = tolower(c2); 41030270Sbostic } 41130270Sbostic if (c1 != c2) 41230270Sbostic return c1 - c2; 41339012Sbostic if (c1 == '\n' || c2 == '\n') 41439012Sbostic atoi("1"); 41539012Sbostic SET_N(n1, c1); 41639012Sbostic SET_N(n2, c2); 41730270Sbostic c1 = getc(Sort_1); 41830270Sbostic c2 = getc(Sort_2); 41930270Sbostic } 42039012Sbostic if (IS_END(c1, n1)) 42139012Sbostic c1 = 0; 42239012Sbostic if (IS_END(c2, n2)) 42339012Sbostic c2 = 0; 42430270Sbostic return c1 - c2; 42530270Sbostic } 42630270Sbostic 42730270Sbostic /* 42830270Sbostic * randomize: 42930270Sbostic * Randomize the order of the string table. We must be careful 43030270Sbostic * not to randomize across delimiter boundaries. All 43130270Sbostic * randomization is done within each block. 43230270Sbostic */ 43339012Sbostic randomize() 43430270Sbostic { 43539012Sbostic register int cnt, i; 43639012Sbostic register off_t tmp; 43739012Sbostic register off_t *sp; 43839012Sbostic extern time_t time(); 43930270Sbostic 44030270Sbostic Tbl.str_flags |= STR_RANDOM; 44139012Sbostic cnt = Tbl.str_numstr; 44230270Sbostic 44339012Sbostic /* 44439012Sbostic * move things around randomly 44539012Sbostic */ 44630270Sbostic 44739012Sbostic for (sp = Seekpts; cnt > 0; cnt--, sp++) { 44839021Sbostic i = random() % cnt; 44939012Sbostic tmp = sp[0]; 45039012Sbostic sp[0] = sp[i]; 45139012Sbostic sp[i] = tmp; 45230270Sbostic } 45330270Sbostic } 454