139012Sbostic /* $Header: strfile.c,v 1.19 89/05/05 16:07:46 arnold Exp $ */ 230270Sbostic 3*39021Sbostic # include <sys/param.h> 4*39021Sbostic # include <sys/types.h> 530270Sbostic # include <stdio.h> 630270Sbostic # include <ctype.h> 730270Sbostic # include "strfile.h" 830270Sbostic 939012Sbostic # ifndef MAXPATHLEN 1039012Sbostic # define MAXPATHLEN 1024 1139012Sbostic # endif /* MAXPATHLEN */ 1239012Sbostic 1330270Sbostic /* 1430270Sbostic * This program takes a file composed of strings seperated by 1530270Sbostic * lines starting with two consecutive delimiting character (default 1630270Sbostic * character is '%') and creates another file which consists of a table 1730270Sbostic * describing the file (structure from "strfile.h"), a table of seek 1830270Sbostic * pointers to the start of the strings, and the strings, each terinated 1930270Sbostic * by a null byte. Usage: 2030270Sbostic * 2130270Sbostic * % strfile [ - ] [ -cC ] [ -sv ] [ -oir ] sourcefile [ datafile ] 2230270Sbostic * 2330270Sbostic * - - Give a usage summary useful for jogging the memory 2430270Sbostic * c - Change delimiting character from '%' to 'C' 2530270Sbostic * s - Silent. Give no summary of data processed at the end of 2630270Sbostic * the run. 2730270Sbostic * v - Verbose. Give summary of data processed. (Default) 2830270Sbostic * o - order the strings in alphabetic order 2930270Sbostic * i - if ordering, ignore case 3030270Sbostic * r - randomize the order of the strings 3130270Sbostic * 3230270Sbostic * Ken Arnold Sept. 7, 1978 -- 3330270Sbostic * 3430270Sbostic * Added method to indicate dividers. A "%-" will cause the address 3530270Sbostic * to be added to the structure in one of the pointer elements. 3630270Sbostic * 3730270Sbostic * Ken Arnold Nov., 1984 -- 3830270Sbostic * 3930270Sbostic * Added ordering options. 4030270Sbostic */ 4130270Sbostic 4230270Sbostic # define TRUE 1 4330270Sbostic # define FALSE 0 4430270Sbostic 4539012Sbostic # define STORING_PTRS (Oflag || Rflag) 4639012Sbostic # define CHUNKSIZE 512 4730270Sbostic 4839012Sbostic #ifdef lint 4939012Sbostic # define ALWAYS atoi("1") 5039012Sbostic #else 5139012Sbostic # define ALWAYS 1 5239012Sbostic #endif 5339012Sbostic # define ALLOC(ptr,sz) if (ALWAYS) { \ 5439012Sbostic if (ptr == NULL) \ 5539012Sbostic ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \ 5639012Sbostic else if (((sz) + 1) % CHUNKSIZE == 0) \ 5739012Sbostic ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \ 5839012Sbostic if (ptr == NULL) { \ 5939012Sbostic fprintf(stderr, "out of space\n"); \ 6039012Sbostic exit(1); \ 6139012Sbostic } \ 6239012Sbostic } else 6339012Sbostic 6439012Sbostic #ifdef NO_VOID 6539012Sbostic # define void char 6639012Sbostic #endif 6739012Sbostic 6830270Sbostic typedef struct { 6930270Sbostic char first; 7039012Sbostic off_t pos; 7130270Sbostic } STR; 7230270Sbostic 7330270Sbostic char *Infile = NULL, /* input file name */ 7439012Sbostic Outfile[MAXPATHLEN] = "", /* output file name */ 7530270Sbostic Delimch = '%', /* delimiting character */ 7630270Sbostic *Usage[] = { /* usage summary */ 7730270Sbostic "usage: strfile [ - ] [ -cC ] [ -sv ] [ -oir ] inputfile [ datafile ]", 7830270Sbostic " - - Give this usage summary", 7930270Sbostic " c - Replace delimiting character with 'C'", 8030270Sbostic " s - Silent. Give no summary", 8130270Sbostic " v - Verbose. Give summary. (default)", 8230270Sbostic " o - order strings alphabetically", 8330270Sbostic " i - ignore case in ordering", 8430270Sbostic " r - randomize the order of the strings", 8530270Sbostic " Default \"datafile\" is inputfile.dat", 8630270Sbostic }; 8730270Sbostic 8830270Sbostic int Sflag = FALSE; /* silent run flag */ 8930270Sbostic int Oflag = FALSE; /* ordering flag */ 9030270Sbostic int Iflag = FALSE; /* ignore case flag */ 9130270Sbostic int Rflag = FALSE; /* randomize order flag */ 9239012Sbostic int Num_pts = 0; /* number of pointers/strings */ 9330270Sbostic 9439012Sbostic off_t *Seekpts; 9530270Sbostic 9630270Sbostic FILE *Sort_1, *Sort_2; /* pointers for sorting */ 9730270Sbostic 9830270Sbostic STRFILE Tbl; /* statistics table */ 9930270Sbostic 10030270Sbostic STR *Firstch; /* first chars of each string */ 10130270Sbostic 10239012Sbostic char *fgets(), *strcpy(), *strcat(); 10330270Sbostic 10439012Sbostic void *malloc(), *realloc(); 10530270Sbostic 10639012Sbostic /* 10739012Sbostic * main: 10839012Sbostic * Drive the sucker. There are two main modes -- either we store 10939012Sbostic * the seek pointers, if the table is to be sorted or randomized, 11039012Sbostic * or we write the pointer directly to the file, if we are to stay 11139012Sbostic * in file order. If the former, we allocate and re-allocate in 11239012Sbostic * CHUNKSIZE blocks; if the latter, we just write each pointer, 11339012Sbostic * and then seek back to the beginning to write in the table. 11439012Sbostic */ 11530270Sbostic main(ac, av) 11630270Sbostic int ac; 11730270Sbostic char **av; 11830270Sbostic { 11930270Sbostic register char *sp, dc; 12030270Sbostic register FILE *inf, *outf; 12139012Sbostic register off_t last_off, length, pos; 12230270Sbostic register int first; 12330270Sbostic register char *nsp; 12430270Sbostic register STR *fp; 12530270Sbostic static char string[257]; 12630270Sbostic 12730270Sbostic getargs(ac, av); /* evalute arguments */ 12830270Sbostic dc = Delimch; 12930270Sbostic if ((inf = fopen(Infile, "r")) == NULL) { 13030270Sbostic perror(Infile); 13130270Sbostic exit(-1); 13230270Sbostic } 13330270Sbostic 13430270Sbostic if ((outf = fopen(Outfile, "w")) == NULL) { 13530270Sbostic perror(Outfile); 13630270Sbostic exit(-1); 13730270Sbostic } 13839012Sbostic if (!STORING_PTRS) 13939012Sbostic (void) fseek(outf, sizeof Tbl, 0); 14030270Sbostic 14130270Sbostic /* 14239012Sbostic * Write the strings onto the file 14330270Sbostic */ 14430270Sbostic 14530270Sbostic Tbl.str_longlen = 0; 14630270Sbostic Tbl.str_shortlen = (unsigned int) 0xffffffff; 14739012Sbostic Tbl.str_delim = dc; 14830270Sbostic first = Oflag; 14939012Sbostic add_offset(outf, ftell(inf)); 15039012Sbostic last_off = 0; 15130270Sbostic do { 15239012Sbostic if (Num_pts > 508) 15339012Sbostic atoi("1"); 15430270Sbostic sp = fgets(string, 256, inf); 15539012Sbostic if (sp == NULL || (sp[0] == dc && sp[1] == dc)) { 15639012Sbostic pos = ftell(inf); 15739012Sbostic add_offset(outf, pos); 15839012Sbostic length = pos - last_off - strlen(sp); 15939012Sbostic last_off = pos; 16039012Sbostic if (Tbl.str_longlen < length) 16139012Sbostic Tbl.str_longlen = length; 16239012Sbostic if (Tbl.str_shortlen > length) 16339012Sbostic Tbl.str_shortlen = length; 16430270Sbostic first = Oflag; 16530270Sbostic } 16630270Sbostic else { 16730270Sbostic if (first) { 16830270Sbostic for (nsp = sp; !isalnum(*nsp); nsp++) 16930270Sbostic continue; 17039012Sbostic ALLOC(Firstch, Num_pts); 17139012Sbostic fp = &Firstch[Num_pts - 1]; 17230270Sbostic if (Iflag && isupper(*nsp)) 17330270Sbostic fp->first = tolower(*nsp); 17430270Sbostic else 17530270Sbostic fp->first = *nsp; 17639012Sbostic fp->pos = Seekpts[Num_pts - 1]; 17730270Sbostic first = FALSE; 17830270Sbostic } 17930270Sbostic } 18030270Sbostic } while (sp != NULL); 18130270Sbostic 18230270Sbostic /* 18330270Sbostic * write the tables in 18430270Sbostic */ 18530270Sbostic 18630270Sbostic (void) fclose(inf); 18739012Sbostic Tbl.str_numstr = Num_pts - 1; 18830270Sbostic 18930270Sbostic if (Oflag) 19039012Sbostic do_order(); 19130270Sbostic else if (Rflag) 19239012Sbostic randomize(); 19330270Sbostic 19439012Sbostic (void) fseek(outf, (off_t) 0, 0); 19530270Sbostic (void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf); 19639012Sbostic if (STORING_PTRS) 19739012Sbostic (void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf); 19830270Sbostic (void) fclose(outf); 19930270Sbostic 20030270Sbostic if (!Sflag) { 20139012Sbostic printf("\"%s\" created\n", Outfile); 20239012Sbostic if (Num_pts == 2) 20330270Sbostic puts("There was 1 string"); 20430270Sbostic else 205*39021Sbostic printf("There were %d strings\n", Num_pts - 1); 206*39021Sbostic printf("Longest string: %lu byte%s\n", Tbl.str_longlen, 20730270Sbostic Tbl.str_longlen == 1 ? "" : "s"); 208*39021Sbostic printf("Shortest string: %lu byte%s\n", Tbl.str_shortlen, 20930270Sbostic Tbl.str_shortlen == 1 ? "" : "s"); 21030270Sbostic } 21130270Sbostic exit(0); 21239012Sbostic /* NOTREACHED */ 21330270Sbostic } 21430270Sbostic 21530270Sbostic /* 21630270Sbostic * This routine evaluates arguments from the command line 21730270Sbostic */ 21830270Sbostic getargs(ac, av) 21930270Sbostic register int ac; 22030270Sbostic register char **av; 22130270Sbostic { 22230270Sbostic register char *sp; 22330270Sbostic register int i; 22430270Sbostic register int bad, j; 22530270Sbostic 22630270Sbostic bad = 0; 22730270Sbostic for (i = 1; i < ac; i++) 22830270Sbostic if (*av[i] == '-' && av[i][1]) { 22930270Sbostic for (sp = &av[i][1]; *sp; sp++) 23030270Sbostic switch (*sp) { 23130270Sbostic case 'c': /* new delimiting char */ 23230270Sbostic if ((Delimch = *++sp) == '\0') { 23330270Sbostic --sp; 23430270Sbostic Delimch = *av[++i]; 23530270Sbostic } 23639012Sbostic if (!isascii(Delimch)) { 23730270Sbostic printf("bad delimiting character: '\\%o\n'", 23830270Sbostic Delimch); 23930270Sbostic bad++; 24030270Sbostic } 24130270Sbostic break; 24230270Sbostic case 's': /* silent */ 24330270Sbostic Sflag++; 24430270Sbostic break; 24530270Sbostic case 'v': /* verbose */ 24630270Sbostic Sflag = 0; 24730270Sbostic break; 24830270Sbostic case 'o': /* order strings */ 24930270Sbostic Oflag++; 25030270Sbostic break; 25130270Sbostic case 'i': /* ignore case in ordering */ 25230270Sbostic Iflag++; 25330270Sbostic break; 25430270Sbostic case 'r': /* ignore case in ordering */ 25530270Sbostic Rflag++; 25630270Sbostic break; 25730270Sbostic default: /* unknown flag */ 25830270Sbostic bad++; 25930270Sbostic printf("bad flag: '%c'\n", *sp); 26030270Sbostic break; 26130270Sbostic } 26230270Sbostic } 26330270Sbostic else if (*av[i] == '-') { 26430270Sbostic for (j = 0; Usage[j]; j++) 26530270Sbostic puts(Usage[j]); 26630270Sbostic exit(0); 26730270Sbostic } 26830270Sbostic else if (Infile) 26930270Sbostic (void) strcpy(Outfile, av[i]); 27030270Sbostic else 27130270Sbostic Infile = av[i]; 27230270Sbostic if (!Infile) { 27330270Sbostic bad++; 27430270Sbostic puts("No input file name"); 27530270Sbostic } 27630270Sbostic if (*Outfile == '\0' && !bad) { 27730270Sbostic (void) strcpy(Outfile, Infile); 27830270Sbostic (void) strcat(Outfile, ".dat"); 27930270Sbostic } 28030270Sbostic if (bad) { 28130270Sbostic puts("use \"strfile -\" to get usage"); 28230270Sbostic exit(-1); 28330270Sbostic } 28430270Sbostic } 28530270Sbostic 28630270Sbostic /* 28739012Sbostic * add_offset: 28839012Sbostic * Add an offset to the list, or write it out, as appropriate. 28939012Sbostic */ 29039012Sbostic add_offset(fp, off) 29139012Sbostic FILE *fp; 29239012Sbostic off_t off; 29339012Sbostic { 29439012Sbostic if (!STORING_PTRS) 29539012Sbostic fwrite(&off, 1, sizeof off, fp); 29639012Sbostic else { 29739012Sbostic ALLOC(Seekpts, Num_pts + 1); 29839012Sbostic Seekpts[Num_pts] = off; 29939012Sbostic } 30039012Sbostic Num_pts++; 30139012Sbostic } 30239012Sbostic 30339012Sbostic /* 30430270Sbostic * do_order: 30530270Sbostic * Order the strings alphabetically (possibly ignoring case). 30630270Sbostic */ 30739012Sbostic do_order() 30830270Sbostic { 30930270Sbostic register int i; 31039012Sbostic register off_t *lp; 31130270Sbostic register STR *fp; 31230270Sbostic extern int cmp_str(); 31330270Sbostic 31439012Sbostic Sort_1 = fopen(Infile, "r"); 31539012Sbostic Sort_2 = fopen(Infile, "r"); 31639012Sbostic qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str); 31730270Sbostic i = Tbl.str_numstr; 31839012Sbostic lp = Seekpts; 31930270Sbostic fp = Firstch; 32030270Sbostic while (i--) 32130270Sbostic *lp++ = fp++->pos; 32230270Sbostic (void) fclose(Sort_1); 32330270Sbostic (void) fclose(Sort_2); 32430270Sbostic Tbl.str_flags |= STR_ORDERED; 32530270Sbostic } 32630270Sbostic 32730270Sbostic /* 32830270Sbostic * cmp_str: 32930270Sbostic * Compare two strings in the file 33030270Sbostic */ 33139012Sbostic char * 33239012Sbostic unctrl(c) 33339012Sbostic char c; 33439012Sbostic { 33539012Sbostic static char buf[3]; 33639012Sbostic 33739012Sbostic if (isprint(c)) { 33839012Sbostic buf[0] = c; 33939012Sbostic buf[1] = '\0'; 34039012Sbostic } 34139012Sbostic else if (c == 0177) { 34239012Sbostic buf[0] = '^'; 34339012Sbostic buf[1] = '?'; 34439012Sbostic } 34539012Sbostic else { 34639012Sbostic buf[0] = '^'; 34739012Sbostic buf[1] = c + 'A' - 1; 34839012Sbostic } 34939012Sbostic return buf; 35039012Sbostic } 35139012Sbostic 35230270Sbostic cmp_str(p1, p2) 35330270Sbostic STR *p1, *p2; 35430270Sbostic { 35530270Sbostic register int c1, c2; 35639012Sbostic register int n1, n2; 35730270Sbostic 35839012Sbostic # define SET_N(nf,ch) (nf = (ch == '\n')) 35939012Sbostic # define IS_END(ch,nf) (ch == Delimch && nf) 36039012Sbostic 36130270Sbostic c1 = p1->first; 36230270Sbostic c2 = p2->first; 36330270Sbostic if (c1 != c2) 36430270Sbostic return c1 - c2; 36530270Sbostic 36630270Sbostic (void) fseek(Sort_1, p1->pos, 0); 36730270Sbostic (void) fseek(Sort_2, p2->pos, 0); 36830270Sbostic 36939012Sbostic n1 = FALSE; 37039012Sbostic n2 = FALSE; 37130270Sbostic while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0') 37239012Sbostic SET_N(n1, c1); 37330270Sbostic while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0') 37439012Sbostic SET_N(n2, c2); 37530270Sbostic 37639012Sbostic while (!IS_END(c1, n1) && !IS_END(c2, n2)) { 37730270Sbostic if (Iflag) { 37830270Sbostic if (isupper(c1)) 37930270Sbostic c1 = tolower(c1); 38030270Sbostic if (isupper(c2)) 38130270Sbostic c2 = tolower(c2); 38230270Sbostic } 38330270Sbostic if (c1 != c2) 38430270Sbostic return c1 - c2; 38539012Sbostic if (c1 == '\n' || c2 == '\n') 38639012Sbostic atoi("1"); 38739012Sbostic SET_N(n1, c1); 38839012Sbostic SET_N(n2, c2); 38930270Sbostic c1 = getc(Sort_1); 39030270Sbostic c2 = getc(Sort_2); 39130270Sbostic } 39239012Sbostic if (IS_END(c1, n1)) 39339012Sbostic c1 = 0; 39439012Sbostic if (IS_END(c2, n2)) 39539012Sbostic c2 = 0; 39630270Sbostic return c1 - c2; 39730270Sbostic } 39830270Sbostic 39930270Sbostic /* 40030270Sbostic * randomize: 40130270Sbostic * Randomize the order of the string table. We must be careful 40230270Sbostic * not to randomize across delimiter boundaries. All 40330270Sbostic * randomization is done within each block. 40430270Sbostic */ 40539012Sbostic randomize() 40630270Sbostic { 40739012Sbostic register int cnt, i; 40839012Sbostic register off_t tmp; 40939012Sbostic register off_t *sp; 41039012Sbostic extern time_t time(); 41130270Sbostic 41230270Sbostic Tbl.str_flags |= STR_RANDOM; 41339012Sbostic cnt = Tbl.str_numstr; 41430270Sbostic 41539012Sbostic /* 41639012Sbostic * move things around randomly 41739012Sbostic */ 41830270Sbostic 41939012Sbostic for (sp = Seekpts; cnt > 0; cnt--, sp++) { 420*39021Sbostic i = random() % cnt; 42139012Sbostic tmp = sp[0]; 42239012Sbostic sp[0] = sp[i]; 42339012Sbostic sp[i] = tmp; 42430270Sbostic } 42530270Sbostic } 426