1*39012Sbostic /* $Header: strfile.c,v 1.19 89/05/05 16:07:46 arnold Exp $ */ 230270Sbostic 330270Sbostic # include <stdio.h> 430270Sbostic # include <ctype.h> 5*39012Sbostic # include <sys/types.h> 6*39012Sbostic # include <sys/param.h> 730270Sbostic # include "strfile.h" 8*39012Sbostic # include "random.h" 930270Sbostic 10*39012Sbostic # ifndef MAXPATHLEN 11*39012Sbostic # define MAXPATHLEN 1024 12*39012Sbostic # endif /* MAXPATHLEN */ 13*39012Sbostic 1430270Sbostic /* 1530270Sbostic * This program takes a file composed of strings seperated by 1630270Sbostic * lines starting with two consecutive delimiting character (default 1730270Sbostic * character is '%') and creates another file which consists of a table 1830270Sbostic * describing the file (structure from "strfile.h"), a table of seek 1930270Sbostic * pointers to the start of the strings, and the strings, each terinated 2030270Sbostic * by a null byte. Usage: 2130270Sbostic * 2230270Sbostic * % strfile [ - ] [ -cC ] [ -sv ] [ -oir ] sourcefile [ datafile ] 2330270Sbostic * 2430270Sbostic * - - Give a usage summary useful for jogging the memory 2530270Sbostic * c - Change delimiting character from '%' to 'C' 2630270Sbostic * s - Silent. Give no summary of data processed at the end of 2730270Sbostic * the run. 2830270Sbostic * v - Verbose. Give summary of data processed. (Default) 2930270Sbostic * o - order the strings in alphabetic order 3030270Sbostic * i - if ordering, ignore case 3130270Sbostic * r - randomize the order of the strings 3230270Sbostic * 3330270Sbostic * Ken Arnold Sept. 7, 1978 -- 3430270Sbostic * 3530270Sbostic * Added method to indicate dividers. A "%-" will cause the address 3630270Sbostic * to be added to the structure in one of the pointer elements. 3730270Sbostic * 3830270Sbostic * Ken Arnold Nov., 1984 -- 3930270Sbostic * 4030270Sbostic * Added ordering options. 4130270Sbostic */ 4230270Sbostic 4330270Sbostic # define TRUE 1 4430270Sbostic # define FALSE 0 4530270Sbostic 46*39012Sbostic # define STORING_PTRS (Oflag || Rflag) 47*39012Sbostic # define CHUNKSIZE 512 4830270Sbostic 49*39012Sbostic #ifdef lint 50*39012Sbostic # define ALWAYS atoi("1") 51*39012Sbostic #else 52*39012Sbostic # define ALWAYS 1 53*39012Sbostic #endif 54*39012Sbostic # define ALLOC(ptr,sz) if (ALWAYS) { \ 55*39012Sbostic if (ptr == NULL) \ 56*39012Sbostic ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \ 57*39012Sbostic else if (((sz) + 1) % CHUNKSIZE == 0) \ 58*39012Sbostic ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \ 59*39012Sbostic if (ptr == NULL) { \ 60*39012Sbostic fprintf(stderr, "out of space\n"); \ 61*39012Sbostic exit(1); \ 62*39012Sbostic } \ 63*39012Sbostic } else 64*39012Sbostic 65*39012Sbostic #ifdef NO_VOID 66*39012Sbostic # define void char 67*39012Sbostic #endif 68*39012Sbostic 6930270Sbostic typedef struct { 7030270Sbostic char first; 71*39012Sbostic off_t pos; 7230270Sbostic } STR; 7330270Sbostic 7430270Sbostic char *Infile = NULL, /* input file name */ 75*39012Sbostic Outfile[MAXPATHLEN] = "", /* output file name */ 7630270Sbostic Delimch = '%', /* delimiting character */ 7730270Sbostic *Usage[] = { /* usage summary */ 7830270Sbostic "usage: strfile [ - ] [ -cC ] [ -sv ] [ -oir ] inputfile [ datafile ]", 7930270Sbostic " - - Give this usage summary", 8030270Sbostic " c - Replace delimiting character with 'C'", 8130270Sbostic " s - Silent. Give no summary", 8230270Sbostic " v - Verbose. Give summary. (default)", 8330270Sbostic " o - order strings alphabetically", 8430270Sbostic " i - ignore case in ordering", 8530270Sbostic " r - randomize the order of the strings", 8630270Sbostic " Default \"datafile\" is inputfile.dat", 8730270Sbostic }; 8830270Sbostic 8930270Sbostic int Sflag = FALSE; /* silent run flag */ 9030270Sbostic int Oflag = FALSE; /* ordering flag */ 9130270Sbostic int Iflag = FALSE; /* ignore case flag */ 9230270Sbostic int Rflag = FALSE; /* randomize order flag */ 93*39012Sbostic int Num_pts = 0; /* number of pointers/strings */ 9430270Sbostic 95*39012Sbostic off_t *Seekpts; 9630270Sbostic 9730270Sbostic FILE *Sort_1, *Sort_2; /* pointers for sorting */ 9830270Sbostic 9930270Sbostic STRFILE Tbl; /* statistics table */ 10030270Sbostic 10130270Sbostic STR *Firstch; /* first chars of each string */ 10230270Sbostic 103*39012Sbostic char *fgets(), *strcpy(), *strcat(); 10430270Sbostic 105*39012Sbostic void *malloc(), *realloc(); 10630270Sbostic 107*39012Sbostic /* 108*39012Sbostic * main: 109*39012Sbostic * Drive the sucker. There are two main modes -- either we store 110*39012Sbostic * the seek pointers, if the table is to be sorted or randomized, 111*39012Sbostic * or we write the pointer directly to the file, if we are to stay 112*39012Sbostic * in file order. If the former, we allocate and re-allocate in 113*39012Sbostic * CHUNKSIZE blocks; if the latter, we just write each pointer, 114*39012Sbostic * and then seek back to the beginning to write in the table. 115*39012Sbostic */ 11630270Sbostic main(ac, av) 11730270Sbostic int ac; 11830270Sbostic char **av; 11930270Sbostic { 12030270Sbostic register char *sp, dc; 12130270Sbostic register FILE *inf, *outf; 122*39012Sbostic register off_t last_off, length, pos; 12330270Sbostic register int first; 12430270Sbostic register char *nsp; 12530270Sbostic register STR *fp; 12630270Sbostic static char string[257]; 12730270Sbostic 12830270Sbostic getargs(ac, av); /* evalute arguments */ 12930270Sbostic dc = Delimch; 13030270Sbostic if ((inf = fopen(Infile, "r")) == NULL) { 13130270Sbostic perror(Infile); 13230270Sbostic exit(-1); 13330270Sbostic } 13430270Sbostic 13530270Sbostic if ((outf = fopen(Outfile, "w")) == NULL) { 13630270Sbostic perror(Outfile); 13730270Sbostic exit(-1); 13830270Sbostic } 139*39012Sbostic if (!STORING_PTRS) 140*39012Sbostic (void) fseek(outf, sizeof Tbl, 0); 14130270Sbostic 14230270Sbostic /* 143*39012Sbostic * Write the strings onto the file 14430270Sbostic */ 14530270Sbostic 14630270Sbostic Tbl.str_longlen = 0; 14730270Sbostic Tbl.str_shortlen = (unsigned int) 0xffffffff; 148*39012Sbostic Tbl.str_delim = dc; 14930270Sbostic first = Oflag; 150*39012Sbostic add_offset(outf, ftell(inf)); 151*39012Sbostic last_off = 0; 15230270Sbostic do { 153*39012Sbostic if (Num_pts > 508) 154*39012Sbostic atoi("1"); 15530270Sbostic sp = fgets(string, 256, inf); 156*39012Sbostic if (sp == NULL || (sp[0] == dc && sp[1] == dc)) { 157*39012Sbostic pos = ftell(inf); 158*39012Sbostic add_offset(outf, pos); 159*39012Sbostic length = pos - last_off - strlen(sp); 160*39012Sbostic last_off = pos; 161*39012Sbostic if (Tbl.str_longlen < length) 162*39012Sbostic Tbl.str_longlen = length; 163*39012Sbostic if (Tbl.str_shortlen > length) 164*39012Sbostic Tbl.str_shortlen = length; 16530270Sbostic first = Oflag; 16630270Sbostic } 16730270Sbostic else { 16830270Sbostic if (first) { 16930270Sbostic for (nsp = sp; !isalnum(*nsp); nsp++) 17030270Sbostic continue; 171*39012Sbostic ALLOC(Firstch, Num_pts); 172*39012Sbostic fp = &Firstch[Num_pts - 1]; 17330270Sbostic if (Iflag && isupper(*nsp)) 17430270Sbostic fp->first = tolower(*nsp); 17530270Sbostic else 17630270Sbostic fp->first = *nsp; 177*39012Sbostic fp->pos = Seekpts[Num_pts - 1]; 17830270Sbostic first = FALSE; 17930270Sbostic } 18030270Sbostic } 18130270Sbostic } while (sp != NULL); 18230270Sbostic 18330270Sbostic /* 18430270Sbostic * write the tables in 18530270Sbostic */ 18630270Sbostic 18730270Sbostic (void) fclose(inf); 188*39012Sbostic Tbl.str_numstr = Num_pts - 1; 18930270Sbostic 19030270Sbostic if (Oflag) 191*39012Sbostic do_order(); 19230270Sbostic else if (Rflag) 193*39012Sbostic randomize(); 19430270Sbostic 195*39012Sbostic (void) fseek(outf, (off_t) 0, 0); 19630270Sbostic (void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf); 197*39012Sbostic if (STORING_PTRS) 198*39012Sbostic (void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf); 19930270Sbostic (void) fclose(outf); 20030270Sbostic 20130270Sbostic if (!Sflag) { 202*39012Sbostic printf("\"%s\" created\n", Outfile); 203*39012Sbostic if (Num_pts == 2) 20430270Sbostic puts("There was 1 string"); 20530270Sbostic else 206*39012Sbostic printf("There were %u strings\n", Num_pts - 1); 20730270Sbostic printf("Longest string: %u byte%s\n", Tbl.str_longlen, 20830270Sbostic Tbl.str_longlen == 1 ? "" : "s"); 20930270Sbostic printf("Shortest string: %u byte%s\n", Tbl.str_shortlen, 21030270Sbostic Tbl.str_shortlen == 1 ? "" : "s"); 21130270Sbostic } 21230270Sbostic exit(0); 213*39012Sbostic /* NOTREACHED */ 21430270Sbostic } 21530270Sbostic 21630270Sbostic /* 21730270Sbostic * This routine evaluates arguments from the command line 21830270Sbostic */ 21930270Sbostic getargs(ac, av) 22030270Sbostic register int ac; 22130270Sbostic register char **av; 22230270Sbostic { 22330270Sbostic register char *sp; 22430270Sbostic register int i; 22530270Sbostic register int bad, j; 22630270Sbostic 22730270Sbostic bad = 0; 22830270Sbostic for (i = 1; i < ac; i++) 22930270Sbostic if (*av[i] == '-' && av[i][1]) { 23030270Sbostic for (sp = &av[i][1]; *sp; sp++) 23130270Sbostic switch (*sp) { 23230270Sbostic case 'c': /* new delimiting char */ 23330270Sbostic if ((Delimch = *++sp) == '\0') { 23430270Sbostic --sp; 23530270Sbostic Delimch = *av[++i]; 23630270Sbostic } 237*39012Sbostic if (!isascii(Delimch)) { 23830270Sbostic printf("bad delimiting character: '\\%o\n'", 23930270Sbostic Delimch); 24030270Sbostic bad++; 24130270Sbostic } 24230270Sbostic break; 24330270Sbostic case 's': /* silent */ 24430270Sbostic Sflag++; 24530270Sbostic break; 24630270Sbostic case 'v': /* verbose */ 24730270Sbostic Sflag = 0; 24830270Sbostic break; 24930270Sbostic case 'o': /* order strings */ 25030270Sbostic Oflag++; 25130270Sbostic break; 25230270Sbostic case 'i': /* ignore case in ordering */ 25330270Sbostic Iflag++; 25430270Sbostic break; 25530270Sbostic case 'r': /* ignore case in ordering */ 25630270Sbostic Rflag++; 25730270Sbostic break; 25830270Sbostic default: /* unknown flag */ 25930270Sbostic bad++; 26030270Sbostic printf("bad flag: '%c'\n", *sp); 26130270Sbostic break; 26230270Sbostic } 26330270Sbostic } 26430270Sbostic else if (*av[i] == '-') { 26530270Sbostic for (j = 0; Usage[j]; j++) 26630270Sbostic puts(Usage[j]); 26730270Sbostic exit(0); 26830270Sbostic } 26930270Sbostic else if (Infile) 27030270Sbostic (void) strcpy(Outfile, av[i]); 27130270Sbostic else 27230270Sbostic Infile = av[i]; 27330270Sbostic if (!Infile) { 27430270Sbostic bad++; 27530270Sbostic puts("No input file name"); 27630270Sbostic } 27730270Sbostic if (*Outfile == '\0' && !bad) { 27830270Sbostic (void) strcpy(Outfile, Infile); 27930270Sbostic (void) strcat(Outfile, ".dat"); 28030270Sbostic } 28130270Sbostic if (bad) { 28230270Sbostic puts("use \"strfile -\" to get usage"); 28330270Sbostic exit(-1); 28430270Sbostic } 28530270Sbostic } 28630270Sbostic 28730270Sbostic /* 288*39012Sbostic * add_offset: 289*39012Sbostic * Add an offset to the list, or write it out, as appropriate. 290*39012Sbostic */ 291*39012Sbostic add_offset(fp, off) 292*39012Sbostic FILE *fp; 293*39012Sbostic off_t off; 294*39012Sbostic { 295*39012Sbostic if (!STORING_PTRS) 296*39012Sbostic fwrite(&off, 1, sizeof off, fp); 297*39012Sbostic else { 298*39012Sbostic ALLOC(Seekpts, Num_pts + 1); 299*39012Sbostic Seekpts[Num_pts] = off; 300*39012Sbostic } 301*39012Sbostic Num_pts++; 302*39012Sbostic } 303*39012Sbostic 304*39012Sbostic /* 30530270Sbostic * do_order: 30630270Sbostic * Order the strings alphabetically (possibly ignoring case). 30730270Sbostic */ 308*39012Sbostic do_order() 30930270Sbostic { 31030270Sbostic register int i; 311*39012Sbostic register off_t *lp; 31230270Sbostic register STR *fp; 31330270Sbostic extern int cmp_str(); 31430270Sbostic 315*39012Sbostic Sort_1 = fopen(Infile, "r"); 316*39012Sbostic Sort_2 = fopen(Infile, "r"); 317*39012Sbostic qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str); 31830270Sbostic i = Tbl.str_numstr; 319*39012Sbostic lp = Seekpts; 32030270Sbostic fp = Firstch; 32130270Sbostic while (i--) 32230270Sbostic *lp++ = fp++->pos; 32330270Sbostic (void) fclose(Sort_1); 32430270Sbostic (void) fclose(Sort_2); 32530270Sbostic Tbl.str_flags |= STR_ORDERED; 32630270Sbostic } 32730270Sbostic 32830270Sbostic /* 32930270Sbostic * cmp_str: 33030270Sbostic * Compare two strings in the file 33130270Sbostic */ 332*39012Sbostic char * 333*39012Sbostic unctrl(c) 334*39012Sbostic char c; 335*39012Sbostic { 336*39012Sbostic static char buf[3]; 337*39012Sbostic 338*39012Sbostic if (isprint(c)) { 339*39012Sbostic buf[0] = c; 340*39012Sbostic buf[1] = '\0'; 341*39012Sbostic } 342*39012Sbostic else if (c == 0177) { 343*39012Sbostic buf[0] = '^'; 344*39012Sbostic buf[1] = '?'; 345*39012Sbostic } 346*39012Sbostic else { 347*39012Sbostic buf[0] = '^'; 348*39012Sbostic buf[1] = c + 'A' - 1; 349*39012Sbostic } 350*39012Sbostic return buf; 351*39012Sbostic } 352*39012Sbostic 35330270Sbostic cmp_str(p1, p2) 35430270Sbostic STR *p1, *p2; 35530270Sbostic { 35630270Sbostic register int c1, c2; 357*39012Sbostic register int n1, n2; 35830270Sbostic 359*39012Sbostic # define SET_N(nf,ch) (nf = (ch == '\n')) 360*39012Sbostic # define IS_END(ch,nf) (ch == Delimch && nf) 361*39012Sbostic 36230270Sbostic c1 = p1->first; 36330270Sbostic c2 = p2->first; 36430270Sbostic if (c1 != c2) 36530270Sbostic return c1 - c2; 36630270Sbostic 36730270Sbostic (void) fseek(Sort_1, p1->pos, 0); 36830270Sbostic (void) fseek(Sort_2, p2->pos, 0); 36930270Sbostic 370*39012Sbostic n1 = FALSE; 371*39012Sbostic n2 = FALSE; 37230270Sbostic while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0') 373*39012Sbostic SET_N(n1, c1); 37430270Sbostic while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0') 375*39012Sbostic SET_N(n2, c2); 37630270Sbostic 377*39012Sbostic while (!IS_END(c1, n1) && !IS_END(c2, n2)) { 37830270Sbostic if (Iflag) { 37930270Sbostic if (isupper(c1)) 38030270Sbostic c1 = tolower(c1); 38130270Sbostic if (isupper(c2)) 38230270Sbostic c2 = tolower(c2); 38330270Sbostic } 38430270Sbostic if (c1 != c2) 38530270Sbostic return c1 - c2; 386*39012Sbostic if (c1 == '\n' || c2 == '\n') 387*39012Sbostic atoi("1"); 388*39012Sbostic SET_N(n1, c1); 389*39012Sbostic SET_N(n2, c2); 39030270Sbostic c1 = getc(Sort_1); 39130270Sbostic c2 = getc(Sort_2); 39230270Sbostic } 393*39012Sbostic if (IS_END(c1, n1)) 394*39012Sbostic c1 = 0; 395*39012Sbostic if (IS_END(c2, n2)) 396*39012Sbostic c2 = 0; 39730270Sbostic return c1 - c2; 39830270Sbostic } 39930270Sbostic 40030270Sbostic /* 40130270Sbostic * randomize: 40230270Sbostic * Randomize the order of the string table. We must be careful 40330270Sbostic * not to randomize across delimiter boundaries. All 40430270Sbostic * randomization is done within each block. 40530270Sbostic */ 406*39012Sbostic randomize() 40730270Sbostic { 408*39012Sbostic register int cnt, i; 409*39012Sbostic register off_t tmp; 410*39012Sbostic register off_t *sp; 411*39012Sbostic extern time_t time(); 41230270Sbostic 41330270Sbostic Tbl.str_flags |= STR_RANDOM; 414*39012Sbostic cnt = Tbl.str_numstr; 41530270Sbostic 416*39012Sbostic /* 417*39012Sbostic * move things around randomly 418*39012Sbostic */ 41930270Sbostic 420*39012Sbostic for (sp = Seekpts; cnt > 0; cnt--, sp++) { 421*39012Sbostic i = rnd((long) cnt); 422*39012Sbostic tmp = sp[0]; 423*39012Sbostic sp[0] = sp[i]; 424*39012Sbostic sp[i] = tmp; 42530270Sbostic } 42630270Sbostic } 427