1*4887Schin /*********************************************************************** 2*4887Schin * * 3*4887Schin * This software is part of the ast package * 4*4887Schin * Copyright (c) 1992-2007 AT&T Knowledge Ventures * 5*4887Schin * and is licensed under the * 6*4887Schin * Common Public License, Version 1.0 * 7*4887Schin * by AT&T Knowledge Ventures * 8*4887Schin * * 9*4887Schin * A copy of the License is available at * 10*4887Schin * http://www.opensource.org/licenses/cpl1.0.txt * 11*4887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12*4887Schin * * 13*4887Schin * Information and Software Systems Research * 14*4887Schin * AT&T Research * 15*4887Schin * Florham Park NJ * 16*4887Schin * * 17*4887Schin * Glenn Fowler <gsf@research.att.com> * 18*4887Schin * David Korn <dgk@research.att.com> * 19*4887Schin * * 20*4887Schin ***********************************************************************/ 21*4887Schin #pragma prototyped 22*4887Schin /* 23*4887Schin * David Korn 24*4887Schin * AT&T Bell Laboratories 25*4887Schin * 26*4887Schin * cut [-sN] [-f flist] [-c clist] [-d delim] [-D delim] [-r reclen] [file] ... 27*4887Schin * 28*4887Schin * cut fields or columns from fields from a file 29*4887Schin */ 30*4887Schin 31*4887Schin static const char usage[] = 32*4887Schin "[-?\n@(#)$Id: cut (AT&T Research) 2007-01-23 $\n]" 33*4887Schin USAGE_LICENSE 34*4887Schin "[+NAME?cut - cut out selected columns or fields of each line of a file]" 35*4887Schin "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields " 36*4887Schin "from one or more files, contatenating them on standard output.]" 37*4887Schin "[+?The option argument \alist\a is a comma-separated or blank-separated " 38*4887Schin "list of positive numbers and ranges. Ranges can be of three " 39*4887Schin "forms. The first is two positive integers separated by a hyphen " 40*4887Schin "(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to " 41*4887Schin "\ahigh\a. The second is a positive number preceded by a hyphen " 42*4887Schin "(\b-\b\ahigh\a), which represents all fields from field \b1\b to " 43*4887Schin "\ahigh\a. The last is a positive number followed by a hyphen " 44*4887Schin "(\alow\a\b-\b), which represents all fields from \alow\a to the " 45*4887Schin "last field, inclusive. Elements in the \alist\a can be repeated, " 46*4887Schin "can overlap, and can appear in any order. The order of the " 47*4887Schin "output is that of the input.]" 48*4887Schin "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]" 49*4887Schin "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b " 50*4887Schin "cuts from standard input. The start of the file is defined " 51*4887Schin "as the current offset.]" 52*4887Schin "[b:bytes]:[list?\bcut\b based on a list of bytes.]" 53*4887Schin "[c:characters]:[list?\bcut\b based on a list of characters.]" 54*4887Schin "[d:delimiter]:[delim?The field character for the \b-f\b option is set " 55*4887Schin "to \adelim\a. The default is the \btab\b character.]" 56*4887Schin "[f:fields]:[list?\bcut\b based on fields separated by the delimiter " 57*4887Schin "character specified with the \b-d\b optiion.]" 58*4887Schin "[n:nosplit?Do not split characters. Currently ignored.]" 59*4887Schin "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length " 60*4887Schin "records of length \areclen\a when used with the \b-b\b or \b-c\b " 61*4887Schin "option.]" 62*4887Schin "[s:suppress|only-delimited?Suppress lines with no delimiter characters, " 63*4887Schin "when used with the \b-f\b option. By default, lines with no " 64*4887Schin "delimiters will be passsed in untouched.]" 65*4887Schin "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for " 66*4887Schin "the \b-f\b option is set to \aldelim\a. The default is the " 67*4887Schin "\bnewline\b character.]" 68*4887Schin "[N:nonewline?Do not output new-lines at end of each record when used " 69*4887Schin "with the \b-b\b or \b-c\b option.]" 70*4887Schin "\n" 71*4887Schin "\n[file ...]\n" 72*4887Schin "\n" 73*4887Schin "[+EXIT STATUS?]{" 74*4887Schin "[+0?All files processed successfully.]" 75*4887Schin "[+>0?One or more files failed to open or could not be read.]" 76*4887Schin "}" 77*4887Schin "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]" 78*4887Schin ; 79*4887Schin 80*4887Schin #include <cmd.h> 81*4887Schin #include <ctype.h> 82*4887Schin 83*4887Schin typedef struct Last_s 84*4887Schin { 85*4887Schin int seqno; 86*4887Schin int seq; 87*4887Schin int wdelim; 88*4887Schin int ldelim; 89*4887Schin } Last_t; 90*4887Schin 91*4887Schin typedef struct Cut_s 92*4887Schin { 93*4887Schin int cflag; 94*4887Schin int sflag; 95*4887Schin int nlflag; 96*4887Schin int wdelim; 97*4887Schin int ldelim; 98*4887Schin int seqno; 99*4887Schin int reclen; 100*4887Schin signed char space[UCHAR_MAX]; 101*4887Schin Last_t last; 102*4887Schin int list[2]; /* NOTE: must be last member */ 103*4887Schin } Cut_t; 104*4887Schin 105*4887Schin #define HUGE (1<<14) 106*4887Schin #define BLOCK 8*1024 107*4887Schin #define C_BYTES 1 108*4887Schin #define C_CHARS 2 109*4887Schin #define C_FIELDS 4 110*4887Schin #define C_SUPRESS 8 111*4887Schin #define C_NOCHOP 16 112*4887Schin #define C_NONEWLINE 32 113*4887Schin 114*4887Schin /* 115*4887Schin * compare the first of an array of integers 116*4887Schin */ 117*4887Schin 118*4887Schin static int mycomp(register const void *a,register const void *b) 119*4887Schin { 120*4887Schin return(*((int*)a) - *((int*)b)); 121*4887Schin } 122*4887Schin 123*4887Schin static Cut_t *cutinit(int mode,char *str,int wdelim,int ldelim,size_t reclen) 124*4887Schin { 125*4887Schin register int *lp, c, n=0; 126*4887Schin register int range = 0; 127*4887Schin register char *cp = str; 128*4887Schin Cut_t *cuthdr; 129*4887Schin if (!(cuthdr = (Cut_t*)stakalloc(sizeof(Cut_t)+strlen(cp)*sizeof(int)))) 130*4887Schin error(ERROR_exit(1), "out of space"); 131*4887Schin memset(cuthdr->space, 0, sizeof(cuthdr->space)); 132*4887Schin cuthdr->last.seqno = 0; 133*4887Schin cuthdr->last.seq = 0; 134*4887Schin cuthdr->last.wdelim = 0; 135*4887Schin cuthdr->last.ldelim = '\n'; 136*4887Schin cuthdr->cflag = ((mode&C_CHARS)!=0 && mbwide()); 137*4887Schin cuthdr->sflag = ((mode&C_SUPRESS)!=0); 138*4887Schin cuthdr->nlflag = ((mode&C_NONEWLINE)!=0); 139*4887Schin cuthdr->wdelim = wdelim; 140*4887Schin cuthdr->ldelim = ldelim; 141*4887Schin cuthdr->reclen = reclen; 142*4887Schin cuthdr->seqno = ++cuthdr->last.seqno; 143*4887Schin lp = cuthdr->list; 144*4887Schin while(1) switch(c= *cp++) 145*4887Schin { 146*4887Schin case ' ': 147*4887Schin case '\t': 148*4887Schin while(*cp==' ' || *cp=='\t') 149*4887Schin cp++; 150*4887Schin case 0: 151*4887Schin case ',': 152*4887Schin if(range) 153*4887Schin { 154*4887Schin --range; 155*4887Schin if((n = (n==0?HUGE:n-range)) < 0) 156*4887Schin error(ERROR_exit(1),"invalid range for c/f option"); 157*4887Schin *lp++ = range; 158*4887Schin *lp++ = n; 159*4887Schin } 160*4887Schin else 161*4887Schin { 162*4887Schin *lp++ = --n; 163*4887Schin *lp++ = 1; 164*4887Schin } 165*4887Schin if(c==0) 166*4887Schin { 167*4887Schin register int *dp; 168*4887Schin *lp = HUGE; 169*4887Schin n = 1 + (lp-cuthdr->list)/2; 170*4887Schin qsort(lp=cuthdr->list,n,2*sizeof(*lp),mycomp); 171*4887Schin /* eliminate overlapping regions */ 172*4887Schin for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2) 173*4887Schin { 174*4887Schin if(lp[0] <= range) 175*4887Schin { 176*4887Schin if(lp[1]==HUGE) 177*4887Schin { 178*4887Schin dp[-1] = HUGE; 179*4887Schin break; 180*4887Schin } 181*4887Schin if((c = lp[0]+lp[1]-range)>0) 182*4887Schin { 183*4887Schin range += c; 184*4887Schin dp[-1] += c; 185*4887Schin } 186*4887Schin } 187*4887Schin else 188*4887Schin { 189*4887Schin range = *dp++ = lp[0]; 190*4887Schin if(lp[1]==HUGE) 191*4887Schin { 192*4887Schin *dp++ = HUGE; 193*4887Schin break; 194*4887Schin } 195*4887Schin range += (*dp++ = lp[1]); 196*4887Schin } 197*4887Schin } 198*4887Schin *dp = HUGE; 199*4887Schin lp = cuthdr->list; 200*4887Schin /* convert ranges into gaps */ 201*4887Schin for(n=0; *lp!=HUGE; lp+=2) 202*4887Schin { 203*4887Schin c = *lp; 204*4887Schin *lp -= n; 205*4887Schin n = c+lp[1]; 206*4887Schin } 207*4887Schin return(cuthdr); 208*4887Schin } 209*4887Schin n = range = 0; 210*4887Schin break; 211*4887Schin 212*4887Schin case '-': 213*4887Schin if(range) 214*4887Schin error(ERROR_exit(1),"bad list for c/f option"); 215*4887Schin range = n?n:1; 216*4887Schin n = 0; 217*4887Schin break; 218*4887Schin 219*4887Schin default: 220*4887Schin if(!isdigit(c)) 221*4887Schin error(ERROR_exit(1),"bad list for c/f option"); 222*4887Schin n = 10*n + (c-'0'); 223*4887Schin } 224*4887Schin /* NOTREACHED */ 225*4887Schin } 226*4887Schin 227*4887Schin /* 228*4887Schin * advance <cp> by <n> multi-byte characters 229*4887Schin */ 230*4887Schin static int advance(const char *str, register int n, register int inlen) 231*4887Schin { 232*4887Schin register int size, len=inlen; 233*4887Schin register const char *cp=str; 234*4887Schin while(len>0 && n-->0) 235*4887Schin { 236*4887Schin size = mblen(cp, len); 237*4887Schin if(size<0) 238*4887Schin size = 1; 239*4887Schin cp += size; 240*4887Schin len -= size; 241*4887Schin 242*4887Schin } 243*4887Schin if(n>0) 244*4887Schin return(inlen+1); 245*4887Schin return(cp-str); 246*4887Schin } 247*4887Schin 248*4887Schin /* 249*4887Schin * cut each line of file <fdin> and put results to <fdout> using list <list> 250*4887Schin */ 251*4887Schin 252*4887Schin static int cutcols(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout) 253*4887Schin { 254*4887Schin register int c, ncol=0,len; 255*4887Schin register const int *lp = cuthdr->list; 256*4887Schin register char *inp; 257*4887Schin register int skip; /* non-zero for don't copy */ 258*4887Schin while(1) 259*4887Schin { 260*4887Schin if(len = cuthdr->reclen) 261*4887Schin inp = sfreserve(fdin, len, -1); 262*4887Schin else 263*4887Schin inp = sfgetr(fdin, '\n', 0); 264*4887Schin if(!inp && !(inp = sfgetr(fdin, 0, SF_LASTR))) 265*4887Schin break; 266*4887Schin len = sfvalue(fdin); 267*4887Schin if((ncol = skip = *(lp = cuthdr->list)) == 0) 268*4887Schin ncol = *++lp; 269*4887Schin while(1) 270*4887Schin { 271*4887Schin if((c=(cuthdr->cflag?advance(inp,ncol,len):ncol)) > len) 272*4887Schin c = len; 273*4887Schin else if(c==len && !skip) 274*4887Schin ncol++; 275*4887Schin ncol -= c; 276*4887Schin if(!skip && sfwrite(fdout,(char*)inp,c)<0) 277*4887Schin return(-1); 278*4887Schin inp += c; 279*4887Schin if(ncol) 280*4887Schin break; 281*4887Schin len -= c; 282*4887Schin ncol = *++lp; 283*4887Schin skip = !skip; 284*4887Schin } 285*4887Schin if(!cuthdr->nlflag && (skip || cuthdr->reclen)) 286*4887Schin sfputc(fdout,cuthdr->ldelim); 287*4887Schin } 288*4887Schin return(c); 289*4887Schin } 290*4887Schin 291*4887Schin /* 292*4887Schin * cut each line of file <fdin> and put results to <fdout> using list <list> 293*4887Schin * stream <fdin> must be line buffered 294*4887Schin */ 295*4887Schin 296*4887Schin #define endline(c) (((signed char)-1)<0?(c)<0:(c)==((char)-1)) 297*4887Schin 298*4887Schin static int cutfields(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout) 299*4887Schin { 300*4887Schin register unsigned char *cp; 301*4887Schin register int c, nfields; 302*4887Schin register const int *lp = cuthdr->list; 303*4887Schin register unsigned char *copy; 304*4887Schin register int nodelim, empty, inword=0; 305*4887Schin register unsigned char *endbuff; 306*4887Schin unsigned char *inbuff, *first; 307*4887Schin int lastchar; 308*4887Schin Sfio_t *fdtmp = 0; 309*4887Schin long offset = 0; 310*4887Schin if(cuthdr->seqno != cuthdr->last.seq) 311*4887Schin { 312*4887Schin cuthdr->space[cuthdr->last.ldelim] = 0; 313*4887Schin cuthdr->space[cuthdr->last.wdelim] = 0; 314*4887Schin cuthdr->space[cuthdr->last.wdelim=cuthdr->wdelim] = 1; 315*4887Schin cuthdr->space[cuthdr->last.ldelim=cuthdr->ldelim] = -1; 316*4887Schin cuthdr->last.seq = cuthdr->seqno; 317*4887Schin } 318*4887Schin /* process each buffer */ 319*4887Schin while ((inbuff = (unsigned char*)sfreserve(fdin, SF_UNBOUND, 0)) && (c = sfvalue(fdin)) > 0) 320*4887Schin { 321*4887Schin cp = inbuff; 322*4887Schin endbuff = cp + --c; 323*4887Schin if((lastchar = cp[c]) != cuthdr->ldelim) 324*4887Schin *endbuff = cuthdr->ldelim; 325*4887Schin /* process each line in the buffer */ 326*4887Schin while(cp <= endbuff) 327*4887Schin { 328*4887Schin first = cp; 329*4887Schin if(!inword) 330*4887Schin { 331*4887Schin nodelim = empty = 1; 332*4887Schin copy = cp; 333*4887Schin if(nfields = *(lp = cuthdr->list)) 334*4887Schin copy = 0; 335*4887Schin else 336*4887Schin nfields = *++lp; 337*4887Schin } 338*4887Schin else if(copy) 339*4887Schin copy = cp; 340*4887Schin inword = 0; 341*4887Schin while(!inword) 342*4887Schin { 343*4887Schin /* skip over non-delimiter characters */ 344*4887Schin while(!(c=cuthdr->space[*cp++])); 345*4887Schin /* check for end-of-line */ 346*4887Schin if(endline(c)) 347*4887Schin { 348*4887Schin if(cp<=endbuff) 349*4887Schin break; 350*4887Schin if((c=cuthdr->space[lastchar]),endline(c)) 351*4887Schin break; 352*4887Schin /* restore cuthdr->last. character */ 353*4887Schin if(lastchar != cuthdr->ldelim) 354*4887Schin *endbuff = lastchar; 355*4887Schin inword++; 356*4887Schin if(!c) 357*4887Schin break; 358*4887Schin } 359*4887Schin nodelim = 0; 360*4887Schin if(--nfields >0) 361*4887Schin continue; 362*4887Schin nfields = *++lp; 363*4887Schin if(copy) 364*4887Schin { 365*4887Schin empty = 0; 366*4887Schin if((c=(cp-1)-copy)>0 && sfwrite(fdout,(char*)copy,c)< 0) 367*4887Schin goto failed; 368*4887Schin copy = 0; 369*4887Schin } 370*4887Schin else 371*4887Schin /* set to delimiter unless the first field */ 372*4887Schin copy = cp -!empty; 373*4887Schin } 374*4887Schin if(!inword) 375*4887Schin { 376*4887Schin if(!copy) 377*4887Schin { 378*4887Schin if(nodelim) 379*4887Schin { 380*4887Schin if(!cuthdr->sflag) 381*4887Schin { 382*4887Schin if(offset) 383*4887Schin { 384*4887Schin sfseek(fdtmp,(Sfoff_t)0,SEEK_SET); 385*4887Schin sfmove(fdtmp,fdout,offset,-1); 386*4887Schin } 387*4887Schin copy = first; 388*4887Schin } 389*4887Schin } 390*4887Schin else 391*4887Schin sfputc(fdout,'\n'); 392*4887Schin } 393*4887Schin if(offset) 394*4887Schin sfseek(fdtmp,offset=0,SEEK_SET); 395*4887Schin } 396*4887Schin if(copy && (c=cp-copy)>0 && (!nodelim || !cuthdr->sflag) && sfwrite(fdout,(char*)copy,c)< 0) 397*4887Schin goto failed; 398*4887Schin } 399*4887Schin /* see whether to save in tmp file */ 400*4887Schin if(nodelim && inword && !cuthdr->sflag && (c=cp-first)>0) 401*4887Schin { 402*4887Schin /* copy line to tmpfile in case no fields */ 403*4887Schin if(!fdtmp) 404*4887Schin fdtmp = sftmp(BLOCK); 405*4887Schin sfwrite(fdtmp,(char*)first,c); 406*4887Schin offset +=c; 407*4887Schin } 408*4887Schin } 409*4887Schin failed: 410*4887Schin if(fdtmp) 411*4887Schin sfclose(fdtmp); 412*4887Schin return(0); 413*4887Schin } 414*4887Schin 415*4887Schin int 416*4887Schin b_cut(int argc,char *argv[], void* context) 417*4887Schin { 418*4887Schin register char *cp = 0; 419*4887Schin register Sfio_t *fp; 420*4887Schin int n; 421*4887Schin Cut_t *cuthdr; 422*4887Schin int mode = 0; 423*4887Schin int wdelim = '\t'; 424*4887Schin int ldelim = '\n'; 425*4887Schin size_t reclen = 0; 426*4887Schin 427*4887Schin cmdinit(argc, argv, context, ERROR_CATALOG, 0); 428*4887Schin while (n = optget(argv, usage)) switch (n) 429*4887Schin { 430*4887Schin case 'b': 431*4887Schin case 'c': 432*4887Schin if(mode&C_FIELDS) 433*4887Schin { 434*4887Schin error(2, "f option already specified"); 435*4887Schin break; 436*4887Schin } 437*4887Schin cp = opt_info.arg; 438*4887Schin if(n=='b') 439*4887Schin mode |= C_BYTES; 440*4887Schin else 441*4887Schin mode |= C_CHARS; 442*4887Schin break; 443*4887Schin case 'D': 444*4887Schin ldelim = *(unsigned char*)opt_info.arg; 445*4887Schin break; 446*4887Schin case 'd': 447*4887Schin wdelim = *(unsigned char*)opt_info.arg; 448*4887Schin break; 449*4887Schin case 'f': 450*4887Schin if(mode&(C_CHARS|C_BYTES)) 451*4887Schin { 452*4887Schin error(2, "c option already specified"); 453*4887Schin break; 454*4887Schin } 455*4887Schin cp = opt_info.arg; 456*4887Schin mode |= C_FIELDS; 457*4887Schin break; 458*4887Schin case 'n': 459*4887Schin mode |= C_NOCHOP; 460*4887Schin break; 461*4887Schin case 'N': 462*4887Schin mode |= C_NONEWLINE; 463*4887Schin break; 464*4887Schin case 'R': 465*4887Schin case 'r': 466*4887Schin if(opt_info.num>0) 467*4887Schin reclen = opt_info.num; 468*4887Schin break; 469*4887Schin case 's': 470*4887Schin mode |= C_SUPRESS; 471*4887Schin break; 472*4887Schin case ':': 473*4887Schin error(2, "%s", opt_info.arg); 474*4887Schin break; 475*4887Schin case '?': 476*4887Schin error(ERROR_usage(2), "%s", opt_info.arg); 477*4887Schin break; 478*4887Schin } 479*4887Schin argv += opt_info.index; 480*4887Schin if (error_info.errors) 481*4887Schin error(ERROR_usage(2), "%s",optusage(NiL)); 482*4887Schin if(!cp) 483*4887Schin { 484*4887Schin error(2, "b, c or f option must be specified"); 485*4887Schin error(ERROR_usage(2), "%s", optusage(NiL)); 486*4887Schin } 487*4887Schin if(!*cp) 488*4887Schin error(3, "non-empty b, c or f option must be specified"); 489*4887Schin if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS) 490*4887Schin error(3, "s option requires f option"); 491*4887Schin cuthdr = cutinit(mode,cp,wdelim,ldelim,reclen); 492*4887Schin if(cp = *argv) 493*4887Schin argv++; 494*4887Schin do 495*4887Schin { 496*4887Schin if(!cp || streq(cp,"-")) 497*4887Schin fp = sfstdin; 498*4887Schin else if(!(fp = sfopen(NiL,cp,"r"))) 499*4887Schin { 500*4887Schin error(ERROR_system(0),"%s: cannot open",cp); 501*4887Schin continue; 502*4887Schin } 503*4887Schin if(mode&C_FIELDS) 504*4887Schin cutfields(cuthdr,fp,sfstdout); 505*4887Schin else 506*4887Schin cutcols(cuthdr,fp,sfstdout); 507*4887Schin if(fp!=sfstdin) 508*4887Schin sfclose(fp); 509*4887Schin } 510*4887Schin while(cp= *argv++); 511*4887Schin return(error_info.errors?1:0); 512*4887Schin } 513