14887Schin /*********************************************************************** 24887Schin * * 34887Schin * This software is part of the ast package * 4*8462SApril.Chin@Sun.COM * Copyright (c) 1992-2008 AT&T Intellectual Property * 54887Schin * and is licensed under the * 64887Schin * Common Public License, Version 1.0 * 7*8462SApril.Chin@Sun.COM * by AT&T Intellectual Property * 84887Schin * * 94887Schin * A copy of the License is available at * 104887Schin * http://www.opensource.org/licenses/cpl1.0.txt * 114887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 124887Schin * * 134887Schin * Information and Software Systems Research * 144887Schin * AT&T Research * 154887Schin * Florham Park NJ * 164887Schin * * 174887Schin * Glenn Fowler <gsf@research.att.com> * 184887Schin * David Korn <dgk@research.att.com> * 194887Schin * * 204887Schin ***********************************************************************/ 214887Schin #pragma prototyped 224887Schin /* 234887Schin * David Korn 244887Schin * AT&T Bell Laboratories 254887Schin * 264887Schin * cut [-sN] [-f flist] [-c clist] [-d delim] [-D delim] [-r reclen] [file] ... 274887Schin * 284887Schin * cut fields or columns from fields from a file 294887Schin */ 304887Schin 314887Schin static const char usage[] = 32*8462SApril.Chin@Sun.COM "[-?\n@(#)$Id: cut (AT&T Research) 2008-04-01 $\n]" 334887Schin USAGE_LICENSE 344887Schin "[+NAME?cut - cut out selected columns or fields of each line of a file]" 354887Schin "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields " 364887Schin "from one or more files, contatenating them on standard output.]" 374887Schin "[+?The option argument \alist\a is a comma-separated or blank-separated " 384887Schin "list of positive numbers and ranges. Ranges can be of three " 394887Schin "forms. The first is two positive integers separated by a hyphen " 404887Schin "(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to " 414887Schin "\ahigh\a. The second is a positive number preceded by a hyphen " 424887Schin "(\b-\b\ahigh\a), which represents all fields from field \b1\b to " 434887Schin "\ahigh\a. The last is a positive number followed by a hyphen " 444887Schin "(\alow\a\b-\b), which represents all fields from \alow\a to the " 454887Schin "last field, inclusive. Elements in the \alist\a can be repeated, " 464887Schin "can overlap, and can appear in any order. The order of the " 474887Schin "output is that of the input.]" 484887Schin "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]" 494887Schin "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b " 504887Schin "cuts from standard input. The start of the file is defined " 514887Schin "as the current offset.]" 524887Schin "[b:bytes]:[list?\bcut\b based on a list of bytes.]" 534887Schin "[c:characters]:[list?\bcut\b based on a list of characters.]" 544887Schin "[d:delimiter]:[delim?The field character for the \b-f\b option is set " 554887Schin "to \adelim\a. The default is the \btab\b character.]" 564887Schin "[f:fields]:[list?\bcut\b based on fields separated by the delimiter " 574887Schin "character specified with the \b-d\b optiion.]" 584887Schin "[n:nosplit?Do not split characters. Currently ignored.]" 594887Schin "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length " 604887Schin "records of length \areclen\a when used with the \b-b\b or \b-c\b " 614887Schin "option.]" 624887Schin "[s:suppress|only-delimited?Suppress lines with no delimiter characters, " 634887Schin "when used with the \b-f\b option. By default, lines with no " 644887Schin "delimiters will be passsed in untouched.]" 654887Schin "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for " 664887Schin "the \b-f\b option is set to \aldelim\a. The default is the " 674887Schin "\bnewline\b character.]" 684887Schin "[N:nonewline?Do not output new-lines at end of each record when used " 694887Schin "with the \b-b\b or \b-c\b option.]" 704887Schin "\n" 714887Schin "\n[file ...]\n" 724887Schin "\n" 734887Schin "[+EXIT STATUS?]{" 744887Schin "[+0?All files processed successfully.]" 754887Schin "[+>0?One or more files failed to open or could not be read.]" 764887Schin "}" 774887Schin "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]" 784887Schin ; 794887Schin 804887Schin #include <cmd.h> 814887Schin #include <ctype.h> 824887Schin 834887Schin typedef struct Last_s 844887Schin { 854887Schin int seqno; 864887Schin int seq; 874887Schin int wdelim; 884887Schin int ldelim; 894887Schin } Last_t; 904887Schin 914887Schin typedef struct Cut_s 924887Schin { 934887Schin int cflag; 944887Schin int sflag; 954887Schin int nlflag; 964887Schin int wdelim; 974887Schin int ldelim; 984887Schin int seqno; 994887Schin int reclen; 1004887Schin signed char space[UCHAR_MAX]; 1014887Schin Last_t last; 1024887Schin int list[2]; /* NOTE: must be last member */ 1034887Schin } Cut_t; 1044887Schin 1054887Schin #define HUGE (1<<14) 1064887Schin #define BLOCK 8*1024 1074887Schin #define C_BYTES 1 1084887Schin #define C_CHARS 2 1094887Schin #define C_FIELDS 4 1104887Schin #define C_SUPRESS 8 1114887Schin #define C_NOCHOP 16 1124887Schin #define C_NONEWLINE 32 1134887Schin 1144887Schin /* 1154887Schin * compare the first of an array of integers 1164887Schin */ 1174887Schin 1184887Schin static int mycomp(register const void *a,register const void *b) 1194887Schin { 1204887Schin return(*((int*)a) - *((int*)b)); 1214887Schin } 1224887Schin 1234887Schin static Cut_t *cutinit(int mode,char *str,int wdelim,int ldelim,size_t reclen) 1244887Schin { 1254887Schin register int *lp, c, n=0; 1264887Schin register int range = 0; 1274887Schin register char *cp = str; 1284887Schin Cut_t *cuthdr; 1294887Schin if (!(cuthdr = (Cut_t*)stakalloc(sizeof(Cut_t)+strlen(cp)*sizeof(int)))) 1304887Schin error(ERROR_exit(1), "out of space"); 1314887Schin memset(cuthdr->space, 0, sizeof(cuthdr->space)); 1324887Schin cuthdr->last.seqno = 0; 1334887Schin cuthdr->last.seq = 0; 1344887Schin cuthdr->last.wdelim = 0; 1354887Schin cuthdr->last.ldelim = '\n'; 1364887Schin cuthdr->cflag = ((mode&C_CHARS)!=0 && mbwide()); 1374887Schin cuthdr->sflag = ((mode&C_SUPRESS)!=0); 1384887Schin cuthdr->nlflag = ((mode&C_NONEWLINE)!=0); 1394887Schin cuthdr->wdelim = wdelim; 1404887Schin cuthdr->ldelim = ldelim; 1414887Schin cuthdr->reclen = reclen; 1424887Schin cuthdr->seqno = ++cuthdr->last.seqno; 1434887Schin lp = cuthdr->list; 1444887Schin while(1) switch(c= *cp++) 1454887Schin { 1464887Schin case ' ': 1474887Schin case '\t': 1484887Schin while(*cp==' ' || *cp=='\t') 1494887Schin cp++; 1504887Schin case 0: 1514887Schin case ',': 1524887Schin if(range) 1534887Schin { 1544887Schin --range; 1554887Schin if((n = (n==0?HUGE:n-range)) < 0) 1564887Schin error(ERROR_exit(1),"invalid range for c/f option"); 1574887Schin *lp++ = range; 1584887Schin *lp++ = n; 1594887Schin } 1604887Schin else 1614887Schin { 1624887Schin *lp++ = --n; 1634887Schin *lp++ = 1; 1644887Schin } 1654887Schin if(c==0) 1664887Schin { 1674887Schin register int *dp; 1684887Schin *lp = HUGE; 1694887Schin n = 1 + (lp-cuthdr->list)/2; 1704887Schin qsort(lp=cuthdr->list,n,2*sizeof(*lp),mycomp); 1714887Schin /* eliminate overlapping regions */ 1724887Schin for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2) 1734887Schin { 1744887Schin if(lp[0] <= range) 1754887Schin { 1764887Schin if(lp[1]==HUGE) 1774887Schin { 1784887Schin dp[-1] = HUGE; 1794887Schin break; 1804887Schin } 1814887Schin if((c = lp[0]+lp[1]-range)>0) 1824887Schin { 1834887Schin range += c; 1844887Schin dp[-1] += c; 1854887Schin } 1864887Schin } 1874887Schin else 1884887Schin { 1894887Schin range = *dp++ = lp[0]; 1904887Schin if(lp[1]==HUGE) 1914887Schin { 1924887Schin *dp++ = HUGE; 1934887Schin break; 1944887Schin } 1954887Schin range += (*dp++ = lp[1]); 1964887Schin } 1974887Schin } 1984887Schin *dp = HUGE; 1994887Schin lp = cuthdr->list; 2004887Schin /* convert ranges into gaps */ 2014887Schin for(n=0; *lp!=HUGE; lp+=2) 2024887Schin { 2034887Schin c = *lp; 2044887Schin *lp -= n; 2054887Schin n = c+lp[1]; 2064887Schin } 2074887Schin return(cuthdr); 2084887Schin } 2094887Schin n = range = 0; 2104887Schin break; 2114887Schin 2124887Schin case '-': 2134887Schin if(range) 2144887Schin error(ERROR_exit(1),"bad list for c/f option"); 2154887Schin range = n?n:1; 2164887Schin n = 0; 2174887Schin break; 2184887Schin 2194887Schin default: 2204887Schin if(!isdigit(c)) 2214887Schin error(ERROR_exit(1),"bad list for c/f option"); 2224887Schin n = 10*n + (c-'0'); 2234887Schin } 2244887Schin /* NOTREACHED */ 2254887Schin } 2264887Schin 2274887Schin /* 2284887Schin * advance <cp> by <n> multi-byte characters 2294887Schin */ 2304887Schin static int advance(const char *str, register int n, register int inlen) 2314887Schin { 2324887Schin register int size, len=inlen; 2334887Schin register const char *cp=str; 2344887Schin while(len>0 && n-->0) 2354887Schin { 2364887Schin size = mblen(cp, len); 2374887Schin if(size<0) 2384887Schin size = 1; 2394887Schin cp += size; 2404887Schin len -= size; 2414887Schin 2424887Schin } 2434887Schin if(n>0) 2444887Schin return(inlen+1); 2454887Schin return(cp-str); 2464887Schin } 2474887Schin 2484887Schin /* 2494887Schin * cut each line of file <fdin> and put results to <fdout> using list <list> 2504887Schin */ 2514887Schin 252*8462SApril.Chin@Sun.COM static void cutcols(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout) 2534887Schin { 2544887Schin register int c, ncol=0,len; 2554887Schin register const int *lp = cuthdr->list; 2564887Schin register char *inp; 2574887Schin register int skip; /* non-zero for don't copy */ 2584887Schin while(1) 2594887Schin { 2604887Schin if(len = cuthdr->reclen) 2614887Schin inp = sfreserve(fdin, len, -1); 2624887Schin else 2634887Schin inp = sfgetr(fdin, '\n', 0); 2644887Schin if(!inp && !(inp = sfgetr(fdin, 0, SF_LASTR))) 2654887Schin break; 2664887Schin len = sfvalue(fdin); 2674887Schin if((ncol = skip = *(lp = cuthdr->list)) == 0) 2684887Schin ncol = *++lp; 2694887Schin while(1) 2704887Schin { 2714887Schin if((c=(cuthdr->cflag?advance(inp,ncol,len):ncol)) > len) 2724887Schin c = len; 2734887Schin else if(c==len && !skip) 2744887Schin ncol++; 2754887Schin ncol -= c; 2764887Schin if(!skip && sfwrite(fdout,(char*)inp,c)<0) 277*8462SApril.Chin@Sun.COM return; 2784887Schin inp += c; 2794887Schin if(ncol) 2804887Schin break; 2814887Schin len -= c; 2824887Schin ncol = *++lp; 2834887Schin skip = !skip; 2844887Schin } 2854887Schin if(!cuthdr->nlflag && (skip || cuthdr->reclen)) 2864887Schin sfputc(fdout,cuthdr->ldelim); 2874887Schin } 2884887Schin } 2894887Schin 2904887Schin /* 2914887Schin * cut each line of file <fdin> and put results to <fdout> using list <list> 2924887Schin * stream <fdin> must be line buffered 2934887Schin */ 2944887Schin 2954887Schin #define endline(c) (((signed char)-1)<0?(c)<0:(c)==((char)-1)) 2964887Schin 297*8462SApril.Chin@Sun.COM static void cutfields(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout) 2984887Schin { 2994887Schin register unsigned char *cp; 3004887Schin register int c, nfields; 3014887Schin register const int *lp = cuthdr->list; 3024887Schin register unsigned char *copy; 3034887Schin register int nodelim, empty, inword=0; 3044887Schin register unsigned char *endbuff; 3054887Schin unsigned char *inbuff, *first; 3064887Schin int lastchar; 3074887Schin Sfio_t *fdtmp = 0; 3084887Schin long offset = 0; 3094887Schin if(cuthdr->seqno != cuthdr->last.seq) 3104887Schin { 3114887Schin cuthdr->space[cuthdr->last.ldelim] = 0; 3124887Schin cuthdr->space[cuthdr->last.wdelim] = 0; 3134887Schin cuthdr->space[cuthdr->last.wdelim=cuthdr->wdelim] = 1; 3144887Schin cuthdr->space[cuthdr->last.ldelim=cuthdr->ldelim] = -1; 3154887Schin cuthdr->last.seq = cuthdr->seqno; 3164887Schin } 3174887Schin /* process each buffer */ 3184887Schin while ((inbuff = (unsigned char*)sfreserve(fdin, SF_UNBOUND, 0)) && (c = sfvalue(fdin)) > 0) 3194887Schin { 3204887Schin cp = inbuff; 3214887Schin endbuff = cp + --c; 3224887Schin if((lastchar = cp[c]) != cuthdr->ldelim) 3234887Schin *endbuff = cuthdr->ldelim; 3244887Schin /* process each line in the buffer */ 3254887Schin while(cp <= endbuff) 3264887Schin { 3274887Schin first = cp; 3284887Schin if(!inword) 3294887Schin { 3304887Schin nodelim = empty = 1; 3314887Schin copy = cp; 3324887Schin if(nfields = *(lp = cuthdr->list)) 3334887Schin copy = 0; 3344887Schin else 3354887Schin nfields = *++lp; 3364887Schin } 3374887Schin else if(copy) 3384887Schin copy = cp; 3394887Schin inword = 0; 3404887Schin while(!inword) 3414887Schin { 3424887Schin /* skip over non-delimiter characters */ 3434887Schin while(!(c=cuthdr->space[*cp++])); 3444887Schin /* check for end-of-line */ 3454887Schin if(endline(c)) 3464887Schin { 3474887Schin if(cp<=endbuff) 3484887Schin break; 3494887Schin if((c=cuthdr->space[lastchar]),endline(c)) 3504887Schin break; 3514887Schin /* restore cuthdr->last. character */ 3524887Schin if(lastchar != cuthdr->ldelim) 3534887Schin *endbuff = lastchar; 3544887Schin inword++; 3554887Schin if(!c) 3564887Schin break; 3574887Schin } 3584887Schin nodelim = 0; 3594887Schin if(--nfields >0) 3604887Schin continue; 3614887Schin nfields = *++lp; 3624887Schin if(copy) 3634887Schin { 3644887Schin empty = 0; 3654887Schin if((c=(cp-1)-copy)>0 && sfwrite(fdout,(char*)copy,c)< 0) 3664887Schin goto failed; 3674887Schin copy = 0; 3684887Schin } 3694887Schin else 3704887Schin /* set to delimiter unless the first field */ 3714887Schin copy = cp -!empty; 3724887Schin } 3734887Schin if(!inword) 3744887Schin { 3754887Schin if(!copy) 3764887Schin { 3774887Schin if(nodelim) 3784887Schin { 3794887Schin if(!cuthdr->sflag) 3804887Schin { 3814887Schin if(offset) 3824887Schin { 3834887Schin sfseek(fdtmp,(Sfoff_t)0,SEEK_SET); 3844887Schin sfmove(fdtmp,fdout,offset,-1); 3854887Schin } 3864887Schin copy = first; 3874887Schin } 3884887Schin } 3894887Schin else 3904887Schin sfputc(fdout,'\n'); 3914887Schin } 3924887Schin if(offset) 3934887Schin sfseek(fdtmp,offset=0,SEEK_SET); 3944887Schin } 3954887Schin if(copy && (c=cp-copy)>0 && (!nodelim || !cuthdr->sflag) && sfwrite(fdout,(char*)copy,c)< 0) 3964887Schin goto failed; 3974887Schin } 3984887Schin /* see whether to save in tmp file */ 399*8462SApril.Chin@Sun.COM if(inword && nodelim && !cuthdr->sflag && (c=cp-first)>0) 4004887Schin { 4014887Schin /* copy line to tmpfile in case no fields */ 4024887Schin if(!fdtmp) 4034887Schin fdtmp = sftmp(BLOCK); 4044887Schin sfwrite(fdtmp,(char*)first,c); 4054887Schin offset +=c; 4064887Schin } 4074887Schin } 4084887Schin failed: 4094887Schin if(fdtmp) 4104887Schin sfclose(fdtmp); 4114887Schin } 4124887Schin 4134887Schin int 4144887Schin b_cut(int argc,char *argv[], void* context) 4154887Schin { 4164887Schin register char *cp = 0; 4174887Schin register Sfio_t *fp; 4184887Schin int n; 4194887Schin Cut_t *cuthdr; 4204887Schin int mode = 0; 4214887Schin int wdelim = '\t'; 4224887Schin int ldelim = '\n'; 4234887Schin size_t reclen = 0; 4244887Schin 4254887Schin cmdinit(argc, argv, context, ERROR_CATALOG, 0); 4264887Schin while (n = optget(argv, usage)) switch (n) 4274887Schin { 4284887Schin case 'b': 4294887Schin case 'c': 4304887Schin if(mode&C_FIELDS) 4314887Schin { 4324887Schin error(2, "f option already specified"); 4334887Schin break; 4344887Schin } 4354887Schin cp = opt_info.arg; 4364887Schin if(n=='b') 4374887Schin mode |= C_BYTES; 4384887Schin else 4394887Schin mode |= C_CHARS; 4404887Schin break; 4414887Schin case 'D': 4424887Schin ldelim = *(unsigned char*)opt_info.arg; 4434887Schin break; 4444887Schin case 'd': 4454887Schin wdelim = *(unsigned char*)opt_info.arg; 4464887Schin break; 4474887Schin case 'f': 4484887Schin if(mode&(C_CHARS|C_BYTES)) 4494887Schin { 4504887Schin error(2, "c option already specified"); 4514887Schin break; 4524887Schin } 4534887Schin cp = opt_info.arg; 4544887Schin mode |= C_FIELDS; 4554887Schin break; 4564887Schin case 'n': 4574887Schin mode |= C_NOCHOP; 4584887Schin break; 4594887Schin case 'N': 4604887Schin mode |= C_NONEWLINE; 4614887Schin break; 4624887Schin case 'R': 4634887Schin case 'r': 4644887Schin if(opt_info.num>0) 4654887Schin reclen = opt_info.num; 4664887Schin break; 4674887Schin case 's': 4684887Schin mode |= C_SUPRESS; 4694887Schin break; 4704887Schin case ':': 4714887Schin error(2, "%s", opt_info.arg); 4724887Schin break; 4734887Schin case '?': 4744887Schin error(ERROR_usage(2), "%s", opt_info.arg); 4754887Schin break; 4764887Schin } 4774887Schin argv += opt_info.index; 4784887Schin if (error_info.errors) 4794887Schin error(ERROR_usage(2), "%s",optusage(NiL)); 4804887Schin if(!cp) 4814887Schin { 4824887Schin error(2, "b, c or f option must be specified"); 4834887Schin error(ERROR_usage(2), "%s", optusage(NiL)); 4844887Schin } 4854887Schin if(!*cp) 4864887Schin error(3, "non-empty b, c or f option must be specified"); 4874887Schin if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS) 4884887Schin error(3, "s option requires f option"); 4894887Schin cuthdr = cutinit(mode,cp,wdelim,ldelim,reclen); 4904887Schin if(cp = *argv) 4914887Schin argv++; 4924887Schin do 4934887Schin { 4944887Schin if(!cp || streq(cp,"-")) 4954887Schin fp = sfstdin; 4964887Schin else if(!(fp = sfopen(NiL,cp,"r"))) 4974887Schin { 4984887Schin error(ERROR_system(0),"%s: cannot open",cp); 4994887Schin continue; 5004887Schin } 5014887Schin if(mode&C_FIELDS) 5024887Schin cutfields(cuthdr,fp,sfstdout); 5034887Schin else 5044887Schin cutcols(cuthdr,fp,sfstdout); 5054887Schin if(fp!=sfstdin) 5064887Schin sfclose(fp); 507*8462SApril.Chin@Sun.COM } while(cp = *argv++); 508*8462SApril.Chin@Sun.COM if (sfsync(sfstdout)) 509*8462SApril.Chin@Sun.COM error(ERROR_system(0), "write error"); 5104887Schin return(error_info.errors?1:0); 5114887Schin } 512