xref: /onnv-gate/usr/src/lib/libcmd/common/cut.c (revision 8462)
14887Schin /***********************************************************************
24887Schin *                                                                      *
34887Schin *               This software is part of the ast package               *
4*8462SApril.Chin@Sun.COM *          Copyright (c) 1992-2008 AT&T Intellectual Property          *
54887Schin *                      and is licensed under the                       *
64887Schin *                  Common Public License, Version 1.0                  *
7*8462SApril.Chin@Sun.COM *                    by AT&T Intellectual Property                     *
84887Schin *                                                                      *
94887Schin *                A copy of the License is available at                 *
104887Schin *            http://www.opensource.org/licenses/cpl1.0.txt             *
114887Schin *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
124887Schin *                                                                      *
134887Schin *              Information and Software Systems Research               *
144887Schin *                            AT&T Research                             *
154887Schin *                           Florham Park NJ                            *
164887Schin *                                                                      *
174887Schin *                 Glenn Fowler <gsf@research.att.com>                  *
184887Schin *                  David Korn <dgk@research.att.com>                   *
194887Schin *                                                                      *
204887Schin ***********************************************************************/
214887Schin #pragma prototyped
224887Schin /*
234887Schin  * David Korn
244887Schin  * AT&T Bell Laboratories
254887Schin  *
264887Schin  * cut [-sN] [-f flist] [-c clist] [-d delim] [-D delim] [-r reclen] [file] ...
274887Schin  *
284887Schin  * cut fields or columns from fields from a file
294887Schin  */
304887Schin 
314887Schin static const char usage[] =
32*8462SApril.Chin@Sun.COM "[-?\n@(#)$Id: cut (AT&T Research) 2008-04-01 $\n]"
334887Schin USAGE_LICENSE
344887Schin "[+NAME?cut - cut out selected columns or fields of each line of a file]"
354887Schin "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
364887Schin 	"from one or more files, contatenating them on standard output.]"
374887Schin "[+?The option argument \alist\a is a comma-separated or blank-separated "
384887Schin 	"list of positive numbers and ranges.  Ranges can be of three "
394887Schin 	"forms.  The first is two positive integers separated by a hyphen "
404887Schin 	"(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
414887Schin 	"\ahigh\a.  The second is a positive number preceded by a hyphen "
424887Schin 	"(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
434887Schin 	"\ahigh\a.  The last is a positive number followed by a hyphen "
444887Schin 	"(\alow\a\b-\b), which represents all fields from \alow\a to the "
454887Schin 	"last field, inclusive.  Elements in the \alist\a can be repeated, "
464887Schin 	"can overlap, and can appear in any order.  The order of the "
474887Schin 	"output is that of the input.]"
484887Schin "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
494887Schin "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
504887Schin         "cuts from standard input.   The start of the file is defined "
514887Schin         "as the current offset.]"
524887Schin "[b:bytes]:[list?\bcut\b based on a list of bytes.]"
534887Schin "[c:characters]:[list?\bcut\b based on a list of characters.]"
544887Schin "[d:delimiter]:[delim?The field character for the \b-f\b option is set "
554887Schin 	"to \adelim\a.  The default is the \btab\b character.]"
564887Schin "[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
574887Schin 	"character specified with the \b-d\b optiion.]"
584887Schin "[n:nosplit?Do not split characters.  Currently ignored.]"
594887Schin "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
604887Schin 	"records of length \areclen\a when used with the \b-b\b or \b-c\b "
614887Schin 	"option.]"
624887Schin "[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
634887Schin 	"when used with the \b-f\b option.  By default, lines with no "
644887Schin 	"delimiters will be passsed in untouched.]"
654887Schin "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
664887Schin 	"the \b-f\b option is set to \aldelim\a.  The default is the "
674887Schin 	"\bnewline\b character.]"
684887Schin "[N:nonewline?Do not output new-lines at end of each record when used "
694887Schin 	"with the \b-b\b or \b-c\b option.]"
704887Schin "\n"
714887Schin "\n[file ...]\n"
724887Schin "\n"
734887Schin "[+EXIT STATUS?]{"
744887Schin 	"[+0?All files processed successfully.]"
754887Schin 	"[+>0?One or more files failed to open or could not be read.]"
764887Schin "}"
774887Schin "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
784887Schin ;
794887Schin 
804887Schin #include <cmd.h>
814887Schin #include <ctype.h>
824887Schin 
834887Schin typedef struct Last_s
844887Schin {
854887Schin 	int		seqno;
864887Schin 	int		seq;
874887Schin 	int		wdelim;
884887Schin 	int		ldelim;
894887Schin } Last_t;
904887Schin 
914887Schin typedef struct Cut_s
924887Schin {
934887Schin 	int		cflag;
944887Schin 	int		sflag;
954887Schin 	int		nlflag;
964887Schin 	int		wdelim;
974887Schin 	int		ldelim;
984887Schin 	int		seqno;
994887Schin 	int		reclen;
1004887Schin 	signed char	space[UCHAR_MAX];
1014887Schin 	Last_t		last;
1024887Schin 	int		list[2];	/* NOTE: must be last member */
1034887Schin } Cut_t;
1044887Schin 
1054887Schin #define HUGE		(1<<14)
1064887Schin #define BLOCK		8*1024
1074887Schin #define C_BYTES		1
1084887Schin #define C_CHARS		2
1094887Schin #define C_FIELDS	4
1104887Schin #define C_SUPRESS	8
1114887Schin #define C_NOCHOP	16
1124887Schin #define C_NONEWLINE	32
1134887Schin 
1144887Schin /*
1154887Schin  * compare the first of an array of integers
1164887Schin  */
1174887Schin 
1184887Schin static int mycomp(register const void *a,register const void *b)
1194887Schin {
1204887Schin 	return(*((int*)a) - *((int*)b));
1214887Schin }
1224887Schin 
1234887Schin static Cut_t *cutinit(int mode,char *str,int wdelim,int ldelim,size_t reclen)
1244887Schin {
1254887Schin 	register int *lp, c, n=0;
1264887Schin 	register int range = 0;
1274887Schin 	register char *cp = str;
1284887Schin 	Cut_t *cuthdr;
1294887Schin 	if (!(cuthdr = (Cut_t*)stakalloc(sizeof(Cut_t)+strlen(cp)*sizeof(int))))
1304887Schin 		error(ERROR_exit(1), "out of space");
1314887Schin 	memset(cuthdr->space, 0, sizeof(cuthdr->space));
1324887Schin 	cuthdr->last.seqno = 0;
1334887Schin 	cuthdr->last.seq = 0;
1344887Schin 	cuthdr->last.wdelim = 0;
1354887Schin 	cuthdr->last.ldelim = '\n';
1364887Schin 	cuthdr->cflag = ((mode&C_CHARS)!=0 && mbwide());
1374887Schin 	cuthdr->sflag = ((mode&C_SUPRESS)!=0);
1384887Schin 	cuthdr->nlflag = ((mode&C_NONEWLINE)!=0);
1394887Schin 	cuthdr->wdelim = wdelim;
1404887Schin 	cuthdr->ldelim = ldelim;
1414887Schin 	cuthdr->reclen = reclen;
1424887Schin 	cuthdr->seqno = ++cuthdr->last.seqno;
1434887Schin 	lp = cuthdr->list;
1444887Schin 	while(1) switch(c= *cp++)
1454887Schin 	{
1464887Schin 		case ' ':
1474887Schin 		case '\t':
1484887Schin 			while(*cp==' ' || *cp=='\t')
1494887Schin 				cp++;
1504887Schin 		case 0:
1514887Schin 		case ',':
1524887Schin 			if(range)
1534887Schin 			{
1544887Schin 				--range;
1554887Schin 				if((n = (n==0?HUGE:n-range)) < 0)
1564887Schin 					error(ERROR_exit(1),"invalid range for c/f option");
1574887Schin 				*lp++ = range;
1584887Schin 				*lp++ = n;
1594887Schin 			}
1604887Schin 			else
1614887Schin 			{
1624887Schin 				*lp++ = --n;
1634887Schin 				*lp++ = 1;
1644887Schin 			}
1654887Schin 			if(c==0)
1664887Schin 			{
1674887Schin 				register int *dp;
1684887Schin 				*lp = HUGE;
1694887Schin 				n = 1 + (lp-cuthdr->list)/2;
1704887Schin 				qsort(lp=cuthdr->list,n,2*sizeof(*lp),mycomp);
1714887Schin 				/* eliminate overlapping regions */
1724887Schin 				for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2)
1734887Schin 				{
1744887Schin 					if(lp[0] <= range)
1754887Schin 					{
1764887Schin 						if(lp[1]==HUGE)
1774887Schin 						{
1784887Schin 							dp[-1] = HUGE;
1794887Schin 							break;
1804887Schin 						}
1814887Schin 						if((c = lp[0]+lp[1]-range)>0)
1824887Schin 						{
1834887Schin 							range += c;
1844887Schin 							dp[-1] += c;
1854887Schin 						}
1864887Schin 					}
1874887Schin 					else
1884887Schin 					{
1894887Schin 						range = *dp++ = lp[0];
1904887Schin 						if(lp[1]==HUGE)
1914887Schin 						{
1924887Schin 							*dp++ = HUGE;
1934887Schin 							break;
1944887Schin 						}
1954887Schin 						range += (*dp++ = lp[1]);
1964887Schin 					}
1974887Schin 				}
1984887Schin 				*dp = HUGE;
1994887Schin 				lp = cuthdr->list;
2004887Schin 				/* convert ranges into gaps */
2014887Schin 				for(n=0; *lp!=HUGE; lp+=2)
2024887Schin 				{
2034887Schin 					c = *lp;
2044887Schin 					*lp -= n;
2054887Schin 					n = c+lp[1];
2064887Schin 				}
2074887Schin 				return(cuthdr);
2084887Schin 			}
2094887Schin 			n = range = 0;
2104887Schin 			break;
2114887Schin 
2124887Schin 		case '-':
2134887Schin 			if(range)
2144887Schin 				error(ERROR_exit(1),"bad list for c/f option");
2154887Schin 			range = n?n:1;
2164887Schin 			n = 0;
2174887Schin 			break;
2184887Schin 
2194887Schin 		default:
2204887Schin 			if(!isdigit(c))
2214887Schin 				error(ERROR_exit(1),"bad list for c/f option");
2224887Schin 			n = 10*n + (c-'0');
2234887Schin 	}
2244887Schin 	/* NOTREACHED */
2254887Schin }
2264887Schin 
2274887Schin /*
2284887Schin  * advance <cp> by <n> multi-byte characters
2294887Schin  */
2304887Schin static int advance(const char *str, register int n, register int inlen)
2314887Schin {
2324887Schin 	register int size, len=inlen;
2334887Schin 	register const char *cp=str;
2344887Schin 	while(len>0 && n-->0)
2354887Schin 	{
2364887Schin 		size = mblen(cp, len);
2374887Schin 		if(size<0)
2384887Schin 			size = 1;
2394887Schin 		cp += size;
2404887Schin 		len -= size;
2414887Schin 
2424887Schin 	}
2434887Schin 	if(n>0)
2444887Schin 		return(inlen+1);
2454887Schin 	return(cp-str);
2464887Schin }
2474887Schin 
2484887Schin /*
2494887Schin  * cut each line of file <fdin> and put results to <fdout> using list <list>
2504887Schin  */
2514887Schin 
252*8462SApril.Chin@Sun.COM static void cutcols(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout)
2534887Schin {
2544887Schin 	register int		c, ncol=0,len;
2554887Schin 	register const int	*lp = cuthdr->list;
2564887Schin 	register char		*inp;
2574887Schin 	register int		skip; /* non-zero for don't copy */
2584887Schin 	while(1)
2594887Schin 	{
2604887Schin 		if(len = cuthdr->reclen)
2614887Schin 			inp = sfreserve(fdin, len, -1);
2624887Schin 		else
2634887Schin 			inp = sfgetr(fdin, '\n', 0);
2644887Schin 		if(!inp && !(inp = sfgetr(fdin, 0, SF_LASTR)))
2654887Schin 			break;
2664887Schin 		len = sfvalue(fdin);
2674887Schin 		if((ncol = skip  = *(lp = cuthdr->list)) == 0)
2684887Schin 			ncol = *++lp;
2694887Schin 		while(1)
2704887Schin 		{
2714887Schin 			if((c=(cuthdr->cflag?advance(inp,ncol,len):ncol)) > len)
2724887Schin 				c = len;
2734887Schin 			else if(c==len && !skip)
2744887Schin 				ncol++;
2754887Schin 			ncol -= c;
2764887Schin 			if(!skip && sfwrite(fdout,(char*)inp,c)<0)
277*8462SApril.Chin@Sun.COM 				return;
2784887Schin 			inp += c;
2794887Schin 			if(ncol)
2804887Schin 				break;
2814887Schin 			len -= c;
2824887Schin 			ncol = *++lp;
2834887Schin 			skip = !skip;
2844887Schin 		}
2854887Schin 		if(!cuthdr->nlflag && (skip || cuthdr->reclen))
2864887Schin 			sfputc(fdout,cuthdr->ldelim);
2874887Schin 	}
2884887Schin }
2894887Schin 
2904887Schin /*
2914887Schin  * cut each line of file <fdin> and put results to <fdout> using list <list>
2924887Schin  * stream <fdin> must be line buffered
2934887Schin  */
2944887Schin 
2954887Schin #define endline(c)	(((signed char)-1)<0?(c)<0:(c)==((char)-1))
2964887Schin 
297*8462SApril.Chin@Sun.COM static void cutfields(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout)
2984887Schin {
2994887Schin 	register unsigned char *cp;
3004887Schin 	register int c, nfields;
3014887Schin 	register const int *lp = cuthdr->list;
3024887Schin 	register unsigned char *copy;
3034887Schin 	register int nodelim, empty, inword=0;
3044887Schin 	register unsigned char *endbuff;
3054887Schin 	unsigned char *inbuff, *first;
3064887Schin 	int lastchar;
3074887Schin 	Sfio_t *fdtmp = 0;
3084887Schin 	long offset = 0;
3094887Schin 	if(cuthdr->seqno != cuthdr->last.seq)
3104887Schin 	{
3114887Schin 		cuthdr->space[cuthdr->last.ldelim] = 0;
3124887Schin 		cuthdr->space[cuthdr->last.wdelim] = 0;
3134887Schin 		cuthdr->space[cuthdr->last.wdelim=cuthdr->wdelim] = 1;
3144887Schin 		cuthdr->space[cuthdr->last.ldelim=cuthdr->ldelim] = -1;
3154887Schin 		cuthdr->last.seq = cuthdr->seqno;
3164887Schin 	}
3174887Schin 	/* process each buffer */
3184887Schin 	while ((inbuff = (unsigned char*)sfreserve(fdin, SF_UNBOUND, 0)) && (c = sfvalue(fdin)) > 0)
3194887Schin 	{
3204887Schin 		cp = inbuff;
3214887Schin 		endbuff = cp + --c;
3224887Schin 		if((lastchar = cp[c]) != cuthdr->ldelim)
3234887Schin 			*endbuff = cuthdr->ldelim;
3244887Schin 		/* process each line in the buffer */
3254887Schin 		while(cp <= endbuff)
3264887Schin 		{
3274887Schin 			first = cp;
3284887Schin 			if(!inword)
3294887Schin 			{
3304887Schin 				nodelim = empty = 1;
3314887Schin 				copy = cp;
3324887Schin 				if(nfields = *(lp = cuthdr->list))
3334887Schin 					copy = 0;
3344887Schin 				else
3354887Schin 					nfields = *++lp;
3364887Schin 			}
3374887Schin 			else if(copy)
3384887Schin 				copy = cp;
3394887Schin 			inword = 0;
3404887Schin 			while(!inword)
3414887Schin 			{
3424887Schin 				/* skip over non-delimiter characters */
3434887Schin 				while(!(c=cuthdr->space[*cp++]));
3444887Schin 				/* check for end-of-line */
3454887Schin 				if(endline(c))
3464887Schin 				{
3474887Schin 					if(cp<=endbuff)
3484887Schin 						break;
3494887Schin 					if((c=cuthdr->space[lastchar]),endline(c))
3504887Schin 						break;
3514887Schin 					/* restore cuthdr->last. character */
3524887Schin 					if(lastchar != cuthdr->ldelim)
3534887Schin 						*endbuff = lastchar;
3544887Schin 					inword++;
3554887Schin 					if(!c)
3564887Schin 						break;
3574887Schin 				}
3584887Schin 				nodelim = 0;
3594887Schin 				if(--nfields >0)
3604887Schin 					continue;
3614887Schin 				nfields = *++lp;
3624887Schin 				if(copy)
3634887Schin 				{
3644887Schin 					empty = 0;
3654887Schin 					if((c=(cp-1)-copy)>0 && sfwrite(fdout,(char*)copy,c)< 0)
3664887Schin 						goto failed;
3674887Schin 					copy = 0;
3684887Schin 				}
3694887Schin 				else
3704887Schin 					/* set to delimiter unless the first field */
3714887Schin 					copy = cp -!empty;
3724887Schin 			}
3734887Schin 			if(!inword)
3744887Schin 			{
3754887Schin 				if(!copy)
3764887Schin 				{
3774887Schin 					if(nodelim)
3784887Schin 					{
3794887Schin 						if(!cuthdr->sflag)
3804887Schin 						{
3814887Schin 							if(offset)
3824887Schin 							{
3834887Schin 								sfseek(fdtmp,(Sfoff_t)0,SEEK_SET);
3844887Schin 								sfmove(fdtmp,fdout,offset,-1);
3854887Schin 							}
3864887Schin 							copy = first;
3874887Schin 						}
3884887Schin 					}
3894887Schin 					else
3904887Schin 						sfputc(fdout,'\n');
3914887Schin 				}
3924887Schin 				if(offset)
3934887Schin 					sfseek(fdtmp,offset=0,SEEK_SET);
3944887Schin 			}
3954887Schin 			if(copy && (c=cp-copy)>0 && (!nodelim || !cuthdr->sflag) && sfwrite(fdout,(char*)copy,c)< 0)
3964887Schin 				goto failed;
3974887Schin 		}
3984887Schin 		/* see whether to save in tmp file */
399*8462SApril.Chin@Sun.COM 		if(inword && nodelim && !cuthdr->sflag && (c=cp-first)>0)
4004887Schin 		{
4014887Schin 			/* copy line to tmpfile in case no fields */
4024887Schin 			if(!fdtmp)
4034887Schin 				fdtmp = sftmp(BLOCK);
4044887Schin 			sfwrite(fdtmp,(char*)first,c);
4054887Schin 			offset +=c;
4064887Schin 		}
4074887Schin 	}
4084887Schin failed:
4094887Schin 	if(fdtmp)
4104887Schin 		sfclose(fdtmp);
4114887Schin }
4124887Schin 
4134887Schin int
4144887Schin b_cut(int argc,char *argv[], void* context)
4154887Schin {
4164887Schin 	register char *cp = 0;
4174887Schin 	register Sfio_t *fp;
4184887Schin 	int	n;
4194887Schin 	Cut_t	*cuthdr;
4204887Schin 	int	mode = 0;
4214887Schin 	int	wdelim = '\t';
4224887Schin 	int	ldelim = '\n';
4234887Schin 	size_t	reclen = 0;
4244887Schin 
4254887Schin 	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
4264887Schin 	while (n = optget(argv, usage)) switch (n)
4274887Schin 	{
4284887Schin 	  case 'b':
4294887Schin 	  case 'c':
4304887Schin 		if(mode&C_FIELDS)
4314887Schin 		{
4324887Schin 			error(2, "f option already specified");
4334887Schin 			break;
4344887Schin 		}
4354887Schin 		cp = opt_info.arg;
4364887Schin 		if(n=='b')
4374887Schin 			mode |= C_BYTES;
4384887Schin 		else
4394887Schin 			mode |= C_CHARS;
4404887Schin 		break;
4414887Schin 	  case 'D':
4424887Schin 		ldelim = *(unsigned char*)opt_info.arg;
4434887Schin 		break;
4444887Schin 	  case 'd':
4454887Schin 		wdelim = *(unsigned char*)opt_info.arg;
4464887Schin 		break;
4474887Schin 	  case 'f':
4484887Schin 		if(mode&(C_CHARS|C_BYTES))
4494887Schin 		{
4504887Schin 			error(2, "c option already specified");
4514887Schin 			break;
4524887Schin 		}
4534887Schin 		cp = opt_info.arg;
4544887Schin 		mode |= C_FIELDS;
4554887Schin 		break;
4564887Schin 	  case 'n':
4574887Schin 		mode |= C_NOCHOP;
4584887Schin 		break;
4594887Schin 	  case 'N':
4604887Schin 		mode |= C_NONEWLINE;
4614887Schin 		break;
4624887Schin 	  case 'R':
4634887Schin 	  case 'r':
4644887Schin 		if(opt_info.num>0)
4654887Schin 			reclen = opt_info.num;
4664887Schin 		break;
4674887Schin 	  case 's':
4684887Schin 		mode |= C_SUPRESS;
4694887Schin 		break;
4704887Schin 	  case ':':
4714887Schin 		error(2, "%s", opt_info.arg);
4724887Schin 		break;
4734887Schin 	  case '?':
4744887Schin 		error(ERROR_usage(2), "%s", opt_info.arg);
4754887Schin 		break;
4764887Schin 	}
4774887Schin 	argv += opt_info.index;
4784887Schin 	if (error_info.errors)
4794887Schin 		error(ERROR_usage(2), "%s",optusage(NiL));
4804887Schin 	if(!cp)
4814887Schin 	{
4824887Schin 		error(2, "b, c or f option must be specified");
4834887Schin 		error(ERROR_usage(2), "%s", optusage(NiL));
4844887Schin 	}
4854887Schin 	if(!*cp)
4864887Schin 		error(3, "non-empty b, c or f option must be specified");
4874887Schin 	if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS)
4884887Schin 		error(3, "s option requires f option");
4894887Schin 	cuthdr = cutinit(mode,cp,wdelim,ldelim,reclen);
4904887Schin 	if(cp = *argv)
4914887Schin 		argv++;
4924887Schin 	do
4934887Schin 	{
4944887Schin 		if(!cp || streq(cp,"-"))
4954887Schin 			fp = sfstdin;
4964887Schin 		else if(!(fp = sfopen(NiL,cp,"r")))
4974887Schin 		{
4984887Schin 			error(ERROR_system(0),"%s: cannot open",cp);
4994887Schin 			continue;
5004887Schin 		}
5014887Schin 		if(mode&C_FIELDS)
5024887Schin 			cutfields(cuthdr,fp,sfstdout);
5034887Schin 		else
5044887Schin 			cutcols(cuthdr,fp,sfstdout);
5054887Schin 		if(fp!=sfstdin)
5064887Schin 			sfclose(fp);
507*8462SApril.Chin@Sun.COM 	} while(cp = *argv++);
508*8462SApril.Chin@Sun.COM 	if (sfsync(sfstdout))
509*8462SApril.Chin@Sun.COM 		error(ERROR_system(0), "write error");
5104887Schin 	return(error_info.errors?1:0);
5114887Schin }
512