xref: /onnv-gate/usr/src/lib/libcmd/common/uniq.c (revision 10898)
14887Schin /***********************************************************************
24887Schin *                                                                      *
34887Schin *               This software is part of the ast package               *
4*10898Sroland.mainz@nrubsig.org *          Copyright (c) 1992-2009 AT&T Intellectual Property          *
54887Schin *                      and is licensed under the                       *
64887Schin *                  Common Public License, Version 1.0                  *
78462SApril.Chin@Sun.COM *                    by AT&T Intellectual Property                     *
84887Schin *                                                                      *
94887Schin *                A copy of the License is available at                 *
104887Schin *            http://www.opensource.org/licenses/cpl1.0.txt             *
114887Schin *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
124887Schin *                                                                      *
134887Schin *              Information and Software Systems Research               *
144887Schin *                            AT&T Research                             *
154887Schin *                           Florham Park NJ                            *
164887Schin *                                                                      *
174887Schin *                 Glenn Fowler <gsf@research.att.com>                  *
184887Schin *                  David Korn <dgk@research.att.com>                   *
194887Schin *                                                                      *
204887Schin ***********************************************************************/
214887Schin #pragma prototyped
224887Schin /*
234887Schin  * uniq
244887Schin  *
254887Schin  * Written by David Korn
264887Schin  */
274887Schin 
284887Schin static const char usage[] =
29*10898Sroland.mainz@nrubsig.org "[-n?\n@(#)$Id: uniq (AT&T Research) 2009-08-10 $\n]"
304887Schin USAGE_LICENSE
314887Schin "[+NAME?uniq - Report or filter out repeated lines in a file]"
32*10898Sroland.mainz@nrubsig.org "[+DESCRIPTION?\buniq\b reads the input, compares adjacent lines, and "
33*10898Sroland.mainz@nrubsig.org 	"writes one copy of each input line on the output.  The second "
344887Schin 	"and succeeding copies of the repeated adjacent lines are not "
354887Schin 	"written.]"
364887Schin "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes "
374887Schin 	"to standard output.  If no \ainfile\a is given, or if the \ainfile\a "
38*10898Sroland.mainz@nrubsig.org 	"is \b-\b, \buniq\b reads from standard input with the start of "
39*10898Sroland.mainz@nrubsig.org 	"the file defined as the current offset.]"
404887Schin "[c:count?Output the number of times each line occurred  along with "
414887Schin 	"the line.]"
424887Schin "[d:repeated|duplicates?Output the first of each duplicate line.]"
434887Schin "[D:all-repeated?Output all duplicate lines as a group with an empty "
444887Schin     "line delimiter specified by \adelimit\a:]:?[delimit:=none]"
454887Schin     "{"
464887Schin         "[n:none?Do not delimit duplicate groups.]"
474887Schin         "[p:prepend?Prepend an empty line before each group.]"
484887Schin         "[s:separate?Separate each group with an empty line.]"
494887Schin     "}"
504887Schin "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over "
514887Schin     "before checking for uniqueness. A field is the minimal string matching "
52*10898Sroland.mainz@nrubsig.org     "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b. -\anumber\a is equivalent to "
53*10898Sroland.mainz@nrubsig.org     "\b--skip-fields\b=\anumber\a.]"
544887Schin "[i:ignore-case?Ignore case in comparisons.]"
554887Schin "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over "
564887Schin 	"before checking for uniqueness.  If specified along with \b-f\b, "
574887Schin 	"the first \achars\a after the first \afields\a are ignored.  If "
584887Schin 	"the \achars\a specifies more characters than are on the line, "
59*10898Sroland.mainz@nrubsig.org 	"an empty string will be used for comparison. +\anumber\a is "
60*10898Sroland.mainz@nrubsig.org 	"equivalent to \b--skip-chars\b=\anumber\a.]"
614887Schin "[u:unique?Output unique lines.]"
624887Schin "[w:check-chars]#[chars?\achars\a is the number of characters to compare "
634887Schin 	"after skipping any specified fields and characters.]"
644887Schin "\n"
654887Schin "\n[infile [outfile]]\n"
664887Schin "\n"
674887Schin "[+EXIT STATUS?]{"
684887Schin 	"[+0?The input file was successfully processed.]"
694887Schin 	"[+>0?An error occurred.]"
704887Schin "}"
714887Schin "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]"
724887Schin ;
734887Schin 
744887Schin #include <cmd.h>
754887Schin 
764887Schin #define C_FLAG	1
774887Schin #define D_FLAG	2
784887Schin #define U_FLAG	4
794887Schin 
804887Schin #define CWIDTH	4
814887Schin #define MAXCNT	9999
824887Schin 
834887Schin typedef int (*Compare_f)(const char*, const char*, size_t);
844887Schin 
854887Schin static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare)
864887Schin {
874887Schin 	register int n, f, outsize=0;
884887Schin 	register char *cp, *ep, *bufp, *outp;
894887Schin 	char *orecp, *sbufp=0, *outbuff;
904887Schin 	int reclen,oreclen= -1,count=0,cwidth=0,sep,next;
914887Schin 	if(mode&C_FLAG)
924887Schin 		cwidth = CWIDTH+1;
934887Schin 	while(1)
944887Schin 	{
954887Schin 		if(bufp = sfgetr(fdin,'\n',0))
964887Schin 			n = sfvalue(fdin);
974887Schin 		else if(bufp = sfgetr(fdin,'\n',SF_LASTR))
984887Schin 		{
994887Schin 			n = sfvalue(fdin);
1004887Schin 			bufp = memcpy(fmtbuf(n + 1), bufp, n);
1014887Schin 			bufp[n++] = '\n';
1024887Schin 		}
1034887Schin 		else
1044887Schin 			n = 0;
1054887Schin 		if(n)
1064887Schin 		{
1074887Schin 			cp = bufp;
1084887Schin 			ep = cp + n;
1094887Schin 			if(f=fields)
1104887Schin 				while(f-->0 && cp<ep) /* skip over fields */
1114887Schin 				{
1124887Schin 					while(cp<ep && *cp==' ' || *cp=='\t')
1134887Schin 						cp++;
1144887Schin 					while(cp<ep && *cp!=' ' && *cp!='\t')
1154887Schin 						cp++;
1164887Schin 				}
1174887Schin 			if(chars)
1184887Schin 				cp += chars;
1194887Schin 			if((reclen = n - (cp-bufp)) <=0)
1204887Schin 			{
1214887Schin 				reclen = 1;
1224887Schin 				cp = bufp + sfvalue(fdin)-1;
1234887Schin 			}
1244887Schin 			else if(width >= 0 && width < reclen)
1254887Schin 				reclen = width;
1264887Schin 		}
1274887Schin 		else
1284887Schin 			reclen=-2;
1294887Schin 		if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen)))
1304887Schin 		{
1314887Schin 			count++;
1324887Schin 			if (!all)
1334887Schin 				continue;
1344887Schin 			next = count;
1354887Schin 		}
1364887Schin 		else
1374887Schin 		{
1384887Schin 			next = 0;
1394887Schin 			if(outsize>0)
1404887Schin 			{
1414887Schin 				if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count))
1424887Schin 				{
1434887Schin 					if(outp!=sbufp)
1444887Schin 						sfwrite(fdout,outp,0);
1454887Schin 				}
1464887Schin 				else
1474887Schin 				{
1484887Schin 					if(cwidth)
1494887Schin 					{
150*10898Sroland.mainz@nrubsig.org 						if(count<9)
151*10898Sroland.mainz@nrubsig.org 						{
152*10898Sroland.mainz@nrubsig.org 							f = 0;
153*10898Sroland.mainz@nrubsig.org 							while(f < CWIDTH-1)
154*10898Sroland.mainz@nrubsig.org 								outp[f++] = ' ';
155*10898Sroland.mainz@nrubsig.org 							outp[f++] = '0' + count + 1;
156*10898Sroland.mainz@nrubsig.org 							outp[f] = ' ';
157*10898Sroland.mainz@nrubsig.org 						}
158*10898Sroland.mainz@nrubsig.org 						else if(count<MAXCNT)
1594887Schin 						{
160*10898Sroland.mainz@nrubsig.org 							count++;
161*10898Sroland.mainz@nrubsig.org 							f = CWIDTH;
162*10898Sroland.mainz@nrubsig.org 							outp[f--] = ' ';
163*10898Sroland.mainz@nrubsig.org 							do
164*10898Sroland.mainz@nrubsig.org 							{
165*10898Sroland.mainz@nrubsig.org 								outp[f--] = '0' + (count % 10);
166*10898Sroland.mainz@nrubsig.org 							} while (count /= 10);
167*10898Sroland.mainz@nrubsig.org 							while (f >= 0)
168*10898Sroland.mainz@nrubsig.org 								outp[f--] = ' ';
1694887Schin 						}
1704887Schin 						else
1714887Schin 						{
1724887Schin 							outsize -= (CWIDTH+1);
1734887Schin 							if(outp!=sbufp)
1744887Schin 							{
1754887Schin 								if(!(sbufp=fmtbuf(outsize)))
1764887Schin 									return(1);
1774887Schin 								memcpy(sbufp,outp+CWIDTH+1,outsize);
1784887Schin 								sfwrite(fdout,outp,0);
1794887Schin 								outp = sbufp;
1804887Schin 							}
1814887Schin 							else
1824887Schin 								outp += CWIDTH+1;
1834887Schin 							sfprintf(fdout,"%4d ",count+1);
1844887Schin 						}
1854887Schin 					}
1864887Schin 					if(sfwrite(fdout,outp,outsize) != outsize)
1874887Schin 						return(1);
1884887Schin 				}
1894887Schin 			}
1904887Schin 		}
1914887Schin 		if(n==0)
1924887Schin 			break;
1934887Schin 		if(count = next)
1944887Schin 		{
1954887Schin 			if(sfwrite(fdout,outp,outsize) != outsize)
1964887Schin 				return(1);
1974887Schin 			if(*all >= 0)
1984887Schin 				*all = 1;
1994887Schin 			sep = 0;
2004887Schin 		}
2014887Schin 		else
2024887Schin 			sep = all && *all > 0;
2034887Schin 		/* save current record */
2044887Schin 		if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0)
2054887Schin 			return(1);
2064887Schin 		outp = outbuff;
2074887Schin 		if(outsize < n+cwidth+sep)
2084887Schin 		{
2094887Schin 			/* no room in outp, clear lock and use side buffer */
2104887Schin 			sfwrite(fdout,outp,0);
2114887Schin 			if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep)))
2124887Schin 				return(1);
2134887Schin 		}
2144887Schin 		else
2154887Schin 			outsize = n+cwidth+sep;
2164887Schin 		memcpy(outp+cwidth+sep,bufp,n);
2174887Schin 		if(sep)
2184887Schin 			outp[cwidth] = '\n';
2194887Schin 		oreclen = reclen;
2204887Schin 		orecp = outp+cwidth+sep + (cp-bufp);
2214887Schin 	}
2224887Schin 	return(0);
2234887Schin }
2244887Schin 
2254887Schin int
2264887Schin b_uniq(int argc, char** argv, void* context)
2274887Schin {
2284887Schin 	register int n, mode=0;
2294887Schin 	register char *cp;
2304887Schin 	int fields=0, chars=0, width=-1;
2314887Schin 	Sfio_t *fpin, *fpout;
2324887Schin 	int* all = 0;
2334887Schin 	int sep;
2344887Schin 	Compare_f compare = (Compare_f)memcmp;
2354887Schin 
2364887Schin 	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
2374887Schin 	while (n = optget(argv, usage)) switch (n)
2384887Schin 	{
2394887Schin 	    case 'c':
2404887Schin 		mode |= C_FLAG;
2414887Schin 		break;
2424887Schin 	    case 'd':
2434887Schin 		mode |= D_FLAG;
2444887Schin 		break;
2454887Schin 	    case 'D':
2464887Schin 		mode |= D_FLAG;
2474887Schin 		switch ((int)opt_info.num)
2484887Schin 		{
2494887Schin 		case 'p':
2504887Schin 			sep = 1;
2514887Schin 			break;
2524887Schin 		case 's':
2534887Schin 			sep = 0;
2544887Schin 			break;
2554887Schin 		default:
2564887Schin 			sep = -1;
2574887Schin 			break;
2584887Schin 		}
2594887Schin 		all = &sep;
2604887Schin 		break;
2614887Schin 	    case 'i':
2624887Schin 		compare = (Compare_f)strncasecmp;
2634887Schin 		break;
2644887Schin 	    case 'u':
2654887Schin 		mode |= U_FLAG;
2664887Schin 		break;
2674887Schin 	    case 'f':
2684887Schin 		if(*opt_info.option=='-')
2694887Schin 			fields = opt_info.num;
2704887Schin 		else
2714887Schin 			chars = opt_info.num;
2724887Schin 		break;
2734887Schin 	    case 's':
2744887Schin 		chars = opt_info.num;
2754887Schin 		break;
2764887Schin 	    case 'w':
2774887Schin 		width = opt_info.num;
2784887Schin 		break;
2794887Schin 	    case ':':
2804887Schin 		error(2, "%s", opt_info.arg);
2814887Schin 		break;
2824887Schin 	    case '?':
2834887Schin 		error(ERROR_usage(2), "%s", opt_info.arg);
2844887Schin 		break;
2854887Schin 	}
2864887Schin 	argv += opt_info.index;
2874887Schin 	if(all && (mode&C_FLAG))
2884887Schin 		error(2, "-c and -D are mutually exclusive");
2894887Schin 	if(error_info.errors)
2904887Schin 		error(ERROR_usage(2), "%s", optusage(NiL));
2914887Schin 	if((cp = *argv) && (argv++,!streq(cp,"-")))
2924887Schin 	{
2934887Schin 		if(!(fpin = sfopen(NiL,cp,"r")))
2944887Schin 			error(ERROR_system(1),"%s: cannot open",cp);
2954887Schin 	}
2964887Schin 	else
2974887Schin 		fpin = sfstdin;
2984887Schin 	if(cp = *argv)
2994887Schin 	{
3004887Schin 		argv++;
3014887Schin 		if(!(fpout = sfopen(NiL,cp,"w")))
3024887Schin 			error(ERROR_system(1),"%s: cannot create",cp);
3034887Schin 	}
3044887Schin 	else
3054887Schin 		fpout = sfstdout;
3064887Schin 	if(*argv)
3074887Schin 	{
3084887Schin 		error(2, "too many arguments");
3094887Schin 		error(ERROR_usage(2), "%s", optusage(NiL));
3104887Schin 	}
3114887Schin 	error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare);
3124887Schin 	if(fpin!=sfstdin)
3134887Schin 		sfclose(fpin);
3144887Schin 	if(fpout!=sfstdout)
3154887Schin 		sfclose(fpout);
3164887Schin 	return(error_info.errors);
3174887Schin }
3184887Schin 
319