xref: /onnv-gate/usr/src/lib/libcmd/common/uniq.c (revision 12068:08a39a083754)
14887Schin /***********************************************************************
24887Schin *                                                                      *
34887Schin *               This software is part of the ast package               *
4*12068SRoger.Faulkner@Oracle.COM *          Copyright (c) 1992-2010 AT&T Intellectual Property          *
54887Schin *                      and is licensed under the                       *
64887Schin *                  Common Public License, Version 1.0                  *
78462SApril.Chin@Sun.COM *                    by AT&T Intellectual Property                     *
84887Schin *                                                                      *
94887Schin *                A copy of the License is available at                 *
104887Schin *            http://www.opensource.org/licenses/cpl1.0.txt             *
114887Schin *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
124887Schin *                                                                      *
134887Schin *              Information and Software Systems Research               *
144887Schin *                            AT&T Research                             *
154887Schin *                           Florham Park NJ                            *
164887Schin *                                                                      *
174887Schin *                 Glenn Fowler <gsf@research.att.com>                  *
184887Schin *                  David Korn <dgk@research.att.com>                   *
194887Schin *                                                                      *
204887Schin ***********************************************************************/
214887Schin #pragma prototyped
224887Schin /*
234887Schin  * uniq
244887Schin  *
254887Schin  * Written by David Korn
264887Schin  */
274887Schin 
284887Schin static const char usage[] =
29*12068SRoger.Faulkner@Oracle.COM "[-n?\n@(#)$Id: uniq (AT&T Research) 2009-11-28 $\n]"
304887Schin USAGE_LICENSE
314887Schin "[+NAME?uniq - Report or filter out repeated lines in a file]"
3210898Sroland.mainz@nrubsig.org "[+DESCRIPTION?\buniq\b reads the input, compares adjacent lines, and "
3310898Sroland.mainz@nrubsig.org 	"writes one copy of each input line on the output.  The second "
344887Schin 	"and succeeding copies of the repeated adjacent lines are not "
354887Schin 	"written.]"
364887Schin "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes "
374887Schin 	"to standard output.  If no \ainfile\a is given, or if the \ainfile\a "
3810898Sroland.mainz@nrubsig.org 	"is \b-\b, \buniq\b reads from standard input with the start of "
3910898Sroland.mainz@nrubsig.org 	"the file defined as the current offset.]"
404887Schin "[c:count?Output the number of times each line occurred  along with "
414887Schin 	"the line.]"
424887Schin "[d:repeated|duplicates?Output the first of each duplicate line.]"
434887Schin "[D:all-repeated?Output all duplicate lines as a group with an empty "
444887Schin     "line delimiter specified by \adelimit\a:]:?[delimit:=none]"
454887Schin     "{"
464887Schin         "[n:none?Do not delimit duplicate groups.]"
474887Schin         "[p:prepend?Prepend an empty line before each group.]"
484887Schin         "[s:separate?Separate each group with an empty line.]"
494887Schin     "}"
504887Schin "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over "
514887Schin     "before checking for uniqueness. A field is the minimal string matching "
5210898Sroland.mainz@nrubsig.org     "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b. -\anumber\a is equivalent to "
5310898Sroland.mainz@nrubsig.org     "\b--skip-fields\b=\anumber\a.]"
544887Schin "[i:ignore-case?Ignore case in comparisons.]"
554887Schin "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over "
564887Schin 	"before checking for uniqueness.  If specified along with \b-f\b, "
574887Schin 	"the first \achars\a after the first \afields\a are ignored.  If "
584887Schin 	"the \achars\a specifies more characters than are on the line, "
5910898Sroland.mainz@nrubsig.org 	"an empty string will be used for comparison. +\anumber\a is "
6010898Sroland.mainz@nrubsig.org 	"equivalent to \b--skip-chars\b=\anumber\a.]"
614887Schin "[u:unique?Output unique lines.]"
624887Schin "[w:check-chars]#[chars?\achars\a is the number of characters to compare "
634887Schin 	"after skipping any specified fields and characters.]"
644887Schin "\n"
654887Schin "\n[infile [outfile]]\n"
664887Schin "\n"
674887Schin "[+EXIT STATUS?]{"
684887Schin 	"[+0?The input file was successfully processed.]"
694887Schin 	"[+>0?An error occurred.]"
704887Schin "}"
714887Schin "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]"
724887Schin ;
734887Schin 
744887Schin #include <cmd.h>
754887Schin 
764887Schin #define C_FLAG	1
774887Schin #define D_FLAG	2
784887Schin #define U_FLAG	4
794887Schin 
804887Schin #define CWIDTH	4
814887Schin #define MAXCNT	9999
824887Schin 
834887Schin typedef int (*Compare_f)(const char*, const char*, size_t);
844887Schin 
uniq(Sfio_t * fdin,Sfio_t * fdout,int fields,int chars,int width,int mode,int * all,Compare_f compare)854887Schin static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare)
864887Schin {
87*12068SRoger.Faulkner@Oracle.COM 	register int n, f, outsize=0, mb = mbwide();
88*12068SRoger.Faulkner@Oracle.COM 	register char *cp, *ep, *mp, *bufp, *outp;
894887Schin 	char *orecp, *sbufp=0, *outbuff;
904887Schin 	int reclen,oreclen= -1,count=0,cwidth=0,sep,next;
914887Schin 	if(mode&C_FLAG)
924887Schin 		cwidth = CWIDTH+1;
934887Schin 	while(1)
944887Schin 	{
954887Schin 		if(bufp = sfgetr(fdin,'\n',0))
964887Schin 			n = sfvalue(fdin);
974887Schin 		else if(bufp = sfgetr(fdin,'\n',SF_LASTR))
984887Schin 		{
994887Schin 			n = sfvalue(fdin);
1004887Schin 			bufp = memcpy(fmtbuf(n + 1), bufp, n);
1014887Schin 			bufp[n++] = '\n';
1024887Schin 		}
1034887Schin 		else
1044887Schin 			n = 0;
105*12068SRoger.Faulkner@Oracle.COM 		if (n)
1064887Schin 		{
1074887Schin 			cp = bufp;
1084887Schin 			ep = cp + n;
109*12068SRoger.Faulkner@Oracle.COM 			if (f = fields)
110*12068SRoger.Faulkner@Oracle.COM 				while (f-->0 && cp<ep) /* skip over fields */
1114887Schin 				{
112*12068SRoger.Faulkner@Oracle.COM 					while (cp<ep && *cp==' ' || *cp=='\t')
1134887Schin 						cp++;
114*12068SRoger.Faulkner@Oracle.COM 					while (cp<ep && *cp!=' ' && *cp!='\t')
1154887Schin 						cp++;
1164887Schin 				}
117*12068SRoger.Faulkner@Oracle.COM 			if (chars)
118*12068SRoger.Faulkner@Oracle.COM 			{
119*12068SRoger.Faulkner@Oracle.COM 				if (mb)
120*12068SRoger.Faulkner@Oracle.COM 					for (f = chars; f; f--)
121*12068SRoger.Faulkner@Oracle.COM 						mbchar(cp);
122*12068SRoger.Faulkner@Oracle.COM 				else
123*12068SRoger.Faulkner@Oracle.COM 					cp += chars;
124*12068SRoger.Faulkner@Oracle.COM 			}
125*12068SRoger.Faulkner@Oracle.COM 			if ((reclen = n - (cp - bufp)) <= 0)
1264887Schin 			{
1274887Schin 				reclen = 1;
128*12068SRoger.Faulkner@Oracle.COM 				cp = bufp + n - 1;
1294887Schin 			}
130*12068SRoger.Faulkner@Oracle.COM 			else if (width >= 0 && width < reclen)
131*12068SRoger.Faulkner@Oracle.COM 			{
132*12068SRoger.Faulkner@Oracle.COM 				if (mb)
133*12068SRoger.Faulkner@Oracle.COM 				{
134*12068SRoger.Faulkner@Oracle.COM 					reclen = 0;
135*12068SRoger.Faulkner@Oracle.COM 					mp = cp;
136*12068SRoger.Faulkner@Oracle.COM 					while (reclen < width && mp < ep)
137*12068SRoger.Faulkner@Oracle.COM 					{
138*12068SRoger.Faulkner@Oracle.COM 						reclen++;
139*12068SRoger.Faulkner@Oracle.COM 						mbchar(mp);
140*12068SRoger.Faulkner@Oracle.COM 					}
141*12068SRoger.Faulkner@Oracle.COM 					reclen = mp - cp;
142*12068SRoger.Faulkner@Oracle.COM 				}
143*12068SRoger.Faulkner@Oracle.COM 				else
144*12068SRoger.Faulkner@Oracle.COM 					reclen = width;
145*12068SRoger.Faulkner@Oracle.COM 			}
1464887Schin 		}
1474887Schin 		else
148*12068SRoger.Faulkner@Oracle.COM 			reclen = -2;
1494887Schin 		if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen)))
1504887Schin 		{
1514887Schin 			count++;
1524887Schin 			if (!all)
1534887Schin 				continue;
1544887Schin 			next = count;
1554887Schin 		}
1564887Schin 		else
1574887Schin 		{
1584887Schin 			next = 0;
1594887Schin 			if(outsize>0)
1604887Schin 			{
1614887Schin 				if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count))
1624887Schin 				{
1634887Schin 					if(outp!=sbufp)
1644887Schin 						sfwrite(fdout,outp,0);
1654887Schin 				}
1664887Schin 				else
1674887Schin 				{
1684887Schin 					if(cwidth)
1694887Schin 					{
17010898Sroland.mainz@nrubsig.org 						if(count<9)
17110898Sroland.mainz@nrubsig.org 						{
17210898Sroland.mainz@nrubsig.org 							f = 0;
17310898Sroland.mainz@nrubsig.org 							while(f < CWIDTH-1)
17410898Sroland.mainz@nrubsig.org 								outp[f++] = ' ';
17510898Sroland.mainz@nrubsig.org 							outp[f++] = '0' + count + 1;
17610898Sroland.mainz@nrubsig.org 							outp[f] = ' ';
17710898Sroland.mainz@nrubsig.org 						}
17810898Sroland.mainz@nrubsig.org 						else if(count<MAXCNT)
1794887Schin 						{
18010898Sroland.mainz@nrubsig.org 							count++;
18110898Sroland.mainz@nrubsig.org 							f = CWIDTH;
18210898Sroland.mainz@nrubsig.org 							outp[f--] = ' ';
18310898Sroland.mainz@nrubsig.org 							do
18410898Sroland.mainz@nrubsig.org 							{
18510898Sroland.mainz@nrubsig.org 								outp[f--] = '0' + (count % 10);
18610898Sroland.mainz@nrubsig.org 							} while (count /= 10);
18710898Sroland.mainz@nrubsig.org 							while (f >= 0)
18810898Sroland.mainz@nrubsig.org 								outp[f--] = ' ';
1894887Schin 						}
1904887Schin 						else
1914887Schin 						{
1924887Schin 							outsize -= (CWIDTH+1);
1934887Schin 							if(outp!=sbufp)
1944887Schin 							{
1954887Schin 								if(!(sbufp=fmtbuf(outsize)))
1964887Schin 									return(1);
1974887Schin 								memcpy(sbufp,outp+CWIDTH+1,outsize);
1984887Schin 								sfwrite(fdout,outp,0);
1994887Schin 								outp = sbufp;
2004887Schin 							}
2014887Schin 							else
2024887Schin 								outp += CWIDTH+1;
2034887Schin 							sfprintf(fdout,"%4d ",count+1);
2044887Schin 						}
2054887Schin 					}
2064887Schin 					if(sfwrite(fdout,outp,outsize) != outsize)
2074887Schin 						return(1);
2084887Schin 				}
2094887Schin 			}
2104887Schin 		}
2114887Schin 		if(n==0)
2124887Schin 			break;
2134887Schin 		if(count = next)
2144887Schin 		{
2154887Schin 			if(sfwrite(fdout,outp,outsize) != outsize)
2164887Schin 				return(1);
2174887Schin 			if(*all >= 0)
2184887Schin 				*all = 1;
2194887Schin 			sep = 0;
2204887Schin 		}
2214887Schin 		else
2224887Schin 			sep = all && *all > 0;
2234887Schin 		/* save current record */
2244887Schin 		if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0)
2254887Schin 			return(1);
2264887Schin 		outp = outbuff;
2274887Schin 		if(outsize < n+cwidth+sep)
2284887Schin 		{
2294887Schin 			/* no room in outp, clear lock and use side buffer */
2304887Schin 			sfwrite(fdout,outp,0);
2314887Schin 			if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep)))
2324887Schin 				return(1);
2334887Schin 		}
2344887Schin 		else
2354887Schin 			outsize = n+cwidth+sep;
2364887Schin 		memcpy(outp+cwidth+sep,bufp,n);
2374887Schin 		if(sep)
2384887Schin 			outp[cwidth] = '\n';
2394887Schin 		oreclen = reclen;
2404887Schin 		orecp = outp+cwidth+sep + (cp-bufp);
2414887Schin 	}
2424887Schin 	return(0);
2434887Schin }
2444887Schin 
2454887Schin int
b_uniq(int argc,char ** argv,void * context)2464887Schin b_uniq(int argc, char** argv, void* context)
2474887Schin {
2484887Schin 	register int n, mode=0;
2494887Schin 	register char *cp;
2504887Schin 	int fields=0, chars=0, width=-1;
2514887Schin 	Sfio_t *fpin, *fpout;
2524887Schin 	int* all = 0;
2534887Schin 	int sep;
2544887Schin 	Compare_f compare = (Compare_f)memcmp;
2554887Schin 
2564887Schin 	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
2574887Schin 	while (n = optget(argv, usage)) switch (n)
2584887Schin 	{
2594887Schin 	    case 'c':
2604887Schin 		mode |= C_FLAG;
2614887Schin 		break;
2624887Schin 	    case 'd':
2634887Schin 		mode |= D_FLAG;
2644887Schin 		break;
2654887Schin 	    case 'D':
2664887Schin 		mode |= D_FLAG;
2674887Schin 		switch ((int)opt_info.num)
2684887Schin 		{
2694887Schin 		case 'p':
2704887Schin 			sep = 1;
2714887Schin 			break;
2724887Schin 		case 's':
2734887Schin 			sep = 0;
2744887Schin 			break;
2754887Schin 		default:
2764887Schin 			sep = -1;
2774887Schin 			break;
2784887Schin 		}
2794887Schin 		all = &sep;
2804887Schin 		break;
2814887Schin 	    case 'i':
2824887Schin 		compare = (Compare_f)strncasecmp;
2834887Schin 		break;
2844887Schin 	    case 'u':
2854887Schin 		mode |= U_FLAG;
2864887Schin 		break;
2874887Schin 	    case 'f':
2884887Schin 		if(*opt_info.option=='-')
2894887Schin 			fields = opt_info.num;
2904887Schin 		else
2914887Schin 			chars = opt_info.num;
2924887Schin 		break;
2934887Schin 	    case 's':
2944887Schin 		chars = opt_info.num;
2954887Schin 		break;
2964887Schin 	    case 'w':
2974887Schin 		width = opt_info.num;
2984887Schin 		break;
2994887Schin 	    case ':':
3004887Schin 		error(2, "%s", opt_info.arg);
3014887Schin 		break;
3024887Schin 	    case '?':
3034887Schin 		error(ERROR_usage(2), "%s", opt_info.arg);
3044887Schin 		break;
3054887Schin 	}
3064887Schin 	argv += opt_info.index;
3074887Schin 	if(all && (mode&C_FLAG))
3084887Schin 		error(2, "-c and -D are mutually exclusive");
3094887Schin 	if(error_info.errors)
3104887Schin 		error(ERROR_usage(2), "%s", optusage(NiL));
3114887Schin 	if((cp = *argv) && (argv++,!streq(cp,"-")))
3124887Schin 	{
3134887Schin 		if(!(fpin = sfopen(NiL,cp,"r")))
3144887Schin 			error(ERROR_system(1),"%s: cannot open",cp);
3154887Schin 	}
3164887Schin 	else
3174887Schin 		fpin = sfstdin;
3184887Schin 	if(cp = *argv)
3194887Schin 	{
3204887Schin 		argv++;
3214887Schin 		if(!(fpout = sfopen(NiL,cp,"w")))
3224887Schin 			error(ERROR_system(1),"%s: cannot create",cp);
3234887Schin 	}
3244887Schin 	else
3254887Schin 		fpout = sfstdout;
3264887Schin 	if(*argv)
3274887Schin 	{
3284887Schin 		error(2, "too many arguments");
3294887Schin 		error(ERROR_usage(2), "%s", optusage(NiL));
3304887Schin 	}
3314887Schin 	error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare);
3324887Schin 	if(fpin!=sfstdin)
3334887Schin 		sfclose(fpin);
3344887Schin 	if(fpout!=sfstdout)
3354887Schin 		sfclose(fpout);
3364887Schin 	return(error_info.errors);
3374887Schin }
3384887Schin 
339