14887Schin /***********************************************************************
24887Schin * *
34887Schin * This software is part of the ast package *
4*12068SRoger.Faulkner@Oracle.COM * Copyright (c) 1992-2010 AT&T Intellectual Property *
54887Schin * and is licensed under the *
64887Schin * Common Public License, Version 1.0 *
78462SApril.Chin@Sun.COM * by AT&T Intellectual Property *
84887Schin * *
94887Schin * A copy of the License is available at *
104887Schin * http://www.opensource.org/licenses/cpl1.0.txt *
114887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
124887Schin * *
134887Schin * Information and Software Systems Research *
144887Schin * AT&T Research *
154887Schin * Florham Park NJ *
164887Schin * *
174887Schin * Glenn Fowler <gsf@research.att.com> *
184887Schin * David Korn <dgk@research.att.com> *
194887Schin * *
204887Schin ***********************************************************************/
214887Schin #pragma prototyped
224887Schin /*
234887Schin * uniq
244887Schin *
254887Schin * Written by David Korn
264887Schin */
274887Schin
284887Schin static const char usage[] =
29*12068SRoger.Faulkner@Oracle.COM "[-n?\n@(#)$Id: uniq (AT&T Research) 2009-11-28 $\n]"
304887Schin USAGE_LICENSE
314887Schin "[+NAME?uniq - Report or filter out repeated lines in a file]"
3210898Sroland.mainz@nrubsig.org "[+DESCRIPTION?\buniq\b reads the input, compares adjacent lines, and "
3310898Sroland.mainz@nrubsig.org "writes one copy of each input line on the output. The second "
344887Schin "and succeeding copies of the repeated adjacent lines are not "
354887Schin "written.]"
364887Schin "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes "
374887Schin "to standard output. If no \ainfile\a is given, or if the \ainfile\a "
3810898Sroland.mainz@nrubsig.org "is \b-\b, \buniq\b reads from standard input with the start of "
3910898Sroland.mainz@nrubsig.org "the file defined as the current offset.]"
404887Schin "[c:count?Output the number of times each line occurred along with "
414887Schin "the line.]"
424887Schin "[d:repeated|duplicates?Output the first of each duplicate line.]"
434887Schin "[D:all-repeated?Output all duplicate lines as a group with an empty "
444887Schin "line delimiter specified by \adelimit\a:]:?[delimit:=none]"
454887Schin "{"
464887Schin "[n:none?Do not delimit duplicate groups.]"
474887Schin "[p:prepend?Prepend an empty line before each group.]"
484887Schin "[s:separate?Separate each group with an empty line.]"
494887Schin "}"
504887Schin "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over "
514887Schin "before checking for uniqueness. A field is the minimal string matching "
5210898Sroland.mainz@nrubsig.org "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b. -\anumber\a is equivalent to "
5310898Sroland.mainz@nrubsig.org "\b--skip-fields\b=\anumber\a.]"
544887Schin "[i:ignore-case?Ignore case in comparisons.]"
554887Schin "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over "
564887Schin "before checking for uniqueness. If specified along with \b-f\b, "
574887Schin "the first \achars\a after the first \afields\a are ignored. If "
584887Schin "the \achars\a specifies more characters than are on the line, "
5910898Sroland.mainz@nrubsig.org "an empty string will be used for comparison. +\anumber\a is "
6010898Sroland.mainz@nrubsig.org "equivalent to \b--skip-chars\b=\anumber\a.]"
614887Schin "[u:unique?Output unique lines.]"
624887Schin "[w:check-chars]#[chars?\achars\a is the number of characters to compare "
634887Schin "after skipping any specified fields and characters.]"
644887Schin "\n"
654887Schin "\n[infile [outfile]]\n"
664887Schin "\n"
674887Schin "[+EXIT STATUS?]{"
684887Schin "[+0?The input file was successfully processed.]"
694887Schin "[+>0?An error occurred.]"
704887Schin "}"
714887Schin "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]"
724887Schin ;
734887Schin
744887Schin #include <cmd.h>
754887Schin
764887Schin #define C_FLAG 1
774887Schin #define D_FLAG 2
784887Schin #define U_FLAG 4
794887Schin
804887Schin #define CWIDTH 4
814887Schin #define MAXCNT 9999
824887Schin
834887Schin typedef int (*Compare_f)(const char*, const char*, size_t);
844887Schin
uniq(Sfio_t * fdin,Sfio_t * fdout,int fields,int chars,int width,int mode,int * all,Compare_f compare)854887Schin static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare)
864887Schin {
87*12068SRoger.Faulkner@Oracle.COM register int n, f, outsize=0, mb = mbwide();
88*12068SRoger.Faulkner@Oracle.COM register char *cp, *ep, *mp, *bufp, *outp;
894887Schin char *orecp, *sbufp=0, *outbuff;
904887Schin int reclen,oreclen= -1,count=0,cwidth=0,sep,next;
914887Schin if(mode&C_FLAG)
924887Schin cwidth = CWIDTH+1;
934887Schin while(1)
944887Schin {
954887Schin if(bufp = sfgetr(fdin,'\n',0))
964887Schin n = sfvalue(fdin);
974887Schin else if(bufp = sfgetr(fdin,'\n',SF_LASTR))
984887Schin {
994887Schin n = sfvalue(fdin);
1004887Schin bufp = memcpy(fmtbuf(n + 1), bufp, n);
1014887Schin bufp[n++] = '\n';
1024887Schin }
1034887Schin else
1044887Schin n = 0;
105*12068SRoger.Faulkner@Oracle.COM if (n)
1064887Schin {
1074887Schin cp = bufp;
1084887Schin ep = cp + n;
109*12068SRoger.Faulkner@Oracle.COM if (f = fields)
110*12068SRoger.Faulkner@Oracle.COM while (f-->0 && cp<ep) /* skip over fields */
1114887Schin {
112*12068SRoger.Faulkner@Oracle.COM while (cp<ep && *cp==' ' || *cp=='\t')
1134887Schin cp++;
114*12068SRoger.Faulkner@Oracle.COM while (cp<ep && *cp!=' ' && *cp!='\t')
1154887Schin cp++;
1164887Schin }
117*12068SRoger.Faulkner@Oracle.COM if (chars)
118*12068SRoger.Faulkner@Oracle.COM {
119*12068SRoger.Faulkner@Oracle.COM if (mb)
120*12068SRoger.Faulkner@Oracle.COM for (f = chars; f; f--)
121*12068SRoger.Faulkner@Oracle.COM mbchar(cp);
122*12068SRoger.Faulkner@Oracle.COM else
123*12068SRoger.Faulkner@Oracle.COM cp += chars;
124*12068SRoger.Faulkner@Oracle.COM }
125*12068SRoger.Faulkner@Oracle.COM if ((reclen = n - (cp - bufp)) <= 0)
1264887Schin {
1274887Schin reclen = 1;
128*12068SRoger.Faulkner@Oracle.COM cp = bufp + n - 1;
1294887Schin }
130*12068SRoger.Faulkner@Oracle.COM else if (width >= 0 && width < reclen)
131*12068SRoger.Faulkner@Oracle.COM {
132*12068SRoger.Faulkner@Oracle.COM if (mb)
133*12068SRoger.Faulkner@Oracle.COM {
134*12068SRoger.Faulkner@Oracle.COM reclen = 0;
135*12068SRoger.Faulkner@Oracle.COM mp = cp;
136*12068SRoger.Faulkner@Oracle.COM while (reclen < width && mp < ep)
137*12068SRoger.Faulkner@Oracle.COM {
138*12068SRoger.Faulkner@Oracle.COM reclen++;
139*12068SRoger.Faulkner@Oracle.COM mbchar(mp);
140*12068SRoger.Faulkner@Oracle.COM }
141*12068SRoger.Faulkner@Oracle.COM reclen = mp - cp;
142*12068SRoger.Faulkner@Oracle.COM }
143*12068SRoger.Faulkner@Oracle.COM else
144*12068SRoger.Faulkner@Oracle.COM reclen = width;
145*12068SRoger.Faulkner@Oracle.COM }
1464887Schin }
1474887Schin else
148*12068SRoger.Faulkner@Oracle.COM reclen = -2;
1494887Schin if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen)))
1504887Schin {
1514887Schin count++;
1524887Schin if (!all)
1534887Schin continue;
1544887Schin next = count;
1554887Schin }
1564887Schin else
1574887Schin {
1584887Schin next = 0;
1594887Schin if(outsize>0)
1604887Schin {
1614887Schin if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count))
1624887Schin {
1634887Schin if(outp!=sbufp)
1644887Schin sfwrite(fdout,outp,0);
1654887Schin }
1664887Schin else
1674887Schin {
1684887Schin if(cwidth)
1694887Schin {
17010898Sroland.mainz@nrubsig.org if(count<9)
17110898Sroland.mainz@nrubsig.org {
17210898Sroland.mainz@nrubsig.org f = 0;
17310898Sroland.mainz@nrubsig.org while(f < CWIDTH-1)
17410898Sroland.mainz@nrubsig.org outp[f++] = ' ';
17510898Sroland.mainz@nrubsig.org outp[f++] = '0' + count + 1;
17610898Sroland.mainz@nrubsig.org outp[f] = ' ';
17710898Sroland.mainz@nrubsig.org }
17810898Sroland.mainz@nrubsig.org else if(count<MAXCNT)
1794887Schin {
18010898Sroland.mainz@nrubsig.org count++;
18110898Sroland.mainz@nrubsig.org f = CWIDTH;
18210898Sroland.mainz@nrubsig.org outp[f--] = ' ';
18310898Sroland.mainz@nrubsig.org do
18410898Sroland.mainz@nrubsig.org {
18510898Sroland.mainz@nrubsig.org outp[f--] = '0' + (count % 10);
18610898Sroland.mainz@nrubsig.org } while (count /= 10);
18710898Sroland.mainz@nrubsig.org while (f >= 0)
18810898Sroland.mainz@nrubsig.org outp[f--] = ' ';
1894887Schin }
1904887Schin else
1914887Schin {
1924887Schin outsize -= (CWIDTH+1);
1934887Schin if(outp!=sbufp)
1944887Schin {
1954887Schin if(!(sbufp=fmtbuf(outsize)))
1964887Schin return(1);
1974887Schin memcpy(sbufp,outp+CWIDTH+1,outsize);
1984887Schin sfwrite(fdout,outp,0);
1994887Schin outp = sbufp;
2004887Schin }
2014887Schin else
2024887Schin outp += CWIDTH+1;
2034887Schin sfprintf(fdout,"%4d ",count+1);
2044887Schin }
2054887Schin }
2064887Schin if(sfwrite(fdout,outp,outsize) != outsize)
2074887Schin return(1);
2084887Schin }
2094887Schin }
2104887Schin }
2114887Schin if(n==0)
2124887Schin break;
2134887Schin if(count = next)
2144887Schin {
2154887Schin if(sfwrite(fdout,outp,outsize) != outsize)
2164887Schin return(1);
2174887Schin if(*all >= 0)
2184887Schin *all = 1;
2194887Schin sep = 0;
2204887Schin }
2214887Schin else
2224887Schin sep = all && *all > 0;
2234887Schin /* save current record */
2244887Schin if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0)
2254887Schin return(1);
2264887Schin outp = outbuff;
2274887Schin if(outsize < n+cwidth+sep)
2284887Schin {
2294887Schin /* no room in outp, clear lock and use side buffer */
2304887Schin sfwrite(fdout,outp,0);
2314887Schin if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep)))
2324887Schin return(1);
2334887Schin }
2344887Schin else
2354887Schin outsize = n+cwidth+sep;
2364887Schin memcpy(outp+cwidth+sep,bufp,n);
2374887Schin if(sep)
2384887Schin outp[cwidth] = '\n';
2394887Schin oreclen = reclen;
2404887Schin orecp = outp+cwidth+sep + (cp-bufp);
2414887Schin }
2424887Schin return(0);
2434887Schin }
2444887Schin
2454887Schin int
b_uniq(int argc,char ** argv,void * context)2464887Schin b_uniq(int argc, char** argv, void* context)
2474887Schin {
2484887Schin register int n, mode=0;
2494887Schin register char *cp;
2504887Schin int fields=0, chars=0, width=-1;
2514887Schin Sfio_t *fpin, *fpout;
2524887Schin int* all = 0;
2534887Schin int sep;
2544887Schin Compare_f compare = (Compare_f)memcmp;
2554887Schin
2564887Schin cmdinit(argc, argv, context, ERROR_CATALOG, 0);
2574887Schin while (n = optget(argv, usage)) switch (n)
2584887Schin {
2594887Schin case 'c':
2604887Schin mode |= C_FLAG;
2614887Schin break;
2624887Schin case 'd':
2634887Schin mode |= D_FLAG;
2644887Schin break;
2654887Schin case 'D':
2664887Schin mode |= D_FLAG;
2674887Schin switch ((int)opt_info.num)
2684887Schin {
2694887Schin case 'p':
2704887Schin sep = 1;
2714887Schin break;
2724887Schin case 's':
2734887Schin sep = 0;
2744887Schin break;
2754887Schin default:
2764887Schin sep = -1;
2774887Schin break;
2784887Schin }
2794887Schin all = &sep;
2804887Schin break;
2814887Schin case 'i':
2824887Schin compare = (Compare_f)strncasecmp;
2834887Schin break;
2844887Schin case 'u':
2854887Schin mode |= U_FLAG;
2864887Schin break;
2874887Schin case 'f':
2884887Schin if(*opt_info.option=='-')
2894887Schin fields = opt_info.num;
2904887Schin else
2914887Schin chars = opt_info.num;
2924887Schin break;
2934887Schin case 's':
2944887Schin chars = opt_info.num;
2954887Schin break;
2964887Schin case 'w':
2974887Schin width = opt_info.num;
2984887Schin break;
2994887Schin case ':':
3004887Schin error(2, "%s", opt_info.arg);
3014887Schin break;
3024887Schin case '?':
3034887Schin error(ERROR_usage(2), "%s", opt_info.arg);
3044887Schin break;
3054887Schin }
3064887Schin argv += opt_info.index;
3074887Schin if(all && (mode&C_FLAG))
3084887Schin error(2, "-c and -D are mutually exclusive");
3094887Schin if(error_info.errors)
3104887Schin error(ERROR_usage(2), "%s", optusage(NiL));
3114887Schin if((cp = *argv) && (argv++,!streq(cp,"-")))
3124887Schin {
3134887Schin if(!(fpin = sfopen(NiL,cp,"r")))
3144887Schin error(ERROR_system(1),"%s: cannot open",cp);
3154887Schin }
3164887Schin else
3174887Schin fpin = sfstdin;
3184887Schin if(cp = *argv)
3194887Schin {
3204887Schin argv++;
3214887Schin if(!(fpout = sfopen(NiL,cp,"w")))
3224887Schin error(ERROR_system(1),"%s: cannot create",cp);
3234887Schin }
3244887Schin else
3254887Schin fpout = sfstdout;
3264887Schin if(*argv)
3274887Schin {
3284887Schin error(2, "too many arguments");
3294887Schin error(ERROR_usage(2), "%s", optusage(NiL));
3304887Schin }
3314887Schin error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare);
3324887Schin if(fpin!=sfstdin)
3334887Schin sfclose(fpin);
3344887Schin if(fpout!=sfstdout)
3354887Schin sfclose(fpout);
3364887Schin return(error_info.errors);
3374887Schin }
3384887Schin
339