14887Schin /*********************************************************************** 24887Schin * * 34887Schin * This software is part of the ast package * 4*12068SRoger.Faulkner@Oracle.COM * Copyright (c) 1992-2010 AT&T Intellectual Property * 54887Schin * and is licensed under the * 64887Schin * Common Public License, Version 1.0 * 78462SApril.Chin@Sun.COM * by AT&T Intellectual Property * 84887Schin * * 94887Schin * A copy of the License is available at * 104887Schin * http://www.opensource.org/licenses/cpl1.0.txt * 114887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 124887Schin * * 134887Schin * Information and Software Systems Research * 144887Schin * AT&T Research * 154887Schin * Florham Park NJ * 164887Schin * * 174887Schin * Glenn Fowler <gsf@research.att.com> * 184887Schin * David Korn <dgk@research.att.com> * 194887Schin * * 204887Schin ***********************************************************************/ 214887Schin #pragma prototyped 224887Schin /* 234887Schin * uniq 244887Schin * 254887Schin * Written by David Korn 264887Schin */ 274887Schin 284887Schin static const char usage[] = 29*12068SRoger.Faulkner@Oracle.COM "[-n?\n@(#)$Id: uniq (AT&T Research) 2009-11-28 $\n]" 304887Schin USAGE_LICENSE 314887Schin "[+NAME?uniq - Report or filter out repeated lines in a file]" 3210898Sroland.mainz@nrubsig.org "[+DESCRIPTION?\buniq\b reads the input, compares adjacent lines, and " 3310898Sroland.mainz@nrubsig.org "writes one copy of each input line on the output. The second " 344887Schin "and succeeding copies of the repeated adjacent lines are not " 354887Schin "written.]" 364887Schin "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes " 374887Schin "to standard output. If no \ainfile\a is given, or if the \ainfile\a " 3810898Sroland.mainz@nrubsig.org "is \b-\b, \buniq\b reads from standard input with the start of " 3910898Sroland.mainz@nrubsig.org "the file defined as the current offset.]" 404887Schin "[c:count?Output the number of times each line occurred along with " 414887Schin "the line.]" 424887Schin "[d:repeated|duplicates?Output the first of each duplicate line.]" 434887Schin "[D:all-repeated?Output all duplicate lines as a group with an empty " 444887Schin "line delimiter specified by \adelimit\a:]:?[delimit:=none]" 454887Schin "{" 464887Schin "[n:none?Do not delimit duplicate groups.]" 474887Schin "[p:prepend?Prepend an empty line before each group.]" 484887Schin "[s:separate?Separate each group with an empty line.]" 494887Schin "}" 504887Schin "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over " 514887Schin "before checking for uniqueness. A field is the minimal string matching " 5210898Sroland.mainz@nrubsig.org "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b. -\anumber\a is equivalent to " 5310898Sroland.mainz@nrubsig.org "\b--skip-fields\b=\anumber\a.]" 544887Schin "[i:ignore-case?Ignore case in comparisons.]" 554887Schin "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over " 564887Schin "before checking for uniqueness. If specified along with \b-f\b, " 574887Schin "the first \achars\a after the first \afields\a are ignored. If " 584887Schin "the \achars\a specifies more characters than are on the line, " 5910898Sroland.mainz@nrubsig.org "an empty string will be used for comparison. +\anumber\a is " 6010898Sroland.mainz@nrubsig.org "equivalent to \b--skip-chars\b=\anumber\a.]" 614887Schin "[u:unique?Output unique lines.]" 624887Schin "[w:check-chars]#[chars?\achars\a is the number of characters to compare " 634887Schin "after skipping any specified fields and characters.]" 644887Schin "\n" 654887Schin "\n[infile [outfile]]\n" 664887Schin "\n" 674887Schin "[+EXIT STATUS?]{" 684887Schin "[+0?The input file was successfully processed.]" 694887Schin "[+>0?An error occurred.]" 704887Schin "}" 714887Schin "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]" 724887Schin ; 734887Schin 744887Schin #include <cmd.h> 754887Schin 764887Schin #define C_FLAG 1 774887Schin #define D_FLAG 2 784887Schin #define U_FLAG 4 794887Schin 804887Schin #define CWIDTH 4 814887Schin #define MAXCNT 9999 824887Schin 834887Schin typedef int (*Compare_f)(const char*, const char*, size_t); 844887Schin 854887Schin static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare) 864887Schin { 87*12068SRoger.Faulkner@Oracle.COM register int n, f, outsize=0, mb = mbwide(); 88*12068SRoger.Faulkner@Oracle.COM register char *cp, *ep, *mp, *bufp, *outp; 894887Schin char *orecp, *sbufp=0, *outbuff; 904887Schin int reclen,oreclen= -1,count=0,cwidth=0,sep,next; 914887Schin if(mode&C_FLAG) 924887Schin cwidth = CWIDTH+1; 934887Schin while(1) 944887Schin { 954887Schin if(bufp = sfgetr(fdin,'\n',0)) 964887Schin n = sfvalue(fdin); 974887Schin else if(bufp = sfgetr(fdin,'\n',SF_LASTR)) 984887Schin { 994887Schin n = sfvalue(fdin); 1004887Schin bufp = memcpy(fmtbuf(n + 1), bufp, n); 1014887Schin bufp[n++] = '\n'; 1024887Schin } 1034887Schin else 1044887Schin n = 0; 105*12068SRoger.Faulkner@Oracle.COM if (n) 1064887Schin { 1074887Schin cp = bufp; 1084887Schin ep = cp + n; 109*12068SRoger.Faulkner@Oracle.COM if (f = fields) 110*12068SRoger.Faulkner@Oracle.COM while (f-->0 && cp<ep) /* skip over fields */ 1114887Schin { 112*12068SRoger.Faulkner@Oracle.COM while (cp<ep && *cp==' ' || *cp=='\t') 1134887Schin cp++; 114*12068SRoger.Faulkner@Oracle.COM while (cp<ep && *cp!=' ' && *cp!='\t') 1154887Schin cp++; 1164887Schin } 117*12068SRoger.Faulkner@Oracle.COM if (chars) 118*12068SRoger.Faulkner@Oracle.COM { 119*12068SRoger.Faulkner@Oracle.COM if (mb) 120*12068SRoger.Faulkner@Oracle.COM for (f = chars; f; f--) 121*12068SRoger.Faulkner@Oracle.COM mbchar(cp); 122*12068SRoger.Faulkner@Oracle.COM else 123*12068SRoger.Faulkner@Oracle.COM cp += chars; 124*12068SRoger.Faulkner@Oracle.COM } 125*12068SRoger.Faulkner@Oracle.COM if ((reclen = n - (cp - bufp)) <= 0) 1264887Schin { 1274887Schin reclen = 1; 128*12068SRoger.Faulkner@Oracle.COM cp = bufp + n - 1; 1294887Schin } 130*12068SRoger.Faulkner@Oracle.COM else if (width >= 0 && width < reclen) 131*12068SRoger.Faulkner@Oracle.COM { 132*12068SRoger.Faulkner@Oracle.COM if (mb) 133*12068SRoger.Faulkner@Oracle.COM { 134*12068SRoger.Faulkner@Oracle.COM reclen = 0; 135*12068SRoger.Faulkner@Oracle.COM mp = cp; 136*12068SRoger.Faulkner@Oracle.COM while (reclen < width && mp < ep) 137*12068SRoger.Faulkner@Oracle.COM { 138*12068SRoger.Faulkner@Oracle.COM reclen++; 139*12068SRoger.Faulkner@Oracle.COM mbchar(mp); 140*12068SRoger.Faulkner@Oracle.COM } 141*12068SRoger.Faulkner@Oracle.COM reclen = mp - cp; 142*12068SRoger.Faulkner@Oracle.COM } 143*12068SRoger.Faulkner@Oracle.COM else 144*12068SRoger.Faulkner@Oracle.COM reclen = width; 145*12068SRoger.Faulkner@Oracle.COM } 1464887Schin } 1474887Schin else 148*12068SRoger.Faulkner@Oracle.COM reclen = -2; 1494887Schin if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen))) 1504887Schin { 1514887Schin count++; 1524887Schin if (!all) 1534887Schin continue; 1544887Schin next = count; 1554887Schin } 1564887Schin else 1574887Schin { 1584887Schin next = 0; 1594887Schin if(outsize>0) 1604887Schin { 1614887Schin if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count)) 1624887Schin { 1634887Schin if(outp!=sbufp) 1644887Schin sfwrite(fdout,outp,0); 1654887Schin } 1664887Schin else 1674887Schin { 1684887Schin if(cwidth) 1694887Schin { 17010898Sroland.mainz@nrubsig.org if(count<9) 17110898Sroland.mainz@nrubsig.org { 17210898Sroland.mainz@nrubsig.org f = 0; 17310898Sroland.mainz@nrubsig.org while(f < CWIDTH-1) 17410898Sroland.mainz@nrubsig.org outp[f++] = ' '; 17510898Sroland.mainz@nrubsig.org outp[f++] = '0' + count + 1; 17610898Sroland.mainz@nrubsig.org outp[f] = ' '; 17710898Sroland.mainz@nrubsig.org } 17810898Sroland.mainz@nrubsig.org else if(count<MAXCNT) 1794887Schin { 18010898Sroland.mainz@nrubsig.org count++; 18110898Sroland.mainz@nrubsig.org f = CWIDTH; 18210898Sroland.mainz@nrubsig.org outp[f--] = ' '; 18310898Sroland.mainz@nrubsig.org do 18410898Sroland.mainz@nrubsig.org { 18510898Sroland.mainz@nrubsig.org outp[f--] = '0' + (count % 10); 18610898Sroland.mainz@nrubsig.org } while (count /= 10); 18710898Sroland.mainz@nrubsig.org while (f >= 0) 18810898Sroland.mainz@nrubsig.org outp[f--] = ' '; 1894887Schin } 1904887Schin else 1914887Schin { 1924887Schin outsize -= (CWIDTH+1); 1934887Schin if(outp!=sbufp) 1944887Schin { 1954887Schin if(!(sbufp=fmtbuf(outsize))) 1964887Schin return(1); 1974887Schin memcpy(sbufp,outp+CWIDTH+1,outsize); 1984887Schin sfwrite(fdout,outp,0); 1994887Schin outp = sbufp; 2004887Schin } 2014887Schin else 2024887Schin outp += CWIDTH+1; 2034887Schin sfprintf(fdout,"%4d ",count+1); 2044887Schin } 2054887Schin } 2064887Schin if(sfwrite(fdout,outp,outsize) != outsize) 2074887Schin return(1); 2084887Schin } 2094887Schin } 2104887Schin } 2114887Schin if(n==0) 2124887Schin break; 2134887Schin if(count = next) 2144887Schin { 2154887Schin if(sfwrite(fdout,outp,outsize) != outsize) 2164887Schin return(1); 2174887Schin if(*all >= 0) 2184887Schin *all = 1; 2194887Schin sep = 0; 2204887Schin } 2214887Schin else 2224887Schin sep = all && *all > 0; 2234887Schin /* save current record */ 2244887Schin if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0) 2254887Schin return(1); 2264887Schin outp = outbuff; 2274887Schin if(outsize < n+cwidth+sep) 2284887Schin { 2294887Schin /* no room in outp, clear lock and use side buffer */ 2304887Schin sfwrite(fdout,outp,0); 2314887Schin if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep))) 2324887Schin return(1); 2334887Schin } 2344887Schin else 2354887Schin outsize = n+cwidth+sep; 2364887Schin memcpy(outp+cwidth+sep,bufp,n); 2374887Schin if(sep) 2384887Schin outp[cwidth] = '\n'; 2394887Schin oreclen = reclen; 2404887Schin orecp = outp+cwidth+sep + (cp-bufp); 2414887Schin } 2424887Schin return(0); 2434887Schin } 2444887Schin 2454887Schin int 2464887Schin b_uniq(int argc, char** argv, void* context) 2474887Schin { 2484887Schin register int n, mode=0; 2494887Schin register char *cp; 2504887Schin int fields=0, chars=0, width=-1; 2514887Schin Sfio_t *fpin, *fpout; 2524887Schin int* all = 0; 2534887Schin int sep; 2544887Schin Compare_f compare = (Compare_f)memcmp; 2554887Schin 2564887Schin cmdinit(argc, argv, context, ERROR_CATALOG, 0); 2574887Schin while (n = optget(argv, usage)) switch (n) 2584887Schin { 2594887Schin case 'c': 2604887Schin mode |= C_FLAG; 2614887Schin break; 2624887Schin case 'd': 2634887Schin mode |= D_FLAG; 2644887Schin break; 2654887Schin case 'D': 2664887Schin mode |= D_FLAG; 2674887Schin switch ((int)opt_info.num) 2684887Schin { 2694887Schin case 'p': 2704887Schin sep = 1; 2714887Schin break; 2724887Schin case 's': 2734887Schin sep = 0; 2744887Schin break; 2754887Schin default: 2764887Schin sep = -1; 2774887Schin break; 2784887Schin } 2794887Schin all = &sep; 2804887Schin break; 2814887Schin case 'i': 2824887Schin compare = (Compare_f)strncasecmp; 2834887Schin break; 2844887Schin case 'u': 2854887Schin mode |= U_FLAG; 2864887Schin break; 2874887Schin case 'f': 2884887Schin if(*opt_info.option=='-') 2894887Schin fields = opt_info.num; 2904887Schin else 2914887Schin chars = opt_info.num; 2924887Schin break; 2934887Schin case 's': 2944887Schin chars = opt_info.num; 2954887Schin break; 2964887Schin case 'w': 2974887Schin width = opt_info.num; 2984887Schin break; 2994887Schin case ':': 3004887Schin error(2, "%s", opt_info.arg); 3014887Schin break; 3024887Schin case '?': 3034887Schin error(ERROR_usage(2), "%s", opt_info.arg); 3044887Schin break; 3054887Schin } 3064887Schin argv += opt_info.index; 3074887Schin if(all && (mode&C_FLAG)) 3084887Schin error(2, "-c and -D are mutually exclusive"); 3094887Schin if(error_info.errors) 3104887Schin error(ERROR_usage(2), "%s", optusage(NiL)); 3114887Schin if((cp = *argv) && (argv++,!streq(cp,"-"))) 3124887Schin { 3134887Schin if(!(fpin = sfopen(NiL,cp,"r"))) 3144887Schin error(ERROR_system(1),"%s: cannot open",cp); 3154887Schin } 3164887Schin else 3174887Schin fpin = sfstdin; 3184887Schin if(cp = *argv) 3194887Schin { 3204887Schin argv++; 3214887Schin if(!(fpout = sfopen(NiL,cp,"w"))) 3224887Schin error(ERROR_system(1),"%s: cannot create",cp); 3234887Schin } 3244887Schin else 3254887Schin fpout = sfstdout; 3264887Schin if(*argv) 3274887Schin { 3284887Schin error(2, "too many arguments"); 3294887Schin error(ERROR_usage(2), "%s", optusage(NiL)); 3304887Schin } 3314887Schin error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare); 3324887Schin if(fpin!=sfstdin) 3334887Schin sfclose(fpin); 3344887Schin if(fpout!=sfstdout) 3354887Schin sfclose(fpout); 3364887Schin return(error_info.errors); 3374887Schin } 3384887Schin 339