14887Schin /*********************************************************************** 24887Schin * * 34887Schin * This software is part of the ast package * 4*10898Sroland.mainz@nrubsig.org * Copyright (c) 1992-2009 AT&T Intellectual Property * 54887Schin * and is licensed under the * 64887Schin * Common Public License, Version 1.0 * 78462SApril.Chin@Sun.COM * by AT&T Intellectual Property * 84887Schin * * 94887Schin * A copy of the License is available at * 104887Schin * http://www.opensource.org/licenses/cpl1.0.txt * 114887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 124887Schin * * 134887Schin * Information and Software Systems Research * 144887Schin * AT&T Research * 154887Schin * Florham Park NJ * 164887Schin * * 174887Schin * Glenn Fowler <gsf@research.att.com> * 184887Schin * David Korn <dgk@research.att.com> * 194887Schin * * 204887Schin ***********************************************************************/ 214887Schin #pragma prototyped 224887Schin /* 234887Schin * uniq 244887Schin * 254887Schin * Written by David Korn 264887Schin */ 274887Schin 284887Schin static const char usage[] = 29*10898Sroland.mainz@nrubsig.org "[-n?\n@(#)$Id: uniq (AT&T Research) 2009-08-10 $\n]" 304887Schin USAGE_LICENSE 314887Schin "[+NAME?uniq - Report or filter out repeated lines in a file]" 32*10898Sroland.mainz@nrubsig.org "[+DESCRIPTION?\buniq\b reads the input, compares adjacent lines, and " 33*10898Sroland.mainz@nrubsig.org "writes one copy of each input line on the output. The second " 344887Schin "and succeeding copies of the repeated adjacent lines are not " 354887Schin "written.]" 364887Schin "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes " 374887Schin "to standard output. If no \ainfile\a is given, or if the \ainfile\a " 38*10898Sroland.mainz@nrubsig.org "is \b-\b, \buniq\b reads from standard input with the start of " 39*10898Sroland.mainz@nrubsig.org "the file defined as the current offset.]" 404887Schin "[c:count?Output the number of times each line occurred along with " 414887Schin "the line.]" 424887Schin "[d:repeated|duplicates?Output the first of each duplicate line.]" 434887Schin "[D:all-repeated?Output all duplicate lines as a group with an empty " 444887Schin "line delimiter specified by \adelimit\a:]:?[delimit:=none]" 454887Schin "{" 464887Schin "[n:none?Do not delimit duplicate groups.]" 474887Schin "[p:prepend?Prepend an empty line before each group.]" 484887Schin "[s:separate?Separate each group with an empty line.]" 494887Schin "}" 504887Schin "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over " 514887Schin "before checking for uniqueness. A field is the minimal string matching " 52*10898Sroland.mainz@nrubsig.org "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b. -\anumber\a is equivalent to " 53*10898Sroland.mainz@nrubsig.org "\b--skip-fields\b=\anumber\a.]" 544887Schin "[i:ignore-case?Ignore case in comparisons.]" 554887Schin "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over " 564887Schin "before checking for uniqueness. If specified along with \b-f\b, " 574887Schin "the first \achars\a after the first \afields\a are ignored. If " 584887Schin "the \achars\a specifies more characters than are on the line, " 59*10898Sroland.mainz@nrubsig.org "an empty string will be used for comparison. +\anumber\a is " 60*10898Sroland.mainz@nrubsig.org "equivalent to \b--skip-chars\b=\anumber\a.]" 614887Schin "[u:unique?Output unique lines.]" 624887Schin "[w:check-chars]#[chars?\achars\a is the number of characters to compare " 634887Schin "after skipping any specified fields and characters.]" 644887Schin "\n" 654887Schin "\n[infile [outfile]]\n" 664887Schin "\n" 674887Schin "[+EXIT STATUS?]{" 684887Schin "[+0?The input file was successfully processed.]" 694887Schin "[+>0?An error occurred.]" 704887Schin "}" 714887Schin "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]" 724887Schin ; 734887Schin 744887Schin #include <cmd.h> 754887Schin 764887Schin #define C_FLAG 1 774887Schin #define D_FLAG 2 784887Schin #define U_FLAG 4 794887Schin 804887Schin #define CWIDTH 4 814887Schin #define MAXCNT 9999 824887Schin 834887Schin typedef int (*Compare_f)(const char*, const char*, size_t); 844887Schin 854887Schin static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare) 864887Schin { 874887Schin register int n, f, outsize=0; 884887Schin register char *cp, *ep, *bufp, *outp; 894887Schin char *orecp, *sbufp=0, *outbuff; 904887Schin int reclen,oreclen= -1,count=0,cwidth=0,sep,next; 914887Schin if(mode&C_FLAG) 924887Schin cwidth = CWIDTH+1; 934887Schin while(1) 944887Schin { 954887Schin if(bufp = sfgetr(fdin,'\n',0)) 964887Schin n = sfvalue(fdin); 974887Schin else if(bufp = sfgetr(fdin,'\n',SF_LASTR)) 984887Schin { 994887Schin n = sfvalue(fdin); 1004887Schin bufp = memcpy(fmtbuf(n + 1), bufp, n); 1014887Schin bufp[n++] = '\n'; 1024887Schin } 1034887Schin else 1044887Schin n = 0; 1054887Schin if(n) 1064887Schin { 1074887Schin cp = bufp; 1084887Schin ep = cp + n; 1094887Schin if(f=fields) 1104887Schin while(f-->0 && cp<ep) /* skip over fields */ 1114887Schin { 1124887Schin while(cp<ep && *cp==' ' || *cp=='\t') 1134887Schin cp++; 1144887Schin while(cp<ep && *cp!=' ' && *cp!='\t') 1154887Schin cp++; 1164887Schin } 1174887Schin if(chars) 1184887Schin cp += chars; 1194887Schin if((reclen = n - (cp-bufp)) <=0) 1204887Schin { 1214887Schin reclen = 1; 1224887Schin cp = bufp + sfvalue(fdin)-1; 1234887Schin } 1244887Schin else if(width >= 0 && width < reclen) 1254887Schin reclen = width; 1264887Schin } 1274887Schin else 1284887Schin reclen=-2; 1294887Schin if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen))) 1304887Schin { 1314887Schin count++; 1324887Schin if (!all) 1334887Schin continue; 1344887Schin next = count; 1354887Schin } 1364887Schin else 1374887Schin { 1384887Schin next = 0; 1394887Schin if(outsize>0) 1404887Schin { 1414887Schin if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count)) 1424887Schin { 1434887Schin if(outp!=sbufp) 1444887Schin sfwrite(fdout,outp,0); 1454887Schin } 1464887Schin else 1474887Schin { 1484887Schin if(cwidth) 1494887Schin { 150*10898Sroland.mainz@nrubsig.org if(count<9) 151*10898Sroland.mainz@nrubsig.org { 152*10898Sroland.mainz@nrubsig.org f = 0; 153*10898Sroland.mainz@nrubsig.org while(f < CWIDTH-1) 154*10898Sroland.mainz@nrubsig.org outp[f++] = ' '; 155*10898Sroland.mainz@nrubsig.org outp[f++] = '0' + count + 1; 156*10898Sroland.mainz@nrubsig.org outp[f] = ' '; 157*10898Sroland.mainz@nrubsig.org } 158*10898Sroland.mainz@nrubsig.org else if(count<MAXCNT) 1594887Schin { 160*10898Sroland.mainz@nrubsig.org count++; 161*10898Sroland.mainz@nrubsig.org f = CWIDTH; 162*10898Sroland.mainz@nrubsig.org outp[f--] = ' '; 163*10898Sroland.mainz@nrubsig.org do 164*10898Sroland.mainz@nrubsig.org { 165*10898Sroland.mainz@nrubsig.org outp[f--] = '0' + (count % 10); 166*10898Sroland.mainz@nrubsig.org } while (count /= 10); 167*10898Sroland.mainz@nrubsig.org while (f >= 0) 168*10898Sroland.mainz@nrubsig.org outp[f--] = ' '; 1694887Schin } 1704887Schin else 1714887Schin { 1724887Schin outsize -= (CWIDTH+1); 1734887Schin if(outp!=sbufp) 1744887Schin { 1754887Schin if(!(sbufp=fmtbuf(outsize))) 1764887Schin return(1); 1774887Schin memcpy(sbufp,outp+CWIDTH+1,outsize); 1784887Schin sfwrite(fdout,outp,0); 1794887Schin outp = sbufp; 1804887Schin } 1814887Schin else 1824887Schin outp += CWIDTH+1; 1834887Schin sfprintf(fdout,"%4d ",count+1); 1844887Schin } 1854887Schin } 1864887Schin if(sfwrite(fdout,outp,outsize) != outsize) 1874887Schin return(1); 1884887Schin } 1894887Schin } 1904887Schin } 1914887Schin if(n==0) 1924887Schin break; 1934887Schin if(count = next) 1944887Schin { 1954887Schin if(sfwrite(fdout,outp,outsize) != outsize) 1964887Schin return(1); 1974887Schin if(*all >= 0) 1984887Schin *all = 1; 1994887Schin sep = 0; 2004887Schin } 2014887Schin else 2024887Schin sep = all && *all > 0; 2034887Schin /* save current record */ 2044887Schin if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0) 2054887Schin return(1); 2064887Schin outp = outbuff; 2074887Schin if(outsize < n+cwidth+sep) 2084887Schin { 2094887Schin /* no room in outp, clear lock and use side buffer */ 2104887Schin sfwrite(fdout,outp,0); 2114887Schin if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep))) 2124887Schin return(1); 2134887Schin } 2144887Schin else 2154887Schin outsize = n+cwidth+sep; 2164887Schin memcpy(outp+cwidth+sep,bufp,n); 2174887Schin if(sep) 2184887Schin outp[cwidth] = '\n'; 2194887Schin oreclen = reclen; 2204887Schin orecp = outp+cwidth+sep + (cp-bufp); 2214887Schin } 2224887Schin return(0); 2234887Schin } 2244887Schin 2254887Schin int 2264887Schin b_uniq(int argc, char** argv, void* context) 2274887Schin { 2284887Schin register int n, mode=0; 2294887Schin register char *cp; 2304887Schin int fields=0, chars=0, width=-1; 2314887Schin Sfio_t *fpin, *fpout; 2324887Schin int* all = 0; 2334887Schin int sep; 2344887Schin Compare_f compare = (Compare_f)memcmp; 2354887Schin 2364887Schin cmdinit(argc, argv, context, ERROR_CATALOG, 0); 2374887Schin while (n = optget(argv, usage)) switch (n) 2384887Schin { 2394887Schin case 'c': 2404887Schin mode |= C_FLAG; 2414887Schin break; 2424887Schin case 'd': 2434887Schin mode |= D_FLAG; 2444887Schin break; 2454887Schin case 'D': 2464887Schin mode |= D_FLAG; 2474887Schin switch ((int)opt_info.num) 2484887Schin { 2494887Schin case 'p': 2504887Schin sep = 1; 2514887Schin break; 2524887Schin case 's': 2534887Schin sep = 0; 2544887Schin break; 2554887Schin default: 2564887Schin sep = -1; 2574887Schin break; 2584887Schin } 2594887Schin all = &sep; 2604887Schin break; 2614887Schin case 'i': 2624887Schin compare = (Compare_f)strncasecmp; 2634887Schin break; 2644887Schin case 'u': 2654887Schin mode |= U_FLAG; 2664887Schin break; 2674887Schin case 'f': 2684887Schin if(*opt_info.option=='-') 2694887Schin fields = opt_info.num; 2704887Schin else 2714887Schin chars = opt_info.num; 2724887Schin break; 2734887Schin case 's': 2744887Schin chars = opt_info.num; 2754887Schin break; 2764887Schin case 'w': 2774887Schin width = opt_info.num; 2784887Schin break; 2794887Schin case ':': 2804887Schin error(2, "%s", opt_info.arg); 2814887Schin break; 2824887Schin case '?': 2834887Schin error(ERROR_usage(2), "%s", opt_info.arg); 2844887Schin break; 2854887Schin } 2864887Schin argv += opt_info.index; 2874887Schin if(all && (mode&C_FLAG)) 2884887Schin error(2, "-c and -D are mutually exclusive"); 2894887Schin if(error_info.errors) 2904887Schin error(ERROR_usage(2), "%s", optusage(NiL)); 2914887Schin if((cp = *argv) && (argv++,!streq(cp,"-"))) 2924887Schin { 2934887Schin if(!(fpin = sfopen(NiL,cp,"r"))) 2944887Schin error(ERROR_system(1),"%s: cannot open",cp); 2954887Schin } 2964887Schin else 2974887Schin fpin = sfstdin; 2984887Schin if(cp = *argv) 2994887Schin { 3004887Schin argv++; 3014887Schin if(!(fpout = sfopen(NiL,cp,"w"))) 3024887Schin error(ERROR_system(1),"%s: cannot create",cp); 3034887Schin } 3044887Schin else 3054887Schin fpout = sfstdout; 3064887Schin if(*argv) 3074887Schin { 3084887Schin error(2, "too many arguments"); 3094887Schin error(ERROR_usage(2), "%s", optusage(NiL)); 3104887Schin } 3114887Schin error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare); 3124887Schin if(fpin!=sfstdin) 3134887Schin sfclose(fpin); 3144887Schin if(fpout!=sfstdout) 3154887Schin sfclose(fpout); 3164887Schin return(error_info.errors); 3174887Schin } 3184887Schin 319