1*4887Schin /***********************************************************************
2*4887Schin *                                                                      *
3*4887Schin *               This software is part of the ast package               *
4*4887Schin *           Copyright (c) 1992-2007 AT&T Knowledge Ventures            *
5*4887Schin *                      and is licensed under the                       *
6*4887Schin *                  Common Public License, Version 1.0                  *
7*4887Schin *                      by AT&T Knowledge Ventures                      *
8*4887Schin *                                                                      *
9*4887Schin *                A copy of the License is available at                 *
10*4887Schin *            http://www.opensource.org/licenses/cpl1.0.txt             *
11*4887Schin *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12*4887Schin *                                                                      *
13*4887Schin *              Information and Software Systems Research               *
14*4887Schin *                            AT&T Research                             *
15*4887Schin *                           Florham Park NJ                            *
16*4887Schin *                                                                      *
17*4887Schin *                 Glenn Fowler <gsf@research.att.com>                  *
18*4887Schin *                  David Korn <dgk@research.att.com>                   *
19*4887Schin *                                                                      *
20*4887Schin ***********************************************************************/
21*4887Schin #pragma prototyped
22*4887Schin /*
23*4887Schin  * David Korn
24*4887Schin  * AT&T Bell Laboratories
25*4887Schin  *
26*4887Schin  * library interface for word count
27*4887Schin  */
28*4887Schin 
29*4887Schin #include <cmd.h>
30*4887Schin #include <wc.h>
31*4887Schin #include <ctype.h>
32*4887Schin 
33*4887Schin #if _hdr_wchar && _hdr_wctype
34*4887Schin 
35*4887Schin #include <wchar.h>
36*4887Schin #include <wctype.h>
37*4887Schin 
38*4887Schin #else
39*4887Schin 
40*4887Schin #ifndef iswspace
41*4887Schin #define iswspace(x)	isspace(x)
42*4887Schin #endif
43*4887Schin 
44*4887Schin #endif
45*4887Schin 
46*4887Schin #define endline(c)	(((signed char)-1)<0?(c)<0:(c)==((char)-1))
47*4887Schin #define mbok(p,n)	(((n)<1)?0:mbwide()?((*ast.mb_towc)(NiL,(char*)(p),n)>=0):1)
48*4887Schin 
49*4887Schin Wc_t *wc_init(int mode)
50*4887Schin {
51*4887Schin 	register int	n;
52*4887Schin 	register int	w;
53*4887Schin 	Wc_t*		wp;
54*4887Schin 
55*4887Schin 	if(!(wp = (Wc_t*)stakalloc(sizeof(Wc_t))))
56*4887Schin 		return(0);
57*4887Schin 	wp->mode = mode;
58*4887Schin 	w = mode & WC_WORDS;
59*4887Schin 	for(n=(1<<CHAR_BIT);--n >=0;)
60*4887Schin 		wp->space[n] = w ? !!isspace(n) : 0;
61*4887Schin 	wp->space['\n'] = -1;
62*4887Schin 	return(wp);
63*4887Schin }
64*4887Schin 
65*4887Schin /*
66*4887Schin  * compute the line, word, and character count for file <fd>
67*4887Schin  */
68*4887Schin int wc_count(Wc_t *wp, Sfio_t *fd, const char* file)
69*4887Schin {
70*4887Schin 	register signed char	*space = wp->space;
71*4887Schin 	register unsigned char	*cp;
72*4887Schin 	register Sfoff_t	nchars;
73*4887Schin 	register Sfoff_t	nwords;
74*4887Schin 	register Sfoff_t	nlines;
75*4887Schin 	register Sfoff_t	eline;
76*4887Schin 	register Sfoff_t	longest;
77*4887Schin 	register ssize_t	c;
78*4887Schin 	register unsigned char	*endbuff;
79*4887Schin 	register int		lasttype = 1;
80*4887Schin 	unsigned int		lastchar;
81*4887Schin 	unsigned char		*buff;
82*4887Schin 	wchar_t			x;
83*4887Schin 
84*4887Schin 	sfset(fd,SF_WRITE,1);
85*4887Schin 	nlines = nwords = nchars = 0;
86*4887Schin 	wp->longest = 0;
87*4887Schin 	if (wp->mode & (WC_LONGEST|WC_MBYTE))
88*4887Schin 	{
89*4887Schin 		longest = 0;
90*4887Schin 		eline = -1;
91*4887Schin 		cp = buff = endbuff = 0;
92*4887Schin 		for (;;)
93*4887Schin 		{
94*4887Schin 			if (!mbok(cp, endbuff-cp))
95*4887Schin 			{
96*4887Schin 				if (buff)
97*4887Schin 					sfread(fd, buff, cp-buff);
98*4887Schin 				if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, SF_LOCKR)))
99*4887Schin 					break;
100*4887Schin 				endbuff = (cp = buff) + sfvalue(fd);
101*4887Schin 			}
102*4887Schin 			nchars++;
103*4887Schin 			x = mbchar(cp);
104*4887Schin 			if (x == -1)
105*4887Schin 			{
106*4887Schin 				if (eline != nlines && !(wp->mode & WC_QUIET))
107*4887Schin 				{
108*4887Schin 					error_info.file = (char*)file;
109*4887Schin 					error_info.line = eline = nlines;
110*4887Schin 					error(ERROR_SYSTEM|1, "invalid multibyte character");
111*4887Schin 					error_info.file = 0;
112*4887Schin 					error_info.line = 0;
113*4887Schin 				}
114*4887Schin 			}
115*4887Schin 			else if (x == '\n')
116*4887Schin 			{
117*4887Schin 				if ((nchars - longest) > wp->longest)
118*4887Schin 					wp->longest = nchars - longest;
119*4887Schin 				longest = nchars;
120*4887Schin 				nlines++;
121*4887Schin 				lasttype = 1;
122*4887Schin 			}
123*4887Schin 			else if (iswspace(x))
124*4887Schin 				lasttype = 1;
125*4887Schin 			else if (lasttype)
126*4887Schin 			{
127*4887Schin 				lasttype = 0;
128*4887Schin 				nwords++;
129*4887Schin 			}
130*4887Schin 		}
131*4887Schin 	}
132*4887Schin 	else
133*4887Schin 	{
134*4887Schin 		for (;;)
135*4887Schin 		{
136*4887Schin 			/* fill next buffer and check for end-of-file */
137*4887Schin 			if (!(buff = (unsigned char*)sfreserve(fd, 0, 0)) || (c = sfvalue(fd)) <= 0)
138*4887Schin 				break;
139*4887Schin 			sfread(fd,(char*)(cp=buff),c);
140*4887Schin 			nchars += c;
141*4887Schin 			/* check to see whether first character terminates word */
142*4887Schin 			if(c==1)
143*4887Schin 			{
144*4887Schin 				if(endline(lasttype))
145*4887Schin 					nlines++;
146*4887Schin 				if((c = space[*cp]) && !lasttype)
147*4887Schin 					nwords++;
148*4887Schin 				lasttype = c;
149*4887Schin 				continue;
150*4887Schin 			}
151*4887Schin 			if(!lasttype && space[*cp])
152*4887Schin 				nwords++;
153*4887Schin 			lastchar = cp[--c];
154*4887Schin 			cp[c] = '\n';
155*4887Schin 			endbuff = cp+c;
156*4887Schin 			c = lasttype;
157*4887Schin 			/* process each buffer */
158*4887Schin 			for (;;)
159*4887Schin 			{
160*4887Schin 				/* process spaces and new-lines */
161*4887Schin 				do if (endline(c))
162*4887Schin 				{
163*4887Schin 					for (;;)
164*4887Schin 					{
165*4887Schin 						/* check for end of buffer */
166*4887Schin 						if (cp > endbuff)
167*4887Schin 							goto eob;
168*4887Schin 						nlines++;
169*4887Schin 						if (*cp != '\n')
170*4887Schin 							break;
171*4887Schin 						cp++;
172*4887Schin 					}
173*4887Schin 				} while (c = space[*cp++]);
174*4887Schin 				/* skip over word characters */
175*4887Schin 				while(!(c = space[*cp++]));
176*4887Schin 				nwords++;
177*4887Schin 			}
178*4887Schin 		eob:
179*4887Schin 			if((cp -= 2) >= buff)
180*4887Schin 				c = space[*cp];
181*4887Schin 			else
182*4887Schin 				c  = lasttype;
183*4887Schin 			lasttype = space[lastchar];
184*4887Schin 			/* see if was in word */
185*4887Schin 			if(!c && !lasttype)
186*4887Schin 				nwords--;
187*4887Schin 		}
188*4887Schin 		if(endline(lasttype))
189*4887Schin 			nlines++;
190*4887Schin 		else if(!lasttype)
191*4887Schin 			nwords++;
192*4887Schin 	}
193*4887Schin 	wp->chars = nchars;
194*4887Schin 	wp->words = nwords;
195*4887Schin 	wp->lines = nlines;
196*4887Schin 	return(0);
197*4887Schin }
198