xref: /netbsd-src/usr.bin/wc/wc.c (revision 1ca5c1b28139779176bd5c13ad7c5f25c0bcd5f8)
1 /*	$NetBSD: wc.c,v 1.21 2001/10/19 06:09:56 yamt Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1987, 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #include <sys/cdefs.h>
37 #ifndef lint
38 __COPYRIGHT("@(#) Copyright (c) 1980, 1987, 1991, 1993\n\
39 	The Regents of the University of California.  All rights reserved.\n");
40 #endif /* not lint */
41 
42 #ifndef lint
43 #if 0
44 static char sccsid[] = "@(#)wc.c	8.2 (Berkeley) 5/2/95";
45 #else
46 __RCSID("$NetBSD: wc.c,v 1.21 2001/10/19 06:09:56 yamt Exp $");
47 #endif
48 #endif /* not lint */
49 
50 /* wc line, word and char count */
51 
52 #include <sys/param.h>
53 #include <sys/stat.h>
54 #include <sys/types.h>
55 
56 #include <fcntl.h>
57 #include <unistd.h>
58 #include <errno.h>
59 #include <stdio.h>
60 
61 #include <stdlib.h>
62 #include <string.h>
63 #include <locale.h>
64 #include <ctype.h>
65 #include <errno.h>
66 #include <sys/param.h>
67 #include <sys/stat.h>
68 #include <sys/file.h>
69 #include <unistd.h>
70 #include <err.h>
71 #include <wchar.h>
72 #include <wctype.h>
73 
74 #ifdef NO_QUAD
75 typedef u_long wc_count_t;
76 # define WCFMT	" %7lu"
77 # define WCCAST unsigned long
78 #else
79 typedef u_quad_t wc_count_t;
80 # define WCFMT	" %7llu"
81 # define WCCAST	unsigned long long
82 #endif
83 
84 static wc_count_t	tlinect, twordct, tcharct;
85 static int		doline, doword, dobyte, dochar;
86 static int 		rval = 0;
87 
88 static void	cnt __P((char *));
89 static void	print_counts __P((wc_count_t, wc_count_t, wc_count_t, char *));
90 static void	usage __P((void));
91 static size_t do_mb __P((wchar_t *, const char *, size_t, mbstate_t *,
92 		size_t *, const char *));
93 int	main __P((int, char *[]));
94 
95 int
96 main(argc, argv)
97 	int argc;
98 	char *argv[];
99 {
100 	int ch;
101 
102 	setlocale(LC_ALL, "");
103 
104 	while ((ch = getopt(argc, argv, "lwcm")) != -1)
105 		switch((char)ch) {
106 		case 'l':
107 			doline = 1;
108 			break;
109 		case 'w':
110 			doword = 1;
111 			break;
112 		case 'm':
113 			dochar = 1;
114 			dobyte = 0;
115 			break;
116 		case 'c':
117 			dochar = 0;
118 			dobyte = 1;
119 			break;
120 		case '?':
121 		default:
122 			usage();
123 		}
124 	argv += optind;
125 	argc -= optind;
126 
127 	/* Wc's flags are on by default. */
128 	if (doline + doword + dobyte + dochar == 0)
129 		doline = doword = dobyte = 1;
130 
131 	if (!*argv) {
132 		cnt(NULL);
133 	} else {
134 		int dototal = (argc > 1);
135 
136 		do {
137 			cnt(*argv);
138 		} while(*++argv);
139 
140 		if (dototal)
141 			print_counts(tlinect, twordct, tcharct, "total");
142 	}
143 
144 	exit(rval);
145 }
146 
147 static size_t
148 do_mb(wc, p, mblen, st, cnt, file)
149 	wchar_t *wc;
150 	const char *p;
151 	size_t mblen;
152 	mbstate_t *st;
153 	size_t *cnt;
154 	const char *file;
155 {
156 	size_t r;
157 	size_t c = 0;
158 
159 	do {
160 		r = mbrtowc(wc, p, mblen, st);
161 		if (r == (size_t)-1) {
162 			warnx("%s: invalid byte sequence", file);
163 			rval = 1;
164 
165 			/* XXX skip 1 byte */
166 			mblen --;
167 			p ++;
168 			memset(st, 0, sizeof(*st));
169 		}
170 		else if (r == (size_t)-2)
171 			break;
172 		else if (r == 0)
173 			r = 1;
174 		c ++;
175 		if (wc)
176 			wc ++;
177 		mblen -= r;
178 		p += r;
179 	} while (mblen > 0);
180 
181 	*cnt = c;
182 
183 	return r;
184 }
185 
186 static void
187 cnt(file)
188 	char *file;
189 {
190 	u_char *C;
191 	wchar_t *WC;
192 	short gotsp;
193 	int len = 0;
194 	wc_count_t linect, wordct, charct;
195 	struct stat sb;
196 	int fd;
197 	u_char buf[MAXBSIZE];
198 	wchar_t wbuf[MAXBSIZE];
199 	size_t r = 0;
200 	mbstate_t st;
201 
202 	linect = wordct = charct = 0;
203 	if (file) {
204 		if ((fd = open(file, O_RDONLY, 0)) < 0) {
205 			warn("%s", file);
206 			rval = 1;
207 			return;
208 		}
209 	} else  {
210 		fd = STDIN_FILENO;
211 	}
212 
213 	if (dochar || doword)
214 		memset(&st, 0, sizeof(st));
215 
216 	if (!doword) {
217 		/*
218 		 * line counting is split out because it's a lot
219 		 * faster to get lines than to get words, since
220 		 * the word count requires some logic.
221 		 */
222 		if (doline || dochar) {
223 			while ((len = read(fd, buf, MAXBSIZE)) > 0) {
224 				if (dochar) {
225 					size_t wlen;
226 
227 					r = do_mb(0, (char *)buf, (size_t)len, &st, &wlen, file);
228 					charct += wlen;
229 				}
230 				else if (dobyte)
231 					charct += len;
232 				if (doline)
233 					for (C = buf; len--; ++C)
234 						if (*C == '\n')
235 							++linect;
236 			}
237 		}
238 
239 		/*
240 		 * if all we need is the number of characters and
241 		 * it's a directory or a regular or linked file, just
242 		 * stat the puppy.  We avoid testing for it not being
243 		 * a special device in case someone adds a new type
244 		 * of inode.
245 		 */
246 		else if (dobyte) {
247 			if (fstat(fd, &sb)) {
248 				warn("%s", file);
249 				rval = 1;
250 			} else {
251 				if (S_ISREG(sb.st_mode) ||
252 				    S_ISLNK(sb.st_mode) ||
253 				    S_ISDIR(sb.st_mode)) {
254 					charct = sb.st_size;
255 				} else {
256 					while ((len = read(fd, buf, MAXBSIZE)) > 0)
257 						charct += len;
258 				}
259 			}
260 		}
261 	}
262 	else
263 	{
264 		/* do it the hard way... */
265 		gotsp = 1;
266 		while ((len = read(fd, buf, MAXBSIZE)) > 0) {
267 			size_t wlen;
268 
269 			r = do_mb(wbuf, (char *)buf, (size_t)len, &st, &wlen, file);
270 			if (dochar) {
271 				charct += wlen;
272 			}
273 			else if (dobyte)
274 				charct += len;
275 			for (WC = wbuf; wlen--; ++WC) {
276 				if (iswspace(*WC)) {
277 					gotsp = 1;
278 					if (*WC == L'\n') {
279 						++linect;
280 					}
281 				} else {
282 					/*
283 					 * This line implements the POSIX
284 					 * spec, i.e. a word is a "maximal
285 					 * string of characters delimited by
286 					 * whitespace."  Notice nothing was
287 					 * said about a character being
288 					 * printing or non-printing.
289 					 */
290 					if (gotsp) {
291 						gotsp = 0;
292 						++wordct;
293 					}
294 				}
295 			}
296 		}
297 	}
298 
299 	if (len == -1) {
300 		warn ("%s", file);
301 		rval = 1;
302 	}
303 	if (dochar && r == (size_t)-2) {
304 		warnx ("%s: incomplete multibyte character", file);
305 		rval = 1;
306 	}
307 
308 	print_counts(linect, wordct, charct, file ? file : 0);
309 
310 	/* don't bother checkint doline, doword, or dobyte --- speeds
311            up the common case */
312 	tlinect += linect;
313 	twordct += wordct;
314 	tcharct += charct;
315 
316 	if (close(fd)) {
317 		warn ("%s", file);
318 		rval = 1;
319 	}
320 }
321 
322 static void
323 print_counts(lines, words, chars, name)
324 	wc_count_t lines;
325 	wc_count_t words;
326 	wc_count_t chars;
327 	char *name;
328 {
329 
330 	if (doline)
331 		printf(WCFMT, (WCCAST)lines);
332 	if (doword)
333 		printf(WCFMT, (WCCAST)words);
334 	if (dobyte || dochar)
335 		printf(WCFMT, (WCCAST)chars);
336 
337 	if (name)
338 		printf(" %s\n", name);
339 	else
340 		printf("\n");
341 }
342 
343 static void
344 usage()
345 {
346 	(void)fprintf(stderr, "usage: wc [-clw] [file ...]\n");
347 	exit(1);
348 }
349