1 /* $NetBSD: wc.c,v 1.29 2003/08/07 11:17:15 agc Exp $ */ 2 3 /* 4 * Copyright (c) 1980, 1987, 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 #ifndef lint 34 __COPYRIGHT("@(#) Copyright (c) 1980, 1987, 1991, 1993\n\ 35 The Regents of the University of California. All rights reserved.\n"); 36 #endif /* not lint */ 37 38 #ifndef lint 39 #if 0 40 static char sccsid[] = "@(#)wc.c 8.2 (Berkeley) 5/2/95"; 41 #else 42 __RCSID("$NetBSD: wc.c,v 1.29 2003/08/07 11:17:15 agc Exp $"); 43 #endif 44 #endif /* not lint */ 45 46 /* wc line, word and char count */ 47 48 #include <sys/param.h> 49 #include <sys/file.h> 50 #include <sys/stat.h> 51 52 #include <ctype.h> 53 #include <fcntl.h> 54 #include <err.h> 55 #include <errno.h> 56 #include <locale.h> 57 #include <stdio.h> 58 #include <stdlib.h> 59 #include <string.h> 60 #include <unistd.h> 61 #include <wchar.h> 62 #include <wctype.h> 63 64 #ifdef NO_QUAD 65 typedef u_long wc_count_t; 66 # define WCFMT " %7lu" 67 # define WCCAST unsigned long 68 #else 69 typedef u_quad_t wc_count_t; 70 # define WCFMT " %7llu" 71 # define WCCAST unsigned long long 72 #endif 73 74 static wc_count_t tlinect, twordct, tcharct; 75 static int doline, doword, dobyte, dochar; 76 static int rval = 0; 77 78 static void cnt __P((char *)); 79 static void print_counts __P((wc_count_t, wc_count_t, wc_count_t, char *)); 80 static void usage __P((void)); 81 static size_t do_mb __P((wchar_t *, const char *, size_t, mbstate_t *, 82 size_t *, const char *)); 83 int main __P((int, char *[])); 84 85 int 86 main(argc, argv) 87 int argc; 88 char *argv[]; 89 { 90 int ch; 91 92 setlocale(LC_ALL, ""); 93 94 while ((ch = getopt(argc, argv, "lwcm")) != -1) 95 switch (ch) { 96 case 'l': 97 doline = 1; 98 break; 99 case 'w': 100 doword = 1; 101 break; 102 case 'm': 103 dochar = 1; 104 dobyte = 0; 105 break; 106 case 'c': 107 dochar = 0; 108 dobyte = 1; 109 break; 110 case '?': 111 default: 112 usage(); 113 } 114 argv += optind; 115 argc -= optind; 116 117 /* Wc's flags are on by default. */ 118 if (doline + doword + dobyte + dochar == 0) 119 doline = doword = dobyte = 1; 120 121 if (!*argv) { 122 cnt(NULL); 123 } else { 124 int dototal = (argc > 1); 125 126 do { 127 cnt(*argv); 128 } while(*++argv); 129 130 if (dototal) 131 print_counts(tlinect, twordct, tcharct, "total"); 132 } 133 134 exit(rval); 135 } 136 137 static size_t 138 do_mb(wc, p, mblen, st, cnt, file) 139 wchar_t *wc; 140 const char *p; 141 size_t mblen; 142 mbstate_t *st; 143 size_t *cnt; 144 const char *file; 145 { 146 size_t r; 147 size_t c = 0; 148 149 do { 150 r = mbrtowc(wc, p, mblen, st); 151 if (r == (size_t)-1) { 152 warnx("%s: invalid byte sequence", file); 153 rval = 1; 154 155 /* XXX skip 1 byte */ 156 mblen--; 157 p++; 158 memset(st, 0, sizeof(*st)); 159 continue; 160 } else if (r == (size_t)-2) 161 break; 162 else if (r == 0) 163 r = 1; 164 c++; 165 if (wc) 166 wc++; 167 mblen -= r; 168 p += r; 169 } while (mblen > 0); 170 171 *cnt = c; 172 173 return (r); 174 } 175 176 static void 177 cnt(file) 178 char *file; 179 { 180 u_char buf[MAXBSIZE]; 181 wchar_t wbuf[MAXBSIZE]; 182 struct stat sb; 183 wc_count_t charct, linect, wordct; 184 mbstate_t st; 185 u_char *C; 186 wchar_t *WC; 187 char *name; /* filename or <stdin> */ 188 size_t r = 0; 189 int fd, gotsp, len = 0; 190 191 linect = wordct = charct = 0; 192 if (file) { 193 if ((fd = open(file, O_RDONLY, 0)) < 0) { 194 warn("%s", file); 195 rval = 1; 196 return; 197 } 198 name = file; 199 } else { 200 fd = STDIN_FILENO; 201 name = "<stdin>"; 202 } 203 204 if (dochar || doword) 205 memset(&st, 0, sizeof(st)); 206 207 if (!doword) { 208 /* 209 * line counting is split out because it's a lot 210 * faster to get lines than to get words, since 211 * the word count requires some logic. 212 */ 213 if (doline || dochar) { 214 while ((len = read(fd, buf, MAXBSIZE)) > 0) { 215 if (dochar) { 216 size_t wlen; 217 218 r = do_mb(0, (char *)buf, (size_t)len, 219 &st, &wlen, name); 220 charct += wlen; 221 } else if (dobyte) 222 charct += len; 223 if (doline) 224 for (C = buf; len--; ++C) 225 if (*C == '\n') 226 ++linect; 227 } 228 } 229 230 /* 231 * if all we need is the number of characters and 232 * it's a directory or a regular or linked file, just 233 * stat the puppy. We avoid testing for it not being 234 * a special device in case someone adds a new type 235 * of inode. 236 */ 237 else if (dobyte) { 238 if (fstat(fd, &sb)) { 239 warn("%s", name); 240 rval = 1; 241 } else { 242 if (S_ISREG(sb.st_mode) || 243 S_ISLNK(sb.st_mode) || 244 S_ISDIR(sb.st_mode)) { 245 charct = sb.st_size; 246 } else { 247 while ((len = 248 read(fd, buf, MAXBSIZE)) > 0) 249 charct += len; 250 } 251 } 252 } 253 } else { 254 /* do it the hard way... */ 255 gotsp = 1; 256 while ((len = read(fd, buf, MAXBSIZE)) > 0) { 257 size_t wlen; 258 259 r = do_mb(wbuf, (char *)buf, (size_t)len, &st, &wlen, 260 name); 261 if (dochar) { 262 charct += wlen; 263 } else if (dobyte) 264 charct += len; 265 for (WC = wbuf; wlen--; ++WC) { 266 if (iswspace(*WC)) { 267 gotsp = 1; 268 if (*WC == L'\n') { 269 ++linect; 270 } 271 } else { 272 /* 273 * This line implements the POSIX 274 * spec, i.e. a word is a "maximal 275 * string of characters delimited by 276 * whitespace." Notice nothing was 277 * said about a character being 278 * printing or non-printing. 279 */ 280 if (gotsp) { 281 gotsp = 0; 282 ++wordct; 283 } 284 } 285 } 286 } 287 } 288 289 if (len == -1) { 290 warn("%s", name); 291 rval = 1; 292 } 293 if (dochar && r == (size_t)-2) { 294 warnx("%s: incomplete multibyte character", name); 295 rval = 1; 296 } 297 298 print_counts(linect, wordct, charct, file); 299 300 /* 301 * don't bother checkint doline, doword, or dobyte --- speeds 302 * up the common case 303 */ 304 tlinect += linect; 305 twordct += wordct; 306 tcharct += charct; 307 308 if (close(fd)) { 309 warn("%s", name); 310 rval = 1; 311 } 312 } 313 314 static void 315 print_counts(lines, words, chars, name) 316 wc_count_t lines; 317 wc_count_t words; 318 wc_count_t chars; 319 char *name; 320 { 321 322 if (doline) 323 printf(WCFMT, (WCCAST)lines); 324 if (doword) 325 printf(WCFMT, (WCCAST)words); 326 if (dobyte || dochar) 327 printf(WCFMT, (WCCAST)chars); 328 329 if (name) 330 printf(" %s\n", name); 331 else 332 printf("\n"); 333 } 334 335 static void 336 usage() 337 { 338 339 (void)fprintf(stderr, "usage: wc [-c | -m] [-lw] [file ...]\n"); 340 exit(1); 341 } 342