xref: /netbsd-src/usr.bin/split/split.c (revision 100a3398b8d3c64e571cff36b46c23431b410e09)
1 /*	$NetBSD: split.c,v 1.33 2024/02/09 22:08:38 andvar Exp $	*/
2 
3 /*
4  * Copyright (c) 1987, 1993, 1994
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 #ifndef lint
34 __COPYRIGHT("@(#) Copyright (c) 1987, 1993, 1994\
35  The Regents of the University of California.  All rights reserved.");
36 #endif /* not lint */
37 
38 #ifndef lint
39 #if 0
40 static char sccsid[] = "@(#)split.c	8.3 (Berkeley) 4/25/94";
41 #endif
42 __RCSID("$NetBSD: split.c,v 1.33 2024/02/09 22:08:38 andvar Exp $");
43 #endif /* not lint */
44 
45 #include <sys/param.h>
46 #include <sys/stat.h>
47 
48 #include <ctype.h>
49 #include <err.h>
50 #include <errno.h>
51 #include <fcntl.h>
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include <unistd.h>
56 
57 #define DEFLINE	1000		/* Default num lines per file. */
58 
59 static int file_open;		/* If a file is open. */
60 static int ifd = STDIN_FILENO, ofd = -1; /* Input/output file descriptors. */
61 static char *fname;		/* File name prefix. */
62 static size_t sfxlen = 2;	/* Suffix length. */
63 static int autosfx = 1;		/* Whether to auto-extend the suffix length. */
64 
65 static void newfile(void);
66 static void split1(off_t, int) __dead;
67 static void split2(off_t) __dead;
68 static void split3(off_t) __dead;
69 static void usage(void) __dead;
70 static size_t bigwrite(int, void const *, size_t);
71 
72 int
main(int argc,char * argv[])73 main(int argc, char *argv[])
74 {
75 	int ch;
76 	char *ep, *p;
77 	char const *base;
78 	off_t bytecnt = 0;	/* Byte count to split on. */
79 	off_t numlines = 0;	/* Line count to split on. */
80 	off_t chunks = 0;	/* Number of chunks to split into. */
81 
82 	while ((ch = getopt(argc, argv, "0123456789a:b:l:n:")) != -1)
83 		switch (ch) {
84 		case '0': case '1': case '2': case '3': case '4':
85 		case '5': case '6': case '7': case '8': case '9':
86 			/*
87 			 * Undocumented kludge: split was originally designed
88 			 * to take a number after a dash.
89 			 */
90 			if (numlines == 0) {
91 				p = argv[optind - 1];
92 				if (p[0] == '-' && p[1] == ch && !p[2])
93 					p++;
94 				else
95 					p = argv[optind] + 1;
96 				numlines = strtoull(p, &ep, 10);
97 				if (numlines == 0 || *ep != '\0')
98 					errx(EXIT_FAILURE, "%s: illegal line count.", p);
99 			}
100 			break;
101 		case 'a':		/* Suffix length. */
102 			if (!isdigit((unsigned char)optarg[0]) ||
103 			    (sfxlen = (size_t)strtoul(optarg, &ep, 10)) == 0 ||
104 			    *ep != '\0')
105 				errx(EXIT_FAILURE, "%s: illegal suffix length.", optarg);
106 			autosfx = 0;
107 			break;
108 		case 'b':		/* Byte count. */
109 			if (!isdigit((unsigned char)optarg[0]) ||
110 			    (bytecnt = strtoull(optarg, &ep, 10)) == 0 ||
111 			    (*ep != '\0' && *ep != 'k' && *ep != 'm'))
112 				errx(EXIT_FAILURE, "%s: illegal byte count.", optarg);
113 			if (*ep == 'k')
114 				bytecnt *= 1024;
115 			else if (*ep == 'm')
116 				bytecnt *= 1024 * 1024;
117 			break;
118 		case 'l':		/* Line count. */
119 			if (numlines != 0)
120 				usage();
121 			if (!isdigit((unsigned char)optarg[0]) ||
122 			    (numlines = strtoull(optarg, &ep, 10)) == 0 ||
123 			    *ep != '\0')
124 				errx(EXIT_FAILURE, "%s: illegal line count.", optarg);
125 			break;
126 		case 'n':		/* Chunks. */
127 			if (!isdigit((unsigned char)optarg[0]) ||
128 			    (chunks = (size_t)strtoul(optarg, &ep, 10)) == 0 ||
129 			    *ep != '\0')
130 				errx(EXIT_FAILURE, "%s: illegal number of chunks.", optarg);
131 			break;
132 		default:
133 			usage();
134 		}
135 	argv += optind;
136 	argc -= optind;
137 
138 	if (*argv != NULL) {
139 		if (strcmp(*argv, "-") != 0 &&
140 		    (ifd = open(*argv, O_RDONLY, 0)) < 0)
141 			err(EXIT_FAILURE, "%s", *argv);
142 		++argv;
143 	}
144 
145 
146 	base = (*argv != NULL) ? *argv++ : "x";
147 	if ((fname = malloc(strlen(base) + sfxlen + 1)) == NULL)
148 		err(EXIT_FAILURE, NULL);
149 	(void)strcpy(fname, base);		/* File name prefix. */
150 
151 	if (*argv != NULL)
152 		usage();
153 
154 	if (numlines == 0)
155 		numlines = DEFLINE;
156 	else if (bytecnt || chunks)
157 		usage();
158 
159 	if (bytecnt && chunks)
160 		usage();
161 
162 	if (bytecnt)
163 		split1(bytecnt, 0);
164 	else if (chunks)
165 		split3(chunks);
166 	else
167 		split2(numlines);
168 
169 	return 0;
170 }
171 
172 /*
173  * split1 --
174  *	Split the input by bytes.
175  */
176 static void
split1(off_t bytecnt,int maxcnt)177 split1(off_t bytecnt, int maxcnt)
178 {
179 	off_t bcnt;
180 	ssize_t dist, len;
181 	char *C;
182 	char bfr[MAXBSIZE];
183 	int nfiles;
184 
185 	nfiles = 0;
186 
187 	for (bcnt = 0;;)
188 		switch (len = read(ifd, bfr, MAXBSIZE)) {
189 		case 0:
190 			exit(EXIT_SUCCESS);
191 			/* NOTREACHED */
192 		case -1:
193 			err(EXIT_FAILURE, "read");
194 			/* NOTREACHED */
195 		default:
196 			if (!file_open) {
197 				if (!maxcnt || (nfiles < maxcnt)) {
198 					newfile();
199 					nfiles++;
200 					file_open = 1;
201 				}
202 			}
203 			if (bcnt + len >= bytecnt) {
204 				/* LINTED: bytecnt - bcnt <= len */
205 				dist = bytecnt - bcnt;
206 				if (bigwrite(ofd, bfr, dist) != (size_t)dist)
207 					err(EXIT_FAILURE, "write");
208 				len -= dist;
209 				for (C = bfr + dist; len >= bytecnt;
210 				    /* LINTED: bytecnt <= len */
211 				    len -= bytecnt, C += bytecnt) {
212 					if (!maxcnt || (nfiles < maxcnt)) {
213 						newfile();
214 						nfiles++;
215 					}
216 					/* LINTED: as above */
217 					if (bigwrite(ofd,
218 					    C, bytecnt) != (size_t)bytecnt)
219 						err(EXIT_FAILURE, "write");
220 				}
221 				if (len) {
222 					if (!maxcnt || (nfiles < maxcnt)) {
223 						newfile();
224 						nfiles++;
225 					}
226 					/* LINTED: len >= 0 */
227 					if (bigwrite(ofd, C, len) != (size_t)len)
228 						err(EXIT_FAILURE, "write");
229 				} else
230 					file_open = 0;
231 				bcnt = len;
232 			} else {
233 				bcnt += len;
234 				/* LINTED: len >= 0 */
235 				if (bigwrite(ofd, bfr, len) != (size_t)len)
236 					err(EXIT_FAILURE, "write");
237 			}
238 		}
239 }
240 
241 /*
242  * split2 --
243  *	Split the input by lines.
244  */
245 static void
split2(off_t numlines)246 split2(off_t numlines)
247 {
248 	off_t lcnt;
249 	size_t bcnt;
250 	ssize_t len;
251 	char *Ce, *Cs;
252 	char bfr[MAXBSIZE];
253 
254 	for (lcnt = 0;;)
255 		switch (len = read(ifd, bfr, MAXBSIZE)) {
256 		case 0:
257 			exit(EXIT_SUCCESS);
258 			/* NOTREACHED */
259 		case -1:
260 			err(EXIT_FAILURE, "read");
261 			/* NOTREACHED */
262 		default:
263 			if (!file_open) {
264 				newfile();
265 				file_open = 1;
266 			}
267 			for (Cs = Ce = bfr; len--; Ce++)
268 				if (*Ce == '\n' && ++lcnt == numlines) {
269 					bcnt = Ce - Cs + 1;
270 					if (bigwrite(ofd, Cs, bcnt) != (size_t)bcnt)
271 						err(EXIT_FAILURE, "write");
272 					lcnt = 0;
273 					Cs = Ce + 1;
274 					if (len)
275 						newfile();
276 					else
277 						file_open = 0;
278 				}
279 			if (Cs < Ce) {
280 				bcnt = Ce - Cs;
281 				if (bigwrite(ofd, Cs, bcnt) != (size_t)bcnt)
282 					err(EXIT_FAILURE, "write");
283 			}
284 		}
285 }
286 
287 /*
288  * split3 --
289  *	Split the input into specified number of chunks
290  */
291 static void
split3(off_t chunks)292 split3(off_t chunks)
293 {
294 	struct stat sb;
295 
296 	if (fstat(ifd, &sb) == -1) {
297 		err(EXIT_FAILURE, "stat");
298 		/* NOTREACHED */
299 	}
300 
301 	if (chunks > sb.st_size) {
302 		errx(EXIT_FAILURE, "can't split into more than %d files",
303 				(int)sb.st_size);
304 		/* NOTREACHED */
305 	}
306 
307 	split1(sb.st_size/chunks, chunks);
308 }
309 
310 /*
311  * newfile --
312  *	Open a new output file.
313  */
314 static void
newfile(void)315 newfile(void)
316 {
317 	static int fnum;
318 	static char *fpnt;
319 	int quot, i;
320 
321 	if (ofd == -1) {
322 		fpnt = fname + strlen(fname);
323 		fpnt[sfxlen] = '\0';
324 	} else if (close(ofd) != 0)
325 		err(EXIT_FAILURE, "%s", fname);
326 
327 	quot = fnum;
328 
329 	/* If '-a' is not specified, then we automatically expand the
330 	 * suffix length to accommodate splitting all input.  We do this
331 	 * by moving the suffix pointer (fpnt) forward and incrementing
332 	 * sfxlen by one, thereby yielding an additional two characters
333 	 * and allowing all output files to sort such that 'cat *' yields
334 	 * the input in order.  I.e., the order is '... xyy xyz xzaaa
335 	 * xzaab ... xzyzy, xzyzz, xzzaaaa, xzzaaab' and so on. */
336 	if (autosfx && (fpnt[0] == 'y') && (strspn(fpnt+1, "z") == strlen(fpnt+1))) {
337 		if ((fname = realloc(fname, strlen(fname) + sfxlen + 2 + 1)) == NULL)
338 			err(EXIT_FAILURE, NULL);
339 			/* NOTREACHED */
340 
341 		fpnt = fname + strlen(fname) - sfxlen;
342 		fpnt[sfxlen + 2] = '\0';
343 
344 		fpnt[0] = 'z';
345 		fpnt[1] = 'a';
346 
347 		/*  Basename | Suffix
348 		 *  before:
349 		 *  x        | yz
350 		 *  after:
351 		 *  xz       | a.. */
352 		fpnt++;
353 		sfxlen++;
354 
355 		/* Reset so we start back at all 'a's in our extended suffix. */
356 		quot = 0;
357 		fnum = 0;
358 	}
359 
360 	for (i = sfxlen - 1; i >= 0; i--) {
361 		fpnt[i] = quot % 26 + 'a';
362 		quot = quot / 26;
363 	}
364 	if (quot > 0)
365 		errx(EXIT_FAILURE, "too many files.");
366 	++fnum;
367 	if ((ofd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, DEFFILEMODE)) < 0)
368 		err(EXIT_FAILURE, "%s", fname);
369 }
370 
371 static size_t
bigwrite(int fd,const void * buf,size_t len)372 bigwrite(int fd, const void *buf, size_t len)
373 {
374 	const char *ptr = buf;
375 	size_t sofar = 0;
376 	ssize_t w;
377 
378 	while (len != 0) {
379 		if  ((w = write(fd, ptr, len)) == -1)
380 			return sofar;
381 		len -= w;
382 		ptr += w;
383 		sofar += w;
384 	}
385 	return sofar;
386 }
387 
388 
389 static void
usage(void)390 usage(void)
391 {
392 	(void)fprintf(stderr,
393 "usage: %s [-b byte_count] [-l line_count] [-n chunk_count] [-a suffix_length] "
394 "[file [prefix]]\n", getprogname());
395 	exit(EXIT_FAILURE);
396 }
397