xref: /openbsd-src/usr.bin/diff/diffreg.c (revision 7f136ccb320b406e181bf8283bddef7fd64c8512)
1 /*	$OpenBSD: diffreg.c,v 1.19 2003/06/26 04:47:30 vincent Exp $	*/
2 
3 /*
4  * Copyright (C) Caldera International Inc.  2001-2002.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code and documentation must retain the above
11  *    copyright notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed or owned by Caldera
18  *	International, Inc.
19  * 4. Neither the name of Caldera International, Inc. nor the names of other
20  *    contributors may be used to endorse or promote products derived from
21  *    this software without specific prior written permission.
22  *
23  * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
24  * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
28  * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
33  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 #include <sys/types.h>
38 
39 #include <stdlib.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <string.h>
43 
44 #include "diff.h"
45 #include "pathnames.h"
46 
47 #if 0
48 static char const sccsid[] = "@(#)diffreg.c 4.21 4/6/90";
49 #endif
50 
51 /*
52  * diff - compare two files.
53  */
54 
55 /*
56  *	Uses an algorithm due to Harold Stone, which finds
57  *	a pair of longest identical subsequences in the two
58  *	files.
59  *
60  *	The major goal is to generate the match vector J.
61  *	J[i] is the index of the line in file1 corresponding
62  *	to line i file0. J[i] = 0 if there is no
63  *	such line in file1.
64  *
65  *	Lines are hashed so as to work in core. All potential
66  *	matches are located by sorting the lines of each file
67  *	on the hash (called ``value''). In particular, this
68  *	collects the equivalence classes in file1 together.
69  *	Subroutine equiv replaces the value of each line in
70  *	file0 by the index of the first element of its
71  *	matching equivalence in (the reordered) file1.
72  *	To save space equiv squeezes file1 into a single
73  *	array member in which the equivalence classes
74  *	are simply concatenated, except that their first
75  *	members are flagged by changing sign.
76  *
77  *	Next the indices that point into member are unsorted into
78  *	array class according to the original order of file0.
79  *
80  *	The cleverness lies in routine stone. This marches
81  *	through the lines of file0, developing a vector klist
82  *	of "k-candidates". At step i a k-candidate is a matched
83  *	pair of lines x,y (x in file0 y in file1) such that
84  *	there is a common subsequence of length k
85  *	between the first i lines of file0 and the first y
86  *	lines of file1, but there is no such subsequence for
87  *	any smaller y. x is the earliest possible mate to y
88  *	that occurs in such a subsequence.
89  *
90  *	Whenever any of the members of the equivalence class of
91  *	lines in file1 matable to a line in file0 has serial number
92  *	less than the y of some k-candidate, that k-candidate
93  *	with the smallest such y is replaced. The new
94  *	k-candidate is chained (via pred) to the current
95  *	k-1 candidate so that the actual subsequence can
96  *	be recovered. When a member has serial number greater
97  *	that the y of all k-candidates, the klist is extended.
98  *	At the end, the longest subsequence is pulled out
99  *	and placed in the array J by unravel
100  *
101  *	With J in hand, the matches there recorded are
102  *	check'ed against reality to assure that no spurious
103  *	matches have crept in due to hashing. If they have,
104  *	they are broken, and "jackpot" is recorded--a harmless
105  *	matter except that a true match for a spuriously
106  *	mated line may now be unnecessarily reported as a change.
107  *
108  *	Much of the complexity of the program comes simply
109  *	from trying to minimize core utilization and
110  *	maximize the range of doable problems by dynamically
111  *	allocating what is needed and reusing what is not.
112  *	The core requirements for problems larger than somewhat
113  *	are (in words) 2*length(file0) + length(file1) +
114  *	3*(number of k-candidates installed),  typically about
115  *	6n words for files of length n.
116  */
117 
118 #define	prints(s)	fputs(s,stdout)
119 
120 FILE *input[2];
121 
122 struct cand {
123 	int x;
124 	int y;
125 	int pred;
126 } cand;
127 
128 struct line {
129 	int serial;
130 	int value;
131 } *file[2];
132 
133 int len[2];
134 struct line *sfile[2];	/* shortened by pruning common prefix and suffix */
135 int slen[2];
136 int pref, suff;			/* length of prefix and suffix */
137 int *class;			/* will be overlaid on file[0] */
138 int *member;			/* will be overlaid on file[1] */
139 int *klist;			/* will be overlaid on file[0] after class */
140 struct cand *clist;		/* merely a free storage pot for candidates */
141 int clen = 0;
142 int *J;				/* will be overlaid on class */
143 long *ixold;			/* will be overlaid on klist */
144 long *ixnew;			/* will be overlaid on file[1] */
145 u_char *chrtran;		/* translation table for case-folding */
146 
147 static void fetch(long *, int, int, FILE *, char *, int);
148 static void output(void);
149 static void check(void);
150 static void range(int, int, char *);
151 static void dump_context_vec(void);
152 static void dump_unified_vec(void);
153 static void prepare(int, FILE *);
154 static void prune(void);
155 static void equiv(struct line *, int, struct line *, int, int *);
156 static void unravel(int);
157 static void unsort(struct line *, int, int *);
158 static void change(int, int, int, int);
159 static void sort(struct line *, int);
160 static int newcand(int, int, int);
161 static int search(int *, int, int);
162 static int skipline(int);
163 static int asciifile(FILE *);
164 static int stone(int *, int, int *, int *);
165 static int readhash(FILE *);
166 
167 /*
168  * chrtran points to one of 2 translation tables: cup2low if folding upper to
169  * lower case clow2low if not folding case
170  */
171 u_char clow2low[256] = {
172 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
173 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
174 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
175 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
176 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
177 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41,
178 	0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c,
179 	0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
180 	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
181 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
182 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
183 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
184 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
185 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
186 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
187 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
188 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
189 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
190 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
191 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
192 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
193 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
194 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
195 	0xfd, 0xfe, 0xff
196 };
197 
198 u_char cup2low[256] = {
199 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
200 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
201 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
202 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
203 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
204 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x60, 0x61,
205 	0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
206 	0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
207 	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x60, 0x61, 0x62,
208 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
209 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
210 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
211 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
212 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
213 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
214 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
215 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
216 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
217 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
218 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
219 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
220 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
221 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
222 	0xfd, 0xfe, 0xff
223 };
224 
225 void
226 diffreg(void)
227 {
228 	char buf1[BUFSIZ], buf2[BUFSIZ];
229 	FILE *f1, *f2;
230 	int i, j;
231 
232 	if (hflag) {
233 		diffargv[0] = "diffh";
234 		execv(diffh, diffargv);
235 		warn("%s", diffh);
236 		done(0);
237 	}
238 	chrtran = (iflag ? cup2low : clow2low);
239 	if ((stb1.st_mode & S_IFMT) == S_IFDIR) {
240 		file1 = splice(file1, file2);
241 		if (stat(file1, &stb1) < 0) {
242 			warn("%s", file1);
243 			done(0);
244 		}
245 	} else if ((stb2.st_mode & S_IFMT) == S_IFDIR) {
246 		file2 = splice(file2, file1);
247 		if (stat(file2, &stb2) < 0) {
248 			warn("%s", file2);
249 			done(0);
250 		}
251 	} else if ((stb1.st_mode & S_IFMT) != S_IFREG || !strcmp(file1, "-")) {
252 		if (!strcmp(file2, "-")) {
253 			warnx("can't specify - -");
254 			done(0);
255 		}
256 		file1 = copytemp();
257 		if (stat(file1, &stb1) < 0) {
258 			warn("%s", file1);
259 			done(0);
260 		}
261 	} else if ((stb2.st_mode & S_IFMT) != S_IFREG || !strcmp(file2, "-")) {
262 		file2 = copytemp();
263 		if (stat(file2, &stb2) < 0) {
264 			warn("%s", file2);
265 			done(0);
266 		}
267 	}
268 	if ((f1 = fopen(file1, "r")) == NULL) {
269 		warn("%s", file1);
270 		done(0);
271 	}
272 	if ((f2 = fopen(file2, "r")) == NULL) {
273 		warn("%s", file2);
274 		fclose(f1);
275 		done(0);
276 	}
277 	if (stb1.st_size != stb2.st_size)
278 		goto notsame;
279 	for (;;) {
280 		i = fread(buf1, 1, BUFSIZ, f1);
281 		j = fread(buf2, 1, BUFSIZ, f2);
282 		if (i < 0 || j < 0 || i != j)
283 			goto notsame;
284 		if (i == 0 && j == 0) {
285 			fclose(f1);
286 			fclose(f2);
287 			status = 0;	/* files don't differ */
288 			goto same;
289 		}
290 		for (j = 0; j < i; j++)
291 			if (buf1[j] != buf2[j])
292 				goto notsame;
293 	}
294 notsame:
295 	/*
296 	 * Files certainly differ at this point; set status accordingly
297 	 */
298 	status = 1;
299 	if (!asciifile(f1) || !asciifile(f2)) {
300 		printf("Binary files %s and %s differ\n", file1, file2);
301 		fclose(f1);
302 		fclose(f2);
303 		done(0);
304 	}
305 	prepare(0, f1);
306 	prepare(1, f2);
307 	fclose(f1);
308 	fclose(f2);
309 	prune();
310 	sort(sfile[0], slen[0]);
311 	sort(sfile[1], slen[1]);
312 
313 	member = (int *)file[1];
314 	equiv(sfile[0], slen[0], sfile[1], slen[1], member);
315 	member = erealloc(member, (slen[1] + 2) * sizeof(int));
316 
317 	class = (int *)file[0];
318 	unsort(sfile[0], slen[0], class);
319 	class = erealloc(class, (slen[0] + 2) * sizeof(int));
320 
321 	klist = emalloc((slen[0] + 2) * sizeof(int));
322 	clist = emalloc(sizeof(cand));
323 	i = stone(class, slen[0], member, klist);
324 	free(member);
325 	free(class);
326 
327 	J = emalloc((len[0] + 2) * sizeof(int));
328 	unravel(klist[i]);
329 	free(clist);
330 	free(klist);
331 
332 	ixold = emalloc((len[0] + 2) * sizeof(long));
333 	ixnew = emalloc((len[1] + 2) * sizeof(long));
334 	check();
335 	output();
336 	status = anychange;
337 same:
338 	if (anychange == 0 && (opt == D_CONTEXT || opt == D_UNIFIED))
339 		printf("No differences encountered\n");
340 	done(0);
341 }
342 
343 char *tempfile = _PATH_TMP;
344 
345 char *
346 copytemp(void)
347 {
348 	char buf[BUFSIZ];
349 	int i, f;
350 
351 	signal(SIGHUP, done);
352 	signal(SIGINT, done);
353 	signal(SIGPIPE, done);
354 	signal(SIGTERM, done);
355 	f = mkstemp(tempfile);
356 	if (f < 0) {
357 		warn("%s", tempfile);
358 		done(0);
359 	}
360 	while ((i = read(0, buf, BUFSIZ)) > 0)
361 		if (write(f, buf, i) != i) {
362 			warn("%s", tempfile);
363 			done(0);
364 		}
365 	close(f);
366 	return (tempfile);
367 }
368 
369 char *
370 splice(char *dir, char *file)
371 {
372 	char *tail, *buf;
373 	size_t len;
374 
375 	if (!strcmp(file, "-")) {
376 		warnx("can't specify - with other arg directory");
377 		done(0);
378 	}
379 	tail = strrchr(file, '/');
380 	if (tail == NULL)
381 		tail = file;
382 	else
383 		tail++;
384 	len = strlen(dir) + 1 + strlen(tail) + 1;
385 	buf = emalloc(len);
386 	snprintf(buf, len, "%s/%s", dir, tail);
387 	return (buf);
388 }
389 
390 static void
391 prepare(int i, FILE *fd)
392 {
393 	struct line *p;
394 	int j, h;
395 
396 	fseek(fd, 0L, SEEK_SET);
397 	p = emalloc(3 * sizeof(struct line));
398 	for (j = 0; (h = readhash(fd));) {
399 		p = erealloc(p, (++j + 3) * sizeof(struct line));
400 		p[j].value = h;
401 	}
402 	len[i] = j;
403 	file[i] = p;
404 }
405 
406 static void
407 prune(void)
408 {
409 	int i, j;
410 
411 	for (pref = 0; pref < len[0] && pref < len[1] &&
412 	    file[0][pref + 1].value == file[1][pref + 1].value;
413 	    pref++)
414 		;
415 	for (suff = 0; suff < len[0] - pref && suff < len[1] - pref &&
416 	    file[0][len[0] - suff].value == file[1][len[1] - suff].value;
417 	    suff++)
418 		;
419 	for (j = 0; j < 2; j++) {
420 		sfile[j] = file[j] + pref;
421 		slen[j] = len[j] - pref - suff;
422 		for (i = 0; i <= slen[j]; i++)
423 			sfile[j][i].serial = i;
424 	}
425 }
426 
427 static void
428 equiv(struct line *a, int n, struct line *b, int m, int *c)
429 {
430 	int i, j;
431 
432 	i = j = 1;
433 	while (i <= n && j <= m) {
434 		if (a[i].value < b[j].value)
435 			a[i++].value = 0;
436 		else if (a[i].value == b[j].value)
437 			a[i++].value = j;
438 		else
439 			j++;
440 	}
441 	while (i <= n)
442 		a[i++].value = 0;
443 	b[m + 1].value = 0;
444 	j = 0;
445 	while (++j <= m) {
446 		c[j] = -b[j].serial;
447 		while (b[j + 1].value == b[j].value) {
448 			j++;
449 			c[j] = b[j].serial;
450 		}
451 	}
452 	c[j] = -1;
453 }
454 
455 static int
456 stone(int *a, int n, int *b, int *c)
457 {
458 	int i, k, y, j, l;
459 	int oldc, tc, oldl;
460 
461 	k = 0;
462 	c[0] = newcand(0, 0, 0);
463 	for (i = 1; i <= n; i++) {
464 		j = a[i];
465 		if (j == 0)
466 			continue;
467 		y = -b[j];
468 		oldl = 0;
469 		oldc = c[0];
470 		do {
471 			if (y <= clist[oldc].y)
472 				continue;
473 			l = search(c, k, y);
474 			if (l != oldl + 1)
475 				oldc = c[l - 1];
476 			if (l <= k) {
477 				if (clist[c[l]].y <= y)
478 					continue;
479 				tc = c[l];
480 				c[l] = newcand(i, y, oldc);
481 				oldc = tc;
482 				oldl = l;
483 			} else {
484 				c[l] = newcand(i, y, oldc);
485 				k++;
486 				break;
487 			}
488 		} while ((y = b[++j]) > 0);
489 	}
490 	return (k);
491 }
492 
493 static int
494 newcand(int x, int y, int pred)
495 {
496 	struct cand *q;
497 
498 	clist = erealloc(clist, ++clen * sizeof(cand));
499 	q = clist + clen - 1;
500 	q->x = x;
501 	q->y = y;
502 	q->pred = pred;
503 	return (clen - 1);
504 }
505 
506 static int
507 search(int *c, int k, int y)
508 {
509 	int i, j, l, t;
510 
511 	if (clist[c[k]].y < y)	/* quick look for typical case */
512 		return (k + 1);
513 	i = 0;
514 	j = k + 1;
515 	while (1) {
516 		l = i + j;
517 		if ((l >>= 1) <= i)
518 			break;
519 		t = clist[c[l]].y;
520 		if (t > y)
521 			j = l;
522 		else if (t < y)
523 			i = l;
524 		else
525 			return (l);
526 	}
527 	return (l + 1);
528 }
529 
530 static void
531 unravel(int p)
532 {
533 	struct cand *q;
534 	int i;
535 
536 	for (i = 0; i <= len[0]; i++)
537 		J[i] = i <= pref ? i :
538 		    i > len[0] - suff ? i + len[1] - len[0] : 0;
539 	for (q = clist + p; q->y != 0; q = clist + q->pred)
540 		J[q->x + pref] = q->y + pref;
541 }
542 
543 /*
544  * Check does double duty:
545  *  1.	ferret out any fortuitous correspondences due
546  *	to confounding by hashing (which result in "jackpot")
547  *  2.  collect random access indexes to the two files
548  */
549 static void
550 check(void)
551 {
552 	int i, j, jackpot, c, d;
553 	long ctold, ctnew;
554 
555 	if ((input[0] = fopen(file1, "r")) == NULL) {
556 		perror(file1);
557 		done(0);
558 	}
559 	if ((input[1] = fopen(file2, "r")) == NULL) {
560 		perror(file2);
561 		done(0);
562 	}
563 	j = 1;
564 	ixold[0] = ixnew[0] = 0;
565 	jackpot = 0;
566 	ctold = ctnew = 0;
567 	for (i = 1; i <= len[0]; i++) {
568 		if (J[i] == 0) {
569 			ixold[i] = ctold += skipline(0);
570 			continue;
571 		}
572 		while (j < J[i]) {
573 			ixnew[j] = ctnew += skipline(1);
574 			j++;
575 		}
576 		if (bflag || wflag || iflag) {
577 			for (;;) {
578 				c = getc(input[0]);
579 				d = getc(input[1]);
580 				ctold++;
581 				ctnew++;
582 				if (bflag && isspace(c) && isspace(d)) {
583 					do {
584 						if (c == '\n')
585 							break;
586 						ctold++;
587 					} while (isspace(c = getc(input[0])));
588 					do {
589 						if (d == '\n')
590 							break;
591 						ctnew++;
592 					} while (isspace(d = getc(input[1])));
593 				} else if (wflag) {
594 					while (isspace(c) && c != '\n') {
595 						c = getc(input[0]);
596 						ctold++;
597 					}
598 					while (isspace(d) && d != '\n') {
599 						d = getc(input[1]);
600 						ctnew++;
601 					}
602 				}
603 				if (chrtran[c] != chrtran[d]) {
604 					jackpot++;
605 					J[i] = 0;
606 					if (c != '\n')
607 						ctold += skipline(0);
608 					if (d != '\n')
609 						ctnew += skipline(1);
610 					break;
611 				}
612 				if (c == '\n')
613 					break;
614 			}
615 		} else {
616 			for (;;) {
617 				ctold++;
618 				ctnew++;
619 				if ((c = getc(input[0])) != (d = getc(input[1]))) {
620 					/* jackpot++; */
621 					J[i] = 0;
622 					if (c != '\n')
623 						ctold += skipline(0);
624 					if (d != '\n')
625 						ctnew += skipline(1);
626 					break;
627 				}
628 				if (c == '\n')
629 					break;
630 			}
631 		}
632 		ixold[i] = ctold;
633 		ixnew[j] = ctnew;
634 		j++;
635 	}
636 	for (; j <= len[1]; j++) {
637 		ixnew[j] = ctnew += skipline(1);
638 	}
639 	fclose(input[0]);
640 	fclose(input[1]);
641 	/*
642 	 * if (jackpot)
643 	 *	fprintf(stderr, "jackpot\n");
644 	 */
645 }
646 
647 /* shellsort CACM #201 */
648 static void
649 sort(struct line *a, int n)
650 {
651 	struct line *ai, *aim, w;
652 	int j, m = 0, k;
653 
654 	if (n == 0)
655 		return;
656 	for (j = 1; j <= n; j *= 2)
657 		m = 2 * j - 1;
658 	for (m /= 2; m != 0; m /= 2) {
659 		k = n - m;
660 		for (j = 1; j <= k; j++) {
661 			for (ai = &a[j]; ai > a; ai -= m) {
662 				aim = &ai[m];
663 				if (aim < ai)
664 					break;	/* wraparound */
665 				if (aim->value > ai[0].value ||
666 				    (aim->value == ai[0].value &&
667 					aim->serial > ai[0].serial))
668 					break;
669 				w.value = ai[0].value;
670 				ai[0].value = aim->value;
671 				aim->value = w.value;
672 				w.serial = ai[0].serial;
673 				ai[0].serial = aim->serial;
674 				aim->serial = w.serial;
675 			}
676 		}
677 	}
678 }
679 
680 static void
681 unsort(struct line *f, int l, int *b)
682 {
683 	int *a, i;
684 
685 	a = emalloc((l + 1) * sizeof(int));
686 	for (i = 1; i <= l; i++)
687 		a[f[i].serial] = f[i].value;
688 	for (i = 1; i <= l; i++)
689 		b[i] = a[i];
690 	free(a);
691 }
692 
693 static int
694 skipline(int f)
695 {
696 	int i, c;
697 
698 	for (i = 1; (c = getc(input[f])) != '\n'; i++)
699 		if (c < 0)
700 			return (i);
701 	return (i);
702 }
703 
704 static void
705 output(void)
706 {
707 	int m, i0, i1, j0, j1;
708 
709 	input[0] = fopen(file1, "r");
710 	input[1] = fopen(file2, "r");
711 	m = len[0];
712 	J[0] = 0;
713 	J[m + 1] = len[1] + 1;
714 	if (opt != D_EDIT) {
715 		for (i0 = 1; i0 <= m; i0 = i1 + 1) {
716 			while (i0 <= m && J[i0] == J[i0 - 1] + 1)
717 				i0++;
718 			j0 = J[i0 - 1] + 1;
719 			i1 = i0 - 1;
720 			while (i1 < m && J[i1 + 1] == 0)
721 				i1++;
722 			j1 = J[i1 + 1] - 1;
723 			J[i1] = j1;
724 			change(i0, i1, j0, j1);
725 		}
726 	} else {
727 		for (i0 = m; i0 >= 1; i0 = i1 - 1) {
728 			while (i0 >= 1 && J[i0] == J[i0 + 1] - 1 && J[i0] != 0)
729 				i0--;
730 			j0 = J[i0 + 1] - 1;
731 			i1 = i0 + 1;
732 			while (i1 > 1 && J[i1 - 1] == 0)
733 				i1--;
734 			j1 = J[i1 - 1] + 1;
735 			J[i1] = j1;
736 			change(i1, i0, j1, j0);
737 		}
738 	}
739 	if (m == 0)
740 		change(1, 0, 1, len[1]);
741 	if (opt == D_IFDEF) {
742 		for (;;) {
743 #define	c i0
744 			c = getc(input[0]);
745 			if (c < 0)
746 				return;
747 			putchar(c);
748 		}
749 #undef c
750 	}
751 	if (anychange != 0) {
752 		if (opt == D_CONTEXT)
753 			dump_context_vec();
754 		else if (opt == D_UNIFIED)
755 			dump_unified_vec();
756 	}
757 }
758 
759 /*
760  * The following struct is used to record change information when
761  * doing a "context" diff.  (see routine "change" to understand the
762  * highly mneumonic field names)
763  */
764 struct context_vec {
765 	int a;			/* start line in old file */
766 	int b;			/* end line in old file */
767 	int c;			/* start line in new file */
768 	int d;			/* end line in new file */
769 };
770 
771 struct context_vec *context_vec_start, *context_vec_end, *context_vec_ptr;
772 
773 #define	MAX_CONTEXT	128
774 
775 /*
776  * indicate that there is a difference between lines a and b of the from file
777  * to get to lines c to d of the to file. If a is greater then b then there
778  * are no lines in the from file involved and this means that there were
779  * lines appended (beginning at b). If c is greater than d then there are
780  * lines missing from the to file.
781  */
782 static void
783 change(int a, int b, int c, int d)
784 {
785 	struct stat stbuf;
786 
787 	if (opt != D_IFDEF && a > b && c > d)
788 		return;
789 	if (anychange == 0) {
790 		anychange = 1;
791 		if (opt == D_CONTEXT || opt == D_UNIFIED) {
792 			stat(file1, &stbuf);
793 			printf("%s %s	%s", opt == D_CONTEXT ? "***" : "---",
794 			   file1, ctime(&stbuf.st_mtime));
795 			stat(file2, &stbuf);
796 			printf("%s %s	%s", opt == D_CONTEXT ? "---" : "+++",
797 			    file2, ctime(&stbuf.st_mtime));
798 			context_vec_start = emalloc(MAX_CONTEXT *
799 			    sizeof(struct context_vec));
800 			context_vec_end = context_vec_start + MAX_CONTEXT;
801 			context_vec_ptr = context_vec_start - 1;
802 		}
803 	}
804 	if (opt == D_CONTEXT || opt == D_UNIFIED) {
805 		/*
806 		 * if this new change is within 'context' lines of
807 		 * the previous change, just add it to the change
808 		 * record.  If the record is full or if this
809 		 * change is more than 'context' lines from the previous
810 		 * change, dump the record, reset it & add the new change.
811 		 */
812 		if (context_vec_ptr >= context_vec_end ||
813 		    (context_vec_ptr >= context_vec_start &&
814 		    a > (context_vec_ptr->b + 2 * context) &&
815 		    c > (context_vec_ptr->d + 2 * context))) {
816 			if (opt == D_CONTEXT)
817 				dump_context_vec();
818 			else
819 				dump_unified_vec();
820 		}
821 		context_vec_ptr++;
822 		context_vec_ptr->a = a;
823 		context_vec_ptr->b = b;
824 		context_vec_ptr->c = c;
825 		context_vec_ptr->d = d;
826 		return;
827 	}
828 	switch (opt) {
829 
830 	case D_NORMAL:
831 	case D_EDIT:
832 		range(a, b, ",");
833 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
834 		if (opt == D_NORMAL)
835 			range(c, d, ",");
836 		putchar('\n');
837 		break;
838 	case D_REVERSE:
839 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
840 		range(a, b, " ");
841 		putchar('\n');
842 		break;
843 	case D_NREVERSE:
844 		if (a > b)
845 			printf("a%d %d\n", b, d - c + 1);
846 		else {
847 			printf("d%d %d\n", a, b - a + 1);
848 			if (!(c > d))
849 				/* add changed lines */
850 				printf("a%d %d\n", b, d - c + 1);
851 		}
852 		break;
853 	}
854 	if (opt == D_NORMAL || opt == D_IFDEF) {
855 		fetch(ixold, a, b, input[0], "< ", 1);
856 		if (a <= b && c <= d && opt == D_NORMAL)
857 			prints("---\n");
858 	}
859 	fetch(ixnew, c, d, input[1], opt == D_NORMAL ? "> " : "", 0);
860 	if ((opt == D_EDIT || opt == D_REVERSE) && c <= d)
861 		prints(".\n");
862 	if (inifdef) {
863 		fprintf(stdout, "#endif /* %s */\n", endifname);
864 		inifdef = 0;
865 	}
866 }
867 
868 static void
869 range(int a, int b, char *separator)
870 {
871 	printf("%d", a > b ? b : a);
872 	if (a < b)
873 		printf("%s%d", separator, b);
874 }
875 
876 static void
877 fetch(long *f, int a, int b, FILE *lb, char *s, int oldfile)
878 {
879 	int oneflag = (*ifdef1 != '\0') != (*ifdef2 != '\0');
880 	int i, j, c, col, nc;
881 
882 	/*
883 	 * When doing #ifdef's, copy down to current line
884 	 * if this is the first file, so that stuff makes it to output.
885 	 */
886 	if (opt == D_IFDEF && oldfile) {
887 		long curpos = ftell(lb);
888 		/* print through if append (a>b), else to (nb: 0 vs 1 orig) */
889 		nc = f[a > b ? b : a - 1] - curpos;
890 		for (i = 0; i < nc; i++)
891 			putchar(getc(lb));
892 	}
893 	if (a > b)
894 		return;
895 	if (opt == D_IFDEF) {
896 		if (inifdef)
897 			fprintf(stdout, "#else /* %s%s */\n",
898 			    oneflag && oldfile == 1 ? "!" : "", ifdef2);
899 		else {
900 			if (oneflag) {
901 				/* There was only one ifdef given */
902 				endifname = ifdef2;
903 				if (oldfile)
904 					fprintf(stdout, "#ifndef %s\n", endifname);
905 				else
906 					fprintf(stdout, "#ifdef %s\n", endifname);
907 			} else {
908 				endifname = oldfile ? ifdef1 : ifdef2;
909 				fprintf(stdout, "#ifdef %s\n", endifname);
910 			}
911 		}
912 		inifdef = 1 + oldfile;
913 	}
914 	for (i = a; i <= b; i++) {
915 		fseek(lb, f[i - 1], SEEK_SET);
916 		nc = f[i] - f[i - 1];
917 		if (opt != D_IFDEF)
918 			prints(s);
919 		col = 0;
920 		for (j = 0; j < nc; j++) {
921 			c = getc(lb);
922 			if (c == '\t' && tflag)
923 				do
924 					putchar(' ');
925 				while (++col & 7);
926 			else {
927 				putchar(c);
928 				col++;
929 			}
930 		}
931 	}
932 
933 	if (inifdef && !wantelses) {
934 		fprintf(stdout, "#endif /* %s */\n", endifname);
935 		inifdef = 0;
936 	}
937 }
938 
939 #define POW2			/* define only if HALFLONG is 2**n */
940 #define HALFLONG 16
941 #define low(x)	(x&((1L<<HALFLONG)-1))
942 #define high(x)	(x>>HALFLONG)
943 
944 /*
945  * hashing has the effect of
946  * arranging line in 7-bit bytes and then
947  * summing 1-s complement in 16-bit hunks
948  */
949 static int
950 readhash(FILE *f)
951 {
952 	unsigned int shift;
953 	int t, space;
954 	long sum;
955 
956 	sum = 1;
957 	space = 0;
958 	if (!bflag && !wflag) {
959 		if (iflag)
960 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
961 				if (t == -1)
962 					return (0);
963 				sum += (long)chrtran[t] << (shift
964 #ifdef POW2
965 				    &= HALFLONG - 1);
966 #else
967 				    %= HALFLONG);
968 #endif
969 			}
970 		else
971 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
972 				if (t == -1)
973 					return (0);
974 				sum += (long)t << (shift
975 #ifdef POW2
976 				    &= HALFLONG - 1);
977 #else
978 				    %= HALFLONG);
979 #endif
980 			}
981 	} else {
982 		for (shift = 0;;) {
983 			switch (t = getc(f)) {
984 			case -1:
985 				return (0);
986 			case '\t':
987 			case ' ':
988 				space++;
989 				continue;
990 			default:
991 				if (space && !wflag) {
992 					shift += 7;
993 					space = 0;
994 				}
995 				sum += (long)chrtran[t] << (shift
996 #ifdef POW2
997 				    &= HALFLONG - 1);
998 #else
999 				    %= HALFLONG);
1000 #endif
1001 				shift += 7;
1002 				continue;
1003 			case '\n':
1004 				break;
1005 			}
1006 			break;
1007 		}
1008 	}
1009 	sum = low(sum) + high(sum);
1010 	return ((short) low(sum) + (short) high(sum));
1011 }
1012 
1013 static int
1014 asciifile(FILE *f)
1015 {
1016 	char buf[BUFSIZ], *cp;
1017 	int cnt;
1018 
1019 	fseek(f, 0L, SEEK_SET);
1020 	cnt = fread(buf, 1, BUFSIZ, f);
1021 	cp = buf;
1022 	while (--cnt >= 0)
1023 		if (*cp++ & 0200)
1024 			return (0);
1025 	return (1);
1026 }
1027 
1028 /* dump accumulated "context" diff changes */
1029 static void
1030 dump_context_vec(void)
1031 {
1032 	struct context_vec *cvp = context_vec_start;
1033 	int lowa, upb, lowc, upd, do_output;
1034 	int a, b, c, d;
1035 	char ch;
1036 
1037 	if (cvp > context_vec_ptr)
1038 		return;
1039 
1040 	b = d = 0;		/* gcc */
1041 	lowa = max(1, cvp->a - context);
1042 	upb = min(len[0], context_vec_ptr->b + context);
1043 	lowc = max(1, cvp->c - context);
1044 	upd = min(len[1], context_vec_ptr->d + context);
1045 
1046 	printf("***************\n*** ");
1047 	range(lowa, upb, ",");
1048 	printf(" ****\n");
1049 
1050 	/*
1051 	 * output changes to the "old" file.  The first loop suppresses
1052 	 * output if there were no changes to the "old" file (we'll see
1053 	 * the "old" lines as context in the "new" list).
1054 	 */
1055 	do_output = 0;
1056 	for (; cvp <= context_vec_ptr; cvp++)
1057 		if (cvp->a <= cvp->b) {
1058 			cvp = context_vec_start;
1059 			do_output++;
1060 			break;
1061 		}
1062 	if (do_output) {
1063 		while (cvp <= context_vec_ptr) {
1064 			a = cvp->a;
1065 			b = cvp->b;
1066 			c = cvp->c;
1067 			d = cvp->d;
1068 
1069 			if (a <= b && c <= d)
1070 				ch = 'c';
1071 			else
1072 				ch = (a <= b) ? 'd' : 'a';
1073 
1074 			if (ch == 'a')
1075 				fetch(ixold, lowa, b, input[0], "  ", 0);
1076 			else {
1077 				fetch(ixold, lowa, a - 1, input[0], "  ", 0);
1078 				fetch(ixold, a, b, input[0],
1079 				    ch == 'c' ? "! " : "- ", 0);
1080 			}
1081 			lowa = b + 1;
1082 			cvp++;
1083 		}
1084 		fetch(ixold, b + 1, upb, input[0], "  ", 0);
1085 	}
1086 	/* output changes to the "new" file */
1087 	printf("--- ");
1088 	range(lowc, upd, ",");
1089 	printf(" ----\n");
1090 
1091 	do_output = 0;
1092 	for (cvp = context_vec_start; cvp <= context_vec_ptr; cvp++)
1093 		if (cvp->c <= cvp->d) {
1094 			cvp = context_vec_start;
1095 			do_output++;
1096 			break;
1097 		}
1098 	if (do_output) {
1099 		while (cvp <= context_vec_ptr) {
1100 			a = cvp->a;
1101 			b = cvp->b;
1102 			c = cvp->c;
1103 			d = cvp->d;
1104 
1105 			if (a <= b && c <= d)
1106 				ch = 'c';
1107 			else
1108 				ch = (a <= b) ? 'd' : 'a';
1109 
1110 			if (ch == 'd')
1111 				fetch(ixnew, lowc, d, input[1], "  ", 0);
1112 			else {
1113 				fetch(ixnew, lowc, c - 1, input[1], "  ", 0);
1114 				fetch(ixnew, c, d, input[1],
1115 				    ch == 'c' ? "! " : "+ ", 0);
1116 			}
1117 			lowc = d + 1;
1118 			cvp++;
1119 		}
1120 		fetch(ixnew, d + 1, upd, input[1], "  ", 0);
1121 	}
1122 	context_vec_ptr = context_vec_start - 1;
1123 }
1124 
1125 /* dump accumulated "unified" diff changes */
1126 static void
1127 dump_unified_vec(void)
1128 {
1129 	struct context_vec *cvp = context_vec_start;
1130 	int lowa, upb, lowc, upd;
1131 	int a, b, c, d;
1132 	char ch;
1133 
1134 	if (cvp > context_vec_ptr)
1135 		return;
1136 
1137 	b = d = 0;		/* gcc */
1138 	lowa = max(1, cvp->a - context);
1139 	upb = min(len[0], context_vec_ptr->b + context);
1140 	lowc = max(1, cvp->c - context);
1141 	upd = min(len[1], context_vec_ptr->d + context);
1142 
1143 	printf("@@ -%d,%d +%d,%d @@\n", lowa, upb - lowa + 1,
1144 	    lowc, upd - lowc + 1);
1145 
1146 	/*
1147 	 * Output changes in "unified" diff format--the old and new lines
1148 	 * are printed together.
1149 	 */
1150 	for (; cvp <= context_vec_ptr; cvp++) {
1151 		a = cvp->a;
1152 		b = cvp->b;
1153 		c = cvp->c;
1154 		d = cvp->d;
1155 
1156 		/*
1157 		 * c: both new and old changes
1158 		 * d: only changes in the old file
1159 		 * a: only changes in the new file
1160 		 */
1161 		if (a <= b && c <= d)
1162 			ch = 'c';
1163 		else
1164 			ch = (a <= b) ? 'd' : 'a';
1165 
1166 		switch (ch) {
1167 		case 'c':
1168 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1169 			fetch(ixold, a, b, input[0], "-", 0);
1170 			fetch(ixnew, c, d, input[1], "+", 0);
1171 			break;
1172 		case 'd':
1173 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1174 			fetch(ixold, a, b, input[0], "-", 0);
1175 			break;
1176 		case 'a':
1177 			fetch(ixnew, lowc, c - 1, input[1], " ", 0);
1178 			fetch(ixnew, c, d, input[1], "+", 0);
1179 			break;
1180 		}
1181 		lowa = b + 1;
1182 		lowc = d + 1;
1183 	}
1184 	fetch(ixnew, d + 1, upd, input[1], " ", 0);
1185 
1186 	context_vec_ptr = context_vec_start - 1;
1187 }
1188