xref: /openbsd-src/usr.bin/diff/diffreg.c (revision 90f56ad89a9a2865b738e2fb968a31f24e4f9aa7)
1 /*	$OpenBSD: diffreg.c,v 1.22 2003/06/26 22:04:45 millert Exp $	*/
2 
3 /*
4  * Copyright (C) Caldera International Inc.  2001-2002.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code and documentation must retain the above
11  *    copyright notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed or owned by Caldera
18  *	International, Inc.
19  * 4. Neither the name of Caldera International, Inc. nor the names of other
20  *    contributors may be used to endorse or promote products derived from
21  *    this software without specific prior written permission.
22  *
23  * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
24  * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
28  * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
33  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 #include <sys/types.h>
38 
39 #include <stdlib.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <string.h>
43 
44 #include "diff.h"
45 #include "pathnames.h"
46 
47 #if 0
48 static char const sccsid[] = "@(#)diffreg.c 4.21 4/6/90";
49 #endif
50 
51 /*
52  * diff - compare two files.
53  */
54 
55 /*
56  *	Uses an algorithm due to Harold Stone, which finds
57  *	a pair of longest identical subsequences in the two
58  *	files.
59  *
60  *	The major goal is to generate the match vector J.
61  *	J[i] is the index of the line in file1 corresponding
62  *	to line i file0. J[i] = 0 if there is no
63  *	such line in file1.
64  *
65  *	Lines are hashed so as to work in core. All potential
66  *	matches are located by sorting the lines of each file
67  *	on the hash (called ``value''). In particular, this
68  *	collects the equivalence classes in file1 together.
69  *	Subroutine equiv replaces the value of each line in
70  *	file0 by the index of the first element of its
71  *	matching equivalence in (the reordered) file1.
72  *	To save space equiv squeezes file1 into a single
73  *	array member in which the equivalence classes
74  *	are simply concatenated, except that their first
75  *	members are flagged by changing sign.
76  *
77  *	Next the indices that point into member are unsorted into
78  *	array class according to the original order of file0.
79  *
80  *	The cleverness lies in routine stone. This marches
81  *	through the lines of file0, developing a vector klist
82  *	of "k-candidates". At step i a k-candidate is a matched
83  *	pair of lines x,y (x in file0 y in file1) such that
84  *	there is a common subsequence of length k
85  *	between the first i lines of file0 and the first y
86  *	lines of file1, but there is no such subsequence for
87  *	any smaller y. x is the earliest possible mate to y
88  *	that occurs in such a subsequence.
89  *
90  *	Whenever any of the members of the equivalence class of
91  *	lines in file1 matable to a line in file0 has serial number
92  *	less than the y of some k-candidate, that k-candidate
93  *	with the smallest such y is replaced. The new
94  *	k-candidate is chained (via pred) to the current
95  *	k-1 candidate so that the actual subsequence can
96  *	be recovered. When a member has serial number greater
97  *	that the y of all k-candidates, the klist is extended.
98  *	At the end, the longest subsequence is pulled out
99  *	and placed in the array J by unravel
100  *
101  *	With J in hand, the matches there recorded are
102  *	check'ed against reality to assure that no spurious
103  *	matches have crept in due to hashing. If they have,
104  *	they are broken, and "jackpot" is recorded--a harmless
105  *	matter except that a true match for a spuriously
106  *	mated line may now be unnecessarily reported as a change.
107  *
108  *	Much of the complexity of the program comes simply
109  *	from trying to minimize core utilization and
110  *	maximize the range of doable problems by dynamically
111  *	allocating what is needed and reusing what is not.
112  *	The core requirements for problems larger than somewhat
113  *	are (in words) 2*length(file0) + length(file1) +
114  *	3*(number of k-candidates installed),  typically about
115  *	6n words for files of length n.
116  */
117 
118 #define	prints(s)	fputs(s,stdout)
119 
120 FILE *input[2];
121 
122 struct cand {
123 	int x;
124 	int y;
125 	int pred;
126 } cand;
127 
128 struct line {
129 	int serial;
130 	int value;
131 } *file[2];
132 
133 int len[2];
134 struct line *sfile[2];	/* shortened by pruning common prefix and suffix */
135 int slen[2];
136 int pref, suff;			/* length of prefix and suffix */
137 int *class;			/* will be overlaid on file[0] */
138 int *member;			/* will be overlaid on file[1] */
139 int *klist;			/* will be overlaid on file[0] after class */
140 struct cand *clist;		/* merely a free storage pot for candidates */
141 int clen = 0;
142 int *J;				/* will be overlaid on class */
143 long *ixold;			/* will be overlaid on klist */
144 long *ixnew;			/* will be overlaid on file[1] */
145 u_char *chrtran;		/* translation table for case-folding */
146 
147 static void fetch(long *, int, int, FILE *, char *, int);
148 static void output(void);
149 static void check(void);
150 static void range(int, int, char *);
151 static void dump_context_vec(void);
152 static void dump_unified_vec(void);
153 static void prepare(int, FILE *);
154 static void prune(void);
155 static void equiv(struct line *, int, struct line *, int, int *);
156 static void unravel(int);
157 static void unsort(struct line *, int, int *);
158 static void change(int, int, int, int);
159 static void sort(struct line *, int);
160 static int newcand(int, int, int);
161 static int search(int *, int, int);
162 static int skipline(int);
163 static int asciifile(FILE *);
164 static int stone(int *, int, int *, int *);
165 static int readhash(FILE *);
166 
167 /*
168  * chrtran points to one of 2 translation tables: cup2low if folding upper to
169  * lower case clow2low if not folding case
170  */
171 u_char clow2low[256] = {
172 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
173 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
174 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
175 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
176 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
177 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41,
178 	0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c,
179 	0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
180 	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
181 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
182 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
183 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
184 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
185 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
186 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
187 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
188 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
189 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
190 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
191 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
192 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
193 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
194 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
195 	0xfd, 0xfe, 0xff
196 };
197 
198 u_char cup2low[256] = {
199 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
200 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
201 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
202 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
203 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
204 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x60, 0x61,
205 	0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
206 	0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
207 	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x60, 0x61, 0x62,
208 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
209 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
210 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
211 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
212 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
213 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
214 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
215 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
216 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
217 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
218 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
219 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
220 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
221 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
222 	0xfd, 0xfe, 0xff
223 };
224 
225 void
226 diffreg(void)
227 {
228 	char buf1[BUFSIZ], buf2[BUFSIZ];
229 	FILE *f1, *f2;
230 	int i, j;
231 
232 	if (hflag) {
233 		diffargv[0] = "diffh";
234 		execv(diffh, diffargv);
235 		error("%s", diffh);
236 	}
237 	chrtran = (iflag ? cup2low : clow2low);
238 	if (strcmp(file1, "-") == 0 && strcmp(file2, "-") == 0)
239 		errorx("can't specify - -");
240 	if (S_ISDIR(stb1.st_mode)) {
241 		file1 = splice(file1, file2);
242 		if (stat(file1, &stb1) < 0)
243 			error("%s", file1);
244 	} else if (!S_ISREG(stb1.st_mode) || strcmp(file1, "-") == 0) {
245 		file1 = copytemp(file1, 1);
246 		if (stat(file1, &stb1) < 0)
247 			error("%s", file1);
248 	}
249 	if (S_ISDIR(stb2.st_mode)) {
250 		file2 = splice(file2, file1);
251 		if (stat(file2, &stb2) < 0)
252 			error("%s", file2);
253 	} else if (!S_ISREG(stb2.st_mode) || strcmp(file2, "-") == 0) {
254 		file2 = copytemp(file2, 2);
255 		if (stat(file2, &stb2) < 0)
256 			error("%s", file2);
257 	}
258 	if ((f1 = fopen(file1, "r")) == NULL)
259 		error("%s", file1);
260 	if ((f2 = fopen(file2, "r")) == NULL)
261 		error("%s", file2);
262 	if (S_ISREG(stb1.st_mode) && S_ISREG(stb2.st_mode) &&
263 	    stb1.st_size != stb2.st_size)
264 		goto notsame;
265 	for (;;) {
266 		i = fread(buf1, 1, BUFSIZ, f1);
267 		j = fread(buf2, 1, BUFSIZ, f2);
268 		if (i < 0 || j < 0 || i != j)
269 			goto notsame;
270 		if (i == 0 && j == 0) {
271 			fclose(f1);
272 			fclose(f2);
273 			status = 0;	/* files don't differ */
274 			goto same;
275 		}
276 		for (j = 0; j < i; j++)
277 			if (buf1[j] != buf2[j])
278 				goto notsame;
279 	}
280 notsame:
281 	/*
282 	 * Files certainly differ at this point; set status accordingly
283 	 */
284 	status = 1;
285 	if (!asciifile(f1) || !asciifile(f2)) {
286 		printf("Binary files %s and %s differ\n", file1, file2);
287 		exit(status);
288 	}
289 	prepare(0, f1);
290 	prepare(1, f2);
291 	fclose(f1);
292 	fclose(f2);
293 	prune();
294 	sort(sfile[0], slen[0]);
295 	sort(sfile[1], slen[1]);
296 
297 	member = (int *)file[1];
298 	equiv(sfile[0], slen[0], sfile[1], slen[1], member);
299 	member = erealloc(member, (slen[1] + 2) * sizeof(int));
300 
301 	class = (int *)file[0];
302 	unsort(sfile[0], slen[0], class);
303 	class = erealloc(class, (slen[0] + 2) * sizeof(int));
304 
305 	klist = emalloc((slen[0] + 2) * sizeof(int));
306 	clist = emalloc(sizeof(cand));
307 	i = stone(class, slen[0], member, klist);
308 	free(member);
309 	free(class);
310 
311 	J = emalloc((len[0] + 2) * sizeof(int));
312 	unravel(klist[i]);
313 	free(clist);
314 	free(klist);
315 
316 	ixold = emalloc((len[0] + 2) * sizeof(long));
317 	ixnew = emalloc((len[1] + 2) * sizeof(long));
318 	check();
319 	output();
320 	status = anychange;
321 same:
322 	if (anychange == 0 && (opt == D_CONTEXT || opt == D_UNIFIED))
323 		printf("No differences encountered\n");
324 }
325 
326 char *tempfiles[2];
327 
328 char *
329 copytemp(const char *file, int n)
330 {
331 	char buf[BUFSIZ], *tempdir, *tempfile;
332 	int i, ifd, ofd;
333 
334 	if (n != 1 && n != 2)
335 		return (NULL);
336 
337 	if (strcmp(file, "-") == 0)
338 		ifd = STDIN_FILENO;
339 	else if ((ifd = open(file, O_RDONLY, 0644)) < 0)
340 		error("%s", file);
341 
342 	if ((tempdir = getenv("TMPDIR")) == NULL)
343 		tempdir = _PATH_TMP;
344 	if (asprintf(&tempfile, "%s/diff%d.XXXXXXXX", tempdir, n) == -1)
345 		error(NULL);
346 	tempfiles[n - 1] = tempfile;
347 
348 	signal(SIGHUP, done);
349 	signal(SIGINT, done);
350 	signal(SIGPIPE, done);
351 	signal(SIGTERM, done);
352 	ofd = mkstemp(tempfile);
353 	if (ofd < 0)
354 		error("%s", tempfile);
355 	while ((i = read(ifd, buf, BUFSIZ)) > 0) {
356 		if (write(ofd, buf, i) != i)
357 			error("%s", tempfile);
358 	}
359 	close(ifd);
360 	close(ofd);
361 	return (tempfile);
362 }
363 
364 char *
365 splice(char *dir, char *file)
366 {
367 	char *tail, *buf;
368 	size_t len;
369 
370 	if (!strcmp(file, "-"))
371 		errorx("can't specify - with other arg directory");
372 	tail = strrchr(file, '/');
373 	if (tail == NULL)
374 		tail = file;
375 	else
376 		tail++;
377 	len = strlen(dir) + 1 + strlen(tail) + 1;
378 	buf = emalloc(len);
379 	snprintf(buf, len, "%s/%s", dir, tail);
380 	return (buf);
381 }
382 
383 static void
384 prepare(int i, FILE *fd)
385 {
386 	struct line *p;
387 	int j, h;
388 
389 	fseek(fd, 0L, SEEK_SET);
390 	p = emalloc(3 * sizeof(struct line));
391 	for (j = 0; (h = readhash(fd));) {
392 		p = erealloc(p, (++j + 3) * sizeof(struct line));
393 		p[j].value = h;
394 	}
395 	len[i] = j;
396 	file[i] = p;
397 }
398 
399 static void
400 prune(void)
401 {
402 	int i, j;
403 
404 	for (pref = 0; pref < len[0] && pref < len[1] &&
405 	    file[0][pref + 1].value == file[1][pref + 1].value;
406 	    pref++)
407 		;
408 	for (suff = 0; suff < len[0] - pref && suff < len[1] - pref &&
409 	    file[0][len[0] - suff].value == file[1][len[1] - suff].value;
410 	    suff++)
411 		;
412 	for (j = 0; j < 2; j++) {
413 		sfile[j] = file[j] + pref;
414 		slen[j] = len[j] - pref - suff;
415 		for (i = 0; i <= slen[j]; i++)
416 			sfile[j][i].serial = i;
417 	}
418 }
419 
420 static void
421 equiv(struct line *a, int n, struct line *b, int m, int *c)
422 {
423 	int i, j;
424 
425 	i = j = 1;
426 	while (i <= n && j <= m) {
427 		if (a[i].value < b[j].value)
428 			a[i++].value = 0;
429 		else if (a[i].value == b[j].value)
430 			a[i++].value = j;
431 		else
432 			j++;
433 	}
434 	while (i <= n)
435 		a[i++].value = 0;
436 	b[m + 1].value = 0;
437 	j = 0;
438 	while (++j <= m) {
439 		c[j] = -b[j].serial;
440 		while (b[j + 1].value == b[j].value) {
441 			j++;
442 			c[j] = b[j].serial;
443 		}
444 	}
445 	c[j] = -1;
446 }
447 
448 static int
449 stone(int *a, int n, int *b, int *c)
450 {
451 	int i, k, y, j, l;
452 	int oldc, tc, oldl;
453 
454 	k = 0;
455 	c[0] = newcand(0, 0, 0);
456 	for (i = 1; i <= n; i++) {
457 		j = a[i];
458 		if (j == 0)
459 			continue;
460 		y = -b[j];
461 		oldl = 0;
462 		oldc = c[0];
463 		do {
464 			if (y <= clist[oldc].y)
465 				continue;
466 			l = search(c, k, y);
467 			if (l != oldl + 1)
468 				oldc = c[l - 1];
469 			if (l <= k) {
470 				if (clist[c[l]].y <= y)
471 					continue;
472 				tc = c[l];
473 				c[l] = newcand(i, y, oldc);
474 				oldc = tc;
475 				oldl = l;
476 			} else {
477 				c[l] = newcand(i, y, oldc);
478 				k++;
479 				break;
480 			}
481 		} while ((y = b[++j]) > 0);
482 	}
483 	return (k);
484 }
485 
486 static int
487 newcand(int x, int y, int pred)
488 {
489 	struct cand *q;
490 
491 	clist = erealloc(clist, ++clen * sizeof(cand));
492 	q = clist + clen - 1;
493 	q->x = x;
494 	q->y = y;
495 	q->pred = pred;
496 	return (clen - 1);
497 }
498 
499 static int
500 search(int *c, int k, int y)
501 {
502 	int i, j, l, t;
503 
504 	if (clist[c[k]].y < y)	/* quick look for typical case */
505 		return (k + 1);
506 	i = 0;
507 	j = k + 1;
508 	while (1) {
509 		l = i + j;
510 		if ((l >>= 1) <= i)
511 			break;
512 		t = clist[c[l]].y;
513 		if (t > y)
514 			j = l;
515 		else if (t < y)
516 			i = l;
517 		else
518 			return (l);
519 	}
520 	return (l + 1);
521 }
522 
523 static void
524 unravel(int p)
525 {
526 	struct cand *q;
527 	int i;
528 
529 	for (i = 0; i <= len[0]; i++)
530 		J[i] = i <= pref ? i :
531 		    i > len[0] - suff ? i + len[1] - len[0] : 0;
532 	for (q = clist + p; q->y != 0; q = clist + q->pred)
533 		J[q->x + pref] = q->y + pref;
534 }
535 
536 /*
537  * Check does double duty:
538  *  1.	ferret out any fortuitous correspondences due
539  *	to confounding by hashing (which result in "jackpot")
540  *  2.  collect random access indexes to the two files
541  */
542 static void
543 check(void)
544 {
545 	int i, j, jackpot, c, d;
546 	long ctold, ctnew;
547 
548 	if ((input[0] = fopen(file1, "r")) == NULL)
549 		error("%s", file1);
550 	if ((input[1] = fopen(file2, "r")) == NULL)
551 		error("%s", file2);
552 	j = 1;
553 	ixold[0] = ixnew[0] = 0;
554 	jackpot = 0;
555 	ctold = ctnew = 0;
556 	for (i = 1; i <= len[0]; i++) {
557 		if (J[i] == 0) {
558 			ixold[i] = ctold += skipline(0);
559 			continue;
560 		}
561 		while (j < J[i]) {
562 			ixnew[j] = ctnew += skipline(1);
563 			j++;
564 		}
565 		if (bflag || wflag || iflag) {
566 			for (;;) {
567 				c = getc(input[0]);
568 				d = getc(input[1]);
569 				ctold++;
570 				ctnew++;
571 				if (bflag && isspace(c) && isspace(d)) {
572 					do {
573 						if (c == '\n')
574 							break;
575 						ctold++;
576 					} while (isspace(c = getc(input[0])));
577 					do {
578 						if (d == '\n')
579 							break;
580 						ctnew++;
581 					} while (isspace(d = getc(input[1])));
582 				} else if (wflag) {
583 					while (isspace(c) && c != '\n') {
584 						c = getc(input[0]);
585 						ctold++;
586 					}
587 					while (isspace(d) && d != '\n') {
588 						d = getc(input[1]);
589 						ctnew++;
590 					}
591 				}
592 				if (chrtran[c] != chrtran[d]) {
593 					jackpot++;
594 					J[i] = 0;
595 					if (c != '\n')
596 						ctold += skipline(0);
597 					if (d != '\n')
598 						ctnew += skipline(1);
599 					break;
600 				}
601 				if (c == '\n')
602 					break;
603 			}
604 		} else {
605 			for (;;) {
606 				ctold++;
607 				ctnew++;
608 				if ((c = getc(input[0])) != (d = getc(input[1]))) {
609 					/* jackpot++; */
610 					J[i] = 0;
611 					if (c != '\n')
612 						ctold += skipline(0);
613 					if (d != '\n')
614 						ctnew += skipline(1);
615 					break;
616 				}
617 				if (c == '\n')
618 					break;
619 			}
620 		}
621 		ixold[i] = ctold;
622 		ixnew[j] = ctnew;
623 		j++;
624 	}
625 	for (; j <= len[1]; j++) {
626 		ixnew[j] = ctnew += skipline(1);
627 	}
628 	fclose(input[0]);
629 	fclose(input[1]);
630 	/*
631 	 * if (jackpot)
632 	 *	fprintf(stderr, "jackpot\n");
633 	 */
634 }
635 
636 /* shellsort CACM #201 */
637 static void
638 sort(struct line *a, int n)
639 {
640 	struct line *ai, *aim, w;
641 	int j, m = 0, k;
642 
643 	if (n == 0)
644 		return;
645 	for (j = 1; j <= n; j *= 2)
646 		m = 2 * j - 1;
647 	for (m /= 2; m != 0; m /= 2) {
648 		k = n - m;
649 		for (j = 1; j <= k; j++) {
650 			for (ai = &a[j]; ai > a; ai -= m) {
651 				aim = &ai[m];
652 				if (aim < ai)
653 					break;	/* wraparound */
654 				if (aim->value > ai[0].value ||
655 				    (aim->value == ai[0].value &&
656 					aim->serial > ai[0].serial))
657 					break;
658 				w.value = ai[0].value;
659 				ai[0].value = aim->value;
660 				aim->value = w.value;
661 				w.serial = ai[0].serial;
662 				ai[0].serial = aim->serial;
663 				aim->serial = w.serial;
664 			}
665 		}
666 	}
667 }
668 
669 static void
670 unsort(struct line *f, int l, int *b)
671 {
672 	int *a, i;
673 
674 	a = emalloc((l + 1) * sizeof(int));
675 	for (i = 1; i <= l; i++)
676 		a[f[i].serial] = f[i].value;
677 	for (i = 1; i <= l; i++)
678 		b[i] = a[i];
679 	free(a);
680 }
681 
682 static int
683 skipline(int f)
684 {
685 	int i, c;
686 
687 	for (i = 1; (c = getc(input[f])) != '\n'; i++)
688 		if (c < 0)
689 			return (i);
690 	return (i);
691 }
692 
693 static void
694 output(void)
695 {
696 	int m, i0, i1, j0, j1;
697 
698 	input[0] = fopen(file1, "r");
699 	input[1] = fopen(file2, "r");
700 	m = len[0];
701 	J[0] = 0;
702 	J[m + 1] = len[1] + 1;
703 	if (opt != D_EDIT) {
704 		for (i0 = 1; i0 <= m; i0 = i1 + 1) {
705 			while (i0 <= m && J[i0] == J[i0 - 1] + 1)
706 				i0++;
707 			j0 = J[i0 - 1] + 1;
708 			i1 = i0 - 1;
709 			while (i1 < m && J[i1 + 1] == 0)
710 				i1++;
711 			j1 = J[i1 + 1] - 1;
712 			J[i1] = j1;
713 			change(i0, i1, j0, j1);
714 		}
715 	} else {
716 		for (i0 = m; i0 >= 1; i0 = i1 - 1) {
717 			while (i0 >= 1 && J[i0] == J[i0 + 1] - 1 && J[i0] != 0)
718 				i0--;
719 			j0 = J[i0 + 1] - 1;
720 			i1 = i0 + 1;
721 			while (i1 > 1 && J[i1 - 1] == 0)
722 				i1--;
723 			j1 = J[i1 - 1] + 1;
724 			J[i1] = j1;
725 			change(i1, i0, j1, j0);
726 		}
727 	}
728 	if (m == 0)
729 		change(1, 0, 1, len[1]);
730 	if (opt == D_IFDEF) {
731 		for (;;) {
732 #define	c i0
733 			c = getc(input[0]);
734 			if (c < 0)
735 				return;
736 			putchar(c);
737 		}
738 #undef c
739 	}
740 	if (anychange != 0) {
741 		if (opt == D_CONTEXT)
742 			dump_context_vec();
743 		else if (opt == D_UNIFIED)
744 			dump_unified_vec();
745 	}
746 }
747 
748 /*
749  * The following struct is used to record change information when
750  * doing a "context" diff.  (see routine "change" to understand the
751  * highly mneumonic field names)
752  */
753 struct context_vec {
754 	int a;			/* start line in old file */
755 	int b;			/* end line in old file */
756 	int c;			/* start line in new file */
757 	int d;			/* end line in new file */
758 };
759 
760 struct context_vec *context_vec_start, *context_vec_end, *context_vec_ptr;
761 
762 #define	MAX_CONTEXT	128
763 
764 /*
765  * indicate that there is a difference between lines a and b of the from file
766  * to get to lines c to d of the to file. If a is greater then b then there
767  * are no lines in the from file involved and this means that there were
768  * lines appended (beginning at b). If c is greater than d then there are
769  * lines missing from the to file.
770  */
771 static void
772 change(int a, int b, int c, int d)
773 {
774 	struct stat stbuf;
775 
776 	if (opt != D_IFDEF && a > b && c > d)
777 		return;
778 	if (anychange == 0) {
779 		anychange = 1;
780 		if (opt == D_CONTEXT || opt == D_UNIFIED) {
781 			stat(file1, &stbuf);
782 			printf("%s %s	%s", opt == D_CONTEXT ? "***" : "---",
783 			   file1, ctime(&stbuf.st_mtime));
784 			stat(file2, &stbuf);
785 			printf("%s %s	%s", opt == D_CONTEXT ? "---" : "+++",
786 			    file2, ctime(&stbuf.st_mtime));
787 			context_vec_start = emalloc(MAX_CONTEXT *
788 			    sizeof(struct context_vec));
789 			context_vec_end = context_vec_start + MAX_CONTEXT;
790 			context_vec_ptr = context_vec_start - 1;
791 		}
792 	}
793 	if (opt == D_CONTEXT || opt == D_UNIFIED) {
794 		/*
795 		 * If this new change is within 'context' lines of
796 		 * the previous change, just add it to the change
797 		 * record.  If the record is full or if this
798 		 * change is more than 'context' lines from the previous
799 		 * change, dump the record, reset it & add the new change.
800 		 */
801 		if (context_vec_ptr >= context_vec_end ||
802 		    (context_vec_ptr >= context_vec_start &&
803 		    a > (context_vec_ptr->b + 2 * context) &&
804 		    c > (context_vec_ptr->d + 2 * context))) {
805 			if (opt == D_CONTEXT)
806 				dump_context_vec();
807 			else
808 				dump_unified_vec();
809 		}
810 		context_vec_ptr++;
811 		context_vec_ptr->a = a;
812 		context_vec_ptr->b = b;
813 		context_vec_ptr->c = c;
814 		context_vec_ptr->d = d;
815 		return;
816 	}
817 	switch (opt) {
818 
819 	case D_NORMAL:
820 	case D_EDIT:
821 		range(a, b, ",");
822 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
823 		if (opt == D_NORMAL)
824 			range(c, d, ",");
825 		putchar('\n');
826 		break;
827 	case D_REVERSE:
828 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
829 		range(a, b, " ");
830 		putchar('\n');
831 		break;
832 	case D_NREVERSE:
833 		if (a > b)
834 			printf("a%d %d\n", b, d - c + 1);
835 		else {
836 			printf("d%d %d\n", a, b - a + 1);
837 			if (!(c > d))
838 				/* add changed lines */
839 				printf("a%d %d\n", b, d - c + 1);
840 		}
841 		break;
842 	}
843 	if (opt == D_NORMAL || opt == D_IFDEF) {
844 		fetch(ixold, a, b, input[0], "< ", 1);
845 		if (a <= b && c <= d && opt == D_NORMAL)
846 			prints("---\n");
847 	}
848 	fetch(ixnew, c, d, input[1], opt == D_NORMAL ? "> " : "", 0);
849 	if ((opt == D_EDIT || opt == D_REVERSE) && c <= d)
850 		prints(".\n");
851 	if (inifdef) {
852 		fprintf(stdout, "#endif /* %s */\n", ifdefname);
853 		inifdef = 0;
854 	}
855 }
856 
857 static void
858 range(int a, int b, char *separator)
859 {
860 	printf("%d", a > b ? b : a);
861 	if (a < b)
862 		printf("%s%d", separator, b);
863 }
864 
865 static void
866 fetch(long *f, int a, int b, FILE *lb, char *s, int oldfile)
867 {
868 	int i, j, c, col, nc;
869 
870 	/*
871 	 * When doing #ifdef's, copy down to current line
872 	 * if this is the first file, so that stuff makes it to output.
873 	 */
874 	if (opt == D_IFDEF && oldfile) {
875 		long curpos = ftell(lb);
876 		/* print through if append (a>b), else to (nb: 0 vs 1 orig) */
877 		nc = f[a > b ? b : a - 1] - curpos;
878 		for (i = 0; i < nc; i++)
879 			putchar(getc(lb));
880 	}
881 	if (a > b)
882 		return;
883 	if (opt == D_IFDEF) {
884 		if (inifdef) {
885 			fprintf(stdout, "#else /* %s%s */\n",
886 			    oldfile == 1 ? "!" : "", ifdefname);
887 		} else {
888 			if (oldfile)
889 				fprintf(stdout, "#ifndef %s\n", ifdefname);
890 			else
891 				fprintf(stdout, "#ifdef %s\n", ifdefname);
892 		}
893 		inifdef = 1 + oldfile;
894 	}
895 	for (i = a; i <= b; i++) {
896 		fseek(lb, f[i - 1], SEEK_SET);
897 		nc = f[i] - f[i - 1];
898 		if (opt != D_IFDEF)
899 			prints(s);
900 		col = 0;
901 		for (j = 0; j < nc; j++) {
902 			c = getc(lb);
903 			if (c == '\t' && tflag)
904 				do
905 					putchar(' ');
906 				while (++col & 7);
907 			else {
908 				putchar(c);
909 				col++;
910 			}
911 		}
912 	}
913 }
914 
915 #define POW2			/* define only if HALFLONG is 2**n */
916 #define HALFLONG 16
917 #define low(x)	(x&((1L<<HALFLONG)-1))
918 #define high(x)	(x>>HALFLONG)
919 
920 /*
921  * hashing has the effect of
922  * arranging line in 7-bit bytes and then
923  * summing 1-s complement in 16-bit hunks
924  */
925 static int
926 readhash(FILE *f)
927 {
928 	unsigned int shift;
929 	int t, space;
930 	long sum;
931 
932 	sum = 1;
933 	space = 0;
934 	if (!bflag && !wflag) {
935 		if (iflag)
936 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
937 				if (t == -1)
938 					return (0);
939 				sum += (long)chrtran[t] << (shift
940 #ifdef POW2
941 				    &= HALFLONG - 1);
942 #else
943 				    %= HALFLONG);
944 #endif
945 			}
946 		else
947 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
948 				if (t == -1)
949 					return (0);
950 				sum += (long)t << (shift
951 #ifdef POW2
952 				    &= HALFLONG - 1);
953 #else
954 				    %= HALFLONG);
955 #endif
956 			}
957 	} else {
958 		for (shift = 0;;) {
959 			switch (t = getc(f)) {
960 			case -1:
961 				return (0);
962 			case '\t':
963 			case ' ':
964 				space++;
965 				continue;
966 			default:
967 				if (space && !wflag) {
968 					shift += 7;
969 					space = 0;
970 				}
971 				sum += (long)chrtran[t] << (shift
972 #ifdef POW2
973 				    &= HALFLONG - 1);
974 #else
975 				    %= HALFLONG);
976 #endif
977 				shift += 7;
978 				continue;
979 			case '\n':
980 				break;
981 			}
982 			break;
983 		}
984 	}
985 	sum = low(sum) + high(sum);
986 	return ((short) low(sum) + (short) high(sum));
987 }
988 
989 static int
990 asciifile(FILE *f)
991 {
992 	char buf[BUFSIZ], *cp;
993 	int cnt;
994 
995 	fseek(f, 0L, SEEK_SET);
996 	cnt = fread(buf, 1, BUFSIZ, f);
997 	cp = buf;
998 	while (--cnt >= 0)
999 		if (*cp++ & 0200)
1000 			return (0);
1001 	return (1);
1002 }
1003 
1004 /* dump accumulated "context" diff changes */
1005 static void
1006 dump_context_vec(void)
1007 {
1008 	struct context_vec *cvp = context_vec_start;
1009 	int lowa, upb, lowc, upd, do_output;
1010 	int a, b, c, d;
1011 	char ch;
1012 
1013 	if (context_vec_start > context_vec_ptr)
1014 		return;
1015 
1016 	b = d = 0;		/* gcc */
1017 	lowa = max(1, cvp->a - context);
1018 	upb = min(len[0], context_vec_ptr->b + context);
1019 	lowc = max(1, cvp->c - context);
1020 	upd = min(len[1], context_vec_ptr->d + context);
1021 
1022 	printf("***************\n*** ");
1023 	range(lowa, upb, ",");
1024 	printf(" ****\n");
1025 
1026 	/*
1027 	 * output changes to the "old" file.  The first loop suppresses
1028 	 * output if there were no changes to the "old" file (we'll see
1029 	 * the "old" lines as context in the "new" list).
1030 	 */
1031 	do_output = 0;
1032 	for (; cvp <= context_vec_ptr; cvp++)
1033 		if (cvp->a <= cvp->b) {
1034 			cvp = context_vec_start;
1035 			do_output++;
1036 			break;
1037 		}
1038 	if (do_output) {
1039 		while (cvp <= context_vec_ptr) {
1040 			a = cvp->a;
1041 			b = cvp->b;
1042 			c = cvp->c;
1043 			d = cvp->d;
1044 
1045 			if (a <= b && c <= d)
1046 				ch = 'c';
1047 			else
1048 				ch = (a <= b) ? 'd' : 'a';
1049 
1050 			if (ch == 'a')
1051 				fetch(ixold, lowa, b, input[0], "  ", 0);
1052 			else {
1053 				fetch(ixold, lowa, a - 1, input[0], "  ", 0);
1054 				fetch(ixold, a, b, input[0],
1055 				    ch == 'c' ? "! " : "- ", 0);
1056 			}
1057 			lowa = b + 1;
1058 			cvp++;
1059 		}
1060 		fetch(ixold, b + 1, upb, input[0], "  ", 0);
1061 	}
1062 	/* output changes to the "new" file */
1063 	printf("--- ");
1064 	range(lowc, upd, ",");
1065 	printf(" ----\n");
1066 
1067 	do_output = 0;
1068 	for (cvp = context_vec_start; cvp <= context_vec_ptr; cvp++)
1069 		if (cvp->c <= cvp->d) {
1070 			cvp = context_vec_start;
1071 			do_output++;
1072 			break;
1073 		}
1074 	if (do_output) {
1075 		while (cvp <= context_vec_ptr) {
1076 			a = cvp->a;
1077 			b = cvp->b;
1078 			c = cvp->c;
1079 			d = cvp->d;
1080 
1081 			if (a <= b && c <= d)
1082 				ch = 'c';
1083 			else
1084 				ch = (a <= b) ? 'd' : 'a';
1085 
1086 			if (ch == 'd')
1087 				fetch(ixnew, lowc, d, input[1], "  ", 0);
1088 			else {
1089 				fetch(ixnew, lowc, c - 1, input[1], "  ", 0);
1090 				fetch(ixnew, c, d, input[1],
1091 				    ch == 'c' ? "! " : "+ ", 0);
1092 			}
1093 			lowc = d + 1;
1094 			cvp++;
1095 		}
1096 		fetch(ixnew, d + 1, upd, input[1], "  ", 0);
1097 	}
1098 	context_vec_ptr = context_vec_start - 1;
1099 }
1100 
1101 /* dump accumulated "unified" diff changes */
1102 static void
1103 dump_unified_vec(void)
1104 {
1105 	struct context_vec *cvp = context_vec_start;
1106 	int lowa, upb, lowc, upd;
1107 	int a, b, c, d;
1108 	char ch;
1109 
1110 	if (context_vec_start > context_vec_ptr)
1111 		return;
1112 
1113 	b = d = 0;		/* gcc */
1114 	lowa = max(1, cvp->a - context);
1115 	upb = min(len[0], context_vec_ptr->b + context);
1116 	lowc = max(1, cvp->c - context);
1117 	upd = min(len[1], context_vec_ptr->d + context);
1118 
1119 	printf("@@ -%d,%d +%d,%d @@\n", lowa, upb - lowa + 1,
1120 	    lowc, upd - lowc + 1);
1121 
1122 	/*
1123 	 * Output changes in "unified" diff format--the old and new lines
1124 	 * are printed together.
1125 	 */
1126 	for (; cvp <= context_vec_ptr; cvp++) {
1127 		a = cvp->a;
1128 		b = cvp->b;
1129 		c = cvp->c;
1130 		d = cvp->d;
1131 
1132 		/*
1133 		 * c: both new and old changes
1134 		 * d: only changes in the old file
1135 		 * a: only changes in the new file
1136 		 */
1137 		if (a <= b && c <= d)
1138 			ch = 'c';
1139 		else
1140 			ch = (a <= b) ? 'd' : 'a';
1141 
1142 		switch (ch) {
1143 		case 'c':
1144 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1145 			fetch(ixold, a, b, input[0], "-", 0);
1146 			fetch(ixnew, c, d, input[1], "+", 0);
1147 			break;
1148 		case 'd':
1149 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1150 			fetch(ixold, a, b, input[0], "-", 0);
1151 			break;
1152 		case 'a':
1153 			fetch(ixnew, lowc, c - 1, input[1], " ", 0);
1154 			fetch(ixnew, c, d, input[1], "+", 0);
1155 			break;
1156 		}
1157 		lowa = b + 1;
1158 		lowc = d + 1;
1159 	}
1160 	fetch(ixnew, d + 1, upd, input[1], " ", 0);
1161 
1162 	context_vec_ptr = context_vec_start - 1;
1163 }
1164