xref: /openbsd-src/usr.bin/diff/diffreg.c (revision 2a1593df6988ca85abee7f39eb02a22b6cb6f306)
1 /*	$OpenBSD: diffreg.c,v 1.23 2003/06/27 20:28:13 tedu Exp $	*/
2 
3 /*
4  * Copyright (C) Caldera International Inc.  2001-2002.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code and documentation must retain the above
11  *    copyright notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed or owned by Caldera
18  *	International, Inc.
19  * 4. Neither the name of Caldera International, Inc. nor the names of other
20  *    contributors may be used to endorse or promote products derived from
21  *    this software without specific prior written permission.
22  *
23  * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
24  * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
28  * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
33  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 #include <sys/types.h>
38 
39 #include <stdlib.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <string.h>
43 
44 #include "diff.h"
45 #include "pathnames.h"
46 
47 #if 0
48 static char const sccsid[] = "@(#)diffreg.c 4.21 4/6/90";
49 #endif
50 
51 /*
52  * diff - compare two files.
53  */
54 
55 /*
56  *	Uses an algorithm due to Harold Stone, which finds
57  *	a pair of longest identical subsequences in the two
58  *	files.
59  *
60  *	The major goal is to generate the match vector J.
61  *	J[i] is the index of the line in file1 corresponding
62  *	to line i file0. J[i] = 0 if there is no
63  *	such line in file1.
64  *
65  *	Lines are hashed so as to work in core. All potential
66  *	matches are located by sorting the lines of each file
67  *	on the hash (called ``value''). In particular, this
68  *	collects the equivalence classes in file1 together.
69  *	Subroutine equiv replaces the value of each line in
70  *	file0 by the index of the first element of its
71  *	matching equivalence in (the reordered) file1.
72  *	To save space equiv squeezes file1 into a single
73  *	array member in which the equivalence classes
74  *	are simply concatenated, except that their first
75  *	members are flagged by changing sign.
76  *
77  *	Next the indices that point into member are unsorted into
78  *	array class according to the original order of file0.
79  *
80  *	The cleverness lies in routine stone. This marches
81  *	through the lines of file0, developing a vector klist
82  *	of "k-candidates". At step i a k-candidate is a matched
83  *	pair of lines x,y (x in file0 y in file1) such that
84  *	there is a common subsequence of length k
85  *	between the first i lines of file0 and the first y
86  *	lines of file1, but there is no such subsequence for
87  *	any smaller y. x is the earliest possible mate to y
88  *	that occurs in such a subsequence.
89  *
90  *	Whenever any of the members of the equivalence class of
91  *	lines in file1 matable to a line in file0 has serial number
92  *	less than the y of some k-candidate, that k-candidate
93  *	with the smallest such y is replaced. The new
94  *	k-candidate is chained (via pred) to the current
95  *	k-1 candidate so that the actual subsequence can
96  *	be recovered. When a member has serial number greater
97  *	that the y of all k-candidates, the klist is extended.
98  *	At the end, the longest subsequence is pulled out
99  *	and placed in the array J by unravel
100  *
101  *	With J in hand, the matches there recorded are
102  *	check'ed against reality to assure that no spurious
103  *	matches have crept in due to hashing. If they have,
104  *	they are broken, and "jackpot" is recorded--a harmless
105  *	matter except that a true match for a spuriously
106  *	mated line may now be unnecessarily reported as a change.
107  *
108  *	Much of the complexity of the program comes simply
109  *	from trying to minimize core utilization and
110  *	maximize the range of doable problems by dynamically
111  *	allocating what is needed and reusing what is not.
112  *	The core requirements for problems larger than somewhat
113  *	are (in words) 2*length(file0) + length(file1) +
114  *	3*(number of k-candidates installed),  typically about
115  *	6n words for files of length n.
116  */
117 
118 #define	prints(s)	fputs(s,stdout)
119 
120 FILE *input[2];
121 
122 struct cand {
123 	int x;
124 	int y;
125 	int pred;
126 } cand;
127 
128 struct line {
129 	int serial;
130 	int value;
131 } *file[2];
132 
133 int len[2];
134 struct line *sfile[2];	/* shortened by pruning common prefix and suffix */
135 int slen[2];
136 int pref, suff;			/* length of prefix and suffix */
137 int *class;			/* will be overlaid on file[0] */
138 int *member;			/* will be overlaid on file[1] */
139 int *klist;			/* will be overlaid on file[0] after class */
140 struct cand *clist;		/* merely a free storage pot for candidates */
141 int clen = 0;
142 int *J;				/* will be overlaid on class */
143 long *ixold;			/* will be overlaid on klist */
144 long *ixnew;			/* will be overlaid on file[1] */
145 u_char *chrtran;		/* translation table for case-folding */
146 
147 static void fetch(long *, int, int, FILE *, char *, int);
148 static void output(void);
149 static void check(void);
150 static void range(int, int, char *);
151 static void dump_context_vec(void);
152 static void dump_unified_vec(void);
153 static void prepare(int, FILE *);
154 static void prune(void);
155 static void equiv(struct line *, int, struct line *, int, int *);
156 static void unravel(int);
157 static void unsort(struct line *, int, int *);
158 static void change(int, int, int, int);
159 static void sort(struct line *, int);
160 static int newcand(int, int, int);
161 static int search(int *, int, int);
162 static int skipline(int);
163 static int asciifile(FILE *);
164 static int stone(int *, int, int *, int *);
165 static int readhash(FILE *);
166 
167 /*
168  * chrtran points to one of 2 translation tables: cup2low if folding upper to
169  * lower case clow2low if not folding case
170  */
171 u_char clow2low[256] = {
172 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
173 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
174 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
175 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
176 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
177 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41,
178 	0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c,
179 	0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
180 	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
181 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
182 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
183 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
184 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
185 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
186 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
187 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
188 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
189 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
190 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
191 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
192 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
193 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
194 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
195 	0xfd, 0xfe, 0xff
196 };
197 
198 u_char cup2low[256] = {
199 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
200 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
201 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
202 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
203 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
204 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x60, 0x61,
205 	0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
206 	0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
207 	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x60, 0x61, 0x62,
208 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
209 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
210 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
211 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
212 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
213 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
214 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
215 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
216 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
217 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
218 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
219 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
220 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
221 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
222 	0xfd, 0xfe, 0xff
223 };
224 
225 void
226 diffreg(void)
227 {
228 	char buf1[BUFSIZ], buf2[BUFSIZ];
229 	FILE *f1, *f2;
230 	int i, j;
231 
232 	if (hflag) {
233 		diffargv[0] = "diffh";
234 		execv(diffh, diffargv);
235 		error("%s", diffh);
236 	}
237 	chrtran = (iflag ? cup2low : clow2low);
238 	if (strcmp(file1, "-") == 0 && strcmp(file2, "-") == 0)
239 		errorx("can't specify - -");
240 	if (S_ISDIR(stb1.st_mode)) {
241 		file1 = splice(file1, file2);
242 		if (stat(file1, &stb1) < 0)
243 			error("%s", file1);
244 	} else if (!S_ISREG(stb1.st_mode) || strcmp(file1, "-") == 0) {
245 		file1 = copytemp(file1, 1);
246 		if (stat(file1, &stb1) < 0)
247 			error("%s", file1);
248 	}
249 	if (S_ISDIR(stb2.st_mode)) {
250 		file2 = splice(file2, file1);
251 		if (stat(file2, &stb2) < 0)
252 			error("%s", file2);
253 	} else if (!S_ISREG(stb2.st_mode) || strcmp(file2, "-") == 0) {
254 		file2 = copytemp(file2, 2);
255 		if (stat(file2, &stb2) < 0)
256 			error("%s", file2);
257 	}
258 	if ((f1 = fopen(file1, "r")) == NULL)
259 		error("%s", file1);
260 	if ((f2 = fopen(file2, "r")) == NULL)
261 		error("%s", file2);
262 	if (S_ISREG(stb1.st_mode) && S_ISREG(stb2.st_mode) &&
263 	    stb1.st_size != stb2.st_size)
264 		goto notsame;
265 	for (;;) {
266 		i = fread(buf1, 1, BUFSIZ, f1);
267 		j = fread(buf2, 1, BUFSIZ, f2);
268 		if (i < 0 || j < 0 || i != j)
269 			goto notsame;
270 		if (i == 0 && j == 0) {
271 			fclose(f1);
272 			fclose(f2);
273 			status = 0;	/* files don't differ */
274 			goto same;
275 		}
276 		for (j = 0; j < i; j++)
277 			if (buf1[j] != buf2[j])
278 				goto notsame;
279 	}
280 notsame:
281 	/*
282 	 * Files certainly differ at this point; set status accordingly
283 	 */
284 	status = 1;
285 	if (!asciifile(f1) || !asciifile(f2)) {
286 		printf("Binary files %s and %s differ\n", file1, file2);
287 		exit(status);
288 	}
289 	prepare(0, f1);
290 	prepare(1, f2);
291 	fclose(f1);
292 	fclose(f2);
293 	prune();
294 	sort(sfile[0], slen[0]);
295 	sort(sfile[1], slen[1]);
296 
297 	member = (int *)file[1];
298 	equiv(sfile[0], slen[0], sfile[1], slen[1], member);
299 	member = erealloc(member, (slen[1] + 2) * sizeof(int));
300 
301 	class = (int *)file[0];
302 	unsort(sfile[0], slen[0], class);
303 	class = erealloc(class, (slen[0] + 2) * sizeof(int));
304 
305 	klist = emalloc((slen[0] + 2) * sizeof(int));
306 	clist = emalloc(sizeof(cand));
307 	i = stone(class, slen[0], member, klist);
308 	free(member);
309 	free(class);
310 
311 	J = emalloc((len[0] + 2) * sizeof(int));
312 	unravel(klist[i]);
313 	free(clist);
314 	free(klist);
315 
316 	ixold = emalloc((len[0] + 2) * sizeof(long));
317 	ixnew = emalloc((len[1] + 2) * sizeof(long));
318 	check();
319 	output();
320 	status = anychange;
321 same:
322 	if (anychange == 0 && (opt == D_CONTEXT || opt == D_UNIFIED))
323 		printf("No differences encountered\n");
324 }
325 
326 char *tempfiles[2];
327 
328 char *
329 copytemp(const char *file, int n)
330 {
331 	char buf[BUFSIZ], *tempdir, *tempfile;
332 	int i, ifd, ofd;
333 
334 	if (n != 1 && n != 2)
335 		return (NULL);
336 
337 	if (strcmp(file, "-") == 0)
338 		ifd = STDIN_FILENO;
339 	else if ((ifd = open(file, O_RDONLY, 0644)) < 0)
340 		error("%s", file);
341 
342 	if ((tempdir = getenv("TMPDIR")) == NULL)
343 		tempdir = _PATH_TMP;
344 	if (asprintf(&tempfile, "%s/diff%d.XXXXXXXX", tempdir, n) == -1)
345 		error(NULL);
346 	tempfiles[n - 1] = tempfile;
347 
348 	signal(SIGHUP, done);
349 	signal(SIGINT, done);
350 	signal(SIGPIPE, done);
351 	signal(SIGTERM, done);
352 	ofd = mkstemp(tempfile);
353 	if (ofd < 0)
354 		error("%s", tempfile);
355 	while ((i = read(ifd, buf, BUFSIZ)) > 0) {
356 		if (write(ofd, buf, i) != i)
357 			error("%s", tempfile);
358 	}
359 	close(ifd);
360 	close(ofd);
361 	return (tempfile);
362 }
363 
364 char *
365 splice(char *dir, char *file)
366 {
367 	char *tail, *buf;
368 	size_t len;
369 
370 	if (!strcmp(file, "-"))
371 		errorx("can't specify - with other arg directory");
372 	tail = strrchr(file, '/');
373 	if (tail == NULL)
374 		tail = file;
375 	else
376 		tail++;
377 	len = strlen(dir) + 1 + strlen(tail) + 1;
378 	buf = emalloc(len);
379 	snprintf(buf, len, "%s/%s", dir, tail);
380 	return (buf);
381 }
382 
383 static void
384 prepare(int i, FILE *fd)
385 {
386 	struct line *p;
387 	int j, h;
388 
389 	fseek(fd, 0L, SEEK_SET);
390 	p = emalloc(3 * sizeof(struct line));
391 	for (j = 0; (h = readhash(fd));) {
392 		p = erealloc(p, (++j + 3) * sizeof(struct line));
393 		p[j].value = h;
394 	}
395 	len[i] = j;
396 	file[i] = p;
397 }
398 
399 static void
400 prune(void)
401 {
402 	int i, j;
403 
404 	for (pref = 0; pref < len[0] && pref < len[1] &&
405 	    file[0][pref + 1].value == file[1][pref + 1].value;
406 	    pref++)
407 		;
408 	for (suff = 0; suff < len[0] - pref && suff < len[1] - pref &&
409 	    file[0][len[0] - suff].value == file[1][len[1] - suff].value;
410 	    suff++)
411 		;
412 	for (j = 0; j < 2; j++) {
413 		sfile[j] = file[j] + pref;
414 		slen[j] = len[j] - pref - suff;
415 		for (i = 0; i <= slen[j]; i++)
416 			sfile[j][i].serial = i;
417 	}
418 }
419 
420 static void
421 equiv(struct line *a, int n, struct line *b, int m, int *c)
422 {
423 	int i, j;
424 
425 	i = j = 1;
426 	while (i <= n && j <= m) {
427 		if (a[i].value < b[j].value)
428 			a[i++].value = 0;
429 		else if (a[i].value == b[j].value)
430 			a[i++].value = j;
431 		else
432 			j++;
433 	}
434 	while (i <= n)
435 		a[i++].value = 0;
436 	b[m + 1].value = 0;
437 	j = 0;
438 	while (++j <= m) {
439 		c[j] = -b[j].serial;
440 		while (b[j + 1].value == b[j].value) {
441 			j++;
442 			c[j] = b[j].serial;
443 		}
444 	}
445 	c[j] = -1;
446 }
447 
448 static int
449 stone(int *a, int n, int *b, int *c)
450 {
451 	int i, k, y, j, l;
452 	int oldc, tc, oldl;
453 
454 	k = 0;
455 	c[0] = newcand(0, 0, 0);
456 	for (i = 1; i <= n; i++) {
457 		j = a[i];
458 		if (j == 0)
459 			continue;
460 		y = -b[j];
461 		oldl = 0;
462 		oldc = c[0];
463 		do {
464 			if (y <= clist[oldc].y)
465 				continue;
466 			l = search(c, k, y);
467 			if (l != oldl + 1)
468 				oldc = c[l - 1];
469 			if (l <= k) {
470 				if (clist[c[l]].y <= y)
471 					continue;
472 				tc = c[l];
473 				c[l] = newcand(i, y, oldc);
474 				oldc = tc;
475 				oldl = l;
476 			} else {
477 				c[l] = newcand(i, y, oldc);
478 				k++;
479 				break;
480 			}
481 		} while ((y = b[++j]) > 0);
482 	}
483 	return (k);
484 }
485 
486 static int
487 newcand(int x, int y, int pred)
488 {
489 	struct cand *q;
490 
491 	clist = erealloc(clist, ++clen * sizeof(cand));
492 	q = clist + clen - 1;
493 	q->x = x;
494 	q->y = y;
495 	q->pred = pred;
496 	return (clen - 1);
497 }
498 
499 static int
500 search(int *c, int k, int y)
501 {
502 	int i, j, l, t;
503 
504 	if (clist[c[k]].y < y)	/* quick look for typical case */
505 		return (k + 1);
506 	i = 0;
507 	j = k + 1;
508 	while (1) {
509 		l = i + j;
510 		if ((l >>= 1) <= i)
511 			break;
512 		t = clist[c[l]].y;
513 		if (t > y)
514 			j = l;
515 		else if (t < y)
516 			i = l;
517 		else
518 			return (l);
519 	}
520 	return (l + 1);
521 }
522 
523 static void
524 unravel(int p)
525 {
526 	struct cand *q;
527 	int i;
528 
529 	for (i = 0; i <= len[0]; i++)
530 		J[i] = i <= pref ? i :
531 		    i > len[0] - suff ? i + len[1] - len[0] : 0;
532 	for (q = clist + p; q->y != 0; q = clist + q->pred)
533 		J[q->x + pref] = q->y + pref;
534 }
535 
536 /*
537  * Check does double duty:
538  *  1.	ferret out any fortuitous correspondences due
539  *	to confounding by hashing (which result in "jackpot")
540  *  2.  collect random access indexes to the two files
541  */
542 static void
543 check(void)
544 {
545 	int i, j, jackpot, c, d;
546 	long ctold, ctnew;
547 
548 	if ((input[0] = fopen(file1, "r")) == NULL)
549 		error("%s", file1);
550 	if ((input[1] = fopen(file2, "r")) == NULL)
551 		error("%s", file2);
552 	j = 1;
553 	ixold[0] = ixnew[0] = 0;
554 	jackpot = 0;
555 	ctold = ctnew = 0;
556 	for (i = 1; i <= len[0]; i++) {
557 		if (J[i] == 0) {
558 			ixold[i] = ctold += skipline(0);
559 			continue;
560 		}
561 		while (j < J[i]) {
562 			ixnew[j] = ctnew += skipline(1);
563 			j++;
564 		}
565 		if (bflag || wflag || iflag) {
566 			for (;;) {
567 				c = getc(input[0]);
568 				d = getc(input[1]);
569 				ctold++;
570 				ctnew++;
571 				if (bflag && isspace(c) && isspace(d)) {
572 					do {
573 						if (c == '\n')
574 							break;
575 						ctold++;
576 					} while (isspace(c = getc(input[0])));
577 					do {
578 						if (d == '\n')
579 							break;
580 						ctnew++;
581 					} while (isspace(d = getc(input[1])));
582 				} else if (wflag) {
583 					while (isspace(c) && c != '\n') {
584 						c = getc(input[0]);
585 						ctold++;
586 					}
587 					while (isspace(d) && d != '\n') {
588 						d = getc(input[1]);
589 						ctnew++;
590 					}
591 				}
592 				if (chrtran[c] != chrtran[d]) {
593 					jackpot++;
594 					J[i] = 0;
595 					if (c != '\n')
596 						ctold += skipline(0);
597 					if (d != '\n')
598 						ctnew += skipline(1);
599 					break;
600 				}
601 				if (c == '\n')
602 					break;
603 			}
604 		} else {
605 			for (;;) {
606 				ctold++;
607 				ctnew++;
608 				if ((c = getc(input[0])) != (d = getc(input[1]))) {
609 					/* jackpot++; */
610 					J[i] = 0;
611 					if (c != '\n')
612 						ctold += skipline(0);
613 					if (d != '\n')
614 						ctnew += skipline(1);
615 					break;
616 				}
617 				if (c == '\n')
618 					break;
619 			}
620 		}
621 		ixold[i] = ctold;
622 		ixnew[j] = ctnew;
623 		j++;
624 	}
625 	for (; j <= len[1]; j++) {
626 		ixnew[j] = ctnew += skipline(1);
627 	}
628 	fclose(input[0]);
629 	fclose(input[1]);
630 	/*
631 	 * if (jackpot)
632 	 *	fprintf(stderr, "jackpot\n");
633 	 */
634 }
635 
636 /* shellsort CACM #201 */
637 static void
638 sort(struct line *a, int n)
639 {
640 	struct line *ai, *aim, w;
641 	int j, m = 0, k;
642 
643 	if (n == 0)
644 		return;
645 	for (j = 1; j <= n; j *= 2)
646 		m = 2 * j - 1;
647 	for (m /= 2; m != 0; m /= 2) {
648 		k = n - m;
649 		for (j = 1; j <= k; j++) {
650 			for (ai = &a[j]; ai > a; ai -= m) {
651 				aim = &ai[m];
652 				if (aim < ai)
653 					break;	/* wraparound */
654 				if (aim->value > ai[0].value ||
655 				    (aim->value == ai[0].value &&
656 					aim->serial > ai[0].serial))
657 					break;
658 				w.value = ai[0].value;
659 				ai[0].value = aim->value;
660 				aim->value = w.value;
661 				w.serial = ai[0].serial;
662 				ai[0].serial = aim->serial;
663 				aim->serial = w.serial;
664 			}
665 		}
666 	}
667 }
668 
669 static void
670 unsort(struct line *f, int l, int *b)
671 {
672 	int *a, i;
673 
674 	a = emalloc((l + 1) * sizeof(int));
675 	for (i = 1; i <= l; i++)
676 		a[f[i].serial] = f[i].value;
677 	for (i = 1; i <= l; i++)
678 		b[i] = a[i];
679 	free(a);
680 }
681 
682 static int
683 skipline(int f)
684 {
685 	int i, c;
686 
687 	for (i = 1; (c = getc(input[f])) != '\n'; i++)
688 		if (c < 0)
689 			return (i);
690 	return (i);
691 }
692 
693 static void
694 output(void)
695 {
696 	int m, i0, i1, j0, j1;
697 
698 	input[0] = fopen(file1, "r");
699 	input[1] = fopen(file2, "r");
700 	m = len[0];
701 	J[0] = 0;
702 	J[m + 1] = len[1] + 1;
703 	if (opt != D_EDIT) {
704 		for (i0 = 1; i0 <= m; i0 = i1 + 1) {
705 			while (i0 <= m && J[i0] == J[i0 - 1] + 1)
706 				i0++;
707 			j0 = J[i0 - 1] + 1;
708 			i1 = i0 - 1;
709 			while (i1 < m && J[i1 + 1] == 0)
710 				i1++;
711 			j1 = J[i1 + 1] - 1;
712 			J[i1] = j1;
713 			change(i0, i1, j0, j1);
714 		}
715 	} else {
716 		for (i0 = m; i0 >= 1; i0 = i1 - 1) {
717 			while (i0 >= 1 && J[i0] == J[i0 + 1] - 1 && J[i0] != 0)
718 				i0--;
719 			j0 = J[i0 + 1] - 1;
720 			i1 = i0 + 1;
721 			while (i1 > 1 && J[i1 - 1] == 0)
722 				i1--;
723 			j1 = J[i1 - 1] + 1;
724 			J[i1] = j1;
725 			change(i1, i0, j1, j0);
726 		}
727 	}
728 	if (m == 0)
729 		change(1, 0, 1, len[1]);
730 	if (opt == D_IFDEF) {
731 		for (;;) {
732 #define	c i0
733 			c = getc(input[0]);
734 			if (c < 0)
735 				return;
736 			putchar(c);
737 		}
738 #undef c
739 	}
740 	if (anychange != 0) {
741 		if (opt == D_CONTEXT)
742 			dump_context_vec();
743 		else if (opt == D_UNIFIED)
744 			dump_unified_vec();
745 	}
746 }
747 
748 /*
749  * The following struct is used to record change information when
750  * doing a "context" diff.  (see routine "change" to understand the
751  * highly mneumonic field names)
752  */
753 struct context_vec {
754 	int a;			/* start line in old file */
755 	int b;			/* end line in old file */
756 	int c;			/* start line in new file */
757 	int d;			/* end line in new file */
758 };
759 
760 struct context_vec *context_vec_start, *context_vec_end, *context_vec_ptr;
761 
762 #define	MAX_CONTEXT	128
763 
764 /*
765  * indicate that there is a difference between lines a and b of the from file
766  * to get to lines c to d of the to file. If a is greater then b then there
767  * are no lines in the from file involved and this means that there were
768  * lines appended (beginning at b). If c is greater than d then there are
769  * lines missing from the to file.
770  */
771 static void
772 change(int a, int b, int c, int d)
773 {
774 	struct stat stbuf;
775 
776 	if (opt != D_IFDEF && a > b && c > d)
777 		return;
778 	if (anychange == 0) {
779 		anychange = 1;
780 		if (opt == D_CONTEXT || opt == D_UNIFIED) {
781 			stat(file1, &stbuf);
782 			printf("%s %s	%s", opt == D_CONTEXT ? "***" : "---",
783 			   file1, ctime(&stbuf.st_mtime));
784 			stat(file2, &stbuf);
785 			printf("%s %s	%s", opt == D_CONTEXT ? "---" : "+++",
786 			    file2, ctime(&stbuf.st_mtime));
787 			context_vec_start = emalloc(MAX_CONTEXT *
788 			    sizeof(struct context_vec));
789 			context_vec_end = context_vec_start + MAX_CONTEXT;
790 			context_vec_ptr = context_vec_start - 1;
791 		}
792 	}
793 	if (opt == D_CONTEXT || opt == D_UNIFIED) {
794 		/*
795 		 * If this new change is within 'context' lines of
796 		 * the previous change, just add it to the change
797 		 * record.  If the record is full or if this
798 		 * change is more than 'context' lines from the previous
799 		 * change, dump the record, reset it & add the new change.
800 		 */
801 		if (context_vec_ptr >= context_vec_end ||
802 		    (context_vec_ptr >= context_vec_start &&
803 		    a > (context_vec_ptr->b + 2 * context) &&
804 		    c > (context_vec_ptr->d + 2 * context))) {
805 			if (opt == D_CONTEXT)
806 				dump_context_vec();
807 			else
808 				dump_unified_vec();
809 		}
810 		context_vec_ptr++;
811 		context_vec_ptr->a = a;
812 		context_vec_ptr->b = b;
813 		context_vec_ptr->c = c;
814 		context_vec_ptr->d = d;
815 		return;
816 	}
817 	switch (opt) {
818 
819 	case D_NORMAL:
820 	case D_EDIT:
821 		range(a, b, ",");
822 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
823 		if (opt == D_NORMAL)
824 			range(c, d, ",");
825 		putchar('\n');
826 		break;
827 	case D_REVERSE:
828 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
829 		range(a, b, " ");
830 		putchar('\n');
831 		break;
832 	case D_NREVERSE:
833 		if (a > b)
834 			printf("a%d %d\n", b, d - c + 1);
835 		else {
836 			printf("d%d %d\n", a, b - a + 1);
837 			if (!(c > d))
838 				/* add changed lines */
839 				printf("a%d %d\n", b, d - c + 1);
840 		}
841 		break;
842 	}
843 	if (opt == D_NORMAL || opt == D_IFDEF) {
844 		fetch(ixold, a, b, input[0], "< ", 1);
845 		if (a <= b && c <= d && opt == D_NORMAL)
846 			prints("---\n");
847 	}
848 	fetch(ixnew, c, d, input[1], opt == D_NORMAL ? "> " : "", 0);
849 	if ((opt == D_EDIT || opt == D_REVERSE) && c <= d)
850 		prints(".\n");
851 	if (inifdef) {
852 		fprintf(stdout, "#endif /* %s */\n", ifdefname);
853 		inifdef = 0;
854 	}
855 }
856 
857 static void
858 range(int a, int b, char *separator)
859 {
860 	printf("%d", a > b ? b : a);
861 	if (a < b)
862 		printf("%s%d", separator, b);
863 }
864 
865 static void
866 fetch(long *f, int a, int b, FILE *lb, char *s, int oldfile)
867 {
868 	int i, j, c, col, nc;
869 
870 	/*
871 	 * When doing #ifdef's, copy down to current line
872 	 * if this is the first file, so that stuff makes it to output.
873 	 */
874 	if (opt == D_IFDEF && oldfile) {
875 		long curpos = ftell(lb);
876 		/* print through if append (a>b), else to (nb: 0 vs 1 orig) */
877 		nc = f[a > b ? b : a - 1] - curpos;
878 		for (i = 0; i < nc; i++)
879 			putchar(getc(lb));
880 	}
881 	if (a > b)
882 		return;
883 	if (opt == D_IFDEF) {
884 		if (inifdef) {
885 			fprintf(stdout, "#else /* %s%s */\n",
886 			    oldfile == 1 ? "!" : "", ifdefname);
887 		} else {
888 			if (oldfile)
889 				fprintf(stdout, "#ifndef %s\n", ifdefname);
890 			else
891 				fprintf(stdout, "#ifdef %s\n", ifdefname);
892 		}
893 		inifdef = 1 + oldfile;
894 	}
895 	for (i = a; i <= b; i++) {
896 		fseek(lb, f[i - 1], SEEK_SET);
897 		nc = f[i] - f[i - 1];
898 		if (opt != D_IFDEF)
899 			prints(s);
900 		col = 0;
901 		for (j = 0; j < nc; j++) {
902 			c = getc(lb);
903 			if (c == '\t' && tflag)
904 				do
905 					putchar(' ');
906 				while (++col & 7);
907 			else {
908 				putchar(c);
909 				col++;
910 			}
911 		}
912 	}
913 }
914 
915 #define POW2			/* define only if HALFLONG is 2**n */
916 #define HALFLONG 16
917 #define low(x)	(x&((1L<<HALFLONG)-1))
918 #define high(x)	(x>>HALFLONG)
919 
920 /*
921  * hashing has the effect of
922  * arranging line in 7-bit bytes and then
923  * summing 1-s complement in 16-bit hunks
924  */
925 static int
926 readhash(FILE *f)
927 {
928 	unsigned int shift;
929 	int t, space;
930 	long sum;
931 
932 	sum = 1;
933 	space = 0;
934 	if (!bflag && !wflag) {
935 		if (iflag)
936 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
937 				if (t == -1)
938 					return (0);
939 				sum += (long)chrtran[t] << (shift
940 #ifdef POW2
941 				    &= HALFLONG - 1);
942 #else
943 				    %= HALFLONG);
944 #endif
945 			}
946 		else
947 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
948 				if (t == -1)
949 					return (0);
950 				sum += (long)t << (shift
951 #ifdef POW2
952 				    &= HALFLONG - 1);
953 #else
954 				    %= HALFLONG);
955 #endif
956 			}
957 	} else {
958 		for (shift = 0;;) {
959 			switch (t = getc(f)) {
960 			case -1:
961 				return (0);
962 			case '\t':
963 			case ' ':
964 				space++;
965 				continue;
966 			default:
967 				if (space && !wflag) {
968 					shift += 7;
969 					space = 0;
970 				}
971 				sum += (long)chrtran[t] << (shift
972 #ifdef POW2
973 				    &= HALFLONG - 1);
974 #else
975 				    %= HALFLONG);
976 #endif
977 				shift += 7;
978 				continue;
979 			case '\n':
980 				break;
981 			}
982 			break;
983 		}
984 	}
985 	sum = low(sum) + high(sum);
986 	return ((short) low(sum) + (short) high(sum));
987 }
988 
989 static int
990 asciifile(FILE *f)
991 {
992 	char buf[BUFSIZ], *cp;
993 	int cnt;
994 
995 	if (aflag)
996 		return (1);
997 
998 	fseek(f, 0L, SEEK_SET);
999 	cnt = fread(buf, 1, BUFSIZ, f);
1000 	cp = buf;
1001 	while (--cnt >= 0)
1002 		if (*cp++ & 0200)
1003 			return (0);
1004 	return (1);
1005 }
1006 
1007 /* dump accumulated "context" diff changes */
1008 static void
1009 dump_context_vec(void)
1010 {
1011 	struct context_vec *cvp = context_vec_start;
1012 	int lowa, upb, lowc, upd, do_output;
1013 	int a, b, c, d;
1014 	char ch;
1015 
1016 	if (context_vec_start > context_vec_ptr)
1017 		return;
1018 
1019 	b = d = 0;		/* gcc */
1020 	lowa = max(1, cvp->a - context);
1021 	upb = min(len[0], context_vec_ptr->b + context);
1022 	lowc = max(1, cvp->c - context);
1023 	upd = min(len[1], context_vec_ptr->d + context);
1024 
1025 	printf("***************\n*** ");
1026 	range(lowa, upb, ",");
1027 	printf(" ****\n");
1028 
1029 	/*
1030 	 * output changes to the "old" file.  The first loop suppresses
1031 	 * output if there were no changes to the "old" file (we'll see
1032 	 * the "old" lines as context in the "new" list).
1033 	 */
1034 	do_output = 0;
1035 	for (; cvp <= context_vec_ptr; cvp++)
1036 		if (cvp->a <= cvp->b) {
1037 			cvp = context_vec_start;
1038 			do_output++;
1039 			break;
1040 		}
1041 	if (do_output) {
1042 		while (cvp <= context_vec_ptr) {
1043 			a = cvp->a;
1044 			b = cvp->b;
1045 			c = cvp->c;
1046 			d = cvp->d;
1047 
1048 			if (a <= b && c <= d)
1049 				ch = 'c';
1050 			else
1051 				ch = (a <= b) ? 'd' : 'a';
1052 
1053 			if (ch == 'a')
1054 				fetch(ixold, lowa, b, input[0], "  ", 0);
1055 			else {
1056 				fetch(ixold, lowa, a - 1, input[0], "  ", 0);
1057 				fetch(ixold, a, b, input[0],
1058 				    ch == 'c' ? "! " : "- ", 0);
1059 			}
1060 			lowa = b + 1;
1061 			cvp++;
1062 		}
1063 		fetch(ixold, b + 1, upb, input[0], "  ", 0);
1064 	}
1065 	/* output changes to the "new" file */
1066 	printf("--- ");
1067 	range(lowc, upd, ",");
1068 	printf(" ----\n");
1069 
1070 	do_output = 0;
1071 	for (cvp = context_vec_start; cvp <= context_vec_ptr; cvp++)
1072 		if (cvp->c <= cvp->d) {
1073 			cvp = context_vec_start;
1074 			do_output++;
1075 			break;
1076 		}
1077 	if (do_output) {
1078 		while (cvp <= context_vec_ptr) {
1079 			a = cvp->a;
1080 			b = cvp->b;
1081 			c = cvp->c;
1082 			d = cvp->d;
1083 
1084 			if (a <= b && c <= d)
1085 				ch = 'c';
1086 			else
1087 				ch = (a <= b) ? 'd' : 'a';
1088 
1089 			if (ch == 'd')
1090 				fetch(ixnew, lowc, d, input[1], "  ", 0);
1091 			else {
1092 				fetch(ixnew, lowc, c - 1, input[1], "  ", 0);
1093 				fetch(ixnew, c, d, input[1],
1094 				    ch == 'c' ? "! " : "+ ", 0);
1095 			}
1096 			lowc = d + 1;
1097 			cvp++;
1098 		}
1099 		fetch(ixnew, d + 1, upd, input[1], "  ", 0);
1100 	}
1101 	context_vec_ptr = context_vec_start - 1;
1102 }
1103 
1104 /* dump accumulated "unified" diff changes */
1105 static void
1106 dump_unified_vec(void)
1107 {
1108 	struct context_vec *cvp = context_vec_start;
1109 	int lowa, upb, lowc, upd;
1110 	int a, b, c, d;
1111 	char ch;
1112 
1113 	if (context_vec_start > context_vec_ptr)
1114 		return;
1115 
1116 	b = d = 0;		/* gcc */
1117 	lowa = max(1, cvp->a - context);
1118 	upb = min(len[0], context_vec_ptr->b + context);
1119 	lowc = max(1, cvp->c - context);
1120 	upd = min(len[1], context_vec_ptr->d + context);
1121 
1122 	printf("@@ -%d,%d +%d,%d @@\n", lowa, upb - lowa + 1,
1123 	    lowc, upd - lowc + 1);
1124 
1125 	/*
1126 	 * Output changes in "unified" diff format--the old and new lines
1127 	 * are printed together.
1128 	 */
1129 	for (; cvp <= context_vec_ptr; cvp++) {
1130 		a = cvp->a;
1131 		b = cvp->b;
1132 		c = cvp->c;
1133 		d = cvp->d;
1134 
1135 		/*
1136 		 * c: both new and old changes
1137 		 * d: only changes in the old file
1138 		 * a: only changes in the new file
1139 		 */
1140 		if (a <= b && c <= d)
1141 			ch = 'c';
1142 		else
1143 			ch = (a <= b) ? 'd' : 'a';
1144 
1145 		switch (ch) {
1146 		case 'c':
1147 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1148 			fetch(ixold, a, b, input[0], "-", 0);
1149 			fetch(ixnew, c, d, input[1], "+", 0);
1150 			break;
1151 		case 'd':
1152 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1153 			fetch(ixold, a, b, input[0], "-", 0);
1154 			break;
1155 		case 'a':
1156 			fetch(ixnew, lowc, c - 1, input[1], " ", 0);
1157 			fetch(ixnew, c, d, input[1], "+", 0);
1158 			break;
1159 		}
1160 		lowa = b + 1;
1161 		lowc = d + 1;
1162 	}
1163 	fetch(ixnew, d + 1, upd, input[1], " ", 0);
1164 
1165 	context_vec_ptr = context_vec_start - 1;
1166 }
1167