xref: /openbsd-src/usr.bin/diff/diffreg.c (revision 66e5764e648f1cef39cdf0423fa96337157ae2bb)
1 /*	$OpenBSD: diffreg.c,v 1.21 2003/06/26 18:19:29 millert Exp $	*/
2 
3 /*
4  * Copyright (C) Caldera International Inc.  2001-2002.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code and documentation must retain the above
11  *    copyright notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed or owned by Caldera
18  *	International, Inc.
19  * 4. Neither the name of Caldera International, Inc. nor the names of other
20  *    contributors may be used to endorse or promote products derived from
21  *    this software without specific prior written permission.
22  *
23  * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
24  * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
28  * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
33  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 #include <sys/types.h>
38 
39 #include <stdlib.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <string.h>
43 
44 #include "diff.h"
45 #include "pathnames.h"
46 
47 #if 0
48 static char const sccsid[] = "@(#)diffreg.c 4.21 4/6/90";
49 #endif
50 
51 /*
52  * diff - compare two files.
53  */
54 
55 /*
56  *	Uses an algorithm due to Harold Stone, which finds
57  *	a pair of longest identical subsequences in the two
58  *	files.
59  *
60  *	The major goal is to generate the match vector J.
61  *	J[i] is the index of the line in file1 corresponding
62  *	to line i file0. J[i] = 0 if there is no
63  *	such line in file1.
64  *
65  *	Lines are hashed so as to work in core. All potential
66  *	matches are located by sorting the lines of each file
67  *	on the hash (called ``value''). In particular, this
68  *	collects the equivalence classes in file1 together.
69  *	Subroutine equiv replaces the value of each line in
70  *	file0 by the index of the first element of its
71  *	matching equivalence in (the reordered) file1.
72  *	To save space equiv squeezes file1 into a single
73  *	array member in which the equivalence classes
74  *	are simply concatenated, except that their first
75  *	members are flagged by changing sign.
76  *
77  *	Next the indices that point into member are unsorted into
78  *	array class according to the original order of file0.
79  *
80  *	The cleverness lies in routine stone. This marches
81  *	through the lines of file0, developing a vector klist
82  *	of "k-candidates". At step i a k-candidate is a matched
83  *	pair of lines x,y (x in file0 y in file1) such that
84  *	there is a common subsequence of length k
85  *	between the first i lines of file0 and the first y
86  *	lines of file1, but there is no such subsequence for
87  *	any smaller y. x is the earliest possible mate to y
88  *	that occurs in such a subsequence.
89  *
90  *	Whenever any of the members of the equivalence class of
91  *	lines in file1 matable to a line in file0 has serial number
92  *	less than the y of some k-candidate, that k-candidate
93  *	with the smallest such y is replaced. The new
94  *	k-candidate is chained (via pred) to the current
95  *	k-1 candidate so that the actual subsequence can
96  *	be recovered. When a member has serial number greater
97  *	that the y of all k-candidates, the klist is extended.
98  *	At the end, the longest subsequence is pulled out
99  *	and placed in the array J by unravel
100  *
101  *	With J in hand, the matches there recorded are
102  *	check'ed against reality to assure that no spurious
103  *	matches have crept in due to hashing. If they have,
104  *	they are broken, and "jackpot" is recorded--a harmless
105  *	matter except that a true match for a spuriously
106  *	mated line may now be unnecessarily reported as a change.
107  *
108  *	Much of the complexity of the program comes simply
109  *	from trying to minimize core utilization and
110  *	maximize the range of doable problems by dynamically
111  *	allocating what is needed and reusing what is not.
112  *	The core requirements for problems larger than somewhat
113  *	are (in words) 2*length(file0) + length(file1) +
114  *	3*(number of k-candidates installed),  typically about
115  *	6n words for files of length n.
116  */
117 
118 #define	prints(s)	fputs(s,stdout)
119 
120 FILE *input[2];
121 
122 struct cand {
123 	int x;
124 	int y;
125 	int pred;
126 } cand;
127 
128 struct line {
129 	int serial;
130 	int value;
131 } *file[2];
132 
133 int len[2];
134 struct line *sfile[2];	/* shortened by pruning common prefix and suffix */
135 int slen[2];
136 int pref, suff;			/* length of prefix and suffix */
137 int *class;			/* will be overlaid on file[0] */
138 int *member;			/* will be overlaid on file[1] */
139 int *klist;			/* will be overlaid on file[0] after class */
140 struct cand *clist;		/* merely a free storage pot for candidates */
141 int clen = 0;
142 int *J;				/* will be overlaid on class */
143 long *ixold;			/* will be overlaid on klist */
144 long *ixnew;			/* will be overlaid on file[1] */
145 u_char *chrtran;		/* translation table for case-folding */
146 
147 static void fetch(long *, int, int, FILE *, char *, int);
148 static void output(void);
149 static void check(void);
150 static void range(int, int, char *);
151 static void dump_context_vec(void);
152 static void dump_unified_vec(void);
153 static void prepare(int, FILE *);
154 static void prune(void);
155 static void equiv(struct line *, int, struct line *, int, int *);
156 static void unravel(int);
157 static void unsort(struct line *, int, int *);
158 static void change(int, int, int, int);
159 static void sort(struct line *, int);
160 static int newcand(int, int, int);
161 static int search(int *, int, int);
162 static int skipline(int);
163 static int asciifile(FILE *);
164 static int stone(int *, int, int *, int *);
165 static int readhash(FILE *);
166 
167 /*
168  * chrtran points to one of 2 translation tables: cup2low if folding upper to
169  * lower case clow2low if not folding case
170  */
171 u_char clow2low[256] = {
172 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
173 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
174 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
175 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
176 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
177 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41,
178 	0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c,
179 	0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
180 	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
181 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
182 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
183 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
184 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
185 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
186 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
187 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
188 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
189 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
190 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
191 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
192 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
193 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
194 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
195 	0xfd, 0xfe, 0xff
196 };
197 
198 u_char cup2low[256] = {
199 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
200 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
201 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
202 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
203 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
204 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x60, 0x61,
205 	0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
206 	0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
207 	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x60, 0x61, 0x62,
208 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
209 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
210 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
211 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
212 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
213 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
214 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
215 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
216 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
217 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
218 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
219 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
220 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
221 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
222 	0xfd, 0xfe, 0xff
223 };
224 
225 void
226 diffreg(void)
227 {
228 	char buf1[BUFSIZ], buf2[BUFSIZ];
229 	FILE *f1, *f2;
230 	int i, j;
231 
232 	if (hflag) {
233 		diffargv[0] = "diffh";
234 		execv(diffh, diffargv);
235 		error("%s", diffh);
236 	}
237 	chrtran = (iflag ? cup2low : clow2low);
238 	if (strcmp(file1, "-") == 0 && strcmp(file2, "-") == 0)
239 		errorx("can't specify - -");
240 	if (S_ISDIR(stb1.st_mode)) {
241 		file1 = splice(file1, file2);
242 		if (stat(file1, &stb1) < 0)
243 			error("%s", file1);
244 	} else if (!S_ISREG(stb1.st_mode) || strcmp(file1, "-") == 0) {
245 		file1 = copytemp(file1, 1);
246 		if (stat(file1, &stb1) < 0)
247 			error("%s", file1);
248 	}
249 	if (S_ISDIR(stb2.st_mode)) {
250 		file2 = splice(file2, file1);
251 		if (stat(file2, &stb2) < 0)
252 			error("%s", file2);
253 	} else if (!S_ISREG(stb2.st_mode) || strcmp(file2, "-") == 0) {
254 		file2 = copytemp(file2, 2);
255 		if (stat(file2, &stb2) < 0)
256 			error("%s", file2);
257 	}
258 	if ((f1 = fopen(file1, "r")) == NULL)
259 		error("%s", file1);
260 	if ((f2 = fopen(file2, "r")) == NULL)
261 		error("%s", file2);
262 	if (S_ISREG(stb1.st_mode) && S_ISREG(stb2.st_mode) &&
263 	    stb1.st_size != stb2.st_size)
264 		goto notsame;
265 	for (;;) {
266 		i = fread(buf1, 1, BUFSIZ, f1);
267 		j = fread(buf2, 1, BUFSIZ, f2);
268 		if (i < 0 || j < 0 || i != j)
269 			goto notsame;
270 		if (i == 0 && j == 0) {
271 			fclose(f1);
272 			fclose(f2);
273 			status = 0;	/* files don't differ */
274 			goto same;
275 		}
276 		for (j = 0; j < i; j++)
277 			if (buf1[j] != buf2[j])
278 				goto notsame;
279 	}
280 notsame:
281 	/*
282 	 * Files certainly differ at this point; set status accordingly
283 	 */
284 	status = 1;
285 	if (!asciifile(f1) || !asciifile(f2)) {
286 		printf("Binary files %s and %s differ\n", file1, file2);
287 		exit(status);
288 	}
289 	prepare(0, f1);
290 	prepare(1, f2);
291 	fclose(f1);
292 	fclose(f2);
293 	prune();
294 	sort(sfile[0], slen[0]);
295 	sort(sfile[1], slen[1]);
296 
297 	member = (int *)file[1];
298 	equiv(sfile[0], slen[0], sfile[1], slen[1], member);
299 	member = erealloc(member, (slen[1] + 2) * sizeof(int));
300 
301 	class = (int *)file[0];
302 	unsort(sfile[0], slen[0], class);
303 	class = erealloc(class, (slen[0] + 2) * sizeof(int));
304 
305 	klist = emalloc((slen[0] + 2) * sizeof(int));
306 	clist = emalloc(sizeof(cand));
307 	i = stone(class, slen[0], member, klist);
308 	free(member);
309 	free(class);
310 
311 	J = emalloc((len[0] + 2) * sizeof(int));
312 	unravel(klist[i]);
313 	free(clist);
314 	free(klist);
315 
316 	ixold = emalloc((len[0] + 2) * sizeof(long));
317 	ixnew = emalloc((len[1] + 2) * sizeof(long));
318 	check();
319 	output();
320 	status = anychange;
321 same:
322 	if (anychange == 0 && (opt == D_CONTEXT || opt == D_UNIFIED))
323 		printf("No differences encountered\n");
324 }
325 
326 char *tempfiles[2];
327 
328 char *
329 copytemp(const char *file, int n)
330 {
331 	char buf[BUFSIZ], *tempdir, *tempfile;
332 	int i, ifd, ofd;
333 
334 	if (n != 1 && n != 2)
335 		return (NULL);
336 
337 	if (strcmp(file, "-") == 0)
338 		ifd = STDIN_FILENO;
339 	else if ((ifd = open(file, O_RDONLY, 0644)) < 0)
340 		error("%s", file);
341 
342 	if ((tempdir = getenv("TMPDIR")) == NULL)
343 		tempdir = _PATH_TMP;
344 	if (asprintf(&tempfile, "%s/diff%d.XXXXXXXX", tempdir, n) == -1)
345 		error(NULL);
346 	tempfiles[n - 1] = tempfile;
347 
348 	signal(SIGHUP, done);
349 	signal(SIGINT, done);
350 	signal(SIGPIPE, done);
351 	signal(SIGTERM, done);
352 	ofd = mkstemp(tempfile);
353 	if (ofd < 0)
354 		error("%s", tempfile);
355 	while ((i = read(ifd, buf, BUFSIZ)) > 0) {
356 		if (write(ofd, buf, i) != i)
357 			error("%s", tempfile);
358 	}
359 	close(ifd);
360 	close(ofd);
361 	return (tempfile);
362 }
363 
364 char *
365 splice(char *dir, char *file)
366 {
367 	char *tail, *buf;
368 	size_t len;
369 
370 	if (!strcmp(file, "-"))
371 		errorx("can't specify - with other arg directory");
372 	tail = strrchr(file, '/');
373 	if (tail == NULL)
374 		tail = file;
375 	else
376 		tail++;
377 	len = strlen(dir) + 1 + strlen(tail) + 1;
378 	buf = emalloc(len);
379 	snprintf(buf, len, "%s/%s", dir, tail);
380 	return (buf);
381 }
382 
383 static void
384 prepare(int i, FILE *fd)
385 {
386 	struct line *p;
387 	int j, h;
388 
389 	fseek(fd, 0L, SEEK_SET);
390 	p = emalloc(3 * sizeof(struct line));
391 	for (j = 0; (h = readhash(fd));) {
392 		p = erealloc(p, (++j + 3) * sizeof(struct line));
393 		p[j].value = h;
394 	}
395 	len[i] = j;
396 	file[i] = p;
397 }
398 
399 static void
400 prune(void)
401 {
402 	int i, j;
403 
404 	for (pref = 0; pref < len[0] && pref < len[1] &&
405 	    file[0][pref + 1].value == file[1][pref + 1].value;
406 	    pref++)
407 		;
408 	for (suff = 0; suff < len[0] - pref && suff < len[1] - pref &&
409 	    file[0][len[0] - suff].value == file[1][len[1] - suff].value;
410 	    suff++)
411 		;
412 	for (j = 0; j < 2; j++) {
413 		sfile[j] = file[j] + pref;
414 		slen[j] = len[j] - pref - suff;
415 		for (i = 0; i <= slen[j]; i++)
416 			sfile[j][i].serial = i;
417 	}
418 }
419 
420 static void
421 equiv(struct line *a, int n, struct line *b, int m, int *c)
422 {
423 	int i, j;
424 
425 	i = j = 1;
426 	while (i <= n && j <= m) {
427 		if (a[i].value < b[j].value)
428 			a[i++].value = 0;
429 		else if (a[i].value == b[j].value)
430 			a[i++].value = j;
431 		else
432 			j++;
433 	}
434 	while (i <= n)
435 		a[i++].value = 0;
436 	b[m + 1].value = 0;
437 	j = 0;
438 	while (++j <= m) {
439 		c[j] = -b[j].serial;
440 		while (b[j + 1].value == b[j].value) {
441 			j++;
442 			c[j] = b[j].serial;
443 		}
444 	}
445 	c[j] = -1;
446 }
447 
448 static int
449 stone(int *a, int n, int *b, int *c)
450 {
451 	int i, k, y, j, l;
452 	int oldc, tc, oldl;
453 
454 	k = 0;
455 	c[0] = newcand(0, 0, 0);
456 	for (i = 1; i <= n; i++) {
457 		j = a[i];
458 		if (j == 0)
459 			continue;
460 		y = -b[j];
461 		oldl = 0;
462 		oldc = c[0];
463 		do {
464 			if (y <= clist[oldc].y)
465 				continue;
466 			l = search(c, k, y);
467 			if (l != oldl + 1)
468 				oldc = c[l - 1];
469 			if (l <= k) {
470 				if (clist[c[l]].y <= y)
471 					continue;
472 				tc = c[l];
473 				c[l] = newcand(i, y, oldc);
474 				oldc = tc;
475 				oldl = l;
476 			} else {
477 				c[l] = newcand(i, y, oldc);
478 				k++;
479 				break;
480 			}
481 		} while ((y = b[++j]) > 0);
482 	}
483 	return (k);
484 }
485 
486 static int
487 newcand(int x, int y, int pred)
488 {
489 	struct cand *q;
490 
491 	clist = erealloc(clist, ++clen * sizeof(cand));
492 	q = clist + clen - 1;
493 	q->x = x;
494 	q->y = y;
495 	q->pred = pred;
496 	return (clen - 1);
497 }
498 
499 static int
500 search(int *c, int k, int y)
501 {
502 	int i, j, l, t;
503 
504 	if (clist[c[k]].y < y)	/* quick look for typical case */
505 		return (k + 1);
506 	i = 0;
507 	j = k + 1;
508 	while (1) {
509 		l = i + j;
510 		if ((l >>= 1) <= i)
511 			break;
512 		t = clist[c[l]].y;
513 		if (t > y)
514 			j = l;
515 		else if (t < y)
516 			i = l;
517 		else
518 			return (l);
519 	}
520 	return (l + 1);
521 }
522 
523 static void
524 unravel(int p)
525 {
526 	struct cand *q;
527 	int i;
528 
529 	for (i = 0; i <= len[0]; i++)
530 		J[i] = i <= pref ? i :
531 		    i > len[0] - suff ? i + len[1] - len[0] : 0;
532 	for (q = clist + p; q->y != 0; q = clist + q->pred)
533 		J[q->x + pref] = q->y + pref;
534 }
535 
536 /*
537  * Check does double duty:
538  *  1.	ferret out any fortuitous correspondences due
539  *	to confounding by hashing (which result in "jackpot")
540  *  2.  collect random access indexes to the two files
541  */
542 static void
543 check(void)
544 {
545 	int i, j, jackpot, c, d;
546 	long ctold, ctnew;
547 
548 	if ((input[0] = fopen(file1, "r")) == NULL)
549 		error("%s", file1);
550 	if ((input[1] = fopen(file2, "r")) == NULL)
551 		error("%s", file2);
552 	j = 1;
553 	ixold[0] = ixnew[0] = 0;
554 	jackpot = 0;
555 	ctold = ctnew = 0;
556 	for (i = 1; i <= len[0]; i++) {
557 		if (J[i] == 0) {
558 			ixold[i] = ctold += skipline(0);
559 			continue;
560 		}
561 		while (j < J[i]) {
562 			ixnew[j] = ctnew += skipline(1);
563 			j++;
564 		}
565 		if (bflag || wflag || iflag) {
566 			for (;;) {
567 				c = getc(input[0]);
568 				d = getc(input[1]);
569 				ctold++;
570 				ctnew++;
571 				if (bflag && isspace(c) && isspace(d)) {
572 					do {
573 						if (c == '\n')
574 							break;
575 						ctold++;
576 					} while (isspace(c = getc(input[0])));
577 					do {
578 						if (d == '\n')
579 							break;
580 						ctnew++;
581 					} while (isspace(d = getc(input[1])));
582 				} else if (wflag) {
583 					while (isspace(c) && c != '\n') {
584 						c = getc(input[0]);
585 						ctold++;
586 					}
587 					while (isspace(d) && d != '\n') {
588 						d = getc(input[1]);
589 						ctnew++;
590 					}
591 				}
592 				if (chrtran[c] != chrtran[d]) {
593 					jackpot++;
594 					J[i] = 0;
595 					if (c != '\n')
596 						ctold += skipline(0);
597 					if (d != '\n')
598 						ctnew += skipline(1);
599 					break;
600 				}
601 				if (c == '\n')
602 					break;
603 			}
604 		} else {
605 			for (;;) {
606 				ctold++;
607 				ctnew++;
608 				if ((c = getc(input[0])) != (d = getc(input[1]))) {
609 					/* jackpot++; */
610 					J[i] = 0;
611 					if (c != '\n')
612 						ctold += skipline(0);
613 					if (d != '\n')
614 						ctnew += skipline(1);
615 					break;
616 				}
617 				if (c == '\n')
618 					break;
619 			}
620 		}
621 		ixold[i] = ctold;
622 		ixnew[j] = ctnew;
623 		j++;
624 	}
625 	for (; j <= len[1]; j++) {
626 		ixnew[j] = ctnew += skipline(1);
627 	}
628 	fclose(input[0]);
629 	fclose(input[1]);
630 	/*
631 	 * if (jackpot)
632 	 *	fprintf(stderr, "jackpot\n");
633 	 */
634 }
635 
636 /* shellsort CACM #201 */
637 static void
638 sort(struct line *a, int n)
639 {
640 	struct line *ai, *aim, w;
641 	int j, m = 0, k;
642 
643 	if (n == 0)
644 		return;
645 	for (j = 1; j <= n; j *= 2)
646 		m = 2 * j - 1;
647 	for (m /= 2; m != 0; m /= 2) {
648 		k = n - m;
649 		for (j = 1; j <= k; j++) {
650 			for (ai = &a[j]; ai > a; ai -= m) {
651 				aim = &ai[m];
652 				if (aim < ai)
653 					break;	/* wraparound */
654 				if (aim->value > ai[0].value ||
655 				    (aim->value == ai[0].value &&
656 					aim->serial > ai[0].serial))
657 					break;
658 				w.value = ai[0].value;
659 				ai[0].value = aim->value;
660 				aim->value = w.value;
661 				w.serial = ai[0].serial;
662 				ai[0].serial = aim->serial;
663 				aim->serial = w.serial;
664 			}
665 		}
666 	}
667 }
668 
669 static void
670 unsort(struct line *f, int l, int *b)
671 {
672 	int *a, i;
673 
674 	a = emalloc((l + 1) * sizeof(int));
675 	for (i = 1; i <= l; i++)
676 		a[f[i].serial] = f[i].value;
677 	for (i = 1; i <= l; i++)
678 		b[i] = a[i];
679 	free(a);
680 }
681 
682 static int
683 skipline(int f)
684 {
685 	int i, c;
686 
687 	for (i = 1; (c = getc(input[f])) != '\n'; i++)
688 		if (c < 0)
689 			return (i);
690 	return (i);
691 }
692 
693 static void
694 output(void)
695 {
696 	int m, i0, i1, j0, j1;
697 
698 	input[0] = fopen(file1, "r");
699 	input[1] = fopen(file2, "r");
700 	m = len[0];
701 	J[0] = 0;
702 	J[m + 1] = len[1] + 1;
703 	if (opt != D_EDIT) {
704 		for (i0 = 1; i0 <= m; i0 = i1 + 1) {
705 			while (i0 <= m && J[i0] == J[i0 - 1] + 1)
706 				i0++;
707 			j0 = J[i0 - 1] + 1;
708 			i1 = i0 - 1;
709 			while (i1 < m && J[i1 + 1] == 0)
710 				i1++;
711 			j1 = J[i1 + 1] - 1;
712 			J[i1] = j1;
713 			change(i0, i1, j0, j1);
714 		}
715 	} else {
716 		for (i0 = m; i0 >= 1; i0 = i1 - 1) {
717 			while (i0 >= 1 && J[i0] == J[i0 + 1] - 1 && J[i0] != 0)
718 				i0--;
719 			j0 = J[i0 + 1] - 1;
720 			i1 = i0 + 1;
721 			while (i1 > 1 && J[i1 - 1] == 0)
722 				i1--;
723 			j1 = J[i1 - 1] + 1;
724 			J[i1] = j1;
725 			change(i1, i0, j1, j0);
726 		}
727 	}
728 	if (m == 0)
729 		change(1, 0, 1, len[1]);
730 	if (opt == D_IFDEF) {
731 		for (;;) {
732 #define	c i0
733 			c = getc(input[0]);
734 			if (c < 0)
735 				return;
736 			putchar(c);
737 		}
738 #undef c
739 	}
740 	if (anychange != 0) {
741 		if (opt == D_CONTEXT)
742 			dump_context_vec();
743 		else if (opt == D_UNIFIED)
744 			dump_unified_vec();
745 	}
746 }
747 
748 /*
749  * The following struct is used to record change information when
750  * doing a "context" diff.  (see routine "change" to understand the
751  * highly mneumonic field names)
752  */
753 struct context_vec {
754 	int a;			/* start line in old file */
755 	int b;			/* end line in old file */
756 	int c;			/* start line in new file */
757 	int d;			/* end line in new file */
758 };
759 
760 struct context_vec *context_vec_start, *context_vec_end, *context_vec_ptr;
761 
762 #define	MAX_CONTEXT	128
763 
764 /*
765  * indicate that there is a difference between lines a and b of the from file
766  * to get to lines c to d of the to file. If a is greater then b then there
767  * are no lines in the from file involved and this means that there were
768  * lines appended (beginning at b). If c is greater than d then there are
769  * lines missing from the to file.
770  */
771 static void
772 change(int a, int b, int c, int d)
773 {
774 	struct stat stbuf;
775 
776 	if (opt != D_IFDEF && a > b && c > d)
777 		return;
778 	if (anychange == 0) {
779 		anychange = 1;
780 		if (opt == D_CONTEXT || opt == D_UNIFIED) {
781 			stat(file1, &stbuf);
782 			printf("%s %s	%s", opt == D_CONTEXT ? "***" : "---",
783 			   file1, ctime(&stbuf.st_mtime));
784 			stat(file2, &stbuf);
785 			printf("%s %s	%s", opt == D_CONTEXT ? "---" : "+++",
786 			    file2, ctime(&stbuf.st_mtime));
787 			context_vec_start = emalloc(MAX_CONTEXT *
788 			    sizeof(struct context_vec));
789 			context_vec_end = context_vec_start + MAX_CONTEXT;
790 			context_vec_ptr = context_vec_start - 1;
791 		}
792 	}
793 	if (opt == D_CONTEXT || opt == D_UNIFIED) {
794 		/*
795 		 * if this new change is within 'context' lines of
796 		 * the previous change, just add it to the change
797 		 * record.  If the record is full or if this
798 		 * change is more than 'context' lines from the previous
799 		 * change, dump the record, reset it & add the new change.
800 		 */
801 		if (context_vec_ptr >= context_vec_end ||
802 		    (context_vec_ptr >= context_vec_start &&
803 		    a > (context_vec_ptr->b + 2 * context) &&
804 		    c > (context_vec_ptr->d + 2 * context))) {
805 			if (opt == D_CONTEXT)
806 				dump_context_vec();
807 			else
808 				dump_unified_vec();
809 		}
810 		context_vec_ptr++;
811 		context_vec_ptr->a = a;
812 		context_vec_ptr->b = b;
813 		context_vec_ptr->c = c;
814 		context_vec_ptr->d = d;
815 		return;
816 	}
817 	switch (opt) {
818 
819 	case D_NORMAL:
820 	case D_EDIT:
821 		range(a, b, ",");
822 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
823 		if (opt == D_NORMAL)
824 			range(c, d, ",");
825 		putchar('\n');
826 		break;
827 	case D_REVERSE:
828 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
829 		range(a, b, " ");
830 		putchar('\n');
831 		break;
832 	case D_NREVERSE:
833 		if (a > b)
834 			printf("a%d %d\n", b, d - c + 1);
835 		else {
836 			printf("d%d %d\n", a, b - a + 1);
837 			if (!(c > d))
838 				/* add changed lines */
839 				printf("a%d %d\n", b, d - c + 1);
840 		}
841 		break;
842 	}
843 	if (opt == D_NORMAL || opt == D_IFDEF) {
844 		fetch(ixold, a, b, input[0], "< ", 1);
845 		if (a <= b && c <= d && opt == D_NORMAL)
846 			prints("---\n");
847 	}
848 	fetch(ixnew, c, d, input[1], opt == D_NORMAL ? "> " : "", 0);
849 	if ((opt == D_EDIT || opt == D_REVERSE) && c <= d)
850 		prints(".\n");
851 	if (inifdef) {
852 		fprintf(stdout, "#endif /* %s */\n", endifname);
853 		inifdef = 0;
854 	}
855 }
856 
857 static void
858 range(int a, int b, char *separator)
859 {
860 	printf("%d", a > b ? b : a);
861 	if (a < b)
862 		printf("%s%d", separator, b);
863 }
864 
865 static void
866 fetch(long *f, int a, int b, FILE *lb, char *s, int oldfile)
867 {
868 	int oneflag = (*ifdef1 != '\0') != (*ifdef2 != '\0');
869 	int i, j, c, col, nc;
870 
871 	/*
872 	 * When doing #ifdef's, copy down to current line
873 	 * if this is the first file, so that stuff makes it to output.
874 	 */
875 	if (opt == D_IFDEF && oldfile) {
876 		long curpos = ftell(lb);
877 		/* print through if append (a>b), else to (nb: 0 vs 1 orig) */
878 		nc = f[a > b ? b : a - 1] - curpos;
879 		for (i = 0; i < nc; i++)
880 			putchar(getc(lb));
881 	}
882 	if (a > b)
883 		return;
884 	if (opt == D_IFDEF) {
885 		if (inifdef)
886 			fprintf(stdout, "#else /* %s%s */\n",
887 			    oneflag && oldfile == 1 ? "!" : "", ifdef2);
888 		else {
889 			if (oneflag) {
890 				/* There was only one ifdef given */
891 				endifname = ifdef2;
892 				if (oldfile)
893 					fprintf(stdout, "#ifndef %s\n", endifname);
894 				else
895 					fprintf(stdout, "#ifdef %s\n", endifname);
896 			} else {
897 				endifname = oldfile ? ifdef1 : ifdef2;
898 				fprintf(stdout, "#ifdef %s\n", endifname);
899 			}
900 		}
901 		inifdef = 1 + oldfile;
902 	}
903 	for (i = a; i <= b; i++) {
904 		fseek(lb, f[i - 1], SEEK_SET);
905 		nc = f[i] - f[i - 1];
906 		if (opt != D_IFDEF)
907 			prints(s);
908 		col = 0;
909 		for (j = 0; j < nc; j++) {
910 			c = getc(lb);
911 			if (c == '\t' && tflag)
912 				do
913 					putchar(' ');
914 				while (++col & 7);
915 			else {
916 				putchar(c);
917 				col++;
918 			}
919 		}
920 	}
921 
922 	if (inifdef && !wantelses) {
923 		fprintf(stdout, "#endif /* %s */\n", endifname);
924 		inifdef = 0;
925 	}
926 }
927 
928 #define POW2			/* define only if HALFLONG is 2**n */
929 #define HALFLONG 16
930 #define low(x)	(x&((1L<<HALFLONG)-1))
931 #define high(x)	(x>>HALFLONG)
932 
933 /*
934  * hashing has the effect of
935  * arranging line in 7-bit bytes and then
936  * summing 1-s complement in 16-bit hunks
937  */
938 static int
939 readhash(FILE *f)
940 {
941 	unsigned int shift;
942 	int t, space;
943 	long sum;
944 
945 	sum = 1;
946 	space = 0;
947 	if (!bflag && !wflag) {
948 		if (iflag)
949 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
950 				if (t == -1)
951 					return (0);
952 				sum += (long)chrtran[t] << (shift
953 #ifdef POW2
954 				    &= HALFLONG - 1);
955 #else
956 				    %= HALFLONG);
957 #endif
958 			}
959 		else
960 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
961 				if (t == -1)
962 					return (0);
963 				sum += (long)t << (shift
964 #ifdef POW2
965 				    &= HALFLONG - 1);
966 #else
967 				    %= HALFLONG);
968 #endif
969 			}
970 	} else {
971 		for (shift = 0;;) {
972 			switch (t = getc(f)) {
973 			case -1:
974 				return (0);
975 			case '\t':
976 			case ' ':
977 				space++;
978 				continue;
979 			default:
980 				if (space && !wflag) {
981 					shift += 7;
982 					space = 0;
983 				}
984 				sum += (long)chrtran[t] << (shift
985 #ifdef POW2
986 				    &= HALFLONG - 1);
987 #else
988 				    %= HALFLONG);
989 #endif
990 				shift += 7;
991 				continue;
992 			case '\n':
993 				break;
994 			}
995 			break;
996 		}
997 	}
998 	sum = low(sum) + high(sum);
999 	return ((short) low(sum) + (short) high(sum));
1000 }
1001 
1002 static int
1003 asciifile(FILE *f)
1004 {
1005 	char buf[BUFSIZ], *cp;
1006 	int cnt;
1007 
1008 	fseek(f, 0L, SEEK_SET);
1009 	cnt = fread(buf, 1, BUFSIZ, f);
1010 	cp = buf;
1011 	while (--cnt >= 0)
1012 		if (*cp++ & 0200)
1013 			return (0);
1014 	return (1);
1015 }
1016 
1017 /* dump accumulated "context" diff changes */
1018 static void
1019 dump_context_vec(void)
1020 {
1021 	struct context_vec *cvp = context_vec_start;
1022 	int lowa, upb, lowc, upd, do_output;
1023 	int a, b, c, d;
1024 	char ch;
1025 
1026 	if (cvp > context_vec_ptr)
1027 		return;
1028 
1029 	b = d = 0;		/* gcc */
1030 	lowa = max(1, cvp->a - context);
1031 	upb = min(len[0], context_vec_ptr->b + context);
1032 	lowc = max(1, cvp->c - context);
1033 	upd = min(len[1], context_vec_ptr->d + context);
1034 
1035 	printf("***************\n*** ");
1036 	range(lowa, upb, ",");
1037 	printf(" ****\n");
1038 
1039 	/*
1040 	 * output changes to the "old" file.  The first loop suppresses
1041 	 * output if there were no changes to the "old" file (we'll see
1042 	 * the "old" lines as context in the "new" list).
1043 	 */
1044 	do_output = 0;
1045 	for (; cvp <= context_vec_ptr; cvp++)
1046 		if (cvp->a <= cvp->b) {
1047 			cvp = context_vec_start;
1048 			do_output++;
1049 			break;
1050 		}
1051 	if (do_output) {
1052 		while (cvp <= context_vec_ptr) {
1053 			a = cvp->a;
1054 			b = cvp->b;
1055 			c = cvp->c;
1056 			d = cvp->d;
1057 
1058 			if (a <= b && c <= d)
1059 				ch = 'c';
1060 			else
1061 				ch = (a <= b) ? 'd' : 'a';
1062 
1063 			if (ch == 'a')
1064 				fetch(ixold, lowa, b, input[0], "  ", 0);
1065 			else {
1066 				fetch(ixold, lowa, a - 1, input[0], "  ", 0);
1067 				fetch(ixold, a, b, input[0],
1068 				    ch == 'c' ? "! " : "- ", 0);
1069 			}
1070 			lowa = b + 1;
1071 			cvp++;
1072 		}
1073 		fetch(ixold, b + 1, upb, input[0], "  ", 0);
1074 	}
1075 	/* output changes to the "new" file */
1076 	printf("--- ");
1077 	range(lowc, upd, ",");
1078 	printf(" ----\n");
1079 
1080 	do_output = 0;
1081 	for (cvp = context_vec_start; cvp <= context_vec_ptr; cvp++)
1082 		if (cvp->c <= cvp->d) {
1083 			cvp = context_vec_start;
1084 			do_output++;
1085 			break;
1086 		}
1087 	if (do_output) {
1088 		while (cvp <= context_vec_ptr) {
1089 			a = cvp->a;
1090 			b = cvp->b;
1091 			c = cvp->c;
1092 			d = cvp->d;
1093 
1094 			if (a <= b && c <= d)
1095 				ch = 'c';
1096 			else
1097 				ch = (a <= b) ? 'd' : 'a';
1098 
1099 			if (ch == 'd')
1100 				fetch(ixnew, lowc, d, input[1], "  ", 0);
1101 			else {
1102 				fetch(ixnew, lowc, c - 1, input[1], "  ", 0);
1103 				fetch(ixnew, c, d, input[1],
1104 				    ch == 'c' ? "! " : "+ ", 0);
1105 			}
1106 			lowc = d + 1;
1107 			cvp++;
1108 		}
1109 		fetch(ixnew, d + 1, upd, input[1], "  ", 0);
1110 	}
1111 	context_vec_ptr = context_vec_start - 1;
1112 }
1113 
1114 /* dump accumulated "unified" diff changes */
1115 static void
1116 dump_unified_vec(void)
1117 {
1118 	struct context_vec *cvp = context_vec_start;
1119 	int lowa, upb, lowc, upd;
1120 	int a, b, c, d;
1121 	char ch;
1122 
1123 	if (cvp > context_vec_ptr)
1124 		return;
1125 
1126 	b = d = 0;		/* gcc */
1127 	lowa = max(1, cvp->a - context);
1128 	upb = min(len[0], context_vec_ptr->b + context);
1129 	lowc = max(1, cvp->c - context);
1130 	upd = min(len[1], context_vec_ptr->d + context);
1131 
1132 	printf("@@ -%d,%d +%d,%d @@\n", lowa, upb - lowa + 1,
1133 	    lowc, upd - lowc + 1);
1134 
1135 	/*
1136 	 * Output changes in "unified" diff format--the old and new lines
1137 	 * are printed together.
1138 	 */
1139 	for (; cvp <= context_vec_ptr; cvp++) {
1140 		a = cvp->a;
1141 		b = cvp->b;
1142 		c = cvp->c;
1143 		d = cvp->d;
1144 
1145 		/*
1146 		 * c: both new and old changes
1147 		 * d: only changes in the old file
1148 		 * a: only changes in the new file
1149 		 */
1150 		if (a <= b && c <= d)
1151 			ch = 'c';
1152 		else
1153 			ch = (a <= b) ? 'd' : 'a';
1154 
1155 		switch (ch) {
1156 		case 'c':
1157 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1158 			fetch(ixold, a, b, input[0], "-", 0);
1159 			fetch(ixnew, c, d, input[1], "+", 0);
1160 			break;
1161 		case 'd':
1162 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1163 			fetch(ixold, a, b, input[0], "-", 0);
1164 			break;
1165 		case 'a':
1166 			fetch(ixnew, lowc, c - 1, input[1], " ", 0);
1167 			fetch(ixnew, c, d, input[1], "+", 0);
1168 			break;
1169 		}
1170 		lowa = b + 1;
1171 		lowc = d + 1;
1172 	}
1173 	fetch(ixnew, d + 1, upd, input[1], " ", 0);
1174 
1175 	context_vec_ptr = context_vec_start - 1;
1176 }
1177