xref: /openbsd-src/usr.bin/diff/diffreg.c (revision ed544660b30d64bfb310ab2aae53f5f326f4f1f1)
1 /*	$OpenBSD: diffreg.c,v 1.24 2003/07/02 18:54:13 millert Exp $	*/
2 
3 /*
4  * Copyright (C) Caldera International Inc.  2001-2002.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code and documentation must retain the above
11  *    copyright notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed or owned by Caldera
18  *	International, Inc.
19  * 4. Neither the name of Caldera International, Inc. nor the names of other
20  *    contributors may be used to endorse or promote products derived from
21  *    this software without specific prior written permission.
22  *
23  * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
24  * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
28  * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
33  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 #include <sys/types.h>
38 
39 #include <stdlib.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <string.h>
43 
44 #include "diff.h"
45 #include "pathnames.h"
46 
47 #if 0
48 static char const sccsid[] = "@(#)diffreg.c 4.21 4/6/90";
49 #endif
50 
51 /*
52  * diff - compare two files.
53  */
54 
55 /*
56  *	Uses an algorithm due to Harold Stone, which finds
57  *	a pair of longest identical subsequences in the two
58  *	files.
59  *
60  *	The major goal is to generate the match vector J.
61  *	J[i] is the index of the line in file1 corresponding
62  *	to line i file0. J[i] = 0 if there is no
63  *	such line in file1.
64  *
65  *	Lines are hashed so as to work in core. All potential
66  *	matches are located by sorting the lines of each file
67  *	on the hash (called ``value''). In particular, this
68  *	collects the equivalence classes in file1 together.
69  *	Subroutine equiv replaces the value of each line in
70  *	file0 by the index of the first element of its
71  *	matching equivalence in (the reordered) file1.
72  *	To save space equiv squeezes file1 into a single
73  *	array member in which the equivalence classes
74  *	are simply concatenated, except that their first
75  *	members are flagged by changing sign.
76  *
77  *	Next the indices that point into member are unsorted into
78  *	array class according to the original order of file0.
79  *
80  *	The cleverness lies in routine stone. This marches
81  *	through the lines of file0, developing a vector klist
82  *	of "k-candidates". At step i a k-candidate is a matched
83  *	pair of lines x,y (x in file0 y in file1) such that
84  *	there is a common subsequence of length k
85  *	between the first i lines of file0 and the first y
86  *	lines of file1, but there is no such subsequence for
87  *	any smaller y. x is the earliest possible mate to y
88  *	that occurs in such a subsequence.
89  *
90  *	Whenever any of the members of the equivalence class of
91  *	lines in file1 matable to a line in file0 has serial number
92  *	less than the y of some k-candidate, that k-candidate
93  *	with the smallest such y is replaced. The new
94  *	k-candidate is chained (via pred) to the current
95  *	k-1 candidate so that the actual subsequence can
96  *	be recovered. When a member has serial number greater
97  *	that the y of all k-candidates, the klist is extended.
98  *	At the end, the longest subsequence is pulled out
99  *	and placed in the array J by unravel
100  *
101  *	With J in hand, the matches there recorded are
102  *	check'ed against reality to assure that no spurious
103  *	matches have crept in due to hashing. If they have,
104  *	they are broken, and "jackpot" is recorded--a harmless
105  *	matter except that a true match for a spuriously
106  *	mated line may now be unnecessarily reported as a change.
107  *
108  *	Much of the complexity of the program comes simply
109  *	from trying to minimize core utilization and
110  *	maximize the range of doable problems by dynamically
111  *	allocating what is needed and reusing what is not.
112  *	The core requirements for problems larger than somewhat
113  *	are (in words) 2*length(file0) + length(file1) +
114  *	3*(number of k-candidates installed),  typically about
115  *	6n words for files of length n.
116  */
117 
118 #define	prints(s)	fputs(s,stdout)
119 
120 FILE *input[2];
121 
122 struct cand {
123 	int x;
124 	int y;
125 	int pred;
126 } cand;
127 
128 struct line {
129 	int serial;
130 	int value;
131 } *file[2];
132 
133 int len[2];
134 struct line *sfile[2];	/* shortened by pruning common prefix and suffix */
135 int slen[2];
136 int pref, suff;			/* length of prefix and suffix */
137 int *class;			/* will be overlaid on file[0] */
138 int *member;			/* will be overlaid on file[1] */
139 int *klist;			/* will be overlaid on file[0] after class */
140 struct cand *clist;		/* merely a free storage pot for candidates */
141 int clen = 0;
142 int *J;				/* will be overlaid on class */
143 long *ixold;			/* will be overlaid on klist */
144 long *ixnew;			/* will be overlaid on file[1] */
145 u_char *chrtran;		/* translation table for case-folding */
146 
147 static void fetch(long *, int, int, FILE *, char *, int);
148 static void output(void);
149 static void check(void);
150 static void range(int, int, char *);
151 static void dump_context_vec(void);
152 static void dump_unified_vec(void);
153 static void prepare(int, FILE *);
154 static void prune(void);
155 static void equiv(struct line *, int, struct line *, int, int *);
156 static void unravel(int);
157 static void unsort(struct line *, int, int *);
158 static void change(int, int, int, int);
159 static void sort(struct line *, int);
160 static int newcand(int, int, int);
161 static int search(int *, int, int);
162 static int skipline(int);
163 static int asciifile(FILE *);
164 static int stone(int *, int, int *, int *);
165 static int readhash(FILE *);
166 
167 /*
168  * chrtran points to one of 2 translation tables: cup2low if folding upper to
169  * lower case clow2low if not folding case
170  */
171 u_char clow2low[256] = {
172 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
173 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
174 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
175 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
176 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
177 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41,
178 	0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c,
179 	0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
180 	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
181 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
182 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
183 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
184 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
185 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
186 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
187 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
188 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
189 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
190 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
191 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
192 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
193 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
194 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
195 	0xfd, 0xfe, 0xff
196 };
197 
198 u_char cup2low[256] = {
199 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
200 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
201 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
202 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
203 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
204 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x60, 0x61,
205 	0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
206 	0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
207 	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x60, 0x61, 0x62,
208 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
209 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
210 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
211 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
212 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
213 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
214 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
215 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
216 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
217 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
218 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
219 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
220 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
221 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
222 	0xfd, 0xfe, 0xff
223 };
224 
225 void
226 diffreg(void)
227 {
228 	char buf1[BUFSIZ], buf2[BUFSIZ];
229 	FILE *f1, *f2;
230 	int i, j;
231 
232 	if (hflag) {
233 		diffargv[0] = "diffh";
234 		execv(diffh, diffargv);
235 		error("%s", diffh);
236 	}
237 	chrtran = (iflag ? cup2low : clow2low);
238 	if (strcmp(file1, "-") == 0 && strcmp(file2, "-") == 0)
239 		errorx("can't specify - -");
240 	if (S_ISDIR(stb1.st_mode)) {
241 		file1 = splice(file1, file2);
242 		if (stat(file1, &stb1) < 0)
243 			error("%s", file1);
244 	} else if (strcmp(file1, "-") == 0 ||
245 	    (!S_ISREG(stb1.st_mode) && strcmp(file1, _PATH_DEVNULL) != 0)) {
246 		file1 = copytemp(file1, 1);
247 		if (stat(file1, &stb1) < 0)
248 			error("%s", file1);
249 	}
250 	if (S_ISDIR(stb2.st_mode)) {
251 		file2 = splice(file2, file1);
252 		if (stat(file2, &stb2) < 0)
253 			error("%s", file2);
254 	} else if (strcmp(file2, "-") == 0 ||
255 	    (!S_ISREG(stb2.st_mode) && strcmp(file2, _PATH_DEVNULL) != 0)) {
256 		file2 = copytemp(file2, 2);
257 		if (stat(file2, &stb2) < 0)
258 			error("%s", file2);
259 	}
260 	if ((f1 = fopen(file1, "r")) == NULL)
261 		error("%s", file1);
262 	if ((f2 = fopen(file2, "r")) == NULL)
263 		error("%s", file2);
264 	if ((stb1.st_mode & S_IFMT) != (stb2.st_mode & S_IFMT) ||
265 	    stb1.st_size != stb2.st_size)
266 		goto notsame;
267 	for (;;) {
268 		i = fread(buf1, 1, BUFSIZ, f1);
269 		j = fread(buf2, 1, BUFSIZ, f2);
270 		if (i < 0 || j < 0 || i != j)
271 			goto notsame;
272 		if (i == 0 && j == 0) {
273 			fclose(f1);
274 			fclose(f2);
275 			status = 0;	/* files don't differ */
276 			goto same;
277 		}
278 		for (j = 0; j < i; j++)
279 			if (buf1[j] != buf2[j])
280 				goto notsame;
281 	}
282 notsame:
283 	/*
284 	 * Files certainly differ at this point; set status accordingly
285 	 */
286 	status = 1;
287 	if (!asciifile(f1) || !asciifile(f2)) {
288 		printf("Binary files %s and %s differ\n", file1, file2);
289 		exit(status);
290 	}
291 	prepare(0, f1);
292 	prepare(1, f2);
293 	fclose(f1);
294 	fclose(f2);
295 	prune();
296 	sort(sfile[0], slen[0]);
297 	sort(sfile[1], slen[1]);
298 
299 	member = (int *)file[1];
300 	equiv(sfile[0], slen[0], sfile[1], slen[1], member);
301 	member = erealloc(member, (slen[1] + 2) * sizeof(int));
302 
303 	class = (int *)file[0];
304 	unsort(sfile[0], slen[0], class);
305 	class = erealloc(class, (slen[0] + 2) * sizeof(int));
306 
307 	klist = emalloc((slen[0] + 2) * sizeof(int));
308 	clist = emalloc(sizeof(cand));
309 	i = stone(class, slen[0], member, klist);
310 	free(member);
311 	free(class);
312 
313 	J = emalloc((len[0] + 2) * sizeof(int));
314 	unravel(klist[i]);
315 	free(clist);
316 	free(klist);
317 
318 	ixold = emalloc((len[0] + 2) * sizeof(long));
319 	ixnew = emalloc((len[1] + 2) * sizeof(long));
320 	check();
321 	output();
322 	status = anychange;
323 same:
324 	if (anychange == 0 && (opt == D_CONTEXT || opt == D_UNIFIED))
325 		printf("No differences encountered\n");
326 }
327 
328 char *tempfiles[2];
329 
330 char *
331 copytemp(const char *file, int n)
332 {
333 	char buf[BUFSIZ], *tempdir, *tempfile;
334 	int i, ifd, ofd;
335 
336 	if (n != 1 && n != 2)
337 		return (NULL);
338 
339 	if (strcmp(file, "-") == 0)
340 		ifd = STDIN_FILENO;
341 	else if ((ifd = open(file, O_RDONLY, 0644)) < 0)
342 		error("%s", file);
343 
344 	if ((tempdir = getenv("TMPDIR")) == NULL)
345 		tempdir = _PATH_TMP;
346 	if (asprintf(&tempfile, "%s/diff%d.XXXXXXXX", tempdir, n) == -1)
347 		error(NULL);
348 	tempfiles[n - 1] = tempfile;
349 
350 	signal(SIGHUP, done);
351 	signal(SIGINT, done);
352 	signal(SIGPIPE, done);
353 	signal(SIGTERM, done);
354 	ofd = mkstemp(tempfile);
355 	if (ofd < 0)
356 		error("%s", tempfile);
357 	while ((i = read(ifd, buf, BUFSIZ)) > 0) {
358 		if (write(ofd, buf, i) != i)
359 			error("%s", tempfile);
360 	}
361 	close(ifd);
362 	close(ofd);
363 	return (tempfile);
364 }
365 
366 char *
367 splice(char *dir, char *file)
368 {
369 	char *tail, *buf;
370 	size_t len;
371 
372 	if (!strcmp(file, "-"))
373 		errorx("can't specify - with other arg directory");
374 	tail = strrchr(file, '/');
375 	if (tail == NULL)
376 		tail = file;
377 	else
378 		tail++;
379 	len = strlen(dir) + 1 + strlen(tail) + 1;
380 	buf = emalloc(len);
381 	snprintf(buf, len, "%s/%s", dir, tail);
382 	return (buf);
383 }
384 
385 static void
386 prepare(int i, FILE *fd)
387 {
388 	struct line *p;
389 	int j, h;
390 
391 	fseek(fd, 0L, SEEK_SET);
392 	p = emalloc(3 * sizeof(struct line));
393 	for (j = 0; (h = readhash(fd));) {
394 		p = erealloc(p, (++j + 3) * sizeof(struct line));
395 		p[j].value = h;
396 	}
397 	len[i] = j;
398 	file[i] = p;
399 }
400 
401 static void
402 prune(void)
403 {
404 	int i, j;
405 
406 	for (pref = 0; pref < len[0] && pref < len[1] &&
407 	    file[0][pref + 1].value == file[1][pref + 1].value;
408 	    pref++)
409 		;
410 	for (suff = 0; suff < len[0] - pref && suff < len[1] - pref &&
411 	    file[0][len[0] - suff].value == file[1][len[1] - suff].value;
412 	    suff++)
413 		;
414 	for (j = 0; j < 2; j++) {
415 		sfile[j] = file[j] + pref;
416 		slen[j] = len[j] - pref - suff;
417 		for (i = 0; i <= slen[j]; i++)
418 			sfile[j][i].serial = i;
419 	}
420 }
421 
422 static void
423 equiv(struct line *a, int n, struct line *b, int m, int *c)
424 {
425 	int i, j;
426 
427 	i = j = 1;
428 	while (i <= n && j <= m) {
429 		if (a[i].value < b[j].value)
430 			a[i++].value = 0;
431 		else if (a[i].value == b[j].value)
432 			a[i++].value = j;
433 		else
434 			j++;
435 	}
436 	while (i <= n)
437 		a[i++].value = 0;
438 	b[m + 1].value = 0;
439 	j = 0;
440 	while (++j <= m) {
441 		c[j] = -b[j].serial;
442 		while (b[j + 1].value == b[j].value) {
443 			j++;
444 			c[j] = b[j].serial;
445 		}
446 	}
447 	c[j] = -1;
448 }
449 
450 static int
451 stone(int *a, int n, int *b, int *c)
452 {
453 	int i, k, y, j, l;
454 	int oldc, tc, oldl;
455 
456 	k = 0;
457 	c[0] = newcand(0, 0, 0);
458 	for (i = 1; i <= n; i++) {
459 		j = a[i];
460 		if (j == 0)
461 			continue;
462 		y = -b[j];
463 		oldl = 0;
464 		oldc = c[0];
465 		do {
466 			if (y <= clist[oldc].y)
467 				continue;
468 			l = search(c, k, y);
469 			if (l != oldl + 1)
470 				oldc = c[l - 1];
471 			if (l <= k) {
472 				if (clist[c[l]].y <= y)
473 					continue;
474 				tc = c[l];
475 				c[l] = newcand(i, y, oldc);
476 				oldc = tc;
477 				oldl = l;
478 			} else {
479 				c[l] = newcand(i, y, oldc);
480 				k++;
481 				break;
482 			}
483 		} while ((y = b[++j]) > 0);
484 	}
485 	return (k);
486 }
487 
488 static int
489 newcand(int x, int y, int pred)
490 {
491 	struct cand *q;
492 
493 	clist = erealloc(clist, ++clen * sizeof(cand));
494 	q = clist + clen - 1;
495 	q->x = x;
496 	q->y = y;
497 	q->pred = pred;
498 	return (clen - 1);
499 }
500 
501 static int
502 search(int *c, int k, int y)
503 {
504 	int i, j, l, t;
505 
506 	if (clist[c[k]].y < y)	/* quick look for typical case */
507 		return (k + 1);
508 	i = 0;
509 	j = k + 1;
510 	while (1) {
511 		l = i + j;
512 		if ((l >>= 1) <= i)
513 			break;
514 		t = clist[c[l]].y;
515 		if (t > y)
516 			j = l;
517 		else if (t < y)
518 			i = l;
519 		else
520 			return (l);
521 	}
522 	return (l + 1);
523 }
524 
525 static void
526 unravel(int p)
527 {
528 	struct cand *q;
529 	int i;
530 
531 	for (i = 0; i <= len[0]; i++)
532 		J[i] = i <= pref ? i :
533 		    i > len[0] - suff ? i + len[1] - len[0] : 0;
534 	for (q = clist + p; q->y != 0; q = clist + q->pred)
535 		J[q->x + pref] = q->y + pref;
536 }
537 
538 /*
539  * Check does double duty:
540  *  1.	ferret out any fortuitous correspondences due
541  *	to confounding by hashing (which result in "jackpot")
542  *  2.  collect random access indexes to the two files
543  */
544 static void
545 check(void)
546 {
547 	int i, j, jackpot, c, d;
548 	long ctold, ctnew;
549 
550 	if ((input[0] = fopen(file1, "r")) == NULL)
551 		error("%s", file1);
552 	if ((input[1] = fopen(file2, "r")) == NULL)
553 		error("%s", file2);
554 	j = 1;
555 	ixold[0] = ixnew[0] = 0;
556 	jackpot = 0;
557 	ctold = ctnew = 0;
558 	for (i = 1; i <= len[0]; i++) {
559 		if (J[i] == 0) {
560 			ixold[i] = ctold += skipline(0);
561 			continue;
562 		}
563 		while (j < J[i]) {
564 			ixnew[j] = ctnew += skipline(1);
565 			j++;
566 		}
567 		if (bflag || wflag || iflag) {
568 			for (;;) {
569 				c = getc(input[0]);
570 				d = getc(input[1]);
571 				ctold++;
572 				ctnew++;
573 				if (bflag && isspace(c) && isspace(d)) {
574 					do {
575 						if (c == '\n')
576 							break;
577 						ctold++;
578 					} while (isspace(c = getc(input[0])));
579 					do {
580 						if (d == '\n')
581 							break;
582 						ctnew++;
583 					} while (isspace(d = getc(input[1])));
584 				} else if (wflag) {
585 					while (isspace(c) && c != '\n') {
586 						c = getc(input[0]);
587 						ctold++;
588 					}
589 					while (isspace(d) && d != '\n') {
590 						d = getc(input[1]);
591 						ctnew++;
592 					}
593 				}
594 				if (chrtran[c] != chrtran[d]) {
595 					jackpot++;
596 					J[i] = 0;
597 					if (c != '\n')
598 						ctold += skipline(0);
599 					if (d != '\n')
600 						ctnew += skipline(1);
601 					break;
602 				}
603 				if (c == '\n')
604 					break;
605 			}
606 		} else {
607 			for (;;) {
608 				ctold++;
609 				ctnew++;
610 				if ((c = getc(input[0])) != (d = getc(input[1]))) {
611 					/* jackpot++; */
612 					J[i] = 0;
613 					if (c != '\n')
614 						ctold += skipline(0);
615 					if (d != '\n')
616 						ctnew += skipline(1);
617 					break;
618 				}
619 				if (c == '\n')
620 					break;
621 			}
622 		}
623 		ixold[i] = ctold;
624 		ixnew[j] = ctnew;
625 		j++;
626 	}
627 	for (; j <= len[1]; j++) {
628 		ixnew[j] = ctnew += skipline(1);
629 	}
630 	fclose(input[0]);
631 	fclose(input[1]);
632 	/*
633 	 * if (jackpot)
634 	 *	fprintf(stderr, "jackpot\n");
635 	 */
636 }
637 
638 /* shellsort CACM #201 */
639 static void
640 sort(struct line *a, int n)
641 {
642 	struct line *ai, *aim, w;
643 	int j, m = 0, k;
644 
645 	if (n == 0)
646 		return;
647 	for (j = 1; j <= n; j *= 2)
648 		m = 2 * j - 1;
649 	for (m /= 2; m != 0; m /= 2) {
650 		k = n - m;
651 		for (j = 1; j <= k; j++) {
652 			for (ai = &a[j]; ai > a; ai -= m) {
653 				aim = &ai[m];
654 				if (aim < ai)
655 					break;	/* wraparound */
656 				if (aim->value > ai[0].value ||
657 				    (aim->value == ai[0].value &&
658 					aim->serial > ai[0].serial))
659 					break;
660 				w.value = ai[0].value;
661 				ai[0].value = aim->value;
662 				aim->value = w.value;
663 				w.serial = ai[0].serial;
664 				ai[0].serial = aim->serial;
665 				aim->serial = w.serial;
666 			}
667 		}
668 	}
669 }
670 
671 static void
672 unsort(struct line *f, int l, int *b)
673 {
674 	int *a, i;
675 
676 	a = emalloc((l + 1) * sizeof(int));
677 	for (i = 1; i <= l; i++)
678 		a[f[i].serial] = f[i].value;
679 	for (i = 1; i <= l; i++)
680 		b[i] = a[i];
681 	free(a);
682 }
683 
684 static int
685 skipline(int f)
686 {
687 	int i, c;
688 
689 	for (i = 1; (c = getc(input[f])) != '\n'; i++)
690 		if (c < 0)
691 			return (i);
692 	return (i);
693 }
694 
695 static void
696 output(void)
697 {
698 	int m, i0, i1, j0, j1;
699 
700 	input[0] = fopen(file1, "r");
701 	input[1] = fopen(file2, "r");
702 	m = len[0];
703 	J[0] = 0;
704 	J[m + 1] = len[1] + 1;
705 	if (opt != D_EDIT) {
706 		for (i0 = 1; i0 <= m; i0 = i1 + 1) {
707 			while (i0 <= m && J[i0] == J[i0 - 1] + 1)
708 				i0++;
709 			j0 = J[i0 - 1] + 1;
710 			i1 = i0 - 1;
711 			while (i1 < m && J[i1 + 1] == 0)
712 				i1++;
713 			j1 = J[i1 + 1] - 1;
714 			J[i1] = j1;
715 			change(i0, i1, j0, j1);
716 		}
717 	} else {
718 		for (i0 = m; i0 >= 1; i0 = i1 - 1) {
719 			while (i0 >= 1 && J[i0] == J[i0 + 1] - 1 && J[i0] != 0)
720 				i0--;
721 			j0 = J[i0 + 1] - 1;
722 			i1 = i0 + 1;
723 			while (i1 > 1 && J[i1 - 1] == 0)
724 				i1--;
725 			j1 = J[i1 - 1] + 1;
726 			J[i1] = j1;
727 			change(i1, i0, j1, j0);
728 		}
729 	}
730 	if (m == 0)
731 		change(1, 0, 1, len[1]);
732 	if (opt == D_IFDEF) {
733 		for (;;) {
734 #define	c i0
735 			c = getc(input[0]);
736 			if (c < 0)
737 				return;
738 			putchar(c);
739 		}
740 #undef c
741 	}
742 	if (anychange != 0) {
743 		if (opt == D_CONTEXT)
744 			dump_context_vec();
745 		else if (opt == D_UNIFIED)
746 			dump_unified_vec();
747 	}
748 }
749 
750 /*
751  * The following struct is used to record change information when
752  * doing a "context" diff.  (see routine "change" to understand the
753  * highly mneumonic field names)
754  */
755 struct context_vec {
756 	int a;			/* start line in old file */
757 	int b;			/* end line in old file */
758 	int c;			/* start line in new file */
759 	int d;			/* end line in new file */
760 };
761 
762 struct context_vec *context_vec_start, *context_vec_end, *context_vec_ptr;
763 
764 #define	MAX_CONTEXT	128
765 
766 /*
767  * indicate that there is a difference between lines a and b of the from file
768  * to get to lines c to d of the to file. If a is greater then b then there
769  * are no lines in the from file involved and this means that there were
770  * lines appended (beginning at b). If c is greater than d then there are
771  * lines missing from the to file.
772  */
773 static void
774 change(int a, int b, int c, int d)
775 {
776 	struct stat stbuf;
777 
778 	if (opt != D_IFDEF && a > b && c > d)
779 		return;
780 	if (anychange == 0) {
781 		anychange = 1;
782 		if (opt == D_CONTEXT || opt == D_UNIFIED) {
783 			stat(file1, &stbuf);
784 			printf("%s %s	%s", opt == D_CONTEXT ? "***" : "---",
785 			   file1, ctime(&stbuf.st_mtime));
786 			stat(file2, &stbuf);
787 			printf("%s %s	%s", opt == D_CONTEXT ? "---" : "+++",
788 			    file2, ctime(&stbuf.st_mtime));
789 			context_vec_start = emalloc(MAX_CONTEXT *
790 			    sizeof(struct context_vec));
791 			context_vec_end = context_vec_start + MAX_CONTEXT;
792 			context_vec_ptr = context_vec_start - 1;
793 		}
794 	}
795 	if (opt == D_CONTEXT || opt == D_UNIFIED) {
796 		/*
797 		 * If this new change is within 'context' lines of
798 		 * the previous change, just add it to the change
799 		 * record.  If the record is full or if this
800 		 * change is more than 'context' lines from the previous
801 		 * change, dump the record, reset it & add the new change.
802 		 */
803 		if (context_vec_ptr >= context_vec_end ||
804 		    (context_vec_ptr >= context_vec_start &&
805 		    a > (context_vec_ptr->b + 2 * context) &&
806 		    c > (context_vec_ptr->d + 2 * context))) {
807 			if (opt == D_CONTEXT)
808 				dump_context_vec();
809 			else
810 				dump_unified_vec();
811 		}
812 		context_vec_ptr++;
813 		context_vec_ptr->a = a;
814 		context_vec_ptr->b = b;
815 		context_vec_ptr->c = c;
816 		context_vec_ptr->d = d;
817 		return;
818 	}
819 	switch (opt) {
820 
821 	case D_NORMAL:
822 	case D_EDIT:
823 		range(a, b, ",");
824 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
825 		if (opt == D_NORMAL)
826 			range(c, d, ",");
827 		putchar('\n');
828 		break;
829 	case D_REVERSE:
830 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
831 		range(a, b, " ");
832 		putchar('\n');
833 		break;
834 	case D_NREVERSE:
835 		if (a > b)
836 			printf("a%d %d\n", b, d - c + 1);
837 		else {
838 			printf("d%d %d\n", a, b - a + 1);
839 			if (!(c > d))
840 				/* add changed lines */
841 				printf("a%d %d\n", b, d - c + 1);
842 		}
843 		break;
844 	}
845 	if (opt == D_NORMAL || opt == D_IFDEF) {
846 		fetch(ixold, a, b, input[0], "< ", 1);
847 		if (a <= b && c <= d && opt == D_NORMAL)
848 			prints("---\n");
849 	}
850 	fetch(ixnew, c, d, input[1], opt == D_NORMAL ? "> " : "", 0);
851 	if ((opt == D_EDIT || opt == D_REVERSE) && c <= d)
852 		prints(".\n");
853 	if (inifdef) {
854 		fprintf(stdout, "#endif /* %s */\n", ifdefname);
855 		inifdef = 0;
856 	}
857 }
858 
859 static void
860 range(int a, int b, char *separator)
861 {
862 	printf("%d", a > b ? b : a);
863 	if (a < b)
864 		printf("%s%d", separator, b);
865 }
866 
867 static void
868 fetch(long *f, int a, int b, FILE *lb, char *s, int oldfile)
869 {
870 	int i, j, c, col, nc;
871 
872 	/*
873 	 * When doing #ifdef's, copy down to current line
874 	 * if this is the first file, so that stuff makes it to output.
875 	 */
876 	if (opt == D_IFDEF && oldfile) {
877 		long curpos = ftell(lb);
878 		/* print through if append (a>b), else to (nb: 0 vs 1 orig) */
879 		nc = f[a > b ? b : a - 1] - curpos;
880 		for (i = 0; i < nc; i++)
881 			putchar(getc(lb));
882 	}
883 	if (a > b)
884 		return;
885 	if (opt == D_IFDEF) {
886 		if (inifdef) {
887 			fprintf(stdout, "#else /* %s%s */\n",
888 			    oldfile == 1 ? "!" : "", ifdefname);
889 		} else {
890 			if (oldfile)
891 				fprintf(stdout, "#ifndef %s\n", ifdefname);
892 			else
893 				fprintf(stdout, "#ifdef %s\n", ifdefname);
894 		}
895 		inifdef = 1 + oldfile;
896 	}
897 	for (i = a; i <= b; i++) {
898 		fseek(lb, f[i - 1], SEEK_SET);
899 		nc = f[i] - f[i - 1];
900 		if (opt != D_IFDEF)
901 			prints(s);
902 		col = 0;
903 		for (j = 0; j < nc; j++) {
904 			c = getc(lb);
905 			if (c == '\t' && tflag)
906 				do
907 					putchar(' ');
908 				while (++col & 7);
909 			else {
910 				putchar(c);
911 				col++;
912 			}
913 		}
914 	}
915 }
916 
917 #define POW2			/* define only if HALFLONG is 2**n */
918 #define HALFLONG 16
919 #define low(x)	(x&((1L<<HALFLONG)-1))
920 #define high(x)	(x>>HALFLONG)
921 
922 /*
923  * hashing has the effect of
924  * arranging line in 7-bit bytes and then
925  * summing 1-s complement in 16-bit hunks
926  */
927 static int
928 readhash(FILE *f)
929 {
930 	unsigned int shift;
931 	int t, space;
932 	long sum;
933 
934 	sum = 1;
935 	space = 0;
936 	if (!bflag && !wflag) {
937 		if (iflag)
938 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
939 				if (t == -1)
940 					return (0);
941 				sum += (long)chrtran[t] << (shift
942 #ifdef POW2
943 				    &= HALFLONG - 1);
944 #else
945 				    %= HALFLONG);
946 #endif
947 			}
948 		else
949 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
950 				if (t == -1)
951 					return (0);
952 				sum += (long)t << (shift
953 #ifdef POW2
954 				    &= HALFLONG - 1);
955 #else
956 				    %= HALFLONG);
957 #endif
958 			}
959 	} else {
960 		for (shift = 0;;) {
961 			switch (t = getc(f)) {
962 			case -1:
963 				return (0);
964 			case '\t':
965 			case ' ':
966 				space++;
967 				continue;
968 			default:
969 				if (space && !wflag) {
970 					shift += 7;
971 					space = 0;
972 				}
973 				sum += (long)chrtran[t] << (shift
974 #ifdef POW2
975 				    &= HALFLONG - 1);
976 #else
977 				    %= HALFLONG);
978 #endif
979 				shift += 7;
980 				continue;
981 			case '\n':
982 				break;
983 			}
984 			break;
985 		}
986 	}
987 	sum = low(sum) + high(sum);
988 	return ((short) low(sum) + (short) high(sum));
989 }
990 
991 static int
992 asciifile(FILE *f)
993 {
994 	char buf[BUFSIZ], *cp;
995 	int cnt;
996 
997 	if (aflag)
998 		return (1);
999 
1000 	fseek(f, 0L, SEEK_SET);
1001 	cnt = fread(buf, 1, BUFSIZ, f);
1002 	cp = buf;
1003 	while (--cnt >= 0)
1004 		if (*cp++ & 0200)
1005 			return (0);
1006 	return (1);
1007 }
1008 
1009 /* dump accumulated "context" diff changes */
1010 static void
1011 dump_context_vec(void)
1012 {
1013 	struct context_vec *cvp = context_vec_start;
1014 	int lowa, upb, lowc, upd, do_output;
1015 	int a, b, c, d;
1016 	char ch;
1017 
1018 	if (context_vec_start > context_vec_ptr)
1019 		return;
1020 
1021 	b = d = 0;		/* gcc */
1022 	lowa = max(1, cvp->a - context);
1023 	upb = min(len[0], context_vec_ptr->b + context);
1024 	lowc = max(1, cvp->c - context);
1025 	upd = min(len[1], context_vec_ptr->d + context);
1026 
1027 	printf("***************\n*** ");
1028 	range(lowa, upb, ",");
1029 	printf(" ****\n");
1030 
1031 	/*
1032 	 * output changes to the "old" file.  The first loop suppresses
1033 	 * output if there were no changes to the "old" file (we'll see
1034 	 * the "old" lines as context in the "new" list).
1035 	 */
1036 	do_output = 0;
1037 	for (; cvp <= context_vec_ptr; cvp++)
1038 		if (cvp->a <= cvp->b) {
1039 			cvp = context_vec_start;
1040 			do_output++;
1041 			break;
1042 		}
1043 	if (do_output) {
1044 		while (cvp <= context_vec_ptr) {
1045 			a = cvp->a;
1046 			b = cvp->b;
1047 			c = cvp->c;
1048 			d = cvp->d;
1049 
1050 			if (a <= b && c <= d)
1051 				ch = 'c';
1052 			else
1053 				ch = (a <= b) ? 'd' : 'a';
1054 
1055 			if (ch == 'a')
1056 				fetch(ixold, lowa, b, input[0], "  ", 0);
1057 			else {
1058 				fetch(ixold, lowa, a - 1, input[0], "  ", 0);
1059 				fetch(ixold, a, b, input[0],
1060 				    ch == 'c' ? "! " : "- ", 0);
1061 			}
1062 			lowa = b + 1;
1063 			cvp++;
1064 		}
1065 		fetch(ixold, b + 1, upb, input[0], "  ", 0);
1066 	}
1067 	/* output changes to the "new" file */
1068 	printf("--- ");
1069 	range(lowc, upd, ",");
1070 	printf(" ----\n");
1071 
1072 	do_output = 0;
1073 	for (cvp = context_vec_start; cvp <= context_vec_ptr; cvp++)
1074 		if (cvp->c <= cvp->d) {
1075 			cvp = context_vec_start;
1076 			do_output++;
1077 			break;
1078 		}
1079 	if (do_output) {
1080 		while (cvp <= context_vec_ptr) {
1081 			a = cvp->a;
1082 			b = cvp->b;
1083 			c = cvp->c;
1084 			d = cvp->d;
1085 
1086 			if (a <= b && c <= d)
1087 				ch = 'c';
1088 			else
1089 				ch = (a <= b) ? 'd' : 'a';
1090 
1091 			if (ch == 'd')
1092 				fetch(ixnew, lowc, d, input[1], "  ", 0);
1093 			else {
1094 				fetch(ixnew, lowc, c - 1, input[1], "  ", 0);
1095 				fetch(ixnew, c, d, input[1],
1096 				    ch == 'c' ? "! " : "+ ", 0);
1097 			}
1098 			lowc = d + 1;
1099 			cvp++;
1100 		}
1101 		fetch(ixnew, d + 1, upd, input[1], "  ", 0);
1102 	}
1103 	context_vec_ptr = context_vec_start - 1;
1104 }
1105 
1106 /* dump accumulated "unified" diff changes */
1107 static void
1108 dump_unified_vec(void)
1109 {
1110 	struct context_vec *cvp = context_vec_start;
1111 	int lowa, upb, lowc, upd;
1112 	int a, b, c, d;
1113 	char ch;
1114 
1115 	if (context_vec_start > context_vec_ptr)
1116 		return;
1117 
1118 	b = d = 0;		/* gcc */
1119 	lowa = max(1, cvp->a - context);
1120 	upb = min(len[0], context_vec_ptr->b + context);
1121 	lowc = max(1, cvp->c - context);
1122 	upd = min(len[1], context_vec_ptr->d + context);
1123 
1124 	printf("@@ -%d,%d +%d,%d @@\n", lowa, upb - lowa + 1,
1125 	    lowc, upd - lowc + 1);
1126 
1127 	/*
1128 	 * Output changes in "unified" diff format--the old and new lines
1129 	 * are printed together.
1130 	 */
1131 	for (; cvp <= context_vec_ptr; cvp++) {
1132 		a = cvp->a;
1133 		b = cvp->b;
1134 		c = cvp->c;
1135 		d = cvp->d;
1136 
1137 		/*
1138 		 * c: both new and old changes
1139 		 * d: only changes in the old file
1140 		 * a: only changes in the new file
1141 		 */
1142 		if (a <= b && c <= d)
1143 			ch = 'c';
1144 		else
1145 			ch = (a <= b) ? 'd' : 'a';
1146 
1147 		switch (ch) {
1148 		case 'c':
1149 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1150 			fetch(ixold, a, b, input[0], "-", 0);
1151 			fetch(ixnew, c, d, input[1], "+", 0);
1152 			break;
1153 		case 'd':
1154 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1155 			fetch(ixold, a, b, input[0], "-", 0);
1156 			break;
1157 		case 'a':
1158 			fetch(ixnew, lowc, c - 1, input[1], " ", 0);
1159 			fetch(ixnew, c, d, input[1], "+", 0);
1160 			break;
1161 		}
1162 		lowa = b + 1;
1163 		lowc = d + 1;
1164 	}
1165 	fetch(ixnew, d + 1, upd, input[1], " ", 0);
1166 
1167 	context_vec_ptr = context_vec_start - 1;
1168 }
1169