xref: /openbsd-src/usr.bin/diff/diffreg.c (revision c5b7f86462e55194e79a600d3e99fe008702f530)
1 /*	$OpenBSD: diffreg.c,v 1.25 2003/07/04 02:54:36 millert Exp $	*/
2 
3 /*
4  * Copyright (C) Caldera International Inc.  2001-2002.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code and documentation must retain the above
11  *    copyright notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed or owned by Caldera
18  *	International, Inc.
19  * 4. Neither the name of Caldera International, Inc. nor the names of other
20  *    contributors may be used to endorse or promote products derived from
21  *    this software without specific prior written permission.
22  *
23  * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
24  * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
28  * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
33  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 #include <sys/types.h>
38 
39 #include <stdlib.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <string.h>
43 
44 #include "diff.h"
45 #include "pathnames.h"
46 
47 #if 0
48 static char const sccsid[] = "@(#)diffreg.c 4.21 4/6/90";
49 #endif
50 
51 /*
52  * diff - compare two files.
53  */
54 
55 /*
56  *	Uses an algorithm due to Harold Stone, which finds
57  *	a pair of longest identical subsequences in the two
58  *	files.
59  *
60  *	The major goal is to generate the match vector J.
61  *	J[i] is the index of the line in file1 corresponding
62  *	to line i file0. J[i] = 0 if there is no
63  *	such line in file1.
64  *
65  *	Lines are hashed so as to work in core. All potential
66  *	matches are located by sorting the lines of each file
67  *	on the hash (called ``value''). In particular, this
68  *	collects the equivalence classes in file1 together.
69  *	Subroutine equiv replaces the value of each line in
70  *	file0 by the index of the first element of its
71  *	matching equivalence in (the reordered) file1.
72  *	To save space equiv squeezes file1 into a single
73  *	array member in which the equivalence classes
74  *	are simply concatenated, except that their first
75  *	members are flagged by changing sign.
76  *
77  *	Next the indices that point into member are unsorted into
78  *	array class according to the original order of file0.
79  *
80  *	The cleverness lies in routine stone. This marches
81  *	through the lines of file0, developing a vector klist
82  *	of "k-candidates". At step i a k-candidate is a matched
83  *	pair of lines x,y (x in file0 y in file1) such that
84  *	there is a common subsequence of length k
85  *	between the first i lines of file0 and the first y
86  *	lines of file1, but there is no such subsequence for
87  *	any smaller y. x is the earliest possible mate to y
88  *	that occurs in such a subsequence.
89  *
90  *	Whenever any of the members of the equivalence class of
91  *	lines in file1 matable to a line in file0 has serial number
92  *	less than the y of some k-candidate, that k-candidate
93  *	with the smallest such y is replaced. The new
94  *	k-candidate is chained (via pred) to the current
95  *	k-1 candidate so that the actual subsequence can
96  *	be recovered. When a member has serial number greater
97  *	that the y of all k-candidates, the klist is extended.
98  *	At the end, the longest subsequence is pulled out
99  *	and placed in the array J by unravel
100  *
101  *	With J in hand, the matches there recorded are
102  *	check'ed against reality to assure that no spurious
103  *	matches have crept in due to hashing. If they have,
104  *	they are broken, and "jackpot" is recorded--a harmless
105  *	matter except that a true match for a spuriously
106  *	mated line may now be unnecessarily reported as a change.
107  *
108  *	Much of the complexity of the program comes simply
109  *	from trying to minimize core utilization and
110  *	maximize the range of doable problems by dynamically
111  *	allocating what is needed and reusing what is not.
112  *	The core requirements for problems larger than somewhat
113  *	are (in words) 2*length(file0) + length(file1) +
114  *	3*(number of k-candidates installed),  typically about
115  *	6n words for files of length n.
116  */
117 
118 #define	prints(s)	fputs(s,stdout)
119 
120 FILE *input[2];
121 
122 struct cand {
123 	int x;
124 	int y;
125 	int pred;
126 } cand;
127 
128 struct line {
129 	int serial;
130 	int value;
131 } *file[2];
132 
133 int len[2];
134 struct line *sfile[2];	/* shortened by pruning common prefix and suffix */
135 int slen[2];
136 int pref, suff;			/* length of prefix and suffix */
137 int inifdef;			/* whether or not we are in a #ifdef block */
138 int *class;			/* will be overlaid on file[0] */
139 int *member;			/* will be overlaid on file[1] */
140 int *klist;			/* will be overlaid on file[0] after class */
141 struct cand *clist;		/* merely a free storage pot for candidates */
142 int clen = 0;
143 int *J;				/* will be overlaid on class */
144 long *ixold;			/* will be overlaid on klist */
145 long *ixnew;			/* will be overlaid on file[1] */
146 u_char *chrtran;		/* translation table for case-folding */
147 
148 static void fetch(long *, int, int, FILE *, char *, int);
149 static void output(void);
150 static void check(void);
151 static void range(int, int, char *);
152 static void dump_context_vec(void);
153 static void dump_unified_vec(void);
154 static void prepare(int, FILE *);
155 static void prune(void);
156 static void equiv(struct line *, int, struct line *, int, int *);
157 static void unravel(int);
158 static void unsort(struct line *, int, int *);
159 static void change(int, int, int, int);
160 static void sort(struct line *, int);
161 static int newcand(int, int, int);
162 static int search(int *, int, int);
163 static int skipline(int);
164 static int asciifile(FILE *);
165 static int stone(int *, int, int *, int *);
166 static int readhash(FILE *);
167 
168 /*
169  * chrtran points to one of 2 translation tables: cup2low if folding upper to
170  * lower case clow2low if not folding case
171  */
172 u_char clow2low[256] = {
173 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
174 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
175 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
176 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
177 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
178 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41,
179 	0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c,
180 	0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
181 	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
182 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
183 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
184 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
185 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
186 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
187 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
188 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
189 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
190 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
191 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
192 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
193 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
194 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
195 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
196 	0xfd, 0xfe, 0xff
197 };
198 
199 u_char cup2low[256] = {
200 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
201 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
202 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
203 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
204 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
205 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x60, 0x61,
206 	0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
207 	0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
208 	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x60, 0x61, 0x62,
209 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
210 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
211 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
212 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
213 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
214 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
215 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
216 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
217 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
218 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
219 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
220 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
221 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
222 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
223 	0xfd, 0xfe, 0xff
224 };
225 
226 void
227 diffreg(void)
228 {
229 	char buf1[BUFSIZ], buf2[BUFSIZ];
230 	FILE *f1, *f2;
231 	int i, j;
232 
233 	if (hflag) {
234 		diffargv[0] = "diffh";
235 		execv(_PATH_DIFFH, diffargv);
236 		error("%s", _PATH_DIFFH);
237 	}
238 	chrtran = (iflag ? cup2low : clow2low);
239 	if (strcmp(file1, "-") == 0 && strcmp(file2, "-") == 0)
240 		errorx("can't specify - -");
241 	if (S_ISDIR(stb1.st_mode)) {
242 		file1 = splice(file1, file2);
243 		if (stat(file1, &stb1) < 0)
244 			error("%s", file1);
245 	} else if (strcmp(file1, "-") == 0 ||
246 	    (!S_ISREG(stb1.st_mode) && strcmp(file1, _PATH_DEVNULL) != 0)) {
247 		file1 = copytemp(file1, 1);
248 		if (stat(file1, &stb1) < 0)
249 			error("%s", file1);
250 	}
251 	if (S_ISDIR(stb2.st_mode)) {
252 		file2 = splice(file2, file1);
253 		if (stat(file2, &stb2) < 0)
254 			error("%s", file2);
255 	} else if (strcmp(file2, "-") == 0 ||
256 	    (!S_ISREG(stb2.st_mode) && strcmp(file2, _PATH_DEVNULL) != 0)) {
257 		file2 = copytemp(file2, 2);
258 		if (stat(file2, &stb2) < 0)
259 			error("%s", file2);
260 	}
261 	if ((f1 = fopen(file1, "r")) == NULL)
262 		error("%s", file1);
263 	if ((f2 = fopen(file2, "r")) == NULL)
264 		error("%s", file2);
265 	if ((stb1.st_mode & S_IFMT) != (stb2.st_mode & S_IFMT) ||
266 	    stb1.st_size != stb2.st_size)
267 		goto notsame;
268 	for (;;) {
269 		i = fread(buf1, 1, BUFSIZ, f1);
270 		j = fread(buf2, 1, BUFSIZ, f2);
271 		if (i < 0 || j < 0 || i != j)
272 			goto notsame;
273 		if (i == 0 && j == 0) {
274 			fclose(f1);
275 			fclose(f2);
276 			status = 0;	/* files don't differ */
277 			goto same;
278 		}
279 		for (j = 0; j < i; j++)
280 			if (buf1[j] != buf2[j])
281 				goto notsame;
282 	}
283 notsame:
284 	/*
285 	 * Files certainly differ at this point; set status accordingly
286 	 */
287 	status = 1;
288 	if (!asciifile(f1) || !asciifile(f2)) {
289 		printf("Binary files %s and %s differ\n", file1, file2);
290 		exit(status);
291 	}
292 	prepare(0, f1);
293 	prepare(1, f2);
294 	fclose(f1);
295 	fclose(f2);
296 	prune();
297 	sort(sfile[0], slen[0]);
298 	sort(sfile[1], slen[1]);
299 
300 	member = (int *)file[1];
301 	equiv(sfile[0], slen[0], sfile[1], slen[1], member);
302 	member = erealloc(member, (slen[1] + 2) * sizeof(int));
303 
304 	class = (int *)file[0];
305 	unsort(sfile[0], slen[0], class);
306 	class = erealloc(class, (slen[0] + 2) * sizeof(int));
307 
308 	klist = emalloc((slen[0] + 2) * sizeof(int));
309 	clist = emalloc(sizeof(cand));
310 	i = stone(class, slen[0], member, klist);
311 	free(member);
312 	free(class);
313 
314 	J = emalloc((len[0] + 2) * sizeof(int));
315 	unravel(klist[i]);
316 	free(clist);
317 	free(klist);
318 
319 	ixold = emalloc((len[0] + 2) * sizeof(long));
320 	ixnew = emalloc((len[1] + 2) * sizeof(long));
321 	check();
322 	output();
323 	status = anychange;
324 same:
325 	if (anychange == 0 && (opt == D_CONTEXT || opt == D_UNIFIED))
326 		printf("No differences encountered\n");
327 }
328 
329 char *tempfiles[2];
330 
331 char *
332 copytemp(const char *file, int n)
333 {
334 	char buf[BUFSIZ], *tempdir, *tempfile;
335 	int i, ifd, ofd;
336 
337 	if (n != 1 && n != 2)
338 		return (NULL);
339 
340 	if (strcmp(file, "-") == 0)
341 		ifd = STDIN_FILENO;
342 	else if ((ifd = open(file, O_RDONLY, 0644)) < 0)
343 		error("%s", file);
344 
345 	if ((tempdir = getenv("TMPDIR")) == NULL)
346 		tempdir = _PATH_TMP;
347 	if (asprintf(&tempfile, "%s/diff%d.XXXXXXXX", tempdir, n) == -1)
348 		error(NULL);
349 	tempfiles[n - 1] = tempfile;
350 
351 	signal(SIGHUP, done);
352 	signal(SIGINT, done);
353 	signal(SIGPIPE, done);
354 	signal(SIGTERM, done);
355 	ofd = mkstemp(tempfile);
356 	if (ofd < 0)
357 		error("%s", tempfile);
358 	while ((i = read(ifd, buf, BUFSIZ)) > 0) {
359 		if (write(ofd, buf, i) != i)
360 			error("%s", tempfile);
361 	}
362 	close(ifd);
363 	close(ofd);
364 	return (tempfile);
365 }
366 
367 char *
368 splice(char *dir, char *file)
369 {
370 	char *tail, *buf;
371 	size_t len;
372 
373 	if (!strcmp(file, "-"))
374 		errorx("can't specify - with other arg directory");
375 	tail = strrchr(file, '/');
376 	if (tail == NULL)
377 		tail = file;
378 	else
379 		tail++;
380 	len = strlen(dir) + 1 + strlen(tail) + 1;
381 	buf = emalloc(len);
382 	snprintf(buf, len, "%s/%s", dir, tail);
383 	return (buf);
384 }
385 
386 static void
387 prepare(int i, FILE *fd)
388 {
389 	struct line *p;
390 	int j, h;
391 
392 	fseek(fd, 0L, SEEK_SET);
393 	p = emalloc(3 * sizeof(struct line));
394 	for (j = 0; (h = readhash(fd));) {
395 		p = erealloc(p, (++j + 3) * sizeof(struct line));
396 		p[j].value = h;
397 	}
398 	len[i] = j;
399 	file[i] = p;
400 }
401 
402 static void
403 prune(void)
404 {
405 	int i, j;
406 
407 	for (pref = 0; pref < len[0] && pref < len[1] &&
408 	    file[0][pref + 1].value == file[1][pref + 1].value;
409 	    pref++)
410 		;
411 	for (suff = 0; suff < len[0] - pref && suff < len[1] - pref &&
412 	    file[0][len[0] - suff].value == file[1][len[1] - suff].value;
413 	    suff++)
414 		;
415 	for (j = 0; j < 2; j++) {
416 		sfile[j] = file[j] + pref;
417 		slen[j] = len[j] - pref - suff;
418 		for (i = 0; i <= slen[j]; i++)
419 			sfile[j][i].serial = i;
420 	}
421 }
422 
423 static void
424 equiv(struct line *a, int n, struct line *b, int m, int *c)
425 {
426 	int i, j;
427 
428 	i = j = 1;
429 	while (i <= n && j <= m) {
430 		if (a[i].value < b[j].value)
431 			a[i++].value = 0;
432 		else if (a[i].value == b[j].value)
433 			a[i++].value = j;
434 		else
435 			j++;
436 	}
437 	while (i <= n)
438 		a[i++].value = 0;
439 	b[m + 1].value = 0;
440 	j = 0;
441 	while (++j <= m) {
442 		c[j] = -b[j].serial;
443 		while (b[j + 1].value == b[j].value) {
444 			j++;
445 			c[j] = b[j].serial;
446 		}
447 	}
448 	c[j] = -1;
449 }
450 
451 static int
452 stone(int *a, int n, int *b, int *c)
453 {
454 	int i, k, y, j, l;
455 	int oldc, tc, oldl;
456 
457 	k = 0;
458 	c[0] = newcand(0, 0, 0);
459 	for (i = 1; i <= n; i++) {
460 		j = a[i];
461 		if (j == 0)
462 			continue;
463 		y = -b[j];
464 		oldl = 0;
465 		oldc = c[0];
466 		do {
467 			if (y <= clist[oldc].y)
468 				continue;
469 			l = search(c, k, y);
470 			if (l != oldl + 1)
471 				oldc = c[l - 1];
472 			if (l <= k) {
473 				if (clist[c[l]].y <= y)
474 					continue;
475 				tc = c[l];
476 				c[l] = newcand(i, y, oldc);
477 				oldc = tc;
478 				oldl = l;
479 			} else {
480 				c[l] = newcand(i, y, oldc);
481 				k++;
482 				break;
483 			}
484 		} while ((y = b[++j]) > 0);
485 	}
486 	return (k);
487 }
488 
489 static int
490 newcand(int x, int y, int pred)
491 {
492 	struct cand *q;
493 
494 	clist = erealloc(clist, ++clen * sizeof(cand));
495 	q = clist + clen - 1;
496 	q->x = x;
497 	q->y = y;
498 	q->pred = pred;
499 	return (clen - 1);
500 }
501 
502 static int
503 search(int *c, int k, int y)
504 {
505 	int i, j, l, t;
506 
507 	if (clist[c[k]].y < y)	/* quick look for typical case */
508 		return (k + 1);
509 	i = 0;
510 	j = k + 1;
511 	while (1) {
512 		l = i + j;
513 		if ((l >>= 1) <= i)
514 			break;
515 		t = clist[c[l]].y;
516 		if (t > y)
517 			j = l;
518 		else if (t < y)
519 			i = l;
520 		else
521 			return (l);
522 	}
523 	return (l + 1);
524 }
525 
526 static void
527 unravel(int p)
528 {
529 	struct cand *q;
530 	int i;
531 
532 	for (i = 0; i <= len[0]; i++)
533 		J[i] = i <= pref ? i :
534 		    i > len[0] - suff ? i + len[1] - len[0] : 0;
535 	for (q = clist + p; q->y != 0; q = clist + q->pred)
536 		J[q->x + pref] = q->y + pref;
537 }
538 
539 /*
540  * Check does double duty:
541  *  1.	ferret out any fortuitous correspondences due
542  *	to confounding by hashing (which result in "jackpot")
543  *  2.  collect random access indexes to the two files
544  */
545 static void
546 check(void)
547 {
548 	int i, j, jackpot, c, d;
549 	long ctold, ctnew;
550 
551 	if ((input[0] = fopen(file1, "r")) == NULL)
552 		error("%s", file1);
553 	if ((input[1] = fopen(file2, "r")) == NULL)
554 		error("%s", file2);
555 	j = 1;
556 	ixold[0] = ixnew[0] = 0;
557 	jackpot = 0;
558 	ctold = ctnew = 0;
559 	for (i = 1; i <= len[0]; i++) {
560 		if (J[i] == 0) {
561 			ixold[i] = ctold += skipline(0);
562 			continue;
563 		}
564 		while (j < J[i]) {
565 			ixnew[j] = ctnew += skipline(1);
566 			j++;
567 		}
568 		if (bflag || wflag || iflag) {
569 			for (;;) {
570 				c = getc(input[0]);
571 				d = getc(input[1]);
572 				ctold++;
573 				ctnew++;
574 				if (bflag && isspace(c) && isspace(d)) {
575 					do {
576 						if (c == '\n')
577 							break;
578 						ctold++;
579 					} while (isspace(c = getc(input[0])));
580 					do {
581 						if (d == '\n')
582 							break;
583 						ctnew++;
584 					} while (isspace(d = getc(input[1])));
585 				} else if (wflag) {
586 					while (isspace(c) && c != '\n') {
587 						c = getc(input[0]);
588 						ctold++;
589 					}
590 					while (isspace(d) && d != '\n') {
591 						d = getc(input[1]);
592 						ctnew++;
593 					}
594 				}
595 				if (chrtran[c] != chrtran[d]) {
596 					jackpot++;
597 					J[i] = 0;
598 					if (c != '\n')
599 						ctold += skipline(0);
600 					if (d != '\n')
601 						ctnew += skipline(1);
602 					break;
603 				}
604 				if (c == '\n')
605 					break;
606 			}
607 		} else {
608 			for (;;) {
609 				ctold++;
610 				ctnew++;
611 				if ((c = getc(input[0])) != (d = getc(input[1]))) {
612 					/* jackpot++; */
613 					J[i] = 0;
614 					if (c != '\n')
615 						ctold += skipline(0);
616 					if (d != '\n')
617 						ctnew += skipline(1);
618 					break;
619 				}
620 				if (c == '\n')
621 					break;
622 			}
623 		}
624 		ixold[i] = ctold;
625 		ixnew[j] = ctnew;
626 		j++;
627 	}
628 	for (; j <= len[1]; j++) {
629 		ixnew[j] = ctnew += skipline(1);
630 	}
631 	fclose(input[0]);
632 	fclose(input[1]);
633 	/*
634 	 * if (jackpot)
635 	 *	fprintf(stderr, "jackpot\n");
636 	 */
637 }
638 
639 /* shellsort CACM #201 */
640 static void
641 sort(struct line *a, int n)
642 {
643 	struct line *ai, *aim, w;
644 	int j, m = 0, k;
645 
646 	if (n == 0)
647 		return;
648 	for (j = 1; j <= n; j *= 2)
649 		m = 2 * j - 1;
650 	for (m /= 2; m != 0; m /= 2) {
651 		k = n - m;
652 		for (j = 1; j <= k; j++) {
653 			for (ai = &a[j]; ai > a; ai -= m) {
654 				aim = &ai[m];
655 				if (aim < ai)
656 					break;	/* wraparound */
657 				if (aim->value > ai[0].value ||
658 				    (aim->value == ai[0].value &&
659 					aim->serial > ai[0].serial))
660 					break;
661 				w.value = ai[0].value;
662 				ai[0].value = aim->value;
663 				aim->value = w.value;
664 				w.serial = ai[0].serial;
665 				ai[0].serial = aim->serial;
666 				aim->serial = w.serial;
667 			}
668 		}
669 	}
670 }
671 
672 static void
673 unsort(struct line *f, int l, int *b)
674 {
675 	int *a, i;
676 
677 	a = emalloc((l + 1) * sizeof(int));
678 	for (i = 1; i <= l; i++)
679 		a[f[i].serial] = f[i].value;
680 	for (i = 1; i <= l; i++)
681 		b[i] = a[i];
682 	free(a);
683 }
684 
685 static int
686 skipline(int f)
687 {
688 	int i, c;
689 
690 	for (i = 1; (c = getc(input[f])) != '\n'; i++)
691 		if (c < 0)
692 			return (i);
693 	return (i);
694 }
695 
696 static void
697 output(void)
698 {
699 	int m, i0, i1, j0, j1;
700 
701 	input[0] = fopen(file1, "r");
702 	input[1] = fopen(file2, "r");
703 	m = len[0];
704 	J[0] = 0;
705 	J[m + 1] = len[1] + 1;
706 	if (opt != D_EDIT) {
707 		for (i0 = 1; i0 <= m; i0 = i1 + 1) {
708 			while (i0 <= m && J[i0] == J[i0 - 1] + 1)
709 				i0++;
710 			j0 = J[i0 - 1] + 1;
711 			i1 = i0 - 1;
712 			while (i1 < m && J[i1 + 1] == 0)
713 				i1++;
714 			j1 = J[i1 + 1] - 1;
715 			J[i1] = j1;
716 			change(i0, i1, j0, j1);
717 		}
718 	} else {
719 		for (i0 = m; i0 >= 1; i0 = i1 - 1) {
720 			while (i0 >= 1 && J[i0] == J[i0 + 1] - 1 && J[i0] != 0)
721 				i0--;
722 			j0 = J[i0 + 1] - 1;
723 			i1 = i0 + 1;
724 			while (i1 > 1 && J[i1 - 1] == 0)
725 				i1--;
726 			j1 = J[i1 - 1] + 1;
727 			J[i1] = j1;
728 			change(i1, i0, j1, j0);
729 		}
730 	}
731 	if (m == 0)
732 		change(1, 0, 1, len[1]);
733 	if (opt == D_IFDEF) {
734 		for (;;) {
735 #define	c i0
736 			c = getc(input[0]);
737 			if (c < 0)
738 				return;
739 			putchar(c);
740 		}
741 #undef c
742 	}
743 	if (anychange != 0) {
744 		if (opt == D_CONTEXT)
745 			dump_context_vec();
746 		else if (opt == D_UNIFIED)
747 			dump_unified_vec();
748 	}
749 }
750 
751 /*
752  * The following struct is used to record change information when
753  * doing a "context" diff.  (see routine "change" to understand the
754  * highly mneumonic field names)
755  */
756 struct context_vec {
757 	int a;			/* start line in old file */
758 	int b;			/* end line in old file */
759 	int c;			/* start line in new file */
760 	int d;			/* end line in new file */
761 };
762 
763 struct context_vec *context_vec_start, *context_vec_end, *context_vec_ptr;
764 
765 #define	MAX_CONTEXT	128
766 
767 /*
768  * indicate that there is a difference between lines a and b of the from file
769  * to get to lines c to d of the to file. If a is greater then b then there
770  * are no lines in the from file involved and this means that there were
771  * lines appended (beginning at b). If c is greater than d then there are
772  * lines missing from the to file.
773  */
774 static void
775 change(int a, int b, int c, int d)
776 {
777 	struct stat stbuf;
778 
779 	if (opt != D_IFDEF && a > b && c > d)
780 		return;
781 	if (anychange == 0) {
782 		anychange = 1;
783 		if (opt == D_CONTEXT || opt == D_UNIFIED) {
784 			stat(file1, &stbuf);
785 			printf("%s %s	%s", opt == D_CONTEXT ? "***" : "---",
786 			   file1, ctime(&stbuf.st_mtime));
787 			stat(file2, &stbuf);
788 			printf("%s %s	%s", opt == D_CONTEXT ? "---" : "+++",
789 			    file2, ctime(&stbuf.st_mtime));
790 			context_vec_start = emalloc(MAX_CONTEXT *
791 			    sizeof(struct context_vec));
792 			context_vec_end = context_vec_start + MAX_CONTEXT;
793 			context_vec_ptr = context_vec_start - 1;
794 		}
795 	}
796 	if (opt == D_CONTEXT || opt == D_UNIFIED) {
797 		/*
798 		 * If this new change is within 'context' lines of
799 		 * the previous change, just add it to the change
800 		 * record.  If the record is full or if this
801 		 * change is more than 'context' lines from the previous
802 		 * change, dump the record, reset it & add the new change.
803 		 */
804 		if (context_vec_ptr >= context_vec_end ||
805 		    (context_vec_ptr >= context_vec_start &&
806 		    a > (context_vec_ptr->b + 2 * context) &&
807 		    c > (context_vec_ptr->d + 2 * context))) {
808 			if (opt == D_CONTEXT)
809 				dump_context_vec();
810 			else
811 				dump_unified_vec();
812 		}
813 		context_vec_ptr++;
814 		context_vec_ptr->a = a;
815 		context_vec_ptr->b = b;
816 		context_vec_ptr->c = c;
817 		context_vec_ptr->d = d;
818 		return;
819 	}
820 	switch (opt) {
821 
822 	case D_NORMAL:
823 	case D_EDIT:
824 		range(a, b, ",");
825 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
826 		if (opt == D_NORMAL)
827 			range(c, d, ",");
828 		putchar('\n');
829 		break;
830 	case D_REVERSE:
831 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
832 		range(a, b, " ");
833 		putchar('\n');
834 		break;
835 	case D_NREVERSE:
836 		if (a > b)
837 			printf("a%d %d\n", b, d - c + 1);
838 		else {
839 			printf("d%d %d\n", a, b - a + 1);
840 			if (!(c > d))
841 				/* add changed lines */
842 				printf("a%d %d\n", b, d - c + 1);
843 		}
844 		break;
845 	}
846 	if (opt == D_NORMAL || opt == D_IFDEF) {
847 		fetch(ixold, a, b, input[0], "< ", 1);
848 		if (a <= b && c <= d && opt == D_NORMAL)
849 			prints("---\n");
850 	}
851 	fetch(ixnew, c, d, input[1], opt == D_NORMAL ? "> " : "", 0);
852 	if ((opt == D_EDIT || opt == D_REVERSE) && c <= d)
853 		prints(".\n");
854 	if (inifdef) {
855 		fprintf(stdout, "#endif /* %s */\n", ifdefname);
856 		inifdef = 0;
857 	}
858 }
859 
860 static void
861 range(int a, int b, char *separator)
862 {
863 	printf("%d", a > b ? b : a);
864 	if (a < b)
865 		printf("%s%d", separator, b);
866 }
867 
868 static void
869 fetch(long *f, int a, int b, FILE *lb, char *s, int oldfile)
870 {
871 	int i, j, c, col, nc;
872 
873 	/*
874 	 * When doing #ifdef's, copy down to current line
875 	 * if this is the first file, so that stuff makes it to output.
876 	 */
877 	if (opt == D_IFDEF && oldfile) {
878 		long curpos = ftell(lb);
879 		/* print through if append (a>b), else to (nb: 0 vs 1 orig) */
880 		nc = f[a > b ? b : a - 1] - curpos;
881 		for (i = 0; i < nc; i++)
882 			putchar(getc(lb));
883 	}
884 	if (a > b)
885 		return;
886 	if (opt == D_IFDEF) {
887 		if (inifdef) {
888 			fprintf(stdout, "#else /* %s%s */\n",
889 			    oldfile == 1 ? "!" : "", ifdefname);
890 		} else {
891 			if (oldfile)
892 				fprintf(stdout, "#ifndef %s\n", ifdefname);
893 			else
894 				fprintf(stdout, "#ifdef %s\n", ifdefname);
895 		}
896 		inifdef = 1 + oldfile;
897 	}
898 	for (i = a; i <= b; i++) {
899 		fseek(lb, f[i - 1], SEEK_SET);
900 		nc = f[i] - f[i - 1];
901 		if (opt != D_IFDEF)
902 			prints(s);
903 		col = 0;
904 		for (j = 0; j < nc; j++) {
905 			c = getc(lb);
906 			if (c == '\t' && tflag)
907 				do
908 					putchar(' ');
909 				while (++col & 7);
910 			else {
911 				putchar(c);
912 				col++;
913 			}
914 		}
915 	}
916 }
917 
918 #define POW2			/* define only if HALFLONG is 2**n */
919 #define HALFLONG 16
920 #define low(x)	(x&((1L<<HALFLONG)-1))
921 #define high(x)	(x>>HALFLONG)
922 
923 /*
924  * hashing has the effect of
925  * arranging line in 7-bit bytes and then
926  * summing 1-s complement in 16-bit hunks
927  */
928 static int
929 readhash(FILE *f)
930 {
931 	unsigned int shift;
932 	int t, space;
933 	long sum;
934 
935 	sum = 1;
936 	space = 0;
937 	if (!bflag && !wflag) {
938 		if (iflag)
939 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
940 				if (t == -1)
941 					return (0);
942 				sum += (long)chrtran[t] << (shift
943 #ifdef POW2
944 				    &= HALFLONG - 1);
945 #else
946 				    %= HALFLONG);
947 #endif
948 			}
949 		else
950 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
951 				if (t == -1)
952 					return (0);
953 				sum += (long)t << (shift
954 #ifdef POW2
955 				    &= HALFLONG - 1);
956 #else
957 				    %= HALFLONG);
958 #endif
959 			}
960 	} else {
961 		for (shift = 0;;) {
962 			switch (t = getc(f)) {
963 			case -1:
964 				return (0);
965 			case '\t':
966 			case ' ':
967 				space++;
968 				continue;
969 			default:
970 				if (space && !wflag) {
971 					shift += 7;
972 					space = 0;
973 				}
974 				sum += (long)chrtran[t] << (shift
975 #ifdef POW2
976 				    &= HALFLONG - 1);
977 #else
978 				    %= HALFLONG);
979 #endif
980 				shift += 7;
981 				continue;
982 			case '\n':
983 				break;
984 			}
985 			break;
986 		}
987 	}
988 	sum = low(sum) + high(sum);
989 	return ((short) low(sum) + (short) high(sum));
990 }
991 
992 static int
993 asciifile(FILE *f)
994 {
995 	char buf[BUFSIZ], *cp;
996 	int cnt;
997 
998 	if (aflag)
999 		return (1);
1000 
1001 	fseek(f, 0L, SEEK_SET);
1002 	cnt = fread(buf, 1, BUFSIZ, f);
1003 	cp = buf;
1004 	while (--cnt >= 0)
1005 		if (*cp++ & 0200)
1006 			return (0);
1007 	return (1);
1008 }
1009 
1010 /* dump accumulated "context" diff changes */
1011 static void
1012 dump_context_vec(void)
1013 {
1014 	struct context_vec *cvp = context_vec_start;
1015 	int lowa, upb, lowc, upd, do_output;
1016 	int a, b, c, d;
1017 	char ch;
1018 
1019 	if (context_vec_start > context_vec_ptr)
1020 		return;
1021 
1022 	b = d = 0;		/* gcc */
1023 	lowa = max(1, cvp->a - context);
1024 	upb = min(len[0], context_vec_ptr->b + context);
1025 	lowc = max(1, cvp->c - context);
1026 	upd = min(len[1], context_vec_ptr->d + context);
1027 
1028 	printf("***************\n*** ");
1029 	range(lowa, upb, ",");
1030 	printf(" ****\n");
1031 
1032 	/*
1033 	 * output changes to the "old" file.  The first loop suppresses
1034 	 * output if there were no changes to the "old" file (we'll see
1035 	 * the "old" lines as context in the "new" list).
1036 	 */
1037 	do_output = 0;
1038 	for (; cvp <= context_vec_ptr; cvp++)
1039 		if (cvp->a <= cvp->b) {
1040 			cvp = context_vec_start;
1041 			do_output++;
1042 			break;
1043 		}
1044 	if (do_output) {
1045 		while (cvp <= context_vec_ptr) {
1046 			a = cvp->a;
1047 			b = cvp->b;
1048 			c = cvp->c;
1049 			d = cvp->d;
1050 
1051 			if (a <= b && c <= d)
1052 				ch = 'c';
1053 			else
1054 				ch = (a <= b) ? 'd' : 'a';
1055 
1056 			if (ch == 'a')
1057 				fetch(ixold, lowa, b, input[0], "  ", 0);
1058 			else {
1059 				fetch(ixold, lowa, a - 1, input[0], "  ", 0);
1060 				fetch(ixold, a, b, input[0],
1061 				    ch == 'c' ? "! " : "- ", 0);
1062 			}
1063 			lowa = b + 1;
1064 			cvp++;
1065 		}
1066 		fetch(ixold, b + 1, upb, input[0], "  ", 0);
1067 	}
1068 	/* output changes to the "new" file */
1069 	printf("--- ");
1070 	range(lowc, upd, ",");
1071 	printf(" ----\n");
1072 
1073 	do_output = 0;
1074 	for (cvp = context_vec_start; cvp <= context_vec_ptr; cvp++)
1075 		if (cvp->c <= cvp->d) {
1076 			cvp = context_vec_start;
1077 			do_output++;
1078 			break;
1079 		}
1080 	if (do_output) {
1081 		while (cvp <= context_vec_ptr) {
1082 			a = cvp->a;
1083 			b = cvp->b;
1084 			c = cvp->c;
1085 			d = cvp->d;
1086 
1087 			if (a <= b && c <= d)
1088 				ch = 'c';
1089 			else
1090 				ch = (a <= b) ? 'd' : 'a';
1091 
1092 			if (ch == 'd')
1093 				fetch(ixnew, lowc, d, input[1], "  ", 0);
1094 			else {
1095 				fetch(ixnew, lowc, c - 1, input[1], "  ", 0);
1096 				fetch(ixnew, c, d, input[1],
1097 				    ch == 'c' ? "! " : "+ ", 0);
1098 			}
1099 			lowc = d + 1;
1100 			cvp++;
1101 		}
1102 		fetch(ixnew, d + 1, upd, input[1], "  ", 0);
1103 	}
1104 	context_vec_ptr = context_vec_start - 1;
1105 }
1106 
1107 /* dump accumulated "unified" diff changes */
1108 static void
1109 dump_unified_vec(void)
1110 {
1111 	struct context_vec *cvp = context_vec_start;
1112 	int lowa, upb, lowc, upd;
1113 	int a, b, c, d;
1114 	char ch;
1115 
1116 	if (context_vec_start > context_vec_ptr)
1117 		return;
1118 
1119 	b = d = 0;		/* gcc */
1120 	lowa = max(1, cvp->a - context);
1121 	upb = min(len[0], context_vec_ptr->b + context);
1122 	lowc = max(1, cvp->c - context);
1123 	upd = min(len[1], context_vec_ptr->d + context);
1124 
1125 	printf("@@ -%d,%d +%d,%d @@\n", lowa, upb - lowa + 1,
1126 	    lowc, upd - lowc + 1);
1127 
1128 	/*
1129 	 * Output changes in "unified" diff format--the old and new lines
1130 	 * are printed together.
1131 	 */
1132 	for (; cvp <= context_vec_ptr; cvp++) {
1133 		a = cvp->a;
1134 		b = cvp->b;
1135 		c = cvp->c;
1136 		d = cvp->d;
1137 
1138 		/*
1139 		 * c: both new and old changes
1140 		 * d: only changes in the old file
1141 		 * a: only changes in the new file
1142 		 */
1143 		if (a <= b && c <= d)
1144 			ch = 'c';
1145 		else
1146 			ch = (a <= b) ? 'd' : 'a';
1147 
1148 		switch (ch) {
1149 		case 'c':
1150 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1151 			fetch(ixold, a, b, input[0], "-", 0);
1152 			fetch(ixnew, c, d, input[1], "+", 0);
1153 			break;
1154 		case 'd':
1155 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1156 			fetch(ixold, a, b, input[0], "-", 0);
1157 			break;
1158 		case 'a':
1159 			fetch(ixnew, lowc, c - 1, input[1], " ", 0);
1160 			fetch(ixnew, c, d, input[1], "+", 0);
1161 			break;
1162 		}
1163 		lowa = b + 1;
1164 		lowc = d + 1;
1165 	}
1166 	fetch(ixnew, d + 1, upd, input[1], " ", 0);
1167 
1168 	context_vec_ptr = context_vec_start - 1;
1169 }
1170