xref: /openbsd-src/usr.bin/diff/diffreg.c (revision 48b947b71e6d4333c359c506becf23ba6bc3e83e)
1 /*	$OpenBSD: diffreg.c,v 1.20 2003/06/26 04:52:26 millert Exp $	*/
2 
3 /*
4  * Copyright (C) Caldera International Inc.  2001-2002.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code and documentation must retain the above
11  *    copyright notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed or owned by Caldera
18  *	International, Inc.
19  * 4. Neither the name of Caldera International, Inc. nor the names of other
20  *    contributors may be used to endorse or promote products derived from
21  *    this software without specific prior written permission.
22  *
23  * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
24  * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
28  * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
33  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 #include <sys/types.h>
38 
39 #include <stdlib.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <string.h>
43 
44 #include "diff.h"
45 #include "pathnames.h"
46 
47 #if 0
48 static char const sccsid[] = "@(#)diffreg.c 4.21 4/6/90";
49 #endif
50 
51 /*
52  * diff - compare two files.
53  */
54 
55 /*
56  *	Uses an algorithm due to Harold Stone, which finds
57  *	a pair of longest identical subsequences in the two
58  *	files.
59  *
60  *	The major goal is to generate the match vector J.
61  *	J[i] is the index of the line in file1 corresponding
62  *	to line i file0. J[i] = 0 if there is no
63  *	such line in file1.
64  *
65  *	Lines are hashed so as to work in core. All potential
66  *	matches are located by sorting the lines of each file
67  *	on the hash (called ``value''). In particular, this
68  *	collects the equivalence classes in file1 together.
69  *	Subroutine equiv replaces the value of each line in
70  *	file0 by the index of the first element of its
71  *	matching equivalence in (the reordered) file1.
72  *	To save space equiv squeezes file1 into a single
73  *	array member in which the equivalence classes
74  *	are simply concatenated, except that their first
75  *	members are flagged by changing sign.
76  *
77  *	Next the indices that point into member are unsorted into
78  *	array class according to the original order of file0.
79  *
80  *	The cleverness lies in routine stone. This marches
81  *	through the lines of file0, developing a vector klist
82  *	of "k-candidates". At step i a k-candidate is a matched
83  *	pair of lines x,y (x in file0 y in file1) such that
84  *	there is a common subsequence of length k
85  *	between the first i lines of file0 and the first y
86  *	lines of file1, but there is no such subsequence for
87  *	any smaller y. x is the earliest possible mate to y
88  *	that occurs in such a subsequence.
89  *
90  *	Whenever any of the members of the equivalence class of
91  *	lines in file1 matable to a line in file0 has serial number
92  *	less than the y of some k-candidate, that k-candidate
93  *	with the smallest such y is replaced. The new
94  *	k-candidate is chained (via pred) to the current
95  *	k-1 candidate so that the actual subsequence can
96  *	be recovered. When a member has serial number greater
97  *	that the y of all k-candidates, the klist is extended.
98  *	At the end, the longest subsequence is pulled out
99  *	and placed in the array J by unravel
100  *
101  *	With J in hand, the matches there recorded are
102  *	check'ed against reality to assure that no spurious
103  *	matches have crept in due to hashing. If they have,
104  *	they are broken, and "jackpot" is recorded--a harmless
105  *	matter except that a true match for a spuriously
106  *	mated line may now be unnecessarily reported as a change.
107  *
108  *	Much of the complexity of the program comes simply
109  *	from trying to minimize core utilization and
110  *	maximize the range of doable problems by dynamically
111  *	allocating what is needed and reusing what is not.
112  *	The core requirements for problems larger than somewhat
113  *	are (in words) 2*length(file0) + length(file1) +
114  *	3*(number of k-candidates installed),  typically about
115  *	6n words for files of length n.
116  */
117 
118 #define	prints(s)	fputs(s,stdout)
119 
120 FILE *input[2];
121 
122 struct cand {
123 	int x;
124 	int y;
125 	int pred;
126 } cand;
127 
128 struct line {
129 	int serial;
130 	int value;
131 } *file[2];
132 
133 int len[2];
134 struct line *sfile[2];	/* shortened by pruning common prefix and suffix */
135 int slen[2];
136 int pref, suff;			/* length of prefix and suffix */
137 int *class;			/* will be overlaid on file[0] */
138 int *member;			/* will be overlaid on file[1] */
139 int *klist;			/* will be overlaid on file[0] after class */
140 struct cand *clist;		/* merely a free storage pot for candidates */
141 int clen = 0;
142 int *J;				/* will be overlaid on class */
143 long *ixold;			/* will be overlaid on klist */
144 long *ixnew;			/* will be overlaid on file[1] */
145 u_char *chrtran;		/* translation table for case-folding */
146 
147 static void fetch(long *, int, int, FILE *, char *, int);
148 static void output(void);
149 static void check(void);
150 static void range(int, int, char *);
151 static void dump_context_vec(void);
152 static void dump_unified_vec(void);
153 static void prepare(int, FILE *);
154 static void prune(void);
155 static void equiv(struct line *, int, struct line *, int, int *);
156 static void unravel(int);
157 static void unsort(struct line *, int, int *);
158 static void change(int, int, int, int);
159 static void sort(struct line *, int);
160 static int newcand(int, int, int);
161 static int search(int *, int, int);
162 static int skipline(int);
163 static int asciifile(FILE *);
164 static int stone(int *, int, int *, int *);
165 static int readhash(FILE *);
166 
167 /*
168  * chrtran points to one of 2 translation tables: cup2low if folding upper to
169  * lower case clow2low if not folding case
170  */
171 u_char clow2low[256] = {
172 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
173 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
174 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
175 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
176 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
177 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41,
178 	0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c,
179 	0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
180 	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
181 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
182 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
183 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
184 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
185 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
186 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
187 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
188 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
189 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
190 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
191 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
192 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
193 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
194 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
195 	0xfd, 0xfe, 0xff
196 };
197 
198 u_char cup2low[256] = {
199 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
200 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
201 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
202 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
203 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
204 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x60, 0x61,
205 	0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
206 	0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
207 	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x60, 0x61, 0x62,
208 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
209 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
210 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
211 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
212 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
213 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
214 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
215 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
216 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
217 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
218 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
219 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
220 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
221 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
222 	0xfd, 0xfe, 0xff
223 };
224 
225 void
226 diffreg(void)
227 {
228 	char buf1[BUFSIZ], buf2[BUFSIZ];
229 	FILE *f1, *f2;
230 	int i, j;
231 
232 	if (hflag) {
233 		diffargv[0] = "diffh";
234 		execv(diffh, diffargv);
235 		warn("%s", diffh);
236 		done(0);
237 	}
238 	chrtran = (iflag ? cup2low : clow2low);
239 	if (strcmp(file1, "-") == 0 && strcmp(file2, "-") == 0) {
240 		warnx("can't specify - -");
241 		done(0);
242 	}
243 	if (S_ISDIR(stb1.st_mode)) {
244 		file1 = splice(file1, file2);
245 		if (stat(file1, &stb1) < 0) {
246 			warn("%s", file1);
247 			done(0);
248 		}
249 	} else if (!S_ISREG(stb1.st_mode) || strcmp(file1, "-") == 0) {
250 		file1 = copytemp(file1, 1);
251 		if (stat(file1, &stb1) < 0) {
252 			warn("%s", file1);
253 			done(0);
254 		}
255 	}
256 	if (S_ISDIR(stb2.st_mode)) {
257 		file2 = splice(file2, file1);
258 		if (stat(file2, &stb2) < 0) {
259 			warn("%s", file2);
260 			done(0);
261 		}
262 	} else if (!S_ISREG(stb2.st_mode) || strcmp(file2, "-") == 0) {
263 		file2 = copytemp(file2, 2);
264 		if (stat(file2, &stb2) < 0) {
265 			warn("%s", file2);
266 			done(0);
267 		}
268 	}
269 	if ((f1 = fopen(file1, "r")) == NULL) {
270 		warn("%s", file1);
271 		done(0);
272 	}
273 	if ((f2 = fopen(file2, "r")) == NULL) {
274 		warn("%s", file2);
275 		fclose(f1);
276 		done(0);
277 	}
278 	if (S_ISREG(stb1.st_mode) && S_ISREG(stb2.st_mode) &&
279 	    stb1.st_size != stb2.st_size)
280 		goto notsame;
281 	for (;;) {
282 		i = fread(buf1, 1, BUFSIZ, f1);
283 		j = fread(buf2, 1, BUFSIZ, f2);
284 		if (i < 0 || j < 0 || i != j)
285 			goto notsame;
286 		if (i == 0 && j == 0) {
287 			fclose(f1);
288 			fclose(f2);
289 			status = 0;	/* files don't differ */
290 			goto same;
291 		}
292 		for (j = 0; j < i; j++)
293 			if (buf1[j] != buf2[j])
294 				goto notsame;
295 	}
296 notsame:
297 	/*
298 	 * Files certainly differ at this point; set status accordingly
299 	 */
300 	status = 1;
301 	if (!asciifile(f1) || !asciifile(f2)) {
302 		printf("Binary files %s and %s differ\n", file1, file2);
303 		fclose(f1);
304 		fclose(f2);
305 		done(0);
306 	}
307 	prepare(0, f1);
308 	prepare(1, f2);
309 	fclose(f1);
310 	fclose(f2);
311 	prune();
312 	sort(sfile[0], slen[0]);
313 	sort(sfile[1], slen[1]);
314 
315 	member = (int *)file[1];
316 	equiv(sfile[0], slen[0], sfile[1], slen[1], member);
317 	member = erealloc(member, (slen[1] + 2) * sizeof(int));
318 
319 	class = (int *)file[0];
320 	unsort(sfile[0], slen[0], class);
321 	class = erealloc(class, (slen[0] + 2) * sizeof(int));
322 
323 	klist = emalloc((slen[0] + 2) * sizeof(int));
324 	clist = emalloc(sizeof(cand));
325 	i = stone(class, slen[0], member, klist);
326 	free(member);
327 	free(class);
328 
329 	J = emalloc((len[0] + 2) * sizeof(int));
330 	unravel(klist[i]);
331 	free(clist);
332 	free(klist);
333 
334 	ixold = emalloc((len[0] + 2) * sizeof(long));
335 	ixnew = emalloc((len[1] + 2) * sizeof(long));
336 	check();
337 	output();
338 	status = anychange;
339 same:
340 	if (anychange == 0 && (opt == D_CONTEXT || opt == D_UNIFIED))
341 		printf("No differences encountered\n");
342 	done(0);
343 }
344 
345 char *tempfiles[2];
346 
347 char *
348 copytemp(const char *file, int n)
349 {
350 	char buf[BUFSIZ], *tempdir, *tempfile;
351 	int i, ifd, ofd;
352 
353 	if (n != 1 && n != 2)
354 		return (NULL);
355 
356 	if (strcmp(file, "-") == 0)
357 		ifd = STDIN_FILENO;
358 	else if ((ifd = open(file, O_RDONLY, 0644)) < 0) {
359 		warn("%s", file);
360 		done(0);
361 	}
362 
363 	if ((tempdir = getenv("TMPDIR")) == NULL)
364 		tempdir = _PATH_TMP;
365 	if (asprintf(&tempfile, "%s/diff%d.XXXXXXXX", tempdir, n) == -1) {
366 		warn(NULL);
367 		done(0);
368 	}
369 	tempfiles[n - 1] = tempfile;
370 
371 	signal(SIGHUP, done);
372 	signal(SIGINT, done);
373 	signal(SIGPIPE, done);
374 	signal(SIGTERM, done);
375 	ofd = mkstemp(tempfile);
376 	if (ofd < 0) {
377 		warn("%s", tempfile);
378 		done(0);
379 	}
380 	while ((i = read(ifd, buf, BUFSIZ)) > 0)
381 		if (write(ofd, buf, i) != i) {
382 			warn("%s", tempfile);
383 			done(0);
384 		}
385 	close(ifd);
386 	close(ofd);
387 	return (tempfile);
388 }
389 
390 char *
391 splice(char *dir, char *file)
392 {
393 	char *tail, *buf;
394 	size_t len;
395 
396 	if (!strcmp(file, "-")) {
397 		warnx("can't specify - with other arg directory");
398 		done(0);
399 	}
400 	tail = strrchr(file, '/');
401 	if (tail == NULL)
402 		tail = file;
403 	else
404 		tail++;
405 	len = strlen(dir) + 1 + strlen(tail) + 1;
406 	buf = emalloc(len);
407 	snprintf(buf, len, "%s/%s", dir, tail);
408 	return (buf);
409 }
410 
411 static void
412 prepare(int i, FILE *fd)
413 {
414 	struct line *p;
415 	int j, h;
416 
417 	fseek(fd, 0L, SEEK_SET);
418 	p = emalloc(3 * sizeof(struct line));
419 	for (j = 0; (h = readhash(fd));) {
420 		p = erealloc(p, (++j + 3) * sizeof(struct line));
421 		p[j].value = h;
422 	}
423 	len[i] = j;
424 	file[i] = p;
425 }
426 
427 static void
428 prune(void)
429 {
430 	int i, j;
431 
432 	for (pref = 0; pref < len[0] && pref < len[1] &&
433 	    file[0][pref + 1].value == file[1][pref + 1].value;
434 	    pref++)
435 		;
436 	for (suff = 0; suff < len[0] - pref && suff < len[1] - pref &&
437 	    file[0][len[0] - suff].value == file[1][len[1] - suff].value;
438 	    suff++)
439 		;
440 	for (j = 0; j < 2; j++) {
441 		sfile[j] = file[j] + pref;
442 		slen[j] = len[j] - pref - suff;
443 		for (i = 0; i <= slen[j]; i++)
444 			sfile[j][i].serial = i;
445 	}
446 }
447 
448 static void
449 equiv(struct line *a, int n, struct line *b, int m, int *c)
450 {
451 	int i, j;
452 
453 	i = j = 1;
454 	while (i <= n && j <= m) {
455 		if (a[i].value < b[j].value)
456 			a[i++].value = 0;
457 		else if (a[i].value == b[j].value)
458 			a[i++].value = j;
459 		else
460 			j++;
461 	}
462 	while (i <= n)
463 		a[i++].value = 0;
464 	b[m + 1].value = 0;
465 	j = 0;
466 	while (++j <= m) {
467 		c[j] = -b[j].serial;
468 		while (b[j + 1].value == b[j].value) {
469 			j++;
470 			c[j] = b[j].serial;
471 		}
472 	}
473 	c[j] = -1;
474 }
475 
476 static int
477 stone(int *a, int n, int *b, int *c)
478 {
479 	int i, k, y, j, l;
480 	int oldc, tc, oldl;
481 
482 	k = 0;
483 	c[0] = newcand(0, 0, 0);
484 	for (i = 1; i <= n; i++) {
485 		j = a[i];
486 		if (j == 0)
487 			continue;
488 		y = -b[j];
489 		oldl = 0;
490 		oldc = c[0];
491 		do {
492 			if (y <= clist[oldc].y)
493 				continue;
494 			l = search(c, k, y);
495 			if (l != oldl + 1)
496 				oldc = c[l - 1];
497 			if (l <= k) {
498 				if (clist[c[l]].y <= y)
499 					continue;
500 				tc = c[l];
501 				c[l] = newcand(i, y, oldc);
502 				oldc = tc;
503 				oldl = l;
504 			} else {
505 				c[l] = newcand(i, y, oldc);
506 				k++;
507 				break;
508 			}
509 		} while ((y = b[++j]) > 0);
510 	}
511 	return (k);
512 }
513 
514 static int
515 newcand(int x, int y, int pred)
516 {
517 	struct cand *q;
518 
519 	clist = erealloc(clist, ++clen * sizeof(cand));
520 	q = clist + clen - 1;
521 	q->x = x;
522 	q->y = y;
523 	q->pred = pred;
524 	return (clen - 1);
525 }
526 
527 static int
528 search(int *c, int k, int y)
529 {
530 	int i, j, l, t;
531 
532 	if (clist[c[k]].y < y)	/* quick look for typical case */
533 		return (k + 1);
534 	i = 0;
535 	j = k + 1;
536 	while (1) {
537 		l = i + j;
538 		if ((l >>= 1) <= i)
539 			break;
540 		t = clist[c[l]].y;
541 		if (t > y)
542 			j = l;
543 		else if (t < y)
544 			i = l;
545 		else
546 			return (l);
547 	}
548 	return (l + 1);
549 }
550 
551 static void
552 unravel(int p)
553 {
554 	struct cand *q;
555 	int i;
556 
557 	for (i = 0; i <= len[0]; i++)
558 		J[i] = i <= pref ? i :
559 		    i > len[0] - suff ? i + len[1] - len[0] : 0;
560 	for (q = clist + p; q->y != 0; q = clist + q->pred)
561 		J[q->x + pref] = q->y + pref;
562 }
563 
564 /*
565  * Check does double duty:
566  *  1.	ferret out any fortuitous correspondences due
567  *	to confounding by hashing (which result in "jackpot")
568  *  2.  collect random access indexes to the two files
569  */
570 static void
571 check(void)
572 {
573 	int i, j, jackpot, c, d;
574 	long ctold, ctnew;
575 
576 	if ((input[0] = fopen(file1, "r")) == NULL) {
577 		perror(file1);
578 		done(0);
579 	}
580 	if ((input[1] = fopen(file2, "r")) == NULL) {
581 		perror(file2);
582 		done(0);
583 	}
584 	j = 1;
585 	ixold[0] = ixnew[0] = 0;
586 	jackpot = 0;
587 	ctold = ctnew = 0;
588 	for (i = 1; i <= len[0]; i++) {
589 		if (J[i] == 0) {
590 			ixold[i] = ctold += skipline(0);
591 			continue;
592 		}
593 		while (j < J[i]) {
594 			ixnew[j] = ctnew += skipline(1);
595 			j++;
596 		}
597 		if (bflag || wflag || iflag) {
598 			for (;;) {
599 				c = getc(input[0]);
600 				d = getc(input[1]);
601 				ctold++;
602 				ctnew++;
603 				if (bflag && isspace(c) && isspace(d)) {
604 					do {
605 						if (c == '\n')
606 							break;
607 						ctold++;
608 					} while (isspace(c = getc(input[0])));
609 					do {
610 						if (d == '\n')
611 							break;
612 						ctnew++;
613 					} while (isspace(d = getc(input[1])));
614 				} else if (wflag) {
615 					while (isspace(c) && c != '\n') {
616 						c = getc(input[0]);
617 						ctold++;
618 					}
619 					while (isspace(d) && d != '\n') {
620 						d = getc(input[1]);
621 						ctnew++;
622 					}
623 				}
624 				if (chrtran[c] != chrtran[d]) {
625 					jackpot++;
626 					J[i] = 0;
627 					if (c != '\n')
628 						ctold += skipline(0);
629 					if (d != '\n')
630 						ctnew += skipline(1);
631 					break;
632 				}
633 				if (c == '\n')
634 					break;
635 			}
636 		} else {
637 			for (;;) {
638 				ctold++;
639 				ctnew++;
640 				if ((c = getc(input[0])) != (d = getc(input[1]))) {
641 					/* jackpot++; */
642 					J[i] = 0;
643 					if (c != '\n')
644 						ctold += skipline(0);
645 					if (d != '\n')
646 						ctnew += skipline(1);
647 					break;
648 				}
649 				if (c == '\n')
650 					break;
651 			}
652 		}
653 		ixold[i] = ctold;
654 		ixnew[j] = ctnew;
655 		j++;
656 	}
657 	for (; j <= len[1]; j++) {
658 		ixnew[j] = ctnew += skipline(1);
659 	}
660 	fclose(input[0]);
661 	fclose(input[1]);
662 	/*
663 	 * if (jackpot)
664 	 *	fprintf(stderr, "jackpot\n");
665 	 */
666 }
667 
668 /* shellsort CACM #201 */
669 static void
670 sort(struct line *a, int n)
671 {
672 	struct line *ai, *aim, w;
673 	int j, m = 0, k;
674 
675 	if (n == 0)
676 		return;
677 	for (j = 1; j <= n; j *= 2)
678 		m = 2 * j - 1;
679 	for (m /= 2; m != 0; m /= 2) {
680 		k = n - m;
681 		for (j = 1; j <= k; j++) {
682 			for (ai = &a[j]; ai > a; ai -= m) {
683 				aim = &ai[m];
684 				if (aim < ai)
685 					break;	/* wraparound */
686 				if (aim->value > ai[0].value ||
687 				    (aim->value == ai[0].value &&
688 					aim->serial > ai[0].serial))
689 					break;
690 				w.value = ai[0].value;
691 				ai[0].value = aim->value;
692 				aim->value = w.value;
693 				w.serial = ai[0].serial;
694 				ai[0].serial = aim->serial;
695 				aim->serial = w.serial;
696 			}
697 		}
698 	}
699 }
700 
701 static void
702 unsort(struct line *f, int l, int *b)
703 {
704 	int *a, i;
705 
706 	a = emalloc((l + 1) * sizeof(int));
707 	for (i = 1; i <= l; i++)
708 		a[f[i].serial] = f[i].value;
709 	for (i = 1; i <= l; i++)
710 		b[i] = a[i];
711 	free(a);
712 }
713 
714 static int
715 skipline(int f)
716 {
717 	int i, c;
718 
719 	for (i = 1; (c = getc(input[f])) != '\n'; i++)
720 		if (c < 0)
721 			return (i);
722 	return (i);
723 }
724 
725 static void
726 output(void)
727 {
728 	int m, i0, i1, j0, j1;
729 
730 	input[0] = fopen(file1, "r");
731 	input[1] = fopen(file2, "r");
732 	m = len[0];
733 	J[0] = 0;
734 	J[m + 1] = len[1] + 1;
735 	if (opt != D_EDIT) {
736 		for (i0 = 1; i0 <= m; i0 = i1 + 1) {
737 			while (i0 <= m && J[i0] == J[i0 - 1] + 1)
738 				i0++;
739 			j0 = J[i0 - 1] + 1;
740 			i1 = i0 - 1;
741 			while (i1 < m && J[i1 + 1] == 0)
742 				i1++;
743 			j1 = J[i1 + 1] - 1;
744 			J[i1] = j1;
745 			change(i0, i1, j0, j1);
746 		}
747 	} else {
748 		for (i0 = m; i0 >= 1; i0 = i1 - 1) {
749 			while (i0 >= 1 && J[i0] == J[i0 + 1] - 1 && J[i0] != 0)
750 				i0--;
751 			j0 = J[i0 + 1] - 1;
752 			i1 = i0 + 1;
753 			while (i1 > 1 && J[i1 - 1] == 0)
754 				i1--;
755 			j1 = J[i1 - 1] + 1;
756 			J[i1] = j1;
757 			change(i1, i0, j1, j0);
758 		}
759 	}
760 	if (m == 0)
761 		change(1, 0, 1, len[1]);
762 	if (opt == D_IFDEF) {
763 		for (;;) {
764 #define	c i0
765 			c = getc(input[0]);
766 			if (c < 0)
767 				return;
768 			putchar(c);
769 		}
770 #undef c
771 	}
772 	if (anychange != 0) {
773 		if (opt == D_CONTEXT)
774 			dump_context_vec();
775 		else if (opt == D_UNIFIED)
776 			dump_unified_vec();
777 	}
778 }
779 
780 /*
781  * The following struct is used to record change information when
782  * doing a "context" diff.  (see routine "change" to understand the
783  * highly mneumonic field names)
784  */
785 struct context_vec {
786 	int a;			/* start line in old file */
787 	int b;			/* end line in old file */
788 	int c;			/* start line in new file */
789 	int d;			/* end line in new file */
790 };
791 
792 struct context_vec *context_vec_start, *context_vec_end, *context_vec_ptr;
793 
794 #define	MAX_CONTEXT	128
795 
796 /*
797  * indicate that there is a difference between lines a and b of the from file
798  * to get to lines c to d of the to file. If a is greater then b then there
799  * are no lines in the from file involved and this means that there were
800  * lines appended (beginning at b). If c is greater than d then there are
801  * lines missing from the to file.
802  */
803 static void
804 change(int a, int b, int c, int d)
805 {
806 	struct stat stbuf;
807 
808 	if (opt != D_IFDEF && a > b && c > d)
809 		return;
810 	if (anychange == 0) {
811 		anychange = 1;
812 		if (opt == D_CONTEXT || opt == D_UNIFIED) {
813 			stat(file1, &stbuf);
814 			printf("%s %s	%s", opt == D_CONTEXT ? "***" : "---",
815 			   file1, ctime(&stbuf.st_mtime));
816 			stat(file2, &stbuf);
817 			printf("%s %s	%s", opt == D_CONTEXT ? "---" : "+++",
818 			    file2, ctime(&stbuf.st_mtime));
819 			context_vec_start = emalloc(MAX_CONTEXT *
820 			    sizeof(struct context_vec));
821 			context_vec_end = context_vec_start + MAX_CONTEXT;
822 			context_vec_ptr = context_vec_start - 1;
823 		}
824 	}
825 	if (opt == D_CONTEXT || opt == D_UNIFIED) {
826 		/*
827 		 * if this new change is within 'context' lines of
828 		 * the previous change, just add it to the change
829 		 * record.  If the record is full or if this
830 		 * change is more than 'context' lines from the previous
831 		 * change, dump the record, reset it & add the new change.
832 		 */
833 		if (context_vec_ptr >= context_vec_end ||
834 		    (context_vec_ptr >= context_vec_start &&
835 		    a > (context_vec_ptr->b + 2 * context) &&
836 		    c > (context_vec_ptr->d + 2 * context))) {
837 			if (opt == D_CONTEXT)
838 				dump_context_vec();
839 			else
840 				dump_unified_vec();
841 		}
842 		context_vec_ptr++;
843 		context_vec_ptr->a = a;
844 		context_vec_ptr->b = b;
845 		context_vec_ptr->c = c;
846 		context_vec_ptr->d = d;
847 		return;
848 	}
849 	switch (opt) {
850 
851 	case D_NORMAL:
852 	case D_EDIT:
853 		range(a, b, ",");
854 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
855 		if (opt == D_NORMAL)
856 			range(c, d, ",");
857 		putchar('\n');
858 		break;
859 	case D_REVERSE:
860 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
861 		range(a, b, " ");
862 		putchar('\n');
863 		break;
864 	case D_NREVERSE:
865 		if (a > b)
866 			printf("a%d %d\n", b, d - c + 1);
867 		else {
868 			printf("d%d %d\n", a, b - a + 1);
869 			if (!(c > d))
870 				/* add changed lines */
871 				printf("a%d %d\n", b, d - c + 1);
872 		}
873 		break;
874 	}
875 	if (opt == D_NORMAL || opt == D_IFDEF) {
876 		fetch(ixold, a, b, input[0], "< ", 1);
877 		if (a <= b && c <= d && opt == D_NORMAL)
878 			prints("---\n");
879 	}
880 	fetch(ixnew, c, d, input[1], opt == D_NORMAL ? "> " : "", 0);
881 	if ((opt == D_EDIT || opt == D_REVERSE) && c <= d)
882 		prints(".\n");
883 	if (inifdef) {
884 		fprintf(stdout, "#endif /* %s */\n", endifname);
885 		inifdef = 0;
886 	}
887 }
888 
889 static void
890 range(int a, int b, char *separator)
891 {
892 	printf("%d", a > b ? b : a);
893 	if (a < b)
894 		printf("%s%d", separator, b);
895 }
896 
897 static void
898 fetch(long *f, int a, int b, FILE *lb, char *s, int oldfile)
899 {
900 	int oneflag = (*ifdef1 != '\0') != (*ifdef2 != '\0');
901 	int i, j, c, col, nc;
902 
903 	/*
904 	 * When doing #ifdef's, copy down to current line
905 	 * if this is the first file, so that stuff makes it to output.
906 	 */
907 	if (opt == D_IFDEF && oldfile) {
908 		long curpos = ftell(lb);
909 		/* print through if append (a>b), else to (nb: 0 vs 1 orig) */
910 		nc = f[a > b ? b : a - 1] - curpos;
911 		for (i = 0; i < nc; i++)
912 			putchar(getc(lb));
913 	}
914 	if (a > b)
915 		return;
916 	if (opt == D_IFDEF) {
917 		if (inifdef)
918 			fprintf(stdout, "#else /* %s%s */\n",
919 			    oneflag && oldfile == 1 ? "!" : "", ifdef2);
920 		else {
921 			if (oneflag) {
922 				/* There was only one ifdef given */
923 				endifname = ifdef2;
924 				if (oldfile)
925 					fprintf(stdout, "#ifndef %s\n", endifname);
926 				else
927 					fprintf(stdout, "#ifdef %s\n", endifname);
928 			} else {
929 				endifname = oldfile ? ifdef1 : ifdef2;
930 				fprintf(stdout, "#ifdef %s\n", endifname);
931 			}
932 		}
933 		inifdef = 1 + oldfile;
934 	}
935 	for (i = a; i <= b; i++) {
936 		fseek(lb, f[i - 1], SEEK_SET);
937 		nc = f[i] - f[i - 1];
938 		if (opt != D_IFDEF)
939 			prints(s);
940 		col = 0;
941 		for (j = 0; j < nc; j++) {
942 			c = getc(lb);
943 			if (c == '\t' && tflag)
944 				do
945 					putchar(' ');
946 				while (++col & 7);
947 			else {
948 				putchar(c);
949 				col++;
950 			}
951 		}
952 	}
953 
954 	if (inifdef && !wantelses) {
955 		fprintf(stdout, "#endif /* %s */\n", endifname);
956 		inifdef = 0;
957 	}
958 }
959 
960 #define POW2			/* define only if HALFLONG is 2**n */
961 #define HALFLONG 16
962 #define low(x)	(x&((1L<<HALFLONG)-1))
963 #define high(x)	(x>>HALFLONG)
964 
965 /*
966  * hashing has the effect of
967  * arranging line in 7-bit bytes and then
968  * summing 1-s complement in 16-bit hunks
969  */
970 static int
971 readhash(FILE *f)
972 {
973 	unsigned int shift;
974 	int t, space;
975 	long sum;
976 
977 	sum = 1;
978 	space = 0;
979 	if (!bflag && !wflag) {
980 		if (iflag)
981 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
982 				if (t == -1)
983 					return (0);
984 				sum += (long)chrtran[t] << (shift
985 #ifdef POW2
986 				    &= HALFLONG - 1);
987 #else
988 				    %= HALFLONG);
989 #endif
990 			}
991 		else
992 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
993 				if (t == -1)
994 					return (0);
995 				sum += (long)t << (shift
996 #ifdef POW2
997 				    &= HALFLONG - 1);
998 #else
999 				    %= HALFLONG);
1000 #endif
1001 			}
1002 	} else {
1003 		for (shift = 0;;) {
1004 			switch (t = getc(f)) {
1005 			case -1:
1006 				return (0);
1007 			case '\t':
1008 			case ' ':
1009 				space++;
1010 				continue;
1011 			default:
1012 				if (space && !wflag) {
1013 					shift += 7;
1014 					space = 0;
1015 				}
1016 				sum += (long)chrtran[t] << (shift
1017 #ifdef POW2
1018 				    &= HALFLONG - 1);
1019 #else
1020 				    %= HALFLONG);
1021 #endif
1022 				shift += 7;
1023 				continue;
1024 			case '\n':
1025 				break;
1026 			}
1027 			break;
1028 		}
1029 	}
1030 	sum = low(sum) + high(sum);
1031 	return ((short) low(sum) + (short) high(sum));
1032 }
1033 
1034 static int
1035 asciifile(FILE *f)
1036 {
1037 	char buf[BUFSIZ], *cp;
1038 	int cnt;
1039 
1040 	fseek(f, 0L, SEEK_SET);
1041 	cnt = fread(buf, 1, BUFSIZ, f);
1042 	cp = buf;
1043 	while (--cnt >= 0)
1044 		if (*cp++ & 0200)
1045 			return (0);
1046 	return (1);
1047 }
1048 
1049 /* dump accumulated "context" diff changes */
1050 static void
1051 dump_context_vec(void)
1052 {
1053 	struct context_vec *cvp = context_vec_start;
1054 	int lowa, upb, lowc, upd, do_output;
1055 	int a, b, c, d;
1056 	char ch;
1057 
1058 	if (cvp > context_vec_ptr)
1059 		return;
1060 
1061 	b = d = 0;		/* gcc */
1062 	lowa = max(1, cvp->a - context);
1063 	upb = min(len[0], context_vec_ptr->b + context);
1064 	lowc = max(1, cvp->c - context);
1065 	upd = min(len[1], context_vec_ptr->d + context);
1066 
1067 	printf("***************\n*** ");
1068 	range(lowa, upb, ",");
1069 	printf(" ****\n");
1070 
1071 	/*
1072 	 * output changes to the "old" file.  The first loop suppresses
1073 	 * output if there were no changes to the "old" file (we'll see
1074 	 * the "old" lines as context in the "new" list).
1075 	 */
1076 	do_output = 0;
1077 	for (; cvp <= context_vec_ptr; cvp++)
1078 		if (cvp->a <= cvp->b) {
1079 			cvp = context_vec_start;
1080 			do_output++;
1081 			break;
1082 		}
1083 	if (do_output) {
1084 		while (cvp <= context_vec_ptr) {
1085 			a = cvp->a;
1086 			b = cvp->b;
1087 			c = cvp->c;
1088 			d = cvp->d;
1089 
1090 			if (a <= b && c <= d)
1091 				ch = 'c';
1092 			else
1093 				ch = (a <= b) ? 'd' : 'a';
1094 
1095 			if (ch == 'a')
1096 				fetch(ixold, lowa, b, input[0], "  ", 0);
1097 			else {
1098 				fetch(ixold, lowa, a - 1, input[0], "  ", 0);
1099 				fetch(ixold, a, b, input[0],
1100 				    ch == 'c' ? "! " : "- ", 0);
1101 			}
1102 			lowa = b + 1;
1103 			cvp++;
1104 		}
1105 		fetch(ixold, b + 1, upb, input[0], "  ", 0);
1106 	}
1107 	/* output changes to the "new" file */
1108 	printf("--- ");
1109 	range(lowc, upd, ",");
1110 	printf(" ----\n");
1111 
1112 	do_output = 0;
1113 	for (cvp = context_vec_start; cvp <= context_vec_ptr; cvp++)
1114 		if (cvp->c <= cvp->d) {
1115 			cvp = context_vec_start;
1116 			do_output++;
1117 			break;
1118 		}
1119 	if (do_output) {
1120 		while (cvp <= context_vec_ptr) {
1121 			a = cvp->a;
1122 			b = cvp->b;
1123 			c = cvp->c;
1124 			d = cvp->d;
1125 
1126 			if (a <= b && c <= d)
1127 				ch = 'c';
1128 			else
1129 				ch = (a <= b) ? 'd' : 'a';
1130 
1131 			if (ch == 'd')
1132 				fetch(ixnew, lowc, d, input[1], "  ", 0);
1133 			else {
1134 				fetch(ixnew, lowc, c - 1, input[1], "  ", 0);
1135 				fetch(ixnew, c, d, input[1],
1136 				    ch == 'c' ? "! " : "+ ", 0);
1137 			}
1138 			lowc = d + 1;
1139 			cvp++;
1140 		}
1141 		fetch(ixnew, d + 1, upd, input[1], "  ", 0);
1142 	}
1143 	context_vec_ptr = context_vec_start - 1;
1144 }
1145 
1146 /* dump accumulated "unified" diff changes */
1147 static void
1148 dump_unified_vec(void)
1149 {
1150 	struct context_vec *cvp = context_vec_start;
1151 	int lowa, upb, lowc, upd;
1152 	int a, b, c, d;
1153 	char ch;
1154 
1155 	if (cvp > context_vec_ptr)
1156 		return;
1157 
1158 	b = d = 0;		/* gcc */
1159 	lowa = max(1, cvp->a - context);
1160 	upb = min(len[0], context_vec_ptr->b + context);
1161 	lowc = max(1, cvp->c - context);
1162 	upd = min(len[1], context_vec_ptr->d + context);
1163 
1164 	printf("@@ -%d,%d +%d,%d @@\n", lowa, upb - lowa + 1,
1165 	    lowc, upd - lowc + 1);
1166 
1167 	/*
1168 	 * Output changes in "unified" diff format--the old and new lines
1169 	 * are printed together.
1170 	 */
1171 	for (; cvp <= context_vec_ptr; cvp++) {
1172 		a = cvp->a;
1173 		b = cvp->b;
1174 		c = cvp->c;
1175 		d = cvp->d;
1176 
1177 		/*
1178 		 * c: both new and old changes
1179 		 * d: only changes in the old file
1180 		 * a: only changes in the new file
1181 		 */
1182 		if (a <= b && c <= d)
1183 			ch = 'c';
1184 		else
1185 			ch = (a <= b) ? 'd' : 'a';
1186 
1187 		switch (ch) {
1188 		case 'c':
1189 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1190 			fetch(ixold, a, b, input[0], "-", 0);
1191 			fetch(ixnew, c, d, input[1], "+", 0);
1192 			break;
1193 		case 'd':
1194 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1195 			fetch(ixold, a, b, input[0], "-", 0);
1196 			break;
1197 		case 'a':
1198 			fetch(ixnew, lowc, c - 1, input[1], " ", 0);
1199 			fetch(ixnew, c, d, input[1], "+", 0);
1200 			break;
1201 		}
1202 		lowa = b + 1;
1203 		lowc = d + 1;
1204 	}
1205 	fetch(ixnew, d + 1, upd, input[1], " ", 0);
1206 
1207 	context_vec_ptr = context_vec_start - 1;
1208 }
1209