xref: /openbsd-src/usr.bin/diff/diffreg.c (revision 1ba3174772658d281f37df4d297ecec7213ddbe1)
1 /*	$OpenBSD: diffreg.c,v 1.26 2003/07/04 17:37:07 millert Exp $	*/
2 
3 /*
4  * Copyright (C) Caldera International Inc.  2001-2002.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code and documentation must retain the above
11  *    copyright notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed or owned by Caldera
18  *	International, Inc.
19  * 4. Neither the name of Caldera International, Inc. nor the names of other
20  *    contributors may be used to endorse or promote products derived from
21  *    this software without specific prior written permission.
22  *
23  * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
24  * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
25  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27  * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
28  * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
33  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 #include <sys/types.h>
38 
39 #include <stdlib.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <string.h>
43 
44 #include "diff.h"
45 #include "pathnames.h"
46 
47 #if 0
48 static char const sccsid[] = "@(#)diffreg.c 4.21 4/6/90";
49 #endif
50 
51 /*
52  * diff - compare two files.
53  */
54 
55 /*
56  *	Uses an algorithm due to Harold Stone, which finds
57  *	a pair of longest identical subsequences in the two
58  *	files.
59  *
60  *	The major goal is to generate the match vector J.
61  *	J[i] is the index of the line in file1 corresponding
62  *	to line i file0. J[i] = 0 if there is no
63  *	such line in file1.
64  *
65  *	Lines are hashed so as to work in core. All potential
66  *	matches are located by sorting the lines of each file
67  *	on the hash (called ``value''). In particular, this
68  *	collects the equivalence classes in file1 together.
69  *	Subroutine equiv replaces the value of each line in
70  *	file0 by the index of the first element of its
71  *	matching equivalence in (the reordered) file1.
72  *	To save space equiv squeezes file1 into a single
73  *	array member in which the equivalence classes
74  *	are simply concatenated, except that their first
75  *	members are flagged by changing sign.
76  *
77  *	Next the indices that point into member are unsorted into
78  *	array class according to the original order of file0.
79  *
80  *	The cleverness lies in routine stone. This marches
81  *	through the lines of file0, developing a vector klist
82  *	of "k-candidates". At step i a k-candidate is a matched
83  *	pair of lines x,y (x in file0 y in file1) such that
84  *	there is a common subsequence of length k
85  *	between the first i lines of file0 and the first y
86  *	lines of file1, but there is no such subsequence for
87  *	any smaller y. x is the earliest possible mate to y
88  *	that occurs in such a subsequence.
89  *
90  *	Whenever any of the members of the equivalence class of
91  *	lines in file1 matable to a line in file0 has serial number
92  *	less than the y of some k-candidate, that k-candidate
93  *	with the smallest such y is replaced. The new
94  *	k-candidate is chained (via pred) to the current
95  *	k-1 candidate so that the actual subsequence can
96  *	be recovered. When a member has serial number greater
97  *	that the y of all k-candidates, the klist is extended.
98  *	At the end, the longest subsequence is pulled out
99  *	and placed in the array J by unravel
100  *
101  *	With J in hand, the matches there recorded are
102  *	check'ed against reality to assure that no spurious
103  *	matches have crept in due to hashing. If they have,
104  *	they are broken, and "jackpot" is recorded--a harmless
105  *	matter except that a true match for a spuriously
106  *	mated line may now be unnecessarily reported as a change.
107  *
108  *	Much of the complexity of the program comes simply
109  *	from trying to minimize core utilization and
110  *	maximize the range of doable problems by dynamically
111  *	allocating what is needed and reusing what is not.
112  *	The core requirements for problems larger than somewhat
113  *	are (in words) 2*length(file0) + length(file1) +
114  *	3*(number of k-candidates installed),  typically about
115  *	6n words for files of length n.
116  */
117 
118 #define	prints(s)	fputs(s,stdout)
119 
120 FILE *input[2];
121 
122 struct cand {
123 	int x;
124 	int y;
125 	int pred;
126 } cand;
127 
128 struct line {
129 	int serial;
130 	int value;
131 } *file[2];
132 
133 int len[2];
134 struct line *sfile[2];	/* shortened by pruning common prefix and suffix */
135 int slen[2];
136 int pref, suff;			/* length of prefix and suffix */
137 int inifdef;			/* whether or not we are in a #ifdef block */
138 int *class;			/* will be overlaid on file[0] */
139 int *member;			/* will be overlaid on file[1] */
140 int *klist;			/* will be overlaid on file[0] after class */
141 struct cand *clist;		/* merely a free storage pot for candidates */
142 int clen = 0;
143 int *J;				/* will be overlaid on class */
144 long *ixold;			/* will be overlaid on klist */
145 long *ixnew;			/* will be overlaid on file[1] */
146 u_char *chrtran;		/* translation table for case-folding */
147 
148 static void fetch(long *, int, int, FILE *, char *, int);
149 static void output(void);
150 static void check(void);
151 static void range(int, int, char *);
152 static void dump_context_vec(void);
153 static void dump_unified_vec(void);
154 static void prepare(int, FILE *);
155 static void prune(void);
156 static void equiv(struct line *, int, struct line *, int, int *);
157 static void unravel(int);
158 static void unsort(struct line *, int, int *);
159 static void change(int, int, int, int);
160 static void sort(struct line *, int);
161 static int newcand(int, int, int);
162 static int search(int *, int, int);
163 static int skipline(int);
164 static int asciifile(FILE *);
165 static int stone(int *, int, int *, int *);
166 static int readhash(FILE *);
167 
168 /*
169  * chrtran points to one of 2 translation tables: cup2low if folding upper to
170  * lower case clow2low if not folding case
171  */
172 u_char clow2low[256] = {
173 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
174 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
175 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
176 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
177 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
178 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41,
179 	0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c,
180 	0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
181 	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
182 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
183 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
184 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
185 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
186 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
187 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
188 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
189 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
190 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
191 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
192 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
193 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
194 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
195 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
196 	0xfd, 0xfe, 0xff
197 };
198 
199 u_char cup2low[256] = {
200 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
201 	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
202 	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
203 	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
204 	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
205 	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x60, 0x61,
206 	0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
207 	0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
208 	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x60, 0x61, 0x62,
209 	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
210 	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
211 	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
212 	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
213 	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
214 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
215 	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
216 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
217 	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
218 	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
219 	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
220 	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
221 	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
222 	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
223 	0xfd, 0xfe, 0xff
224 };
225 
226 void
227 diffreg(void)
228 {
229 	char buf1[BUFSIZ], buf2[BUFSIZ];
230 	FILE *f1, *f2;
231 	int i, j;
232 
233 	chrtran = (iflag ? cup2low : clow2low);
234 	if (strcmp(file1, "-") == 0 && strcmp(file2, "-") == 0)
235 		errorx("can't specify - -");
236 	if (S_ISDIR(stb1.st_mode)) {
237 		file1 = splice(file1, file2);
238 		if (stat(file1, &stb1) < 0)
239 			error("%s", file1);
240 	} else if (strcmp(file1, "-") == 0 ||
241 	    (!S_ISREG(stb1.st_mode) && strcmp(file1, _PATH_DEVNULL) != 0)) {
242 		file1 = copytemp(file1, 1);
243 		if (stat(file1, &stb1) < 0)
244 			error("%s", file1);
245 	}
246 	if (S_ISDIR(stb2.st_mode)) {
247 		file2 = splice(file2, file1);
248 		if (stat(file2, &stb2) < 0)
249 			error("%s", file2);
250 	} else if (strcmp(file2, "-") == 0 ||
251 	    (!S_ISREG(stb2.st_mode) && strcmp(file2, _PATH_DEVNULL) != 0)) {
252 		file2 = copytemp(file2, 2);
253 		if (stat(file2, &stb2) < 0)
254 			error("%s", file2);
255 	}
256 	if ((f1 = fopen(file1, "r")) == NULL)
257 		error("%s", file1);
258 	if ((f2 = fopen(file2, "r")) == NULL)
259 		error("%s", file2);
260 	if ((stb1.st_mode & S_IFMT) != (stb2.st_mode & S_IFMT) ||
261 	    stb1.st_size != stb2.st_size)
262 		goto notsame;
263 	for (;;) {
264 		i = fread(buf1, 1, BUFSIZ, f1);
265 		j = fread(buf2, 1, BUFSIZ, f2);
266 		if (i < 0 || j < 0 || i != j)
267 			goto notsame;
268 		if (i == 0 && j == 0) {
269 			fclose(f1);
270 			fclose(f2);
271 			status = 0;	/* files don't differ */
272 			goto same;
273 		}
274 		for (j = 0; j < i; j++)
275 			if (buf1[j] != buf2[j])
276 				goto notsame;
277 	}
278 notsame:
279 	/*
280 	 * Files certainly differ at this point; set status accordingly
281 	 */
282 	status = 1;
283 	if (!asciifile(f1) || !asciifile(f2)) {
284 		printf("Binary files %s and %s differ\n", file1, file2);
285 		exit(status);
286 	}
287 	prepare(0, f1);
288 	prepare(1, f2);
289 	fclose(f1);
290 	fclose(f2);
291 	prune();
292 	sort(sfile[0], slen[0]);
293 	sort(sfile[1], slen[1]);
294 
295 	member = (int *)file[1];
296 	equiv(sfile[0], slen[0], sfile[1], slen[1], member);
297 	member = erealloc(member, (slen[1] + 2) * sizeof(int));
298 
299 	class = (int *)file[0];
300 	unsort(sfile[0], slen[0], class);
301 	class = erealloc(class, (slen[0] + 2) * sizeof(int));
302 
303 	klist = emalloc((slen[0] + 2) * sizeof(int));
304 	clist = emalloc(sizeof(cand));
305 	i = stone(class, slen[0], member, klist);
306 	free(member);
307 	free(class);
308 
309 	J = emalloc((len[0] + 2) * sizeof(int));
310 	unravel(klist[i]);
311 	free(clist);
312 	free(klist);
313 
314 	ixold = emalloc((len[0] + 2) * sizeof(long));
315 	ixnew = emalloc((len[1] + 2) * sizeof(long));
316 	check();
317 	output();
318 	status = anychange;
319 same:
320 	if (anychange == 0 && (opt == D_CONTEXT || opt == D_UNIFIED))
321 		printf("No differences encountered\n");
322 }
323 
324 char *tempfiles[2];
325 
326 char *
327 copytemp(const char *file, int n)
328 {
329 	char buf[BUFSIZ], *tempdir, *tempfile;
330 	int i, ifd, ofd;
331 
332 	if (n != 1 && n != 2)
333 		return (NULL);
334 
335 	if (strcmp(file, "-") == 0)
336 		ifd = STDIN_FILENO;
337 	else if ((ifd = open(file, O_RDONLY, 0644)) < 0)
338 		error("%s", file);
339 
340 	if ((tempdir = getenv("TMPDIR")) == NULL)
341 		tempdir = _PATH_TMP;
342 	if (asprintf(&tempfile, "%s/diff%d.XXXXXXXX", tempdir, n) == -1)
343 		error(NULL);
344 	tempfiles[n - 1] = tempfile;
345 
346 	signal(SIGHUP, done);
347 	signal(SIGINT, done);
348 	signal(SIGPIPE, done);
349 	signal(SIGTERM, done);
350 	ofd = mkstemp(tempfile);
351 	if (ofd < 0)
352 		error("%s", tempfile);
353 	while ((i = read(ifd, buf, BUFSIZ)) > 0) {
354 		if (write(ofd, buf, i) != i)
355 			error("%s", tempfile);
356 	}
357 	close(ifd);
358 	close(ofd);
359 	return (tempfile);
360 }
361 
362 char *
363 splice(char *dir, char *file)
364 {
365 	char *tail, *buf;
366 	size_t len;
367 
368 	if (!strcmp(file, "-"))
369 		errorx("can't specify - with other arg directory");
370 	tail = strrchr(file, '/');
371 	if (tail == NULL)
372 		tail = file;
373 	else
374 		tail++;
375 	len = strlen(dir) + 1 + strlen(tail) + 1;
376 	buf = emalloc(len);
377 	snprintf(buf, len, "%s/%s", dir, tail);
378 	return (buf);
379 }
380 
381 static void
382 prepare(int i, FILE *fd)
383 {
384 	struct line *p;
385 	int j, h;
386 
387 	fseek(fd, 0L, SEEK_SET);
388 	p = emalloc(3 * sizeof(struct line));
389 	for (j = 0; (h = readhash(fd));) {
390 		p = erealloc(p, (++j + 3) * sizeof(struct line));
391 		p[j].value = h;
392 	}
393 	len[i] = j;
394 	file[i] = p;
395 }
396 
397 static void
398 prune(void)
399 {
400 	int i, j;
401 
402 	for (pref = 0; pref < len[0] && pref < len[1] &&
403 	    file[0][pref + 1].value == file[1][pref + 1].value;
404 	    pref++)
405 		;
406 	for (suff = 0; suff < len[0] - pref && suff < len[1] - pref &&
407 	    file[0][len[0] - suff].value == file[1][len[1] - suff].value;
408 	    suff++)
409 		;
410 	for (j = 0; j < 2; j++) {
411 		sfile[j] = file[j] + pref;
412 		slen[j] = len[j] - pref - suff;
413 		for (i = 0; i <= slen[j]; i++)
414 			sfile[j][i].serial = i;
415 	}
416 }
417 
418 static void
419 equiv(struct line *a, int n, struct line *b, int m, int *c)
420 {
421 	int i, j;
422 
423 	i = j = 1;
424 	while (i <= n && j <= m) {
425 		if (a[i].value < b[j].value)
426 			a[i++].value = 0;
427 		else if (a[i].value == b[j].value)
428 			a[i++].value = j;
429 		else
430 			j++;
431 	}
432 	while (i <= n)
433 		a[i++].value = 0;
434 	b[m + 1].value = 0;
435 	j = 0;
436 	while (++j <= m) {
437 		c[j] = -b[j].serial;
438 		while (b[j + 1].value == b[j].value) {
439 			j++;
440 			c[j] = b[j].serial;
441 		}
442 	}
443 	c[j] = -1;
444 }
445 
446 static int
447 stone(int *a, int n, int *b, int *c)
448 {
449 	int i, k, y, j, l;
450 	int oldc, tc, oldl;
451 
452 	k = 0;
453 	c[0] = newcand(0, 0, 0);
454 	for (i = 1; i <= n; i++) {
455 		j = a[i];
456 		if (j == 0)
457 			continue;
458 		y = -b[j];
459 		oldl = 0;
460 		oldc = c[0];
461 		do {
462 			if (y <= clist[oldc].y)
463 				continue;
464 			l = search(c, k, y);
465 			if (l != oldl + 1)
466 				oldc = c[l - 1];
467 			if (l <= k) {
468 				if (clist[c[l]].y <= y)
469 					continue;
470 				tc = c[l];
471 				c[l] = newcand(i, y, oldc);
472 				oldc = tc;
473 				oldl = l;
474 			} else {
475 				c[l] = newcand(i, y, oldc);
476 				k++;
477 				break;
478 			}
479 		} while ((y = b[++j]) > 0);
480 	}
481 	return (k);
482 }
483 
484 static int
485 newcand(int x, int y, int pred)
486 {
487 	struct cand *q;
488 
489 	clist = erealloc(clist, ++clen * sizeof(cand));
490 	q = clist + clen - 1;
491 	q->x = x;
492 	q->y = y;
493 	q->pred = pred;
494 	return (clen - 1);
495 }
496 
497 static int
498 search(int *c, int k, int y)
499 {
500 	int i, j, l, t;
501 
502 	if (clist[c[k]].y < y)	/* quick look for typical case */
503 		return (k + 1);
504 	i = 0;
505 	j = k + 1;
506 	while (1) {
507 		l = i + j;
508 		if ((l >>= 1) <= i)
509 			break;
510 		t = clist[c[l]].y;
511 		if (t > y)
512 			j = l;
513 		else if (t < y)
514 			i = l;
515 		else
516 			return (l);
517 	}
518 	return (l + 1);
519 }
520 
521 static void
522 unravel(int p)
523 {
524 	struct cand *q;
525 	int i;
526 
527 	for (i = 0; i <= len[0]; i++)
528 		J[i] = i <= pref ? i :
529 		    i > len[0] - suff ? i + len[1] - len[0] : 0;
530 	for (q = clist + p; q->y != 0; q = clist + q->pred)
531 		J[q->x + pref] = q->y + pref;
532 }
533 
534 /*
535  * Check does double duty:
536  *  1.	ferret out any fortuitous correspondences due
537  *	to confounding by hashing (which result in "jackpot")
538  *  2.  collect random access indexes to the two files
539  */
540 static void
541 check(void)
542 {
543 	int i, j, jackpot, c, d;
544 	long ctold, ctnew;
545 
546 	if ((input[0] = fopen(file1, "r")) == NULL)
547 		error("%s", file1);
548 	if ((input[1] = fopen(file2, "r")) == NULL)
549 		error("%s", file2);
550 	j = 1;
551 	ixold[0] = ixnew[0] = 0;
552 	jackpot = 0;
553 	ctold = ctnew = 0;
554 	for (i = 1; i <= len[0]; i++) {
555 		if (J[i] == 0) {
556 			ixold[i] = ctold += skipline(0);
557 			continue;
558 		}
559 		while (j < J[i]) {
560 			ixnew[j] = ctnew += skipline(1);
561 			j++;
562 		}
563 		if (bflag || wflag || iflag) {
564 			for (;;) {
565 				c = getc(input[0]);
566 				d = getc(input[1]);
567 				ctold++;
568 				ctnew++;
569 				if (bflag && isspace(c) && isspace(d)) {
570 					do {
571 						if (c == '\n')
572 							break;
573 						ctold++;
574 					} while (isspace(c = getc(input[0])));
575 					do {
576 						if (d == '\n')
577 							break;
578 						ctnew++;
579 					} while (isspace(d = getc(input[1])));
580 				} else if (wflag) {
581 					while (isspace(c) && c != '\n') {
582 						c = getc(input[0]);
583 						ctold++;
584 					}
585 					while (isspace(d) && d != '\n') {
586 						d = getc(input[1]);
587 						ctnew++;
588 					}
589 				}
590 				if (chrtran[c] != chrtran[d]) {
591 					jackpot++;
592 					J[i] = 0;
593 					if (c != '\n')
594 						ctold += skipline(0);
595 					if (d != '\n')
596 						ctnew += skipline(1);
597 					break;
598 				}
599 				if (c == '\n')
600 					break;
601 			}
602 		} else {
603 			for (;;) {
604 				ctold++;
605 				ctnew++;
606 				if ((c = getc(input[0])) != (d = getc(input[1]))) {
607 					/* jackpot++; */
608 					J[i] = 0;
609 					if (c != '\n')
610 						ctold += skipline(0);
611 					if (d != '\n')
612 						ctnew += skipline(1);
613 					break;
614 				}
615 				if (c == '\n')
616 					break;
617 			}
618 		}
619 		ixold[i] = ctold;
620 		ixnew[j] = ctnew;
621 		j++;
622 	}
623 	for (; j <= len[1]; j++) {
624 		ixnew[j] = ctnew += skipline(1);
625 	}
626 	fclose(input[0]);
627 	fclose(input[1]);
628 	/*
629 	 * if (jackpot)
630 	 *	fprintf(stderr, "jackpot\n");
631 	 */
632 }
633 
634 /* shellsort CACM #201 */
635 static void
636 sort(struct line *a, int n)
637 {
638 	struct line *ai, *aim, w;
639 	int j, m = 0, k;
640 
641 	if (n == 0)
642 		return;
643 	for (j = 1; j <= n; j *= 2)
644 		m = 2 * j - 1;
645 	for (m /= 2; m != 0; m /= 2) {
646 		k = n - m;
647 		for (j = 1; j <= k; j++) {
648 			for (ai = &a[j]; ai > a; ai -= m) {
649 				aim = &ai[m];
650 				if (aim < ai)
651 					break;	/* wraparound */
652 				if (aim->value > ai[0].value ||
653 				    (aim->value == ai[0].value &&
654 					aim->serial > ai[0].serial))
655 					break;
656 				w.value = ai[0].value;
657 				ai[0].value = aim->value;
658 				aim->value = w.value;
659 				w.serial = ai[0].serial;
660 				ai[0].serial = aim->serial;
661 				aim->serial = w.serial;
662 			}
663 		}
664 	}
665 }
666 
667 static void
668 unsort(struct line *f, int l, int *b)
669 {
670 	int *a, i;
671 
672 	a = emalloc((l + 1) * sizeof(int));
673 	for (i = 1; i <= l; i++)
674 		a[f[i].serial] = f[i].value;
675 	for (i = 1; i <= l; i++)
676 		b[i] = a[i];
677 	free(a);
678 }
679 
680 static int
681 skipline(int f)
682 {
683 	int i, c;
684 
685 	for (i = 1; (c = getc(input[f])) != '\n'; i++)
686 		if (c < 0)
687 			return (i);
688 	return (i);
689 }
690 
691 static void
692 output(void)
693 {
694 	int m, i0, i1, j0, j1;
695 
696 	input[0] = fopen(file1, "r");
697 	input[1] = fopen(file2, "r");
698 	m = len[0];
699 	J[0] = 0;
700 	J[m + 1] = len[1] + 1;
701 	if (opt != D_EDIT) {
702 		for (i0 = 1; i0 <= m; i0 = i1 + 1) {
703 			while (i0 <= m && J[i0] == J[i0 - 1] + 1)
704 				i0++;
705 			j0 = J[i0 - 1] + 1;
706 			i1 = i0 - 1;
707 			while (i1 < m && J[i1 + 1] == 0)
708 				i1++;
709 			j1 = J[i1 + 1] - 1;
710 			J[i1] = j1;
711 			change(i0, i1, j0, j1);
712 		}
713 	} else {
714 		for (i0 = m; i0 >= 1; i0 = i1 - 1) {
715 			while (i0 >= 1 && J[i0] == J[i0 + 1] - 1 && J[i0] != 0)
716 				i0--;
717 			j0 = J[i0 + 1] - 1;
718 			i1 = i0 + 1;
719 			while (i1 > 1 && J[i1 - 1] == 0)
720 				i1--;
721 			j1 = J[i1 - 1] + 1;
722 			J[i1] = j1;
723 			change(i1, i0, j1, j0);
724 		}
725 	}
726 	if (m == 0)
727 		change(1, 0, 1, len[1]);
728 	if (opt == D_IFDEF) {
729 		for (;;) {
730 #define	c i0
731 			c = getc(input[0]);
732 			if (c < 0)
733 				return;
734 			putchar(c);
735 		}
736 #undef c
737 	}
738 	if (anychange != 0) {
739 		if (opt == D_CONTEXT)
740 			dump_context_vec();
741 		else if (opt == D_UNIFIED)
742 			dump_unified_vec();
743 	}
744 }
745 
746 /*
747  * The following struct is used to record change information when
748  * doing a "context" diff.  (see routine "change" to understand the
749  * highly mneumonic field names)
750  */
751 struct context_vec {
752 	int a;			/* start line in old file */
753 	int b;			/* end line in old file */
754 	int c;			/* start line in new file */
755 	int d;			/* end line in new file */
756 };
757 
758 struct context_vec *context_vec_start, *context_vec_end, *context_vec_ptr;
759 
760 #define	MAX_CONTEXT	128
761 
762 /*
763  * indicate that there is a difference between lines a and b of the from file
764  * to get to lines c to d of the to file. If a is greater then b then there
765  * are no lines in the from file involved and this means that there were
766  * lines appended (beginning at b). If c is greater than d then there are
767  * lines missing from the to file.
768  */
769 static void
770 change(int a, int b, int c, int d)
771 {
772 	struct stat stbuf;
773 
774 	if (opt != D_IFDEF && a > b && c > d)
775 		return;
776 	if (anychange == 0) {
777 		anychange = 1;
778 		if (opt == D_CONTEXT || opt == D_UNIFIED) {
779 			stat(file1, &stbuf);
780 			printf("%s %s	%s", opt == D_CONTEXT ? "***" : "---",
781 			   file1, ctime(&stbuf.st_mtime));
782 			stat(file2, &stbuf);
783 			printf("%s %s	%s", opt == D_CONTEXT ? "---" : "+++",
784 			    file2, ctime(&stbuf.st_mtime));
785 			context_vec_start = emalloc(MAX_CONTEXT *
786 			    sizeof(struct context_vec));
787 			context_vec_end = context_vec_start + MAX_CONTEXT;
788 			context_vec_ptr = context_vec_start - 1;
789 		}
790 	}
791 	if (opt == D_CONTEXT || opt == D_UNIFIED) {
792 		/*
793 		 * If this new change is within 'context' lines of
794 		 * the previous change, just add it to the change
795 		 * record.  If the record is full or if this
796 		 * change is more than 'context' lines from the previous
797 		 * change, dump the record, reset it & add the new change.
798 		 */
799 		if (context_vec_ptr >= context_vec_end ||
800 		    (context_vec_ptr >= context_vec_start &&
801 		    a > (context_vec_ptr->b + 2 * context) &&
802 		    c > (context_vec_ptr->d + 2 * context))) {
803 			if (opt == D_CONTEXT)
804 				dump_context_vec();
805 			else
806 				dump_unified_vec();
807 		}
808 		context_vec_ptr++;
809 		context_vec_ptr->a = a;
810 		context_vec_ptr->b = b;
811 		context_vec_ptr->c = c;
812 		context_vec_ptr->d = d;
813 		return;
814 	}
815 	switch (opt) {
816 
817 	case D_NORMAL:
818 	case D_EDIT:
819 		range(a, b, ",");
820 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
821 		if (opt == D_NORMAL)
822 			range(c, d, ",");
823 		putchar('\n');
824 		break;
825 	case D_REVERSE:
826 		putchar(a > b ? 'a' : c > d ? 'd' : 'c');
827 		range(a, b, " ");
828 		putchar('\n');
829 		break;
830 	case D_NREVERSE:
831 		if (a > b)
832 			printf("a%d %d\n", b, d - c + 1);
833 		else {
834 			printf("d%d %d\n", a, b - a + 1);
835 			if (!(c > d))
836 				/* add changed lines */
837 				printf("a%d %d\n", b, d - c + 1);
838 		}
839 		break;
840 	}
841 	if (opt == D_NORMAL || opt == D_IFDEF) {
842 		fetch(ixold, a, b, input[0], "< ", 1);
843 		if (a <= b && c <= d && opt == D_NORMAL)
844 			prints("---\n");
845 	}
846 	fetch(ixnew, c, d, input[1], opt == D_NORMAL ? "> " : "", 0);
847 	if ((opt == D_EDIT || opt == D_REVERSE) && c <= d)
848 		prints(".\n");
849 	if (inifdef) {
850 		fprintf(stdout, "#endif /* %s */\n", ifdefname);
851 		inifdef = 0;
852 	}
853 }
854 
855 static void
856 range(int a, int b, char *separator)
857 {
858 	printf("%d", a > b ? b : a);
859 	if (a < b)
860 		printf("%s%d", separator, b);
861 }
862 
863 static void
864 fetch(long *f, int a, int b, FILE *lb, char *s, int oldfile)
865 {
866 	int i, j, c, col, nc;
867 
868 	/*
869 	 * When doing #ifdef's, copy down to current line
870 	 * if this is the first file, so that stuff makes it to output.
871 	 */
872 	if (opt == D_IFDEF && oldfile) {
873 		long curpos = ftell(lb);
874 		/* print through if append (a>b), else to (nb: 0 vs 1 orig) */
875 		nc = f[a > b ? b : a - 1] - curpos;
876 		for (i = 0; i < nc; i++)
877 			putchar(getc(lb));
878 	}
879 	if (a > b)
880 		return;
881 	if (opt == D_IFDEF) {
882 		if (inifdef) {
883 			fprintf(stdout, "#else /* %s%s */\n",
884 			    oldfile == 1 ? "!" : "", ifdefname);
885 		} else {
886 			if (oldfile)
887 				fprintf(stdout, "#ifndef %s\n", ifdefname);
888 			else
889 				fprintf(stdout, "#ifdef %s\n", ifdefname);
890 		}
891 		inifdef = 1 + oldfile;
892 	}
893 	for (i = a; i <= b; i++) {
894 		fseek(lb, f[i - 1], SEEK_SET);
895 		nc = f[i] - f[i - 1];
896 		if (opt != D_IFDEF)
897 			prints(s);
898 		col = 0;
899 		for (j = 0; j < nc; j++) {
900 			c = getc(lb);
901 			if (c == '\t' && tflag)
902 				do
903 					putchar(' ');
904 				while (++col & 7);
905 			else {
906 				putchar(c);
907 				col++;
908 			}
909 		}
910 	}
911 }
912 
913 #define POW2			/* define only if HALFLONG is 2**n */
914 #define HALFLONG 16
915 #define low(x)	(x&((1L<<HALFLONG)-1))
916 #define high(x)	(x>>HALFLONG)
917 
918 /*
919  * hashing has the effect of
920  * arranging line in 7-bit bytes and then
921  * summing 1-s complement in 16-bit hunks
922  */
923 static int
924 readhash(FILE *f)
925 {
926 	unsigned int shift;
927 	int t, space;
928 	long sum;
929 
930 	sum = 1;
931 	space = 0;
932 	if (!bflag && !wflag) {
933 		if (iflag)
934 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
935 				if (t == -1)
936 					return (0);
937 				sum += (long)chrtran[t] << (shift
938 #ifdef POW2
939 				    &= HALFLONG - 1);
940 #else
941 				    %= HALFLONG);
942 #endif
943 			}
944 		else
945 			for (shift = 0; (t = getc(f)) != '\n'; shift += 7) {
946 				if (t == -1)
947 					return (0);
948 				sum += (long)t << (shift
949 #ifdef POW2
950 				    &= HALFLONG - 1);
951 #else
952 				    %= HALFLONG);
953 #endif
954 			}
955 	} else {
956 		for (shift = 0;;) {
957 			switch (t = getc(f)) {
958 			case -1:
959 				return (0);
960 			case '\t':
961 			case ' ':
962 				space++;
963 				continue;
964 			default:
965 				if (space && !wflag) {
966 					shift += 7;
967 					space = 0;
968 				}
969 				sum += (long)chrtran[t] << (shift
970 #ifdef POW2
971 				    &= HALFLONG - 1);
972 #else
973 				    %= HALFLONG);
974 #endif
975 				shift += 7;
976 				continue;
977 			case '\n':
978 				break;
979 			}
980 			break;
981 		}
982 	}
983 	sum = low(sum) + high(sum);
984 	return ((short) low(sum) + (short) high(sum));
985 }
986 
987 static int
988 asciifile(FILE *f)
989 {
990 	char buf[BUFSIZ], *cp;
991 	int cnt;
992 
993 	if (aflag)
994 		return (1);
995 
996 	fseek(f, 0L, SEEK_SET);
997 	cnt = fread(buf, 1, BUFSIZ, f);
998 	cp = buf;
999 	while (--cnt >= 0)
1000 		if (*cp++ & 0200)
1001 			return (0);
1002 	return (1);
1003 }
1004 
1005 /* dump accumulated "context" diff changes */
1006 static void
1007 dump_context_vec(void)
1008 {
1009 	struct context_vec *cvp = context_vec_start;
1010 	int lowa, upb, lowc, upd, do_output;
1011 	int a, b, c, d;
1012 	char ch;
1013 
1014 	if (context_vec_start > context_vec_ptr)
1015 		return;
1016 
1017 	b = d = 0;		/* gcc */
1018 	lowa = max(1, cvp->a - context);
1019 	upb = min(len[0], context_vec_ptr->b + context);
1020 	lowc = max(1, cvp->c - context);
1021 	upd = min(len[1], context_vec_ptr->d + context);
1022 
1023 	printf("***************\n*** ");
1024 	range(lowa, upb, ",");
1025 	printf(" ****\n");
1026 
1027 	/*
1028 	 * output changes to the "old" file.  The first loop suppresses
1029 	 * output if there were no changes to the "old" file (we'll see
1030 	 * the "old" lines as context in the "new" list).
1031 	 */
1032 	do_output = 0;
1033 	for (; cvp <= context_vec_ptr; cvp++)
1034 		if (cvp->a <= cvp->b) {
1035 			cvp = context_vec_start;
1036 			do_output++;
1037 			break;
1038 		}
1039 	if (do_output) {
1040 		while (cvp <= context_vec_ptr) {
1041 			a = cvp->a;
1042 			b = cvp->b;
1043 			c = cvp->c;
1044 			d = cvp->d;
1045 
1046 			if (a <= b && c <= d)
1047 				ch = 'c';
1048 			else
1049 				ch = (a <= b) ? 'd' : 'a';
1050 
1051 			if (ch == 'a')
1052 				fetch(ixold, lowa, b, input[0], "  ", 0);
1053 			else {
1054 				fetch(ixold, lowa, a - 1, input[0], "  ", 0);
1055 				fetch(ixold, a, b, input[0],
1056 				    ch == 'c' ? "! " : "- ", 0);
1057 			}
1058 			lowa = b + 1;
1059 			cvp++;
1060 		}
1061 		fetch(ixold, b + 1, upb, input[0], "  ", 0);
1062 	}
1063 	/* output changes to the "new" file */
1064 	printf("--- ");
1065 	range(lowc, upd, ",");
1066 	printf(" ----\n");
1067 
1068 	do_output = 0;
1069 	for (cvp = context_vec_start; cvp <= context_vec_ptr; cvp++)
1070 		if (cvp->c <= cvp->d) {
1071 			cvp = context_vec_start;
1072 			do_output++;
1073 			break;
1074 		}
1075 	if (do_output) {
1076 		while (cvp <= context_vec_ptr) {
1077 			a = cvp->a;
1078 			b = cvp->b;
1079 			c = cvp->c;
1080 			d = cvp->d;
1081 
1082 			if (a <= b && c <= d)
1083 				ch = 'c';
1084 			else
1085 				ch = (a <= b) ? 'd' : 'a';
1086 
1087 			if (ch == 'd')
1088 				fetch(ixnew, lowc, d, input[1], "  ", 0);
1089 			else {
1090 				fetch(ixnew, lowc, c - 1, input[1], "  ", 0);
1091 				fetch(ixnew, c, d, input[1],
1092 				    ch == 'c' ? "! " : "+ ", 0);
1093 			}
1094 			lowc = d + 1;
1095 			cvp++;
1096 		}
1097 		fetch(ixnew, d + 1, upd, input[1], "  ", 0);
1098 	}
1099 	context_vec_ptr = context_vec_start - 1;
1100 }
1101 
1102 /* dump accumulated "unified" diff changes */
1103 static void
1104 dump_unified_vec(void)
1105 {
1106 	struct context_vec *cvp = context_vec_start;
1107 	int lowa, upb, lowc, upd;
1108 	int a, b, c, d;
1109 	char ch;
1110 
1111 	if (context_vec_start > context_vec_ptr)
1112 		return;
1113 
1114 	b = d = 0;		/* gcc */
1115 	lowa = max(1, cvp->a - context);
1116 	upb = min(len[0], context_vec_ptr->b + context);
1117 	lowc = max(1, cvp->c - context);
1118 	upd = min(len[1], context_vec_ptr->d + context);
1119 
1120 	printf("@@ -%d,%d +%d,%d @@\n", lowa, upb - lowa + 1,
1121 	    lowc, upd - lowc + 1);
1122 
1123 	/*
1124 	 * Output changes in "unified" diff format--the old and new lines
1125 	 * are printed together.
1126 	 */
1127 	for (; cvp <= context_vec_ptr; cvp++) {
1128 		a = cvp->a;
1129 		b = cvp->b;
1130 		c = cvp->c;
1131 		d = cvp->d;
1132 
1133 		/*
1134 		 * c: both new and old changes
1135 		 * d: only changes in the old file
1136 		 * a: only changes in the new file
1137 		 */
1138 		if (a <= b && c <= d)
1139 			ch = 'c';
1140 		else
1141 			ch = (a <= b) ? 'd' : 'a';
1142 
1143 		switch (ch) {
1144 		case 'c':
1145 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1146 			fetch(ixold, a, b, input[0], "-", 0);
1147 			fetch(ixnew, c, d, input[1], "+", 0);
1148 			break;
1149 		case 'd':
1150 			fetch(ixold, lowa, a - 1, input[0], " ", 0);
1151 			fetch(ixold, a, b, input[0], "-", 0);
1152 			break;
1153 		case 'a':
1154 			fetch(ixnew, lowc, c - 1, input[1], " ", 0);
1155 			fetch(ixnew, c, d, input[1], "+", 0);
1156 			break;
1157 		}
1158 		lowa = b + 1;
1159 		lowc = d + 1;
1160 	}
1161 	fetch(ixnew, d + 1, upd, input[1], " ", 0);
1162 
1163 	context_vec_ptr = context_vec_start - 1;
1164 }
1165