xref: /plan9/sys/src/cmd/diff/diffreg.c (revision 3e12c5d1bb89fc02707907988834ef147769ddaf)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include "diff.h"
5 
6 /*	diff - differential file comparison
7 *
8 *	Uses an algorithm due to Harold Stone, which finds
9 *	a pair of longest identical subsequences in the two
10 *	files.
11 *
12 *	The major goal is to generate the match vector J.
13 *	J[i] is the index of the line in file1 corresponding
14 *	to line i file0. J[i] = 0 if there is no
15 *	such line in file1.
16 *
17 *	Lines are hashed so as to work in core. All potential
18 *	matches are located by sorting the lines of each file
19 *	on the hash (called value). In particular, this
20 *	collects the equivalence classes in file1 together.
21 *	Subroutine equiv replaces the value of each line in
22 *	file0 by the index of the first element of its
23 *	matching equivalence in (the reordered) file1.
24 *	To save space equiv squeezes file1 into a single
25 *	array member in which the equivalence classes
26 *	are simply concatenated, except that their first
27 *	members are flagged by changing sign.
28 *
29 *	Next the indices that point into member are unsorted into
30 *	array class according to the original order of file0.
31 *
32 *	The cleverness lies in routine stone. This marches
33 *	through the lines of file0, developing a vector klist
34 *	of "k-candidates". At step i a k-candidate is a matched
35 *	pair of lines x,y (x in file0 y in file1) such that
36 *	there is a common subsequence of lenght k
37 *	between the first i lines of file0 and the first y
38 *	lines of file1, but there is no such subsequence for
39 *	any smaller y. x is the earliest possible mate to y
40 *	that occurs in such a subsequence.
41 *
42 *	Whenever any of the members of the equivalence class of
43 *	lines in file1 matable to a line in file0 has serial number
44 *	less than the y of some k-candidate, that k-candidate
45 *	with the smallest such y is replaced. The new
46 *	k-candidate is chained (via pred) to the current
47 *	k-1 candidate so that the actual subsequence can
48 *	be recovered. When a member has serial number greater
49 *	that the y of all k-candidates, the klist is extended.
50 *	At the end, the longest subsequence is pulled out
51 *	and placed in the array J by unravel.
52 *
53 *	With J in hand, the matches there recorded are
54 *	check'ed against reality to assure that no spurious
55 *	matches have crept in due to hashing. If they have,
56 *	they are broken, and "jackpot " is recorded--a harmless
57 *	matter except that a true match for a spuriously
58 *	mated line may now be unnecessarily reported as a change.
59 *
60 *	Much of the complexity of the program comes simply
61 *	from trying to minimize core utilization and
62 *	maximize the range of doable problems by dynamically
63 *	allocating what is needed and reusing what is not.
64 *	The core requirements for problems larger than somewhat
65 *	are (in words) 2*length(file0) + length(file1) +
66 *	3*(number of k-candidates installed),  typically about
67 *	6n words for files of length n.
68 */
69 /* TIDY THIS UP */
70 struct cand {
71 	int x;
72 	int y;
73 	int pred;
74 } cand;
75 struct line {
76 	int serial;
77 	int value;
78 } *file[2], line;
79 int len[2];
80 struct line *sfile[2];	/*shortened by pruning common prefix and suffix*/
81 int slen[2];
82 int pref, suff;	/*length of prefix and suffix*/
83 int *class;	/*will be overlaid on file[0]*/
84 int *member;	/*will be overlaid on file[1]*/
85 int *klist;		/*will be overlaid on file[0] after class*/
86 struct cand *clist;	/* merely a free storage pot for candidates */
87 int clen;
88 int *J;		/*will be overlaid on class*/
89 long *ixold;	/*will be overlaid on klist*/
90 long *ixnew;	/*will be overlaid on file[1]*/
91 /* END OF SOME TIDYING */
92 
93 static void
94 sort(struct line *a, int n)	/*shellsort CACM #201*/
95 {
96 	int m;
97 	struct line *ai, *aim, *j, *k;
98 	struct line w;
99 	int i;
100 
101 	m = 0;
102 	for (i = 1; i <= n; i *= 2)
103 		m = 2*i - 1;
104 	for (m /= 2; m != 0; m /= 2) {
105 		k = a+(n-m);
106 		for (j = a+1; j <= k; j++) {
107 			ai = j;
108 			aim = ai+m;
109 			do {
110 				if (aim->value > ai->value ||
111 				   aim->value == ai->value &&
112 				   aim->serial > ai->serial)
113 					break;
114 				w = *ai;
115 				*ai = *aim;
116 				*aim = w;
117 
118 				aim = ai;
119 				ai -= m;
120 			} while (ai > a && aim >= ai);
121 		}
122 	}
123 }
124 
125 static void
126 unsort(struct line *f, int l, int *b)
127 {
128 	int *a;
129 	int i;
130 
131 	a = MALLOC(int, (l+1));
132 	for(i=1;i<=l;i++)
133 		a[f[i].serial] = f[i].value;
134 	for(i=1;i<=l;i++)
135 		b[i] = a[i];
136 	FREE(a);
137 }
138 
139 static void
140 prune(void)
141 {
142 	int i,j;
143 
144 	for(pref=0;pref<len[0]&&pref<len[1]&&
145 		file[0][pref+1].value==file[1][pref+1].value;
146 		pref++ ) ;
147 	for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&&
148 		file[0][len[0]-suff].value==file[1][len[1]-suff].value;
149 		suff++) ;
150 	for(j=0;j<2;j++) {
151 		sfile[j] = file[j]+pref;
152 		slen[j] = len[j]-pref-suff;
153 		for(i=0;i<=slen[j];i++)
154 			sfile[j][i].serial = i;
155 	}
156 }
157 
158 static void
159 equiv(struct line *a, int n, struct line *b, int m, int *c)
160 {
161 	int i, j;
162 
163 	i = j = 1;
164 	while(i<=n && j<=m) {
165 		if(a[i].value < b[j].value)
166 			a[i++].value = 0;
167 		else if(a[i].value == b[j].value)
168 			a[i++].value = j;
169 		else
170 			j++;
171 	}
172 	while(i <= n)
173 		a[i++].value = 0;
174 	b[m+1].value = 0;
175 	j = 0;
176 	while(++j <= m) {
177 		c[j] = -b[j].serial;
178 		while(b[j+1].value == b[j].value) {
179 			j++;
180 			c[j] = b[j].serial;
181 		}
182 	}
183 	c[j] = -1;
184 }
185 
186 static int
187 newcand(int x, int  y, int pred)
188 {
189 	struct cand *q;
190 
191 	clist = REALLOC(clist, struct cand, (clen+1));
192 	q = clist + clen;
193 	q->x = x;
194 	q->y = y;
195 	q->pred = pred;
196 	return clen++;
197 }
198 
199 static int
200 search(int *c, int k, int y)
201 {
202 	int i, j, l;
203 	int t;
204 
205 	if(clist[c[k]].y < y)	/*quick look for typical case*/
206 		return k+1;
207 	i = 0;
208 	j = k+1;
209 	while((l=(i+j)/2) > i) {
210 		t = clist[c[l]].y;
211 		if(t > y)
212 			j = l;
213 		else if(t < y)
214 			i = l;
215 		else
216 			return l;
217 	}
218 	return l+1;
219 }
220 
221 static int
222 stone(int *a, int n, int *b, int *c)
223 {
224 	int i, k,y;
225 	int j, l;
226 	int oldc, tc;
227 	int oldl;
228 
229 	k = 0;
230 	c[0] = newcand(0,0,0);
231 	for(i=1; i<=n; i++) {
232 		j = a[i];
233 		if(j==0)
234 			continue;
235 		y = -b[j];
236 		oldl = 0;
237 		oldc = c[0];
238 		do {
239 			if(y <= clist[oldc].y)
240 				continue;
241 			l = search(c, k, y);
242 			if(l!=oldl+1)
243 				oldc = c[l-1];
244 			if(l<=k) {
245 				if(clist[c[l]].y <= y)
246 					continue;
247 				tc = c[l];
248 				c[l] = newcand(i,y,oldc);
249 				oldc = tc;
250 				oldl = l;
251 			} else {
252 				c[l] = newcand(i,y,oldc);
253 				k++;
254 				break;
255 			}
256 		} while((y=b[++j]) > 0);
257 	}
258 	return k;
259 }
260 
261 static void
262 unravel(int p)
263 {
264 	int i;
265 	struct cand *q;
266 
267 	for(i=0; i<=len[0]; i++) {
268 		if (i <= pref)
269 			J[i] = i;
270 		else if (i > len[0]-suff)
271 			J[i] = i+len[1]-len[0];
272 		else
273 			J[i] = 0;
274 	}
275 	for(q=clist+p;q->y!=0;q=clist+q->pred)
276 		J[q->x+pref] = q->y+pref;
277 }
278 
279 static void
280 output(void)
281 {
282 	int m, i0, i1, j0, j1;
283 
284 	m = len[0];
285 	J[0] = 0;
286 	J[m+1] = len[1]+1;
287 	if (mode != 'e') {
288 		for (i0 = 1; i0 <= m; i0 = i1+1) {
289 			while (i0 <= m && J[i0] == J[i0-1]+1)
290 				i0++;
291 			j0 = J[i0-1]+1;
292 			i1 = i0-1;
293 			while (i1 < m && J[i1+1] == 0)
294 				i1++;
295 			j1 = J[i1+1]-1;
296 			J[i1] = j1;
297 			change(i0, i1, j0, j1);
298 		}
299 	}
300 	else {
301 		for (i0 = m; i0 >= 1; i0 = i1-1) {
302 			while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0])
303 				i0--;
304 			j0 = J[i0+1]-1;
305 			i1 = i0+1;
306 			while (i1 > 1 && J[i1-1] == 0)
307 				i1--;
308 			j1 = J[i1-1]+1;
309 			J[i1] = j1;
310 			change(i1 , i0, j1, j0);
311 		}
312 	}
313 	if (m == 0)
314 		change(1, 0, 1, len[1]);
315 }
316 
317 void
318 diffreg(char *f, char *t)
319 {
320 	Biobuf *b0, *b1;
321 	int k;
322 
323 	b0 = prepare(0, f);
324 	if (!b0)
325 		return;
326 	b1 = prepare(1, t);
327 	if (!b1) {
328 		FREE(file[0]);
329 		Bclose(b0);
330 		return;
331 	}
332 	clen = 0;
333 	prune();
334 	sort(sfile[0], slen[0]);
335 	sort(sfile[1], slen[1]);
336 
337 	member = (int *)file[1];
338 	equiv(sfile[0], slen[0], sfile[1], slen[1], member);
339 	member = REALLOC(member, int, slen[1]+2);
340 
341 	class = (int *)file[0];
342 	unsort(sfile[0], slen[0], class);
343 	class = REALLOC(class, int, slen[0]+2);
344 
345 	klist = MALLOC(int, slen[0]+2);
346 	clist = MALLOC(struct cand, 1);
347 	k = stone(class, slen[0], member, klist);
348 	FREE(member);
349 	FREE(class);
350 
351 	J = MALLOC(int, len[0]+2);
352 	unravel(klist[k]);
353 	FREE(clist);
354 	FREE(klist);
355 
356 	ixold = MALLOC(long, len[0]+2);
357 	ixnew = MALLOC(long, len[1]+2);
358 	Bseek(b0, 0, 0); Bseek(b1, 0, 0);
359 	check(b0, b1);
360 	output();
361 	FREE(J); FREE(ixold); FREE(ixnew);
362 	Bclose(b0); Bclose(b1);			/* ++++ */
363 }
364