1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include "diff.h" 5 6 /* diff - differential file comparison 7 * 8 * Uses an algorithm due to Harold Stone, which finds 9 * a pair of longest identical subsequences in the two 10 * files. 11 * 12 * The major goal is to generate the match vector J. 13 * J[i] is the index of the line in file1 corresponding 14 * to line i file0. J[i] = 0 if there is no 15 * such line in file1. 16 * 17 * Lines are hashed so as to work in core. All potential 18 * matches are located by sorting the lines of each file 19 * on the hash (called value). In particular, this 20 * collects the equivalence classes in file1 together. 21 * Subroutine equiv replaces the value of each line in 22 * file0 by the index of the first element of its 23 * matching equivalence in (the reordered) file1. 24 * To save space equiv squeezes file1 into a single 25 * array member in which the equivalence classes 26 * are simply concatenated, except that their first 27 * members are flagged by changing sign. 28 * 29 * Next the indices that point into member are unsorted into 30 * array class according to the original order of file0. 31 * 32 * The cleverness lies in routine stone. This marches 33 * through the lines of file0, developing a vector klist 34 * of "k-candidates". At step i a k-candidate is a matched 35 * pair of lines x,y (x in file0 y in file1) such that 36 * there is a common subsequence of lenght k 37 * between the first i lines of file0 and the first y 38 * lines of file1, but there is no such subsequence for 39 * any smaller y. x is the earliest possible mate to y 40 * that occurs in such a subsequence. 41 * 42 * Whenever any of the members of the equivalence class of 43 * lines in file1 matable to a line in file0 has serial number 44 * less than the y of some k-candidate, that k-candidate 45 * with the smallest such y is replaced. The new 46 * k-candidate is chained (via pred) to the current 47 * k-1 candidate so that the actual subsequence can 48 * be recovered. When a member has serial number greater 49 * that the y of all k-candidates, the klist is extended. 50 * At the end, the longest subsequence is pulled out 51 * and placed in the array J by unravel. 52 * 53 * With J in hand, the matches there recorded are 54 * check'ed against reality to assure that no spurious 55 * matches have crept in due to hashing. If they have, 56 * they are broken, and "jackpot " is recorded--a harmless 57 * matter except that a true match for a spuriously 58 * mated line may now be unnecessarily reported as a change. 59 * 60 * Much of the complexity of the program comes simply 61 * from trying to minimize core utilization and 62 * maximize the range of doable problems by dynamically 63 * allocating what is needed and reusing what is not. 64 * The core requirements for problems larger than somewhat 65 * are (in words) 2*length(file0) + length(file1) + 66 * 3*(number of k-candidates installed), typically about 67 * 6n words for files of length n. 68 */ 69 /* TIDY THIS UP */ 70 struct cand { 71 int x; 72 int y; 73 int pred; 74 } cand; 75 struct line { 76 int serial; 77 int value; 78 } *file[2], line; 79 int len[2]; 80 struct line *sfile[2]; /*shortened by pruning common prefix and suffix*/ 81 int slen[2]; 82 int pref, suff; /*length of prefix and suffix*/ 83 int *class; /*will be overlaid on file[0]*/ 84 int *member; /*will be overlaid on file[1]*/ 85 int *klist; /*will be overlaid on file[0] after class*/ 86 struct cand *clist; /* merely a free storage pot for candidates */ 87 int clen; 88 int *J; /*will be overlaid on class*/ 89 long *ixold; /*will be overlaid on klist*/ 90 long *ixnew; /*will be overlaid on file[1]*/ 91 /* END OF SOME TIDYING */ 92 93 static void 94 sort(struct line *a, int n) /*shellsort CACM #201*/ 95 { 96 int m; 97 struct line *ai, *aim, *j, *k; 98 struct line w; 99 int i; 100 101 m = 0; 102 for (i = 1; i <= n; i *= 2) 103 m = 2*i - 1; 104 for (m /= 2; m != 0; m /= 2) { 105 k = a+(n-m); 106 for (j = a+1; j <= k; j++) { 107 ai = j; 108 aim = ai+m; 109 do { 110 if (aim->value > ai->value || 111 aim->value == ai->value && 112 aim->serial > ai->serial) 113 break; 114 w = *ai; 115 *ai = *aim; 116 *aim = w; 117 118 aim = ai; 119 ai -= m; 120 } while (ai > a && aim >= ai); 121 } 122 } 123 } 124 125 static void 126 unsort(struct line *f, int l, int *b) 127 { 128 int *a; 129 int i; 130 131 a = MALLOC(int, (l+1)); 132 for(i=1;i<=l;i++) 133 a[f[i].serial] = f[i].value; 134 for(i=1;i<=l;i++) 135 b[i] = a[i]; 136 FREE(a); 137 } 138 139 static void 140 prune(void) 141 { 142 int i,j; 143 144 for(pref=0;pref<len[0]&&pref<len[1]&& 145 file[0][pref+1].value==file[1][pref+1].value; 146 pref++ ) ; 147 for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&& 148 file[0][len[0]-suff].value==file[1][len[1]-suff].value; 149 suff++) ; 150 for(j=0;j<2;j++) { 151 sfile[j] = file[j]+pref; 152 slen[j] = len[j]-pref-suff; 153 for(i=0;i<=slen[j];i++) 154 sfile[j][i].serial = i; 155 } 156 } 157 158 static void 159 equiv(struct line *a, int n, struct line *b, int m, int *c) 160 { 161 int i, j; 162 163 i = j = 1; 164 while(i<=n && j<=m) { 165 if(a[i].value < b[j].value) 166 a[i++].value = 0; 167 else if(a[i].value == b[j].value) 168 a[i++].value = j; 169 else 170 j++; 171 } 172 while(i <= n) 173 a[i++].value = 0; 174 b[m+1].value = 0; 175 j = 0; 176 while(++j <= m) { 177 c[j] = -b[j].serial; 178 while(b[j+1].value == b[j].value) { 179 j++; 180 c[j] = b[j].serial; 181 } 182 } 183 c[j] = -1; 184 } 185 186 static int 187 newcand(int x, int y, int pred) 188 { 189 struct cand *q; 190 191 clist = REALLOC(clist, struct cand, (clen+1)); 192 q = clist + clen; 193 q->x = x; 194 q->y = y; 195 q->pred = pred; 196 return clen++; 197 } 198 199 static int 200 search(int *c, int k, int y) 201 { 202 int i, j, l; 203 int t; 204 205 if(clist[c[k]].y < y) /*quick look for typical case*/ 206 return k+1; 207 i = 0; 208 j = k+1; 209 while((l=(i+j)/2) > i) { 210 t = clist[c[l]].y; 211 if(t > y) 212 j = l; 213 else if(t < y) 214 i = l; 215 else 216 return l; 217 } 218 return l+1; 219 } 220 221 static int 222 stone(int *a, int n, int *b, int *c) 223 { 224 int i, k,y; 225 int j, l; 226 int oldc, tc; 227 int oldl; 228 229 k = 0; 230 c[0] = newcand(0,0,0); 231 for(i=1; i<=n; i++) { 232 j = a[i]; 233 if(j==0) 234 continue; 235 y = -b[j]; 236 oldl = 0; 237 oldc = c[0]; 238 do { 239 if(y <= clist[oldc].y) 240 continue; 241 l = search(c, k, y); 242 if(l!=oldl+1) 243 oldc = c[l-1]; 244 if(l<=k) { 245 if(clist[c[l]].y <= y) 246 continue; 247 tc = c[l]; 248 c[l] = newcand(i,y,oldc); 249 oldc = tc; 250 oldl = l; 251 } else { 252 c[l] = newcand(i,y,oldc); 253 k++; 254 break; 255 } 256 } while((y=b[++j]) > 0); 257 } 258 return k; 259 } 260 261 static void 262 unravel(int p) 263 { 264 int i; 265 struct cand *q; 266 267 for(i=0; i<=len[0]; i++) { 268 if (i <= pref) 269 J[i] = i; 270 else if (i > len[0]-suff) 271 J[i] = i+len[1]-len[0]; 272 else 273 J[i] = 0; 274 } 275 for(q=clist+p;q->y!=0;q=clist+q->pred) 276 J[q->x+pref] = q->y+pref; 277 } 278 279 static void 280 output(void) 281 { 282 int m, i0, i1, j0, j1; 283 284 m = len[0]; 285 J[0] = 0; 286 J[m+1] = len[1]+1; 287 if (mode != 'e') { 288 for (i0 = 1; i0 <= m; i0 = i1+1) { 289 while (i0 <= m && J[i0] == J[i0-1]+1) 290 i0++; 291 j0 = J[i0-1]+1; 292 i1 = i0-1; 293 while (i1 < m && J[i1+1] == 0) 294 i1++; 295 j1 = J[i1+1]-1; 296 J[i1] = j1; 297 change(i0, i1, j0, j1); 298 } 299 } 300 else { 301 for (i0 = m; i0 >= 1; i0 = i1-1) { 302 while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0]) 303 i0--; 304 j0 = J[i0+1]-1; 305 i1 = i0+1; 306 while (i1 > 1 && J[i1-1] == 0) 307 i1--; 308 j1 = J[i1-1]+1; 309 J[i1] = j1; 310 change(i1 , i0, j1, j0); 311 } 312 } 313 if (m == 0) 314 change(1, 0, 1, len[1]); 315 } 316 317 void 318 diffreg(char *f, char *t) 319 { 320 Biobuf *b0, *b1; 321 int k; 322 323 b0 = prepare(0, f); 324 if (!b0) 325 return; 326 b1 = prepare(1, t); 327 if (!b1) { 328 FREE(file[0]); 329 Bclose(b0); 330 return; 331 } 332 clen = 0; 333 prune(); 334 sort(sfile[0], slen[0]); 335 sort(sfile[1], slen[1]); 336 337 member = (int *)file[1]; 338 equiv(sfile[0], slen[0], sfile[1], slen[1], member); 339 member = REALLOC(member, int, slen[1]+2); 340 341 class = (int *)file[0]; 342 unsort(sfile[0], slen[0], class); 343 class = REALLOC(class, int, slen[0]+2); 344 345 klist = MALLOC(int, slen[0]+2); 346 clist = MALLOC(struct cand, 1); 347 k = stone(class, slen[0], member, klist); 348 FREE(member); 349 FREE(class); 350 351 J = MALLOC(int, len[0]+2); 352 unravel(klist[k]); 353 FREE(clist); 354 FREE(klist); 355 356 ixold = MALLOC(long, len[0]+2); 357 ixnew = MALLOC(long, len[1]+2); 358 Bseek(b0, 0, 0); Bseek(b1, 0, 0); 359 check(b0, b1); 360 output(); 361 FREE(J); FREE(ixold); FREE(ixnew); 362 Bclose(b0); Bclose(b1); /* ++++ */ 363 } 364