13e12c5d1SDavid du Colombier #include <u.h> 23e12c5d1SDavid du Colombier #include <libc.h> 33e12c5d1SDavid du Colombier #include <bio.h> 43e12c5d1SDavid du Colombier #include "diff.h" 53e12c5d1SDavid du Colombier 63e12c5d1SDavid du Colombier /* diff - differential file comparison 73e12c5d1SDavid du Colombier * 83e12c5d1SDavid du Colombier * Uses an algorithm due to Harold Stone, which finds 93e12c5d1SDavid du Colombier * a pair of longest identical subsequences in the two 103e12c5d1SDavid du Colombier * files. 113e12c5d1SDavid du Colombier * 123e12c5d1SDavid du Colombier * The major goal is to generate the match vector J. 133e12c5d1SDavid du Colombier * J[i] is the index of the line in file1 corresponding 143e12c5d1SDavid du Colombier * to line i file0. J[i] = 0 if there is no 153e12c5d1SDavid du Colombier * such line in file1. 163e12c5d1SDavid du Colombier * 173e12c5d1SDavid du Colombier * Lines are hashed so as to work in core. All potential 183e12c5d1SDavid du Colombier * matches are located by sorting the lines of each file 193e12c5d1SDavid du Colombier * on the hash (called value). In particular, this 203e12c5d1SDavid du Colombier * collects the equivalence classes in file1 together. 213e12c5d1SDavid du Colombier * Subroutine equiv replaces the value of each line in 223e12c5d1SDavid du Colombier * file0 by the index of the first element of its 233e12c5d1SDavid du Colombier * matching equivalence in (the reordered) file1. 243e12c5d1SDavid du Colombier * To save space equiv squeezes file1 into a single 253e12c5d1SDavid du Colombier * array member in which the equivalence classes 263e12c5d1SDavid du Colombier * are simply concatenated, except that their first 273e12c5d1SDavid du Colombier * members are flagged by changing sign. 283e12c5d1SDavid du Colombier * 293e12c5d1SDavid du Colombier * Next the indices that point into member are unsorted into 303e12c5d1SDavid du Colombier * array class according to the original order of file0. 313e12c5d1SDavid du Colombier * 323e12c5d1SDavid du Colombier * The cleverness lies in routine stone. This marches 333e12c5d1SDavid du Colombier * through the lines of file0, developing a vector klist 343e12c5d1SDavid du Colombier * of "k-candidates". At step i a k-candidate is a matched 353e12c5d1SDavid du Colombier * pair of lines x,y (x in file0 y in file1) such that 363e12c5d1SDavid du Colombier * there is a common subsequence of lenght k 373e12c5d1SDavid du Colombier * between the first i lines of file0 and the first y 383e12c5d1SDavid du Colombier * lines of file1, but there is no such subsequence for 393e12c5d1SDavid du Colombier * any smaller y. x is the earliest possible mate to y 403e12c5d1SDavid du Colombier * that occurs in such a subsequence. 413e12c5d1SDavid du Colombier * 423e12c5d1SDavid du Colombier * Whenever any of the members of the equivalence class of 433e12c5d1SDavid du Colombier * lines in file1 matable to a line in file0 has serial number 443e12c5d1SDavid du Colombier * less than the y of some k-candidate, that k-candidate 453e12c5d1SDavid du Colombier * with the smallest such y is replaced. The new 463e12c5d1SDavid du Colombier * k-candidate is chained (via pred) to the current 473e12c5d1SDavid du Colombier * k-1 candidate so that the actual subsequence can 483e12c5d1SDavid du Colombier * be recovered. When a member has serial number greater 493e12c5d1SDavid du Colombier * that the y of all k-candidates, the klist is extended. 503e12c5d1SDavid du Colombier * At the end, the longest subsequence is pulled out 513e12c5d1SDavid du Colombier * and placed in the array J by unravel. 523e12c5d1SDavid du Colombier * 533e12c5d1SDavid du Colombier * With J in hand, the matches there recorded are 543e12c5d1SDavid du Colombier * check'ed against reality to assure that no spurious 553e12c5d1SDavid du Colombier * matches have crept in due to hashing. If they have, 563e12c5d1SDavid du Colombier * they are broken, and "jackpot " is recorded--a harmless 573e12c5d1SDavid du Colombier * matter except that a true match for a spuriously 583e12c5d1SDavid du Colombier * mated line may now be unnecessarily reported as a change. 593e12c5d1SDavid du Colombier * 603e12c5d1SDavid du Colombier * Much of the complexity of the program comes simply 613e12c5d1SDavid du Colombier * from trying to minimize core utilization and 623e12c5d1SDavid du Colombier * maximize the range of doable problems by dynamically 633e12c5d1SDavid du Colombier * allocating what is needed and reusing what is not. 643e12c5d1SDavid du Colombier * The core requirements for problems larger than somewhat 653e12c5d1SDavid du Colombier * are (in words) 2*length(file0) + length(file1) + 663e12c5d1SDavid du Colombier * 3*(number of k-candidates installed), typically about 673e12c5d1SDavid du Colombier * 6n words for files of length n. 683e12c5d1SDavid du Colombier */ 693e12c5d1SDavid du Colombier /* TIDY THIS UP */ 703e12c5d1SDavid du Colombier struct cand { 713e12c5d1SDavid du Colombier int x; 723e12c5d1SDavid du Colombier int y; 733e12c5d1SDavid du Colombier int pred; 743e12c5d1SDavid du Colombier } cand; 753e12c5d1SDavid du Colombier struct line { 763e12c5d1SDavid du Colombier int serial; 773e12c5d1SDavid du Colombier int value; 783e12c5d1SDavid du Colombier } *file[2], line; 793e12c5d1SDavid du Colombier int len[2]; 803e12c5d1SDavid du Colombier struct line *sfile[2]; /*shortened by pruning common prefix and suffix*/ 813e12c5d1SDavid du Colombier int slen[2]; 823e12c5d1SDavid du Colombier int pref, suff; /*length of prefix and suffix*/ 833e12c5d1SDavid du Colombier int *class; /*will be overlaid on file[0]*/ 843e12c5d1SDavid du Colombier int *member; /*will be overlaid on file[1]*/ 853e12c5d1SDavid du Colombier int *klist; /*will be overlaid on file[0] after class*/ 863e12c5d1SDavid du Colombier struct cand *clist; /* merely a free storage pot for candidates */ 873e12c5d1SDavid du Colombier int clen; 883e12c5d1SDavid du Colombier int *J; /*will be overlaid on class*/ 893e12c5d1SDavid du Colombier long *ixold; /*will be overlaid on klist*/ 903e12c5d1SDavid du Colombier long *ixnew; /*will be overlaid on file[1]*/ 913e12c5d1SDavid du Colombier /* END OF SOME TIDYING */ 923e12c5d1SDavid du Colombier 933e12c5d1SDavid du Colombier static void 943e12c5d1SDavid du Colombier sort(struct line *a, int n) /*shellsort CACM #201*/ 953e12c5d1SDavid du Colombier { 963e12c5d1SDavid du Colombier int m; 973e12c5d1SDavid du Colombier struct line *ai, *aim, *j, *k; 983e12c5d1SDavid du Colombier struct line w; 993e12c5d1SDavid du Colombier int i; 1003e12c5d1SDavid du Colombier 1013e12c5d1SDavid du Colombier m = 0; 1023e12c5d1SDavid du Colombier for (i = 1; i <= n; i *= 2) 1033e12c5d1SDavid du Colombier m = 2*i - 1; 1043e12c5d1SDavid du Colombier for (m /= 2; m != 0; m /= 2) { 1053e12c5d1SDavid du Colombier k = a+(n-m); 1063e12c5d1SDavid du Colombier for (j = a+1; j <= k; j++) { 1073e12c5d1SDavid du Colombier ai = j; 1083e12c5d1SDavid du Colombier aim = ai+m; 1093e12c5d1SDavid du Colombier do { 1103e12c5d1SDavid du Colombier if (aim->value > ai->value || 1113e12c5d1SDavid du Colombier aim->value == ai->value && 1123e12c5d1SDavid du Colombier aim->serial > ai->serial) 1133e12c5d1SDavid du Colombier break; 1143e12c5d1SDavid du Colombier w = *ai; 1153e12c5d1SDavid du Colombier *ai = *aim; 1163e12c5d1SDavid du Colombier *aim = w; 1173e12c5d1SDavid du Colombier 1183e12c5d1SDavid du Colombier aim = ai; 1193e12c5d1SDavid du Colombier ai -= m; 1203e12c5d1SDavid du Colombier } while (ai > a && aim >= ai); 1213e12c5d1SDavid du Colombier } 1223e12c5d1SDavid du Colombier } 1233e12c5d1SDavid du Colombier } 1243e12c5d1SDavid du Colombier 1253e12c5d1SDavid du Colombier static void 1263e12c5d1SDavid du Colombier unsort(struct line *f, int l, int *b) 1273e12c5d1SDavid du Colombier { 1283e12c5d1SDavid du Colombier int *a; 1293e12c5d1SDavid du Colombier int i; 1303e12c5d1SDavid du Colombier 1313e12c5d1SDavid du Colombier a = MALLOC(int, (l+1)); 1323e12c5d1SDavid du Colombier for(i=1;i<=l;i++) 1333e12c5d1SDavid du Colombier a[f[i].serial] = f[i].value; 1343e12c5d1SDavid du Colombier for(i=1;i<=l;i++) 1353e12c5d1SDavid du Colombier b[i] = a[i]; 1363e12c5d1SDavid du Colombier FREE(a); 1373e12c5d1SDavid du Colombier } 1383e12c5d1SDavid du Colombier 1393e12c5d1SDavid du Colombier static void 1403e12c5d1SDavid du Colombier prune(void) 1413e12c5d1SDavid du Colombier { 1423e12c5d1SDavid du Colombier int i,j; 1433e12c5d1SDavid du Colombier 1443e12c5d1SDavid du Colombier for(pref=0;pref<len[0]&&pref<len[1]&& 1453e12c5d1SDavid du Colombier file[0][pref+1].value==file[1][pref+1].value; 1463e12c5d1SDavid du Colombier pref++ ) ; 1473e12c5d1SDavid du Colombier for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&& 1483e12c5d1SDavid du Colombier file[0][len[0]-suff].value==file[1][len[1]-suff].value; 1493e12c5d1SDavid du Colombier suff++) ; 1503e12c5d1SDavid du Colombier for(j=0;j<2;j++) { 1513e12c5d1SDavid du Colombier sfile[j] = file[j]+pref; 1523e12c5d1SDavid du Colombier slen[j] = len[j]-pref-suff; 1533e12c5d1SDavid du Colombier for(i=0;i<=slen[j];i++) 1543e12c5d1SDavid du Colombier sfile[j][i].serial = i; 1553e12c5d1SDavid du Colombier } 1563e12c5d1SDavid du Colombier } 1573e12c5d1SDavid du Colombier 1583e12c5d1SDavid du Colombier static void 1593e12c5d1SDavid du Colombier equiv(struct line *a, int n, struct line *b, int m, int *c) 1603e12c5d1SDavid du Colombier { 1613e12c5d1SDavid du Colombier int i, j; 1623e12c5d1SDavid du Colombier 1633e12c5d1SDavid du Colombier i = j = 1; 1643e12c5d1SDavid du Colombier while(i<=n && j<=m) { 1653e12c5d1SDavid du Colombier if(a[i].value < b[j].value) 1663e12c5d1SDavid du Colombier a[i++].value = 0; 1673e12c5d1SDavid du Colombier else if(a[i].value == b[j].value) 1683e12c5d1SDavid du Colombier a[i++].value = j; 1693e12c5d1SDavid du Colombier else 1703e12c5d1SDavid du Colombier j++; 1713e12c5d1SDavid du Colombier } 1723e12c5d1SDavid du Colombier while(i <= n) 1733e12c5d1SDavid du Colombier a[i++].value = 0; 1743e12c5d1SDavid du Colombier b[m+1].value = 0; 1753e12c5d1SDavid du Colombier j = 0; 1763e12c5d1SDavid du Colombier while(++j <= m) { 1773e12c5d1SDavid du Colombier c[j] = -b[j].serial; 1783e12c5d1SDavid du Colombier while(b[j+1].value == b[j].value) { 1793e12c5d1SDavid du Colombier j++; 1803e12c5d1SDavid du Colombier c[j] = b[j].serial; 1813e12c5d1SDavid du Colombier } 1823e12c5d1SDavid du Colombier } 1833e12c5d1SDavid du Colombier c[j] = -1; 1843e12c5d1SDavid du Colombier } 1853e12c5d1SDavid du Colombier 1863e12c5d1SDavid du Colombier static int 1873e12c5d1SDavid du Colombier newcand(int x, int y, int pred) 1883e12c5d1SDavid du Colombier { 1893e12c5d1SDavid du Colombier struct cand *q; 1903e12c5d1SDavid du Colombier 1913e12c5d1SDavid du Colombier clist = REALLOC(clist, struct cand, (clen+1)); 1923e12c5d1SDavid du Colombier q = clist + clen; 1933e12c5d1SDavid du Colombier q->x = x; 1943e12c5d1SDavid du Colombier q->y = y; 1953e12c5d1SDavid du Colombier q->pred = pred; 1963e12c5d1SDavid du Colombier return clen++; 1973e12c5d1SDavid du Colombier } 1983e12c5d1SDavid du Colombier 1993e12c5d1SDavid du Colombier static int 2003e12c5d1SDavid du Colombier search(int *c, int k, int y) 2013e12c5d1SDavid du Colombier { 2023e12c5d1SDavid du Colombier int i, j, l; 2033e12c5d1SDavid du Colombier int t; 2043e12c5d1SDavid du Colombier 2053e12c5d1SDavid du Colombier if(clist[c[k]].y < y) /*quick look for typical case*/ 2063e12c5d1SDavid du Colombier return k+1; 2073e12c5d1SDavid du Colombier i = 0; 2083e12c5d1SDavid du Colombier j = k+1; 2093e12c5d1SDavid du Colombier while((l=(i+j)/2) > i) { 2103e12c5d1SDavid du Colombier t = clist[c[l]].y; 2113e12c5d1SDavid du Colombier if(t > y) 2123e12c5d1SDavid du Colombier j = l; 2133e12c5d1SDavid du Colombier else if(t < y) 2143e12c5d1SDavid du Colombier i = l; 2153e12c5d1SDavid du Colombier else 2163e12c5d1SDavid du Colombier return l; 2173e12c5d1SDavid du Colombier } 2183e12c5d1SDavid du Colombier return l+1; 2193e12c5d1SDavid du Colombier } 2203e12c5d1SDavid du Colombier 2213e12c5d1SDavid du Colombier static int 2223e12c5d1SDavid du Colombier stone(int *a, int n, int *b, int *c) 2233e12c5d1SDavid du Colombier { 2243e12c5d1SDavid du Colombier int i, k,y; 2253e12c5d1SDavid du Colombier int j, l; 2263e12c5d1SDavid du Colombier int oldc, tc; 2273e12c5d1SDavid du Colombier int oldl; 2283e12c5d1SDavid du Colombier 2293e12c5d1SDavid du Colombier k = 0; 2303e12c5d1SDavid du Colombier c[0] = newcand(0,0,0); 2313e12c5d1SDavid du Colombier for(i=1; i<=n; i++) { 2323e12c5d1SDavid du Colombier j = a[i]; 2333e12c5d1SDavid du Colombier if(j==0) 2343e12c5d1SDavid du Colombier continue; 2353e12c5d1SDavid du Colombier y = -b[j]; 2363e12c5d1SDavid du Colombier oldl = 0; 2373e12c5d1SDavid du Colombier oldc = c[0]; 2383e12c5d1SDavid du Colombier do { 2393e12c5d1SDavid du Colombier if(y <= clist[oldc].y) 2403e12c5d1SDavid du Colombier continue; 2413e12c5d1SDavid du Colombier l = search(c, k, y); 2423e12c5d1SDavid du Colombier if(l!=oldl+1) 2433e12c5d1SDavid du Colombier oldc = c[l-1]; 2443e12c5d1SDavid du Colombier if(l<=k) { 2453e12c5d1SDavid du Colombier if(clist[c[l]].y <= y) 2463e12c5d1SDavid du Colombier continue; 2473e12c5d1SDavid du Colombier tc = c[l]; 2483e12c5d1SDavid du Colombier c[l] = newcand(i,y,oldc); 2493e12c5d1SDavid du Colombier oldc = tc; 2503e12c5d1SDavid du Colombier oldl = l; 2513e12c5d1SDavid du Colombier } else { 2523e12c5d1SDavid du Colombier c[l] = newcand(i,y,oldc); 2533e12c5d1SDavid du Colombier k++; 2543e12c5d1SDavid du Colombier break; 2553e12c5d1SDavid du Colombier } 2563e12c5d1SDavid du Colombier } while((y=b[++j]) > 0); 2573e12c5d1SDavid du Colombier } 2583e12c5d1SDavid du Colombier return k; 2593e12c5d1SDavid du Colombier } 2603e12c5d1SDavid du Colombier 2613e12c5d1SDavid du Colombier static void 2623e12c5d1SDavid du Colombier unravel(int p) 2633e12c5d1SDavid du Colombier { 2643e12c5d1SDavid du Colombier int i; 2653e12c5d1SDavid du Colombier struct cand *q; 2663e12c5d1SDavid du Colombier 2673e12c5d1SDavid du Colombier for(i=0; i<=len[0]; i++) { 2683e12c5d1SDavid du Colombier if (i <= pref) 2693e12c5d1SDavid du Colombier J[i] = i; 2703e12c5d1SDavid du Colombier else if (i > len[0]-suff) 2713e12c5d1SDavid du Colombier J[i] = i+len[1]-len[0]; 2723e12c5d1SDavid du Colombier else 2733e12c5d1SDavid du Colombier J[i] = 0; 2743e12c5d1SDavid du Colombier } 2753e12c5d1SDavid du Colombier for(q=clist+p;q->y!=0;q=clist+q->pred) 2763e12c5d1SDavid du Colombier J[q->x+pref] = q->y+pref; 2773e12c5d1SDavid du Colombier } 2783e12c5d1SDavid du Colombier 2793e12c5d1SDavid du Colombier static void 2803e12c5d1SDavid du Colombier output(void) 2813e12c5d1SDavid du Colombier { 2823e12c5d1SDavid du Colombier int m, i0, i1, j0, j1; 2833e12c5d1SDavid du Colombier 2843e12c5d1SDavid du Colombier m = len[0]; 2853e12c5d1SDavid du Colombier J[0] = 0; 2863e12c5d1SDavid du Colombier J[m+1] = len[1]+1; 2873e12c5d1SDavid du Colombier if (mode != 'e') { 2883e12c5d1SDavid du Colombier for (i0 = 1; i0 <= m; i0 = i1+1) { 2893e12c5d1SDavid du Colombier while (i0 <= m && J[i0] == J[i0-1]+1) 2903e12c5d1SDavid du Colombier i0++; 2913e12c5d1SDavid du Colombier j0 = J[i0-1]+1; 2923e12c5d1SDavid du Colombier i1 = i0-1; 2933e12c5d1SDavid du Colombier while (i1 < m && J[i1+1] == 0) 2943e12c5d1SDavid du Colombier i1++; 2953e12c5d1SDavid du Colombier j1 = J[i1+1]-1; 2963e12c5d1SDavid du Colombier J[i1] = j1; 2973e12c5d1SDavid du Colombier change(i0, i1, j0, j1); 2983e12c5d1SDavid du Colombier } 2993e12c5d1SDavid du Colombier } 3003e12c5d1SDavid du Colombier else { 3013e12c5d1SDavid du Colombier for (i0 = m; i0 >= 1; i0 = i1-1) { 3023e12c5d1SDavid du Colombier while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0]) 3033e12c5d1SDavid du Colombier i0--; 3043e12c5d1SDavid du Colombier j0 = J[i0+1]-1; 3053e12c5d1SDavid du Colombier i1 = i0+1; 3063e12c5d1SDavid du Colombier while (i1 > 1 && J[i1-1] == 0) 3073e12c5d1SDavid du Colombier i1--; 3083e12c5d1SDavid du Colombier j1 = J[i1-1]+1; 3093e12c5d1SDavid du Colombier J[i1] = j1; 3103e12c5d1SDavid du Colombier change(i1 , i0, j1, j0); 3113e12c5d1SDavid du Colombier } 3123e12c5d1SDavid du Colombier } 3133e12c5d1SDavid du Colombier if (m == 0) 3143e12c5d1SDavid du Colombier change(1, 0, 1, len[1]); 3153e12c5d1SDavid du Colombier } 3163e12c5d1SDavid du Colombier 3173e12c5d1SDavid du Colombier void 3183e12c5d1SDavid du Colombier diffreg(char *f, char *t) 3193e12c5d1SDavid du Colombier { 3203e12c5d1SDavid du Colombier Biobuf *b0, *b1; 3213e12c5d1SDavid du Colombier int k; 3223e12c5d1SDavid du Colombier 3233e12c5d1SDavid du Colombier b0 = prepare(0, f); 3243e12c5d1SDavid du Colombier if (!b0) 3253e12c5d1SDavid du Colombier return; 3263e12c5d1SDavid du Colombier b1 = prepare(1, t); 3273e12c5d1SDavid du Colombier if (!b1) { 3283e12c5d1SDavid du Colombier FREE(file[0]); 329*219b2ee8SDavid du Colombier Bterm(b0); 3303e12c5d1SDavid du Colombier return; 3313e12c5d1SDavid du Colombier } 3323e12c5d1SDavid du Colombier clen = 0; 3333e12c5d1SDavid du Colombier prune(); 3343e12c5d1SDavid du Colombier sort(sfile[0], slen[0]); 3353e12c5d1SDavid du Colombier sort(sfile[1], slen[1]); 3363e12c5d1SDavid du Colombier 3373e12c5d1SDavid du Colombier member = (int *)file[1]; 3383e12c5d1SDavid du Colombier equiv(sfile[0], slen[0], sfile[1], slen[1], member); 3393e12c5d1SDavid du Colombier member = REALLOC(member, int, slen[1]+2); 3403e12c5d1SDavid du Colombier 3413e12c5d1SDavid du Colombier class = (int *)file[0]; 3423e12c5d1SDavid du Colombier unsort(sfile[0], slen[0], class); 3433e12c5d1SDavid du Colombier class = REALLOC(class, int, slen[0]+2); 3443e12c5d1SDavid du Colombier 3453e12c5d1SDavid du Colombier klist = MALLOC(int, slen[0]+2); 3463e12c5d1SDavid du Colombier clist = MALLOC(struct cand, 1); 3473e12c5d1SDavid du Colombier k = stone(class, slen[0], member, klist); 3483e12c5d1SDavid du Colombier FREE(member); 3493e12c5d1SDavid du Colombier FREE(class); 3503e12c5d1SDavid du Colombier 3513e12c5d1SDavid du Colombier J = MALLOC(int, len[0]+2); 3523e12c5d1SDavid du Colombier unravel(klist[k]); 3533e12c5d1SDavid du Colombier FREE(clist); 3543e12c5d1SDavid du Colombier FREE(klist); 3553e12c5d1SDavid du Colombier 3563e12c5d1SDavid du Colombier ixold = MALLOC(long, len[0]+2); 3573e12c5d1SDavid du Colombier ixnew = MALLOC(long, len[1]+2); 3583e12c5d1SDavid du Colombier Bseek(b0, 0, 0); Bseek(b1, 0, 0); 3593e12c5d1SDavid du Colombier check(b0, b1); 3603e12c5d1SDavid du Colombier output(); 3613e12c5d1SDavid du Colombier FREE(J); FREE(ixold); FREE(ixnew); 362*219b2ee8SDavid du Colombier Bterm(b0); Bterm(b1); /* ++++ */ 3633e12c5d1SDavid du Colombier } 364