1*2286d8edStholo /* File I/O for GNU DIFF. 2*2286d8edStholo Copyright (C) 1988, 1989, 1992, 1993, 1994 Free Software Foundation, Inc. 3*2286d8edStholo 4*2286d8edStholo This file is part of GNU DIFF. 5*2286d8edStholo 6*2286d8edStholo GNU DIFF is free software; you can redistribute it and/or modify 7*2286d8edStholo it under the terms of the GNU General Public License as published by 8*2286d8edStholo the Free Software Foundation; either version 2, or (at your option) 9*2286d8edStholo any later version. 10*2286d8edStholo 11*2286d8edStholo GNU DIFF is distributed in the hope that it will be useful, 12*2286d8edStholo but WITHOUT ANY WARRANTY; without even the implied warranty of 13*2286d8edStholo MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14*2286d8edStholo GNU General Public License for more details. 15*2286d8edStholo 16*2286d8edStholo You should have received a copy of the GNU General Public License 17*2286d8edStholo along with GNU DIFF; see the file COPYING. If not, write to 18*2286d8edStholo the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ 19*2286d8edStholo 20*2286d8edStholo #include "diff.h" 21*2286d8edStholo 22*2286d8edStholo /* Rotate a value n bits to the left. */ 23*2286d8edStholo #define UINT_BIT (sizeof (unsigned) * CHAR_BIT) 24*2286d8edStholo #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n))) 25*2286d8edStholo 26*2286d8edStholo /* Given a hash value and a new character, return a new hash value. */ 27*2286d8edStholo #define HASH(h, c) ((c) + ROL (h, 7)) 28*2286d8edStholo 29*2286d8edStholo /* Guess remaining number of lines from number N of lines so far, 30*2286d8edStholo size S so far, and total size T. */ 31*2286d8edStholo #define GUESS_LINES(n,s,t) (((t) - (s)) / ((n) < 10 ? 32 : (s) / ((n)-1)) + 5) 32*2286d8edStholo 33*2286d8edStholo /* Type used for fast prefix comparison in find_identical_ends. */ 34*2286d8edStholo #ifndef word 35*2286d8edStholo #define word int 36*2286d8edStholo #endif 37*2286d8edStholo 38*2286d8edStholo /* Lines are put into equivalence classes (of lines that match in line_cmp). 39*2286d8edStholo Each equivalence class is represented by one of these structures, 40*2286d8edStholo but only while the classes are being computed. 41*2286d8edStholo Afterward, each class is represented by a number. */ 42*2286d8edStholo struct equivclass 43*2286d8edStholo { 44*2286d8edStholo int next; /* Next item in this bucket. */ 45*2286d8edStholo unsigned hash; /* Hash of lines in this class. */ 46*2286d8edStholo char const *line; /* A line that fits this class. */ 47*2286d8edStholo size_t length; /* That line's length, not counting its newline. */ 48*2286d8edStholo }; 49*2286d8edStholo 50*2286d8edStholo /* Hash-table: array of buckets, each being a chain of equivalence classes. 51*2286d8edStholo buckets[-1] is reserved for incomplete lines. */ 52*2286d8edStholo static int *buckets; 53*2286d8edStholo 54*2286d8edStholo /* Number of buckets in the hash table array, not counting buckets[-1]. */ 55*2286d8edStholo static int nbuckets; 56*2286d8edStholo 57*2286d8edStholo /* Array in which the equivalence classes are allocated. 58*2286d8edStholo The bucket-chains go through the elements in this array. 59*2286d8edStholo The number of an equivalence class is its index in this array. */ 60*2286d8edStholo static struct equivclass *equivs; 61*2286d8edStholo 62*2286d8edStholo /* Index of first free element in the array `equivs'. */ 63*2286d8edStholo static int equivs_index; 64*2286d8edStholo 65*2286d8edStholo /* Number of elements allocated in the array `equivs'. */ 66*2286d8edStholo static int equivs_alloc; 67*2286d8edStholo 68*2286d8edStholo static void find_and_hash_each_line PARAMS((struct file_data *)); 69*2286d8edStholo static void find_identical_ends PARAMS((struct file_data[])); 70*2286d8edStholo static void prepare_text_end PARAMS((struct file_data *)); 71*2286d8edStholo 72*2286d8edStholo /* Check for binary files and compare them for exact identity. */ 73*2286d8edStholo 74*2286d8edStholo /* Return 1 if BUF contains a non text character. 75*2286d8edStholo SIZE is the number of characters in BUF. */ 76*2286d8edStholo 77*2286d8edStholo #define binary_file_p(buf, size) (memchr (buf, '\0', size) != 0) 78*2286d8edStholo 79*2286d8edStholo /* Get ready to read the current file. 80*2286d8edStholo Return nonzero if SKIP_TEST is zero, 81*2286d8edStholo and if it appears to be a binary file. */ 82*2286d8edStholo 83*2286d8edStholo int 84*2286d8edStholo sip (current, skip_test) 85*2286d8edStholo struct file_data *current; 86*2286d8edStholo int skip_test; 87*2286d8edStholo { 88*2286d8edStholo /* If we have a nonexistent file at this stage, treat it as empty. */ 89*2286d8edStholo if (current->desc < 0) 90*2286d8edStholo { 91*2286d8edStholo /* Leave room for a sentinel. */ 92*2286d8edStholo current->bufsize = sizeof (word); 93*2286d8edStholo current->buffer = xmalloc (current->bufsize); 94*2286d8edStholo } 95*2286d8edStholo else 96*2286d8edStholo { 97*2286d8edStholo current->bufsize = STAT_BLOCKSIZE (current->stat); 98*2286d8edStholo current->buffer = xmalloc (current->bufsize); 99*2286d8edStholo 100*2286d8edStholo if (! skip_test) 101*2286d8edStholo { 102*2286d8edStholo /* Check first part of file to see if it's a binary file. */ 103*2286d8edStholo #if HAVE_SETMODE 104*2286d8edStholo int oldmode = setmode (current->desc, O_BINARY); 105*2286d8edStholo #endif 106*2286d8edStholo size_t n = read (current->desc, current->buffer, current->bufsize); 107*2286d8edStholo if (n == -1) 108*2286d8edStholo pfatal_with_name (current->name); 109*2286d8edStholo current->buffered_chars = n; 110*2286d8edStholo #if HAVE_SETMODE 111*2286d8edStholo if (oldmode != O_BINARY) 112*2286d8edStholo { 113*2286d8edStholo if (lseek (current->desc, - (off_t) n, SEEK_CUR) == -1) 114*2286d8edStholo pfatal_with_name (current->name); 115*2286d8edStholo setmode (current->desc, oldmode); 116*2286d8edStholo current->buffered_chars = 0; 117*2286d8edStholo } 118*2286d8edStholo #endif 119*2286d8edStholo return binary_file_p (current->buffer, n); 120*2286d8edStholo } 121*2286d8edStholo } 122*2286d8edStholo 123*2286d8edStholo current->buffered_chars = 0; 124*2286d8edStholo return 0; 125*2286d8edStholo } 126*2286d8edStholo 127*2286d8edStholo /* Slurp the rest of the current file completely into memory. */ 128*2286d8edStholo 129*2286d8edStholo void 130*2286d8edStholo slurp (current) 131*2286d8edStholo struct file_data *current; 132*2286d8edStholo { 133*2286d8edStholo size_t cc; 134*2286d8edStholo 135*2286d8edStholo if (current->desc < 0) 136*2286d8edStholo /* The file is nonexistent. */ 137*2286d8edStholo ; 138*2286d8edStholo else if (S_ISREG (current->stat.st_mode)) 139*2286d8edStholo { 140*2286d8edStholo /* It's a regular file; slurp in the rest all at once. */ 141*2286d8edStholo 142*2286d8edStholo /* Get the size out of the stat block. 143*2286d8edStholo Allocate enough room for appended newline and sentinel. */ 144*2286d8edStholo cc = current->stat.st_size + 1 + sizeof (word); 145*2286d8edStholo if (current->bufsize < cc) 146*2286d8edStholo { 147*2286d8edStholo current->bufsize = cc; 148*2286d8edStholo current->buffer = xrealloc (current->buffer, cc); 149*2286d8edStholo } 150*2286d8edStholo 151*2286d8edStholo if (current->buffered_chars < current->stat.st_size) 152*2286d8edStholo { 153*2286d8edStholo cc = read (current->desc, 154*2286d8edStholo current->buffer + current->buffered_chars, 155*2286d8edStholo current->stat.st_size - current->buffered_chars); 156*2286d8edStholo if (cc == -1) 157*2286d8edStholo pfatal_with_name (current->name); 158*2286d8edStholo current->buffered_chars += cc; 159*2286d8edStholo } 160*2286d8edStholo } 161*2286d8edStholo /* It's not a regular file; read it, growing the buffer as needed. */ 162*2286d8edStholo else if (always_text_flag || current->buffered_chars != 0) 163*2286d8edStholo { 164*2286d8edStholo for (;;) 165*2286d8edStholo { 166*2286d8edStholo if (current->buffered_chars == current->bufsize) 167*2286d8edStholo { 168*2286d8edStholo current->bufsize = current->bufsize * 2; 169*2286d8edStholo current->buffer = xrealloc (current->buffer, current->bufsize); 170*2286d8edStholo } 171*2286d8edStholo cc = read (current->desc, 172*2286d8edStholo current->buffer + current->buffered_chars, 173*2286d8edStholo current->bufsize - current->buffered_chars); 174*2286d8edStholo if (cc == 0) 175*2286d8edStholo break; 176*2286d8edStholo if (cc == -1) 177*2286d8edStholo pfatal_with_name (current->name); 178*2286d8edStholo current->buffered_chars += cc; 179*2286d8edStholo } 180*2286d8edStholo /* Allocate just enough room for appended newline and sentinel. */ 181*2286d8edStholo current->bufsize = current->buffered_chars + 1 + sizeof (word); 182*2286d8edStholo current->buffer = xrealloc (current->buffer, current->bufsize); 183*2286d8edStholo } 184*2286d8edStholo } 185*2286d8edStholo 186*2286d8edStholo /* Split the file into lines, simultaneously computing the equivalence class for 187*2286d8edStholo each line. */ 188*2286d8edStholo 189*2286d8edStholo static void 190*2286d8edStholo find_and_hash_each_line (current) 191*2286d8edStholo struct file_data *current; 192*2286d8edStholo { 193*2286d8edStholo unsigned h; 194*2286d8edStholo unsigned char const *p = (unsigned char const *) current->prefix_end; 195*2286d8edStholo unsigned char c; 196*2286d8edStholo int i, *bucket; 197*2286d8edStholo size_t length; 198*2286d8edStholo 199*2286d8edStholo /* Cache often-used quantities in local variables to help the compiler. */ 200*2286d8edStholo char const **linbuf = current->linbuf; 201*2286d8edStholo int alloc_lines = current->alloc_lines; 202*2286d8edStholo int line = 0; 203*2286d8edStholo int linbuf_base = current->linbuf_base; 204*2286d8edStholo int *cureqs = (int *) xmalloc (alloc_lines * sizeof (int)); 205*2286d8edStholo struct equivclass *eqs = equivs; 206*2286d8edStholo int eqs_index = equivs_index; 207*2286d8edStholo int eqs_alloc = equivs_alloc; 208*2286d8edStholo char const *suffix_begin = current->suffix_begin; 209*2286d8edStholo char const *bufend = current->buffer + current->buffered_chars; 210*2286d8edStholo int use_line_cmp = ignore_some_line_changes; 211*2286d8edStholo 212*2286d8edStholo while ((char const *) p < suffix_begin) 213*2286d8edStholo { 214*2286d8edStholo char const *ip = (char const *) p; 215*2286d8edStholo 216*2286d8edStholo /* Compute the equivalence class for this line. */ 217*2286d8edStholo 218*2286d8edStholo h = 0; 219*2286d8edStholo 220*2286d8edStholo /* Hash this line until we find a newline. */ 221*2286d8edStholo if (ignore_case_flag) 222*2286d8edStholo { 223*2286d8edStholo if (ignore_all_space_flag) 224*2286d8edStholo while ((c = *p++) != '\n') 225*2286d8edStholo { 226*2286d8edStholo if (! ISSPACE (c)) 227*2286d8edStholo h = HASH (h, ISUPPER (c) ? tolower (c) : c); 228*2286d8edStholo } 229*2286d8edStholo else if (ignore_space_change_flag) 230*2286d8edStholo while ((c = *p++) != '\n') 231*2286d8edStholo { 232*2286d8edStholo if (ISSPACE (c)) 233*2286d8edStholo { 234*2286d8edStholo for (;;) 235*2286d8edStholo { 236*2286d8edStholo c = *p++; 237*2286d8edStholo if (!ISSPACE (c)) 238*2286d8edStholo break; 239*2286d8edStholo if (c == '\n') 240*2286d8edStholo goto hashing_done; 241*2286d8edStholo } 242*2286d8edStholo h = HASH (h, ' '); 243*2286d8edStholo } 244*2286d8edStholo /* C is now the first non-space. */ 245*2286d8edStholo h = HASH (h, ISUPPER (c) ? tolower (c) : c); 246*2286d8edStholo } 247*2286d8edStholo else 248*2286d8edStholo while ((c = *p++) != '\n') 249*2286d8edStholo h = HASH (h, ISUPPER (c) ? tolower (c) : c); 250*2286d8edStholo } 251*2286d8edStholo else 252*2286d8edStholo { 253*2286d8edStholo if (ignore_all_space_flag) 254*2286d8edStholo while ((c = *p++) != '\n') 255*2286d8edStholo { 256*2286d8edStholo if (! ISSPACE (c)) 257*2286d8edStholo h = HASH (h, c); 258*2286d8edStholo } 259*2286d8edStholo else if (ignore_space_change_flag) 260*2286d8edStholo while ((c = *p++) != '\n') 261*2286d8edStholo { 262*2286d8edStholo if (ISSPACE (c)) 263*2286d8edStholo { 264*2286d8edStholo for (;;) 265*2286d8edStholo { 266*2286d8edStholo c = *p++; 267*2286d8edStholo if (!ISSPACE (c)) 268*2286d8edStholo break; 269*2286d8edStholo if (c == '\n') 270*2286d8edStholo goto hashing_done; 271*2286d8edStholo } 272*2286d8edStholo h = HASH (h, ' '); 273*2286d8edStholo } 274*2286d8edStholo /* C is now the first non-space. */ 275*2286d8edStholo h = HASH (h, c); 276*2286d8edStholo } 277*2286d8edStholo else 278*2286d8edStholo while ((c = *p++) != '\n') 279*2286d8edStholo h = HASH (h, c); 280*2286d8edStholo } 281*2286d8edStholo hashing_done:; 282*2286d8edStholo 283*2286d8edStholo bucket = &buckets[h % nbuckets]; 284*2286d8edStholo length = (char const *) p - ip - 1; 285*2286d8edStholo 286*2286d8edStholo if ((char const *) p == bufend 287*2286d8edStholo && current->missing_newline 288*2286d8edStholo && ROBUST_OUTPUT_STYLE (output_style)) 289*2286d8edStholo { 290*2286d8edStholo /* This line is incomplete. If this is significant, 291*2286d8edStholo put the line into bucket[-1]. */ 292*2286d8edStholo if (! (ignore_space_change_flag | ignore_all_space_flag)) 293*2286d8edStholo bucket = &buckets[-1]; 294*2286d8edStholo 295*2286d8edStholo /* Omit the inserted newline when computing linbuf later. */ 296*2286d8edStholo p--; 297*2286d8edStholo bufend = suffix_begin = (char const *) p; 298*2286d8edStholo } 299*2286d8edStholo 300*2286d8edStholo for (i = *bucket; ; i = eqs[i].next) 301*2286d8edStholo if (!i) 302*2286d8edStholo { 303*2286d8edStholo /* Create a new equivalence class in this bucket. */ 304*2286d8edStholo i = eqs_index++; 305*2286d8edStholo if (i == eqs_alloc) 306*2286d8edStholo eqs = (struct equivclass *) 307*2286d8edStholo xrealloc (eqs, (eqs_alloc*=2) * sizeof(*eqs)); 308*2286d8edStholo eqs[i].next = *bucket; 309*2286d8edStholo eqs[i].hash = h; 310*2286d8edStholo eqs[i].line = ip; 311*2286d8edStholo eqs[i].length = length; 312*2286d8edStholo *bucket = i; 313*2286d8edStholo break; 314*2286d8edStholo } 315*2286d8edStholo else if (eqs[i].hash == h) 316*2286d8edStholo { 317*2286d8edStholo char const *eqline = eqs[i].line; 318*2286d8edStholo 319*2286d8edStholo /* Reuse existing equivalence class if the lines are identical. 320*2286d8edStholo This detects the common case of exact identity 321*2286d8edStholo faster than complete comparison would. */ 322*2286d8edStholo if (eqs[i].length == length && memcmp (eqline, ip, length) == 0) 323*2286d8edStholo break; 324*2286d8edStholo 325*2286d8edStholo /* Reuse existing class if line_cmp reports the lines equal. */ 326*2286d8edStholo if (use_line_cmp && line_cmp (eqline, ip) == 0) 327*2286d8edStholo break; 328*2286d8edStholo } 329*2286d8edStholo 330*2286d8edStholo /* Maybe increase the size of the line table. */ 331*2286d8edStholo if (line == alloc_lines) 332*2286d8edStholo { 333*2286d8edStholo /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 334*2286d8edStholo alloc_lines = 2 * alloc_lines - linbuf_base; 335*2286d8edStholo cureqs = (int *) xrealloc (cureqs, alloc_lines * sizeof (*cureqs)); 336*2286d8edStholo linbuf = (char const **) xrealloc (linbuf + linbuf_base, 337*2286d8edStholo (alloc_lines - linbuf_base) 338*2286d8edStholo * sizeof (*linbuf)) 339*2286d8edStholo - linbuf_base; 340*2286d8edStholo } 341*2286d8edStholo linbuf[line] = ip; 342*2286d8edStholo cureqs[line] = i; 343*2286d8edStholo ++line; 344*2286d8edStholo } 345*2286d8edStholo 346*2286d8edStholo current->buffered_lines = line; 347*2286d8edStholo 348*2286d8edStholo for (i = 0; ; i++) 349*2286d8edStholo { 350*2286d8edStholo /* Record the line start for lines in the suffix that we care about. 351*2286d8edStholo Record one more line start than lines, 352*2286d8edStholo so that we can compute the length of any buffered line. */ 353*2286d8edStholo if (line == alloc_lines) 354*2286d8edStholo { 355*2286d8edStholo /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 356*2286d8edStholo alloc_lines = 2 * alloc_lines - linbuf_base; 357*2286d8edStholo linbuf = (char const **) xrealloc (linbuf + linbuf_base, 358*2286d8edStholo (alloc_lines - linbuf_base) 359*2286d8edStholo * sizeof (*linbuf)) 360*2286d8edStholo - linbuf_base; 361*2286d8edStholo } 362*2286d8edStholo linbuf[line] = (char const *) p; 363*2286d8edStholo 364*2286d8edStholo if ((char const *) p == bufend) 365*2286d8edStholo break; 366*2286d8edStholo 367*2286d8edStholo if (context <= i && no_diff_means_no_output) 368*2286d8edStholo break; 369*2286d8edStholo 370*2286d8edStholo line++; 371*2286d8edStholo 372*2286d8edStholo while (*p++ != '\n') 373*2286d8edStholo ; 374*2286d8edStholo } 375*2286d8edStholo 376*2286d8edStholo /* Done with cache in local variables. */ 377*2286d8edStholo current->linbuf = linbuf; 378*2286d8edStholo current->valid_lines = line; 379*2286d8edStholo current->alloc_lines = alloc_lines; 380*2286d8edStholo current->equivs = cureqs; 381*2286d8edStholo equivs = eqs; 382*2286d8edStholo equivs_alloc = eqs_alloc; 383*2286d8edStholo equivs_index = eqs_index; 384*2286d8edStholo } 385*2286d8edStholo 386*2286d8edStholo /* Prepare the end of the text. Make sure it's initialized. 387*2286d8edStholo Make sure text ends in a newline, 388*2286d8edStholo but remember that we had to add one. */ 389*2286d8edStholo 390*2286d8edStholo static void 391*2286d8edStholo prepare_text_end (current) 392*2286d8edStholo struct file_data *current; 393*2286d8edStholo { 394*2286d8edStholo size_t buffered_chars = current->buffered_chars; 395*2286d8edStholo char *p = current->buffer; 396*2286d8edStholo 397*2286d8edStholo if (buffered_chars == 0 || p[buffered_chars - 1] == '\n') 398*2286d8edStholo current->missing_newline = 0; 399*2286d8edStholo else 400*2286d8edStholo { 401*2286d8edStholo p[buffered_chars++] = '\n'; 402*2286d8edStholo current->buffered_chars = buffered_chars; 403*2286d8edStholo current->missing_newline = 1; 404*2286d8edStholo } 405*2286d8edStholo 406*2286d8edStholo /* Don't use uninitialized storage when planting or using sentinels. */ 407*2286d8edStholo if (p) 408*2286d8edStholo bzero (p + buffered_chars, sizeof (word)); 409*2286d8edStholo } 410*2286d8edStholo 411*2286d8edStholo /* Given a vector of two file_data objects, find the identical 412*2286d8edStholo prefixes and suffixes of each object. */ 413*2286d8edStholo 414*2286d8edStholo static void 415*2286d8edStholo find_identical_ends (filevec) 416*2286d8edStholo struct file_data filevec[]; 417*2286d8edStholo { 418*2286d8edStholo word *w0, *w1; 419*2286d8edStholo char *p0, *p1, *buffer0, *buffer1; 420*2286d8edStholo char const *end0, *beg0; 421*2286d8edStholo char const **linbuf0, **linbuf1; 422*2286d8edStholo int i, lines; 423*2286d8edStholo size_t n0, n1, tem; 424*2286d8edStholo int alloc_lines0, alloc_lines1; 425*2286d8edStholo int buffered_prefix, prefix_count, prefix_mask; 426*2286d8edStholo 427*2286d8edStholo slurp (&filevec[0]); 428*2286d8edStholo if (filevec[0].desc != filevec[1].desc) 429*2286d8edStholo slurp (&filevec[1]); 430*2286d8edStholo else 431*2286d8edStholo { 432*2286d8edStholo filevec[1].buffer = filevec[0].buffer; 433*2286d8edStholo filevec[1].bufsize = filevec[0].bufsize; 434*2286d8edStholo filevec[1].buffered_chars = filevec[0].buffered_chars; 435*2286d8edStholo } 436*2286d8edStholo for (i = 0; i < 2; i++) 437*2286d8edStholo prepare_text_end (&filevec[i]); 438*2286d8edStholo 439*2286d8edStholo /* Find identical prefix. */ 440*2286d8edStholo 441*2286d8edStholo p0 = buffer0 = filevec[0].buffer; 442*2286d8edStholo p1 = buffer1 = filevec[1].buffer; 443*2286d8edStholo 444*2286d8edStholo n0 = filevec[0].buffered_chars; 445*2286d8edStholo n1 = filevec[1].buffered_chars; 446*2286d8edStholo 447*2286d8edStholo if (p0 == p1) 448*2286d8edStholo /* The buffers are the same; sentinels won't work. */ 449*2286d8edStholo p0 = p1 += n1; 450*2286d8edStholo else 451*2286d8edStholo { 452*2286d8edStholo /* Insert end sentinels, in this case characters that are guaranteed 453*2286d8edStholo to make the equality test false, and thus terminate the loop. */ 454*2286d8edStholo 455*2286d8edStholo if (n0 < n1) 456*2286d8edStholo p0[n0] = ~p1[n0]; 457*2286d8edStholo else 458*2286d8edStholo p1[n1] = ~p0[n1]; 459*2286d8edStholo 460*2286d8edStholo /* Loop until first mismatch, or to the sentinel characters. */ 461*2286d8edStholo 462*2286d8edStholo /* Compare a word at a time for speed. */ 463*2286d8edStholo w0 = (word *) p0; 464*2286d8edStholo w1 = (word *) p1; 465*2286d8edStholo while (*w0++ == *w1++) 466*2286d8edStholo ; 467*2286d8edStholo --w0, --w1; 468*2286d8edStholo 469*2286d8edStholo /* Do the last few bytes of comparison a byte at a time. */ 470*2286d8edStholo p0 = (char *) w0; 471*2286d8edStholo p1 = (char *) w1; 472*2286d8edStholo while (*p0++ == *p1++) 473*2286d8edStholo ; 474*2286d8edStholo --p0, --p1; 475*2286d8edStholo 476*2286d8edStholo /* Don't mistakenly count missing newline as part of prefix. */ 477*2286d8edStholo if (ROBUST_OUTPUT_STYLE (output_style) 478*2286d8edStholo && (buffer0 + n0 - filevec[0].missing_newline < p0) 479*2286d8edStholo != 480*2286d8edStholo (buffer1 + n1 - filevec[1].missing_newline < p1)) 481*2286d8edStholo --p0, --p1; 482*2286d8edStholo } 483*2286d8edStholo 484*2286d8edStholo /* Now P0 and P1 point at the first nonmatching characters. */ 485*2286d8edStholo 486*2286d8edStholo /* Skip back to last line-beginning in the prefix, 487*2286d8edStholo and then discard up to HORIZON_LINES lines from the prefix. */ 488*2286d8edStholo i = horizon_lines; 489*2286d8edStholo while (p0 != buffer0 && (p0[-1] != '\n' || i--)) 490*2286d8edStholo --p0, --p1; 491*2286d8edStholo 492*2286d8edStholo /* Record the prefix. */ 493*2286d8edStholo filevec[0].prefix_end = p0; 494*2286d8edStholo filevec[1].prefix_end = p1; 495*2286d8edStholo 496*2286d8edStholo /* Find identical suffix. */ 497*2286d8edStholo 498*2286d8edStholo /* P0 and P1 point beyond the last chars not yet compared. */ 499*2286d8edStholo p0 = buffer0 + n0; 500*2286d8edStholo p1 = buffer1 + n1; 501*2286d8edStholo 502*2286d8edStholo if (! ROBUST_OUTPUT_STYLE (output_style) 503*2286d8edStholo || filevec[0].missing_newline == filevec[1].missing_newline) 504*2286d8edStholo { 505*2286d8edStholo end0 = p0; /* Addr of last char in file 0. */ 506*2286d8edStholo 507*2286d8edStholo /* Get value of P0 at which we should stop scanning backward: 508*2286d8edStholo this is when either P0 or P1 points just past the last char 509*2286d8edStholo of the identical prefix. */ 510*2286d8edStholo beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1); 511*2286d8edStholo 512*2286d8edStholo /* Scan back until chars don't match or we reach that point. */ 513*2286d8edStholo while (p0 != beg0) 514*2286d8edStholo if (*--p0 != *--p1) 515*2286d8edStholo { 516*2286d8edStholo /* Point at the first char of the matching suffix. */ 517*2286d8edStholo ++p0, ++p1; 518*2286d8edStholo beg0 = p0; 519*2286d8edStholo break; 520*2286d8edStholo } 521*2286d8edStholo 522*2286d8edStholo /* Are we at a line-beginning in both files? If not, add the rest of 523*2286d8edStholo this line to the main body. Discard up to HORIZON_LINES lines from 524*2286d8edStholo the identical suffix. Also, discard one extra line, 525*2286d8edStholo because shift_boundaries may need it. */ 526*2286d8edStholo i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n') 527*2286d8edStholo && 528*2286d8edStholo (buffer1 == p1 || p1[-1] == '\n')); 529*2286d8edStholo while (i-- && p0 != end0) 530*2286d8edStholo while (*p0++ != '\n') 531*2286d8edStholo ; 532*2286d8edStholo 533*2286d8edStholo p1 += p0 - beg0; 534*2286d8edStholo } 535*2286d8edStholo 536*2286d8edStholo /* Record the suffix. */ 537*2286d8edStholo filevec[0].suffix_begin = p0; 538*2286d8edStholo filevec[1].suffix_begin = p1; 539*2286d8edStholo 540*2286d8edStholo /* Calculate number of lines of prefix to save. 541*2286d8edStholo 542*2286d8edStholo prefix_count == 0 means save the whole prefix; 543*2286d8edStholo we need this with for options like -D that output the whole file. 544*2286d8edStholo We also need it for options like -F that output some preceding line; 545*2286d8edStholo at least we will need to find the last few lines, 546*2286d8edStholo but since we don't know how many, it's easiest to find them all. 547*2286d8edStholo 548*2286d8edStholo Otherwise, prefix_count != 0. Save just prefix_count lines at start 549*2286d8edStholo of the line buffer; they'll be moved to the proper location later. 550*2286d8edStholo Handle 1 more line than the context says (because we count 1 too many), 551*2286d8edStholo rounded up to the next power of 2 to speed index computation. */ 552*2286d8edStholo 553*2286d8edStholo if (no_diff_means_no_output && ! function_regexp_list) 554*2286d8edStholo { 555*2286d8edStholo for (prefix_count = 1; prefix_count < context + 1; prefix_count *= 2) 556*2286d8edStholo ; 557*2286d8edStholo prefix_mask = prefix_count - 1; 558*2286d8edStholo alloc_lines0 559*2286d8edStholo = prefix_count 560*2286d8edStholo + GUESS_LINES (0, 0, p0 - filevec[0].prefix_end) 561*2286d8edStholo + context; 562*2286d8edStholo } 563*2286d8edStholo else 564*2286d8edStholo { 565*2286d8edStholo prefix_count = 0; 566*2286d8edStholo prefix_mask = ~0; 567*2286d8edStholo alloc_lines0 = GUESS_LINES (0, 0, n0); 568*2286d8edStholo } 569*2286d8edStholo 570*2286d8edStholo lines = 0; 571*2286d8edStholo linbuf0 = (char const **) xmalloc (alloc_lines0 * sizeof (*linbuf0)); 572*2286d8edStholo 573*2286d8edStholo /* If the prefix is needed, find the prefix lines. */ 574*2286d8edStholo if (! (no_diff_means_no_output 575*2286d8edStholo && filevec[0].prefix_end == p0 576*2286d8edStholo && filevec[1].prefix_end == p1)) 577*2286d8edStholo { 578*2286d8edStholo p0 = buffer0; 579*2286d8edStholo end0 = filevec[0].prefix_end; 580*2286d8edStholo while (p0 != end0) 581*2286d8edStholo { 582*2286d8edStholo int l = lines++ & prefix_mask; 583*2286d8edStholo if (l == alloc_lines0) 584*2286d8edStholo linbuf0 = (char const **) xrealloc (linbuf0, (alloc_lines0 *= 2) 585*2286d8edStholo * sizeof(*linbuf0)); 586*2286d8edStholo linbuf0[l] = p0; 587*2286d8edStholo while (*p0++ != '\n') 588*2286d8edStholo ; 589*2286d8edStholo } 590*2286d8edStholo } 591*2286d8edStholo buffered_prefix = prefix_count && context < lines ? context : lines; 592*2286d8edStholo 593*2286d8edStholo /* Allocate line buffer 1. */ 594*2286d8edStholo tem = prefix_count ? filevec[1].suffix_begin - buffer1 : n1; 595*2286d8edStholo 596*2286d8edStholo alloc_lines1 597*2286d8edStholo = (buffered_prefix 598*2286d8edStholo + GUESS_LINES (lines, filevec[1].prefix_end - buffer1, tem) 599*2286d8edStholo + context); 600*2286d8edStholo linbuf1 = (char const **) xmalloc (alloc_lines1 * sizeof (*linbuf1)); 601*2286d8edStholo 602*2286d8edStholo if (buffered_prefix != lines) 603*2286d8edStholo { 604*2286d8edStholo /* Rotate prefix lines to proper location. */ 605*2286d8edStholo for (i = 0; i < buffered_prefix; i++) 606*2286d8edStholo linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask]; 607*2286d8edStholo for (i = 0; i < buffered_prefix; i++) 608*2286d8edStholo linbuf0[i] = linbuf1[i]; 609*2286d8edStholo } 610*2286d8edStholo 611*2286d8edStholo /* Initialize line buffer 1 from line buffer 0. */ 612*2286d8edStholo for (i = 0; i < buffered_prefix; i++) 613*2286d8edStholo linbuf1[i] = linbuf0[i] - buffer0 + buffer1; 614*2286d8edStholo 615*2286d8edStholo /* Record the line buffer, adjusted so that 616*2286d8edStholo linbuf*[0] points at the first differing line. */ 617*2286d8edStholo filevec[0].linbuf = linbuf0 + buffered_prefix; 618*2286d8edStholo filevec[1].linbuf = linbuf1 + buffered_prefix; 619*2286d8edStholo filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix; 620*2286d8edStholo filevec[0].alloc_lines = alloc_lines0 - buffered_prefix; 621*2286d8edStholo filevec[1].alloc_lines = alloc_lines1 - buffered_prefix; 622*2286d8edStholo filevec[0].prefix_lines = filevec[1].prefix_lines = lines; 623*2286d8edStholo } 624*2286d8edStholo 625*2286d8edStholo /* Largest primes less than some power of two, for nbuckets. Values range 626*2286d8edStholo from useful to preposterous. If one of these numbers isn't prime 627*2286d8edStholo after all, don't blame it on me, blame it on primes (6) . . . */ 628*2286d8edStholo static int const primes[] = 629*2286d8edStholo { 630*2286d8edStholo 509, 631*2286d8edStholo 1021, 632*2286d8edStholo 2039, 633*2286d8edStholo 4093, 634*2286d8edStholo 8191, 635*2286d8edStholo 16381, 636*2286d8edStholo 32749, 637*2286d8edStholo #if 32767 < INT_MAX 638*2286d8edStholo 65521, 639*2286d8edStholo 131071, 640*2286d8edStholo 262139, 641*2286d8edStholo 524287, 642*2286d8edStholo 1048573, 643*2286d8edStholo 2097143, 644*2286d8edStholo 4194301, 645*2286d8edStholo 8388593, 646*2286d8edStholo 16777213, 647*2286d8edStholo 33554393, 648*2286d8edStholo 67108859, /* Preposterously large . . . */ 649*2286d8edStholo 134217689, 650*2286d8edStholo 268435399, 651*2286d8edStholo 536870909, 652*2286d8edStholo 1073741789, 653*2286d8edStholo 2147483647, 654*2286d8edStholo #endif 655*2286d8edStholo 0 656*2286d8edStholo }; 657*2286d8edStholo 658*2286d8edStholo /* Given a vector of two file_data objects, read the file associated 659*2286d8edStholo with each one, and build the table of equivalence classes. 660*2286d8edStholo Return 1 if either file appears to be a binary file. 661*2286d8edStholo If PRETEND_BINARY is nonzero, pretend they are binary regardless. */ 662*2286d8edStholo 663*2286d8edStholo int 664*2286d8edStholo read_files (filevec, pretend_binary) 665*2286d8edStholo struct file_data filevec[]; 666*2286d8edStholo int pretend_binary; 667*2286d8edStholo { 668*2286d8edStholo int i; 669*2286d8edStholo int skip_test = always_text_flag | pretend_binary; 670*2286d8edStholo int appears_binary = pretend_binary | sip (&filevec[0], skip_test); 671*2286d8edStholo 672*2286d8edStholo if (filevec[0].desc != filevec[1].desc) 673*2286d8edStholo appears_binary |= sip (&filevec[1], skip_test | appears_binary); 674*2286d8edStholo else 675*2286d8edStholo { 676*2286d8edStholo filevec[1].buffer = filevec[0].buffer; 677*2286d8edStholo filevec[1].bufsize = filevec[0].bufsize; 678*2286d8edStholo filevec[1].buffered_chars = filevec[0].buffered_chars; 679*2286d8edStholo } 680*2286d8edStholo if (appears_binary) 681*2286d8edStholo { 682*2286d8edStholo #if HAVE_SETMODE 683*2286d8edStholo setmode (filevec[0].desc, O_BINARY); 684*2286d8edStholo setmode (filevec[1].desc, O_BINARY); 685*2286d8edStholo #endif 686*2286d8edStholo return 1; 687*2286d8edStholo } 688*2286d8edStholo 689*2286d8edStholo find_identical_ends (filevec); 690*2286d8edStholo 691*2286d8edStholo equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1; 692*2286d8edStholo equivs = (struct equivclass *) xmalloc (equivs_alloc * sizeof (struct equivclass)); 693*2286d8edStholo /* Equivalence class 0 is permanently safe for lines that were not 694*2286d8edStholo hashed. Real equivalence classes start at 1. */ 695*2286d8edStholo equivs_index = 1; 696*2286d8edStholo 697*2286d8edStholo for (i = 0; primes[i] < equivs_alloc / 3; i++) 698*2286d8edStholo if (! primes[i]) 699*2286d8edStholo abort (); 700*2286d8edStholo nbuckets = primes[i]; 701*2286d8edStholo 702*2286d8edStholo buckets = (int *) xmalloc ((nbuckets + 1) * sizeof (*buckets)); 703*2286d8edStholo bzero (buckets++, (nbuckets + 1) * sizeof (*buckets)); 704*2286d8edStholo 705*2286d8edStholo for (i = 0; i < 2; i++) 706*2286d8edStholo find_and_hash_each_line (&filevec[i]); 707*2286d8edStholo 708*2286d8edStholo filevec[0].equiv_max = filevec[1].equiv_max = equivs_index; 709*2286d8edStholo 710*2286d8edStholo free (equivs); 711*2286d8edStholo free (buckets - 1); 712*2286d8edStholo 713*2286d8edStholo return 0; 714*2286d8edStholo } 715