xref: /dflybsd-src/contrib/diffutils/src/io.c (revision 99d38c703099701955c5b1808096ed712af780e9)
1 /* File I/O for GNU DIFF.
2 
3    Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2013
4    Free Software Foundation, Inc.
5 
6    This file is part of GNU DIFF.
7 
8    This program is free software: you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation, either version 3 of the License, or
11    (at your option) any later version.
12 
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
20 
21 #include "diff.h"
22 #include <binary-io.h>
23 #include <cmpbuf.h>
24 #include <file-type.h>
25 #include <xalloc.h>
26 
27 /* Rotate an unsigned value to the left.  */
28 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
29 
30 /* Given a hash value and a new character, return a new hash value.  */
31 #define HASH(h, c) ((c) + ROL (h, 7))
32 
33 /* The type of a hash value.  */
34 typedef size_t hash_value;
35 verify (! TYPE_SIGNED (hash_value));
36 
37 /* Lines are put into equivalence classes of lines that match in lines_differ.
38    Each equivalence class is represented by one of these structures,
39    but only while the classes are being computed.
40    Afterward, each class is represented by a number.  */
41 struct equivclass
42 {
43   lin next;		/* Next item in this bucket.  */
44   hash_value hash;	/* Hash of lines in this class.  */
45   char const *line;	/* A line that fits this class.  */
46   size_t length;	/* That line's length, not counting its newline.  */
47 };
48 
49 /* Hash-table: array of buckets, each being a chain of equivalence classes.
50    buckets[-1] is reserved for incomplete lines.  */
51 static lin *buckets;
52 
53 /* Number of buckets in the hash table array, not counting buckets[-1].  */
54 static size_t nbuckets;
55 
56 /* Array in which the equivalence classes are allocated.
57    The bucket-chains go through the elements in this array.
58    The number of an equivalence class is its index in this array.  */
59 static struct equivclass *equivs;
60 
61 /* Index of first free element in the array 'equivs'.  */
62 static lin equivs_index;
63 
64 /* Number of elements allocated in the array 'equivs'.  */
65 static lin equivs_alloc;
66 
67 /* Read a block of data into a file buffer, checking for EOF and error.  */
68 
69 void
70 file_block_read (struct file_data *current, size_t size)
71 {
72   if (size && ! current->eof)
73     {
74       size_t s = block_read (current->desc,
75 			     FILE_BUFFER (current) + current->buffered, size);
76       if (s == SIZE_MAX)
77 	pfatal_with_name (current->name);
78       current->buffered += s;
79       current->eof = s < size;
80     }
81 }
82 
83 /* Check for binary files and compare them for exact identity.  */
84 
85 /* Return 1 if BUF contains a non text character.
86    SIZE is the number of characters in BUF.  */
87 
88 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
89 
90 /* Get ready to read the current file.
91    Return nonzero if SKIP_TEST is zero,
92    and if it appears to be a binary file.  */
93 
94 static bool
95 sip (struct file_data *current, bool skip_test)
96 {
97   /* If we have a nonexistent file at this stage, treat it as empty.  */
98   if (current->desc < 0)
99     {
100       /* Leave room for a sentinel.  */
101       current->bufsize = sizeof (word);
102       current->buffer = xmalloc (current->bufsize);
103     }
104   else
105     {
106       current->bufsize = buffer_lcm (sizeof (word),
107 				     STAT_BLOCKSIZE (current->stat),
108 				     PTRDIFF_MAX - 2 * sizeof (word));
109       current->buffer = xmalloc (current->bufsize);
110 
111       if (! skip_test)
112 	{
113 	  /* Check first part of file to see if it's a binary file.  */
114 
115 	  int prev_mode = set_binary_mode (current->desc, O_BINARY);
116 	  off_t buffered;
117 	  file_block_read (current, current->bufsize);
118 	  buffered = current->buffered;
119 
120 	  if (prev_mode != O_BINARY)
121 	    {
122 	      /* Revert to text mode and seek back to the start to reread
123 		 the file.  Use relative seek, since file descriptors
124 		 like stdin might not start at offset zero.  */
125 	      if (lseek (current->desc, - buffered, SEEK_CUR) < 0)
126 		pfatal_with_name (current->name);
127 	      set_binary_mode (current->desc, prev_mode);
128 	      current->buffered = 0;
129 	      current->eof = false;
130 	    }
131 
132 	  return binary_file_p (current->buffer, buffered);
133 	}
134     }
135 
136   current->buffered = 0;
137   current->eof = false;
138   return false;
139 }
140 
141 /* Slurp the rest of the current file completely into memory.  */
142 
143 static void
144 slurp (struct file_data *current)
145 {
146   size_t cc;
147 
148   if (current->desc < 0)
149     {
150       /* The file is nonexistent.  */
151       return;
152     }
153 
154   if (S_ISREG (current->stat.st_mode))
155     {
156       /* It's a regular file; slurp in the rest all at once.  */
157 
158       /* Get the size out of the stat block.
159 	 Allocate just enough room for appended newline plus word sentinel,
160 	 plus word-alignment since we want the buffer word-aligned.  */
161       size_t file_size = current->stat.st_size;
162       cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
163       if (file_size != current->stat.st_size || cc < file_size
164 	  || PTRDIFF_MAX <= cc)
165 	xalloc_die ();
166 
167       if (current->bufsize < cc)
168 	{
169 	  current->bufsize = cc;
170 	  current->buffer = xrealloc (current->buffer, cc);
171 	}
172 
173       /* Try to read at least 1 more byte than the size indicates, to
174 	 detect whether the file is growing.  This is a nicety for
175 	 users who run 'diff' on files while they are changing.  */
176 
177       if (current->buffered <= file_size)
178 	{
179 	  file_block_read (current, file_size + 1 - current->buffered);
180 	  if (current->buffered <= file_size)
181 	    return;
182 	}
183     }
184 
185   /* It's not a regular file, or it's a growing regular file; read it,
186      growing the buffer as needed.  */
187 
188   file_block_read (current, current->bufsize - current->buffered);
189 
190   if (current->buffered)
191     {
192       while (current->buffered == current->bufsize)
193 	{
194 	  if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
195 	    xalloc_die ();
196 	  current->bufsize *= 2;
197 	  current->buffer = xrealloc (current->buffer, current->bufsize);
198 	  file_block_read (current, current->bufsize - current->buffered);
199 	}
200 
201       /* Allocate just enough room for appended newline plus word
202 	 sentinel, plus word-alignment.  */
203       cc = current->buffered + 2 * sizeof (word);
204       current->bufsize = cc - cc % sizeof (word);
205       current->buffer = xrealloc (current->buffer, current->bufsize);
206     }
207 }
208 
209 /* Split the file into lines, simultaneously computing the equivalence
210    class for each line.  */
211 
212 static void
213 find_and_hash_each_line (struct file_data *current)
214 {
215   char const *p = current->prefix_end;
216   lin i, *bucket;
217   size_t length;
218 
219   /* Cache often-used quantities in local variables to help the compiler.  */
220   char const **linbuf = current->linbuf;
221   lin alloc_lines = current->alloc_lines;
222   lin line = 0;
223   lin linbuf_base = current->linbuf_base;
224   lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
225   struct equivclass *eqs = equivs;
226   lin eqs_index = equivs_index;
227   lin eqs_alloc = equivs_alloc;
228   char const *suffix_begin = current->suffix_begin;
229   char const *bufend = FILE_BUFFER (current) + current->buffered;
230   bool ig_case = ignore_case;
231   enum DIFF_white_space ig_white_space = ignore_white_space;
232   bool diff_length_compare_anyway =
233     ig_white_space != IGNORE_NO_WHITE_SPACE;
234   bool same_length_diff_contents_compare_anyway =
235     diff_length_compare_anyway | ig_case;
236 
237   while (p < suffix_begin)
238     {
239       char const *ip = p;
240       hash_value h = 0;
241       unsigned char c;
242 
243       /* Hash this line until we find a newline.  */
244       switch (ig_white_space)
245 	{
246 	case IGNORE_ALL_SPACE:
247 	  while ((c = *p++) != '\n')
248 	    if (! isspace (c))
249 	      h = HASH (h, ig_case ? tolower (c) : c);
250 	  break;
251 
252 	case IGNORE_SPACE_CHANGE:
253 	  while ((c = *p++) != '\n')
254 	    {
255 	      if (isspace (c))
256 		{
257 		  do
258 		    if ((c = *p++) == '\n')
259 		      goto hashing_done;
260 		  while (isspace (c));
261 
262 		  h = HASH (h, ' ');
263 		}
264 
265 	      /* C is now the first non-space.  */
266 	      h = HASH (h, ig_case ? tolower (c) : c);
267 	    }
268 	  break;
269 
270 	case IGNORE_TAB_EXPANSION:
271 	case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
272 	case IGNORE_TRAILING_SPACE:
273 	  {
274 	    size_t column = 0;
275 	    while ((c = *p++) != '\n')
276 	      {
277 		if (ig_white_space & IGNORE_TRAILING_SPACE
278 		    && isspace (c))
279 		  {
280 		    char const *p1 = p;
281 		    unsigned char c1;
282 		    do
283 		      if ((c1 = *p1++) == '\n')
284 			{
285 			  p = p1;
286 			  goto hashing_done;
287 			}
288 		    while (isspace (c1));
289 		  }
290 
291 		size_t repetitions = 1;
292 
293 		if (ig_white_space & IGNORE_TAB_EXPANSION)
294 		  switch (c)
295 		    {
296 		    case '\b':
297 		      column -= 0 < column;
298 		      break;
299 
300 		    case '\t':
301 		      c = ' ';
302 		      repetitions = tabsize - column % tabsize;
303 		      column = (column + repetitions < column
304 				? 0
305 				: column + repetitions);
306 		      break;
307 
308 		    case '\r':
309 		      column = 0;
310 		      break;
311 
312 		    default:
313 		      column++;
314 		      break;
315 		    }
316 
317 		if (ig_case)
318 		  c = tolower (c);
319 
320 		do
321 		  h = HASH (h, c);
322 		while (--repetitions != 0);
323 	      }
324 	  }
325 	  break;
326 
327 	default:
328 	  if (ig_case)
329 	    while ((c = *p++) != '\n')
330 	      h = HASH (h, tolower (c));
331 	  else
332 	    while ((c = *p++) != '\n')
333 	      h = HASH (h, c);
334 	  break;
335 	}
336 
337    hashing_done:;
338 
339       bucket = &buckets[h % nbuckets];
340       length = p - ip - 1;
341 
342       if (p == bufend
343 	  && current->missing_newline
344 	  && ROBUST_OUTPUT_STYLE (output_style))
345 	{
346 	  /* The last line is incomplete and we do not silently
347 	     complete lines.  If the line cannot compare equal to any
348 	     complete line, put it into buckets[-1] so that it can
349 	     compare equal only to the other file's incomplete line
350 	     (if one exists).  */
351 	  if (ig_white_space < IGNORE_TRAILING_SPACE)
352 	    bucket = &buckets[-1];
353 	}
354 
355       for (i = *bucket;  ;  i = eqs[i].next)
356 	if (!i)
357 	  {
358 	    /* Create a new equivalence class in this bucket.  */
359 	    i = eqs_index++;
360 	    if (i == eqs_alloc)
361 	      {
362 		if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
363 		  xalloc_die ();
364 		eqs_alloc *= 2;
365 		eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
366 	      }
367 	    eqs[i].next = *bucket;
368 	    eqs[i].hash = h;
369 	    eqs[i].line = ip;
370 	    eqs[i].length = length;
371 	    *bucket = i;
372 	    break;
373 	  }
374 	else if (eqs[i].hash == h)
375 	  {
376 	    char const *eqline = eqs[i].line;
377 
378 	    /* Reuse existing class if lines_differ reports the lines
379                equal.  */
380 	    if (eqs[i].length == length)
381 	      {
382 		/* Reuse existing equivalence class if the lines are identical.
383 		   This detects the common case of exact identity
384 		   faster than lines_differ would.  */
385 		if (memcmp (eqline, ip, length) == 0)
386 		  break;
387 		if (!same_length_diff_contents_compare_anyway)
388 		  continue;
389 	      }
390 	    else if (!diff_length_compare_anyway)
391 	      continue;
392 
393 	    if (! lines_differ (eqline, ip))
394 	      break;
395 	  }
396 
397       /* Maybe increase the size of the line table.  */
398       if (line == alloc_lines)
399 	{
400 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
401 	  if (PTRDIFF_MAX / 3 <= alloc_lines
402 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
403 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
404 	    xalloc_die ();
405 	  alloc_lines = 2 * alloc_lines - linbuf_base;
406 	  cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
407 	  linbuf += linbuf_base;
408 	  linbuf = xrealloc (linbuf,
409 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
410 	  linbuf -= linbuf_base;
411 	}
412       linbuf[line] = ip;
413       cureqs[line] = i;
414       ++line;
415     }
416 
417   current->buffered_lines = line;
418 
419   for (i = 0;  ;  i++)
420     {
421       /* Record the line start for lines in the suffix that we care about.
422 	 Record one more line start than lines,
423 	 so that we can compute the length of any buffered line.  */
424       if (line == alloc_lines)
425 	{
426 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
427 	  if (PTRDIFF_MAX / 3 <= alloc_lines
428 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
429 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
430 	    xalloc_die ();
431 	  alloc_lines = 2 * alloc_lines - linbuf_base;
432 	  linbuf += linbuf_base;
433 	  linbuf = xrealloc (linbuf,
434 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
435 	  linbuf -= linbuf_base;
436 	}
437       linbuf[line] = p;
438 
439       if (p == bufend)
440 	{
441 	  /* If the last line is incomplete and we do not silently
442 	     complete lines, don't count its appended newline.  */
443 	  if (current->missing_newline && ROBUST_OUTPUT_STYLE (output_style))
444 	    linbuf[line]--;
445 	  break;
446 	}
447 
448       if (context <= i && no_diff_means_no_output)
449 	break;
450 
451       line++;
452 
453       while (*p++ != '\n')
454 	continue;
455     }
456 
457   /* Done with cache in local variables.  */
458   current->linbuf = linbuf;
459   current->valid_lines = line;
460   current->alloc_lines = alloc_lines;
461   current->equivs = cureqs;
462   equivs = eqs;
463   equivs_alloc = eqs_alloc;
464   equivs_index = eqs_index;
465 }
466 
467 /* Prepare the text.  Make sure the text end is initialized.
468    Make sure text ends in a newline,
469    but remember that we had to add one.
470    Strip trailing CRs, if that was requested.  */
471 
472 static void
473 prepare_text (struct file_data *current)
474 {
475   size_t buffered = current->buffered;
476   char *p = FILE_BUFFER (current);
477   char *dst;
478 
479   if (buffered == 0 || p[buffered - 1] == '\n')
480     current->missing_newline = false;
481   else
482     {
483       p[buffered++] = '\n';
484       current->missing_newline = true;
485     }
486 
487   if (!p)
488     return;
489 
490   /* Don't use uninitialized storage when planting or using sentinels.  */
491   memset (p + buffered, 0, sizeof (word));
492 
493   if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
494     {
495       char const *src = dst;
496       char const *srclim = p + buffered;
497 
498       do
499 	dst += ! ((*dst = *src++) == '\r' && *src == '\n');
500       while (src < srclim);
501 
502       buffered -= src - dst;
503     }
504 
505   current->buffered = buffered;
506 }
507 
508 /* We have found N lines in a buffer of size S; guess the
509    proportionate number of lines that will be found in a buffer of
510    size T.  However, do not guess a number of lines so large that the
511    resulting line table might cause overflow in size calculations.  */
512 static lin
513 guess_lines (lin n, size_t s, size_t t)
514 {
515   size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
516   lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
517   return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
518 }
519 
520 /* Given a vector of two file_data objects, find the identical
521    prefixes and suffixes of each object.  */
522 
523 static void
524 find_identical_ends (struct file_data filevec[])
525 {
526   word *w0, *w1;
527   char *p0, *p1, *buffer0, *buffer1;
528   char const *end0, *beg0;
529   char const **linbuf0, **linbuf1;
530   lin i, lines;
531   size_t n0, n1;
532   lin alloc_lines0, alloc_lines1;
533   lin buffered_prefix, prefix_count, prefix_mask;
534   lin middle_guess, suffix_guess;
535 
536   slurp (&filevec[0]);
537   prepare_text (&filevec[0]);
538   if (filevec[0].desc != filevec[1].desc)
539     {
540       slurp (&filevec[1]);
541       prepare_text (&filevec[1]);
542     }
543   else
544     {
545       filevec[1].buffer = filevec[0].buffer;
546       filevec[1].bufsize = filevec[0].bufsize;
547       filevec[1].buffered = filevec[0].buffered;
548       filevec[1].missing_newline = filevec[0].missing_newline;
549     }
550 
551   /* Find identical prefix.  */
552 
553   w0 = filevec[0].buffer;
554   w1 = filevec[1].buffer;
555   p0 = buffer0 = (char *) w0;
556   p1 = buffer1 = (char *) w1;
557   n0 = filevec[0].buffered;
558   n1 = filevec[1].buffered;
559 
560   if (p0 == p1)
561     /* The buffers are the same; sentinels won't work.  */
562     p0 = p1 += n1;
563   else
564     {
565       /* Insert end sentinels, in this case characters that are guaranteed
566 	 to make the equality test false, and thus terminate the loop.  */
567 
568       if (n0 < n1)
569 	p0[n0] = ~p1[n0];
570       else
571 	p1[n1] = ~p0[n1];
572 
573       /* Loop until first mismatch, or to the sentinel characters.  */
574 
575       /* Compare a word at a time for speed.  */
576       while (*w0 == *w1)
577 	w0++, w1++;
578 
579       /* Do the last few bytes of comparison a byte at a time.  */
580       p0 = (char *) w0;
581       p1 = (char *) w1;
582       while (*p0 == *p1)
583 	p0++, p1++;
584 
585       /* Don't mistakenly count missing newline as part of prefix.  */
586       if (ROBUST_OUTPUT_STYLE (output_style)
587 	  && ((buffer0 + n0 - filevec[0].missing_newline < p0)
588 	      !=
589 	      (buffer1 + n1 - filevec[1].missing_newline < p1)))
590 	p0--, p1--;
591     }
592 
593   /* Now P0 and P1 point at the first nonmatching characters.  */
594 
595   /* Skip back to last line-beginning in the prefix,
596      and then discard up to HORIZON_LINES lines from the prefix.  */
597   i = horizon_lines;
598   while (p0 != buffer0 && (p0[-1] != '\n' || i--))
599     p0--, p1--;
600 
601   /* Record the prefix.  */
602   filevec[0].prefix_end = p0;
603   filevec[1].prefix_end = p1;
604 
605   /* Find identical suffix.  */
606 
607   /* P0 and P1 point beyond the last chars not yet compared.  */
608   p0 = buffer0 + n0;
609   p1 = buffer1 + n1;
610 
611   if (! ROBUST_OUTPUT_STYLE (output_style)
612       || filevec[0].missing_newline == filevec[1].missing_newline)
613     {
614       end0 = p0;	/* Addr of last char in file 0.  */
615 
616       /* Get value of P0 at which we should stop scanning backward:
617 	 this is when either P0 or P1 points just past the last char
618 	 of the identical prefix.  */
619       beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
620 
621       /* Scan back until chars don't match or we reach that point.  */
622       while (p0 != beg0)
623 	if (*--p0 != *--p1)
624 	  {
625 	    /* Point at the first char of the matching suffix.  */
626 	    ++p0, ++p1;
627 	    beg0 = p0;
628 	    break;
629 	  }
630 
631       /* Are we at a line-beginning in both files?  If not, add the rest of
632 	 this line to the main body.  Discard up to HORIZON_LINES lines from
633 	 the identical suffix.  Also, discard one extra line,
634 	 because shift_boundaries may need it.  */
635       i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
636 			    &&
637 			    (buffer1 == p1 || p1[-1] == '\n'));
638       while (i-- && p0 != end0)
639 	while (*p0++ != '\n')
640 	  continue;
641 
642       p1 += p0 - beg0;
643     }
644 
645   /* Record the suffix.  */
646   filevec[0].suffix_begin = p0;
647   filevec[1].suffix_begin = p1;
648 
649   /* Calculate number of lines of prefix to save.
650 
651      prefix_count == 0 means save the whole prefix;
652      we need this for options like -D that output the whole file,
653      or for enormous contexts (to avoid worrying about arithmetic overflow).
654      We also need it for options like -F that output some preceding line;
655      at least we will need to find the last few lines,
656      but since we don't know how many, it's easiest to find them all.
657 
658      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
659      of the line buffer; they'll be moved to the proper location later.
660      Handle 1 more line than the context says (because we count 1 too many),
661      rounded up to the next power of 2 to speed index computation.  */
662 
663   if (no_diff_means_no_output && ! function_regexp.fastmap
664       && context < LIN_MAX / 4 && context < n0)
665     {
666       middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
667       suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
668       for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
669 	continue;
670       alloc_lines0 = (prefix_count + middle_guess
671 		      + MIN (context, suffix_guess));
672     }
673   else
674     {
675       prefix_count = 0;
676       alloc_lines0 = guess_lines (0, 0, n0);
677     }
678 
679   prefix_mask = prefix_count - 1;
680   lines = 0;
681   linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
682   p0 = buffer0;
683 
684   /* If the prefix is needed, find the prefix lines.  */
685   if (! (no_diff_means_no_output
686 	 && filevec[0].prefix_end == p0
687 	 && filevec[1].prefix_end == p1))
688     {
689       end0 = filevec[0].prefix_end;
690       while (p0 != end0)
691 	{
692 	  lin l = lines++ & prefix_mask;
693 	  if (l == alloc_lines0)
694 	    {
695 	      if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
696 		xalloc_die ();
697 	      alloc_lines0 *= 2;
698 	      linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
699 	    }
700 	  linbuf0[l] = p0;
701 	  while (*p0++ != '\n')
702 	    continue;
703 	}
704     }
705   buffered_prefix = prefix_count && context < lines ? context : lines;
706 
707   /* Allocate line buffer 1.  */
708 
709   middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
710   suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
711   alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
712   if (alloc_lines1 < buffered_prefix
713       || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
714     xalloc_die ();
715   linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
716 
717   if (buffered_prefix != lines)
718     {
719       /* Rotate prefix lines to proper location.  */
720       for (i = 0;  i < buffered_prefix;  i++)
721 	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
722       for (i = 0;  i < buffered_prefix;  i++)
723 	linbuf0[i] = linbuf1[i];
724     }
725 
726   /* Initialize line buffer 1 from line buffer 0.  */
727   for (i = 0; i < buffered_prefix; i++)
728     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
729 
730   /* Record the line buffer, adjusted so that
731      linbuf[0] points at the first differing line.  */
732   filevec[0].linbuf = linbuf0 + buffered_prefix;
733   filevec[1].linbuf = linbuf1 + buffered_prefix;
734   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
735   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
736   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
737   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
738 }
739 
740 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
741    than 2**k.  This table is derived from Chris K. Caldwell's list
742    <http://www.utm.edu/research/primes/lists/2small/>.  */
743 
744 static unsigned char const prime_offset[] =
745 {
746   0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
747   15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
748   11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
749   55, 93, 1, 57, 25
750 };
751 
752 /* Verify that this host's size_t is not too wide for the above table.  */
753 
754 verify (sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
755 
756 /* Given a vector of two file_data objects, read the file associated
757    with each one, and build the table of equivalence classes.
758    Return nonzero if either file appears to be a binary file.
759    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
760 
761 bool
762 read_files (struct file_data filevec[], bool pretend_binary)
763 {
764   int i;
765   bool skip_test = text | pretend_binary;
766   bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
767 
768   if (filevec[0].desc != filevec[1].desc)
769     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
770   else
771     {
772       filevec[1].buffer = filevec[0].buffer;
773       filevec[1].bufsize = filevec[0].bufsize;
774       filevec[1].buffered = filevec[0].buffered;
775     }
776   if (appears_binary)
777     {
778       set_binary_mode (filevec[0].desc, O_BINARY);
779       set_binary_mode (filevec[1].desc, O_BINARY);
780       return true;
781     }
782 
783   find_identical_ends (filevec);
784 
785   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
786   if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
787     xalloc_die ();
788   equivs = xmalloc (equivs_alloc * sizeof *equivs);
789   /* Equivalence class 0 is permanently safe for lines that were not
790      hashed.  Real equivalence classes start at 1.  */
791   equivs_index = 1;
792 
793   /* Allocate (one plus) a prime number of hash buckets.  Use a prime
794      number between 1/3 and 2/3 of the value of equiv_allocs,
795      approximately.  */
796   for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
797     continue;
798   nbuckets = ((size_t) 1 << i) - prime_offset[i];
799   if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
800     xalloc_die ();
801   buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
802   buckets++;
803 
804   for (i = 0; i < 2; i++)
805     find_and_hash_each_line (&filevec[i]);
806 
807   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
808 
809   free (equivs);
810   free (buckets - 1);
811 
812   return false;
813 }
814