xref: /netbsd-src/external/gpl2/diffutils/dist/src/io.c (revision ed75d7a867996c84cfa88e3b8906816277e957f7)
1 /*	$NetBSD: io.c,v 1.1.1.1 2016/01/13 03:15:30 christos Exp $	*/
2 
3 /* File I/O for GNU DIFF.
4 
5    Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002
6    Free Software Foundation, Inc.
7 
8    This file is part of GNU DIFF.
9 
10    GNU DIFF is free software; you can redistribute it and/or modify
11    it under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 2, or (at your option)
13    any later version.
14 
15    GNU DIFF is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; see the file COPYING.
22    If not, write to the Free Software Foundation,
23    59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
24 
25 #include "diff.h"
26 #include <cmpbuf.h>
27 #include <regex.h>
28 #include <setmode.h>
29 #include <xalloc.h>
30 
31 /* Rotate an unsigned value to the left.  */
32 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
33 
34 /* Given a hash value and a new character, return a new hash value.  */
35 #define HASH(h, c) ((c) + ROL (h, 7))
36 
37 /* The type of a hash value.  */
38 typedef size_t hash_value;
39 verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
40 
41 /* Lines are put into equivalence classes of lines that match in lines_differ.
42    Each equivalence class is represented by one of these structures,
43    but only while the classes are being computed.
44    Afterward, each class is represented by a number.  */
45 struct equivclass
46 {
47   lin next;		/* Next item in this bucket.  */
48   hash_value hash;	/* Hash of lines in this class.  */
49   char const *line;	/* A line that fits this class.  */
50   size_t length;	/* That line's length, not counting its newline.  */
51 };
52 
53 /* Hash-table: array of buckets, each being a chain of equivalence classes.
54    buckets[-1] is reserved for incomplete lines.  */
55 static lin *buckets;
56 
57 /* Number of buckets in the hash table array, not counting buckets[-1].  */
58 static size_t nbuckets;
59 
60 /* Array in which the equivalence classes are allocated.
61    The bucket-chains go through the elements in this array.
62    The number of an equivalence class is its index in this array.  */
63 static struct equivclass *equivs;
64 
65 /* Index of first free element in the array `equivs'.  */
66 static lin equivs_index;
67 
68 /* Number of elements allocated in the array `equivs'.  */
69 static lin equivs_alloc;
70 
71 /* Read a block of data into a file buffer, checking for EOF and error.  */
72 
73 void
74 file_block_read (struct file_data *current, size_t size)
75 {
76   if (size && ! current->eof)
77     {
78       size_t s = block_read (current->desc,
79 			     FILE_BUFFER (current) + current->buffered, size);
80       if (s == SIZE_MAX)
81 	pfatal_with_name (current->name);
82       current->buffered += s;
83       current->eof = s < size;
84     }
85 }
86 
87 /* Check for binary files and compare them for exact identity.  */
88 
89 /* Return 1 if BUF contains a non text character.
90    SIZE is the number of characters in BUF.  */
91 
92 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
93 
94 /* Get ready to read the current file.
95    Return nonzero if SKIP_TEST is zero,
96    and if it appears to be a binary file.  */
97 
98 static bool
99 sip (struct file_data *current, bool skip_test)
100 {
101   /* If we have a nonexistent file at this stage, treat it as empty.  */
102   if (current->desc < 0)
103     {
104       /* Leave room for a sentinel.  */
105       current->bufsize = sizeof (word);
106       current->buffer = xmalloc (current->bufsize);
107     }
108   else
109     {
110       current->bufsize = buffer_lcm (sizeof (word),
111 				     STAT_BLOCKSIZE (current->stat),
112 				     PTRDIFF_MAX - 2 * sizeof (word));
113       current->buffer = xmalloc (current->bufsize);
114 
115       if (! skip_test)
116 	{
117 	  /* Check first part of file to see if it's a binary file.  */
118 
119 	  bool was_binary = set_binary_mode (current->desc, 1);
120 	  off_t buffered;
121 	  file_block_read (current, current->bufsize);
122 	  buffered = current->buffered;
123 
124 	  if (! was_binary)
125 	    {
126 	      /* Revert to text mode and seek back to the beginning to
127 		 reread the file.  Use relative seek, since file
128 		 descriptors like stdin might not start at offset
129 		 zero.  */
130 
131 	      if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
132 		pfatal_with_name (current->name);
133 	      set_binary_mode (current->desc, 0);
134 	      current->buffered = 0;
135 	      current->eof = 0;
136 	    }
137 
138 	  return binary_file_p (current->buffer, buffered);
139 	}
140     }
141 
142   current->buffered = 0;
143   current->eof = 0;
144   return 0;
145 }
146 
147 /* Slurp the rest of the current file completely into memory.  */
148 
149 static void
150 slurp (struct file_data *current)
151 {
152   size_t cc;
153 
154   if (current->desc < 0)
155     {
156       /* The file is nonexistent.  */
157       return;
158     }
159 
160   if (S_ISREG (current->stat.st_mode))
161     {
162       /* It's a regular file; slurp in the rest all at once.  */
163 
164       /* Get the size out of the stat block.
165 	 Allocate just enough room for appended newline plus word sentinel,
166 	 plus word-alignment since we want the buffer word-aligned.  */
167       size_t file_size = current->stat.st_size;
168       cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
169       if (file_size != current->stat.st_size || cc < file_size
170 	  || PTRDIFF_MAX <= cc)
171 	xalloc_die ();
172 
173       if (current->bufsize < cc)
174 	{
175 	  current->bufsize = cc;
176 	  current->buffer = xrealloc (current->buffer, cc);
177 	}
178 
179       /* Try to read at least 1 more byte than the size indicates, to
180 	 detect whether the file is growing.  This is a nicety for
181 	 users who run 'diff' on files while they are changing.  */
182 
183       if (current->buffered <= file_size)
184 	{
185 	  file_block_read (current, file_size + 1 - current->buffered);
186 	  if (current->buffered <= file_size)
187 	    return;
188 	}
189     }
190 
191   /* It's not a regular file, or it's a growing regular file; read it,
192      growing the buffer as needed.  */
193 
194   file_block_read (current, current->bufsize - current->buffered);
195 
196   if (current->buffered)
197     {
198       while (current->buffered == current->bufsize)
199 	{
200 	  if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
201 	    xalloc_die ();
202 	  current->bufsize *= 2;
203 	  current->buffer = xrealloc (current->buffer, current->bufsize);
204 	  file_block_read (current, current->bufsize - current->buffered);
205 	}
206 
207       /* Allocate just enough room for appended newline plus word
208 	 sentinel, plus word-alignment.  */
209       cc = current->buffered + 2 * sizeof (word);
210       current->bufsize = cc - cc % sizeof (word);
211       current->buffer = xrealloc (current->buffer, current->bufsize);
212     }
213 }
214 
215 /* Split the file into lines, simultaneously computing the equivalence
216    class for each line.  */
217 
218 static void
219 find_and_hash_each_line (struct file_data *current)
220 {
221   hash_value h;
222   unsigned char const *p = (unsigned char const *) current->prefix_end;
223   unsigned char c;
224   lin i, *bucket;
225   size_t length;
226 
227   /* Cache often-used quantities in local variables to help the compiler.  */
228   char const **linbuf = current->linbuf;
229   lin alloc_lines = current->alloc_lines;
230   lin line = 0;
231   lin linbuf_base = current->linbuf_base;
232   lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
233   struct equivclass *eqs = equivs;
234   lin eqs_index = equivs_index;
235   lin eqs_alloc = equivs_alloc;
236   char const *suffix_begin = current->suffix_begin;
237   char const *bufend = FILE_BUFFER (current) + current->buffered;
238   bool diff_length_compare_anyway =
239     ignore_white_space != IGNORE_NO_WHITE_SPACE;
240   bool same_length_diff_contents_compare_anyway =
241     diff_length_compare_anyway | ignore_case;
242 
243   while ((char const *) p < suffix_begin)
244     {
245       char const *ip = (char const *) p;
246 
247       h = 0;
248 
249       /* Hash this line until we find a newline.  */
250       if (ignore_case)
251 	switch (ignore_white_space)
252 	  {
253 	  case IGNORE_ALL_SPACE:
254 	    while ((c = *p++) != '\n')
255 	      if (! ISSPACE (c))
256 		h = HASH (h, TOLOWER (c));
257 	    break;
258 
259 	  case IGNORE_SPACE_CHANGE:
260 	    while ((c = *p++) != '\n')
261 	      {
262 		if (ISSPACE (c))
263 		  {
264 		    do
265 		      if ((c = *p++) == '\n')
266 			goto hashing_done;
267 		    while (ISSPACE (c));
268 
269 		    h = HASH (h, ' ');
270 		  }
271 
272 		/* C is now the first non-space.  */
273 		h = HASH (h, TOLOWER (c));
274 	      }
275 	    break;
276 
277 	  case IGNORE_TAB_EXPANSION:
278 	    {
279 	      size_t column = 0;
280 	      while ((c = *p++) != '\n')
281 		{
282 		  int repetitions = 1;
283 
284 		  switch (c)
285 		    {
286 		    case '\b':
287 		      column -= 0 < column;
288 		      break;
289 
290 		    case '\t':
291 		      c = ' ';
292 		      repetitions = TAB_WIDTH - column % TAB_WIDTH;
293 		      column += repetitions;
294 		      break;
295 
296 		    case '\r':
297 		      column = 0;
298 		      break;
299 
300 		    default:
301 		      c = TOLOWER (c);
302 		      column++;
303 		      break;
304 		    }
305 
306 		  do
307 		    h = HASH (h, c);
308 		  while (--repetitions != 0);
309 		}
310 	    }
311 	    break;
312 
313 	  default:
314 	    while ((c = *p++) != '\n')
315 	      h = HASH (h, TOLOWER (c));
316 	    break;
317 	  }
318       else
319 	switch (ignore_white_space)
320 	  {
321 	  case IGNORE_ALL_SPACE:
322 	    while ((c = *p++) != '\n')
323 	      if (! ISSPACE (c))
324 		h = HASH (h, c);
325 	    break;
326 
327 	  case IGNORE_SPACE_CHANGE:
328 	    while ((c = *p++) != '\n')
329 	      {
330 		if (ISSPACE (c))
331 		  {
332 		    do
333 		      if ((c = *p++) == '\n')
334 			goto hashing_done;
335 		    while (ISSPACE (c));
336 
337 		    h = HASH (h, ' ');
338 		  }
339 
340 		/* C is now the first non-space.  */
341 		h = HASH (h, c);
342 	      }
343 	    break;
344 
345 	  case IGNORE_TAB_EXPANSION:
346 	    {
347 	      size_t column = 0;
348 	      while ((c = *p++) != '\n')
349 		{
350 		  int repetitions = 1;
351 
352 		  switch (c)
353 		    {
354 		    case '\b':
355 		      column -= 0 < column;
356 		      break;
357 
358 		    case '\t':
359 		      c = ' ';
360 		      repetitions = TAB_WIDTH - column % TAB_WIDTH;
361 		      column += repetitions;
362 		      break;
363 
364 		    case '\r':
365 		      column = 0;
366 		      break;
367 
368 		    default:
369 		      column++;
370 		      break;
371 		    }
372 
373 		  do
374 		    h = HASH (h, c);
375 		  while (--repetitions != 0);
376 		}
377 	    }
378 	    break;
379 
380 	  default:
381 	    while ((c = *p++) != '\n')
382 	      h = HASH (h, c);
383 	    break;
384 	  }
385 
386    hashing_done:;
387 
388       bucket = &buckets[h % nbuckets];
389       length = (char const *) p - ip - 1;
390 
391       if ((char const *) p == bufend
392 	  && current->missing_newline
393 	  && ROBUST_OUTPUT_STYLE (output_style))
394 	{
395 	  /* This line is incomplete.  If this is significant,
396 	     put the line into buckets[-1].  */
397 	  if (ignore_white_space < IGNORE_SPACE_CHANGE)
398 	    bucket = &buckets[-1];
399 
400 	  /* Omit the inserted newline when computing linbuf later.  */
401 	  p--;
402 	  bufend = suffix_begin = (char const *) p;
403 	}
404 
405       for (i = *bucket;  ;  i = eqs[i].next)
406 	if (!i)
407 	  {
408 	    /* Create a new equivalence class in this bucket.  */
409 	    i = eqs_index++;
410 	    if (i == eqs_alloc)
411 	      {
412 		if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
413 		  xalloc_die ();
414 		eqs_alloc *= 2;
415 		eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
416 	      }
417 	    eqs[i].next = *bucket;
418 	    eqs[i].hash = h;
419 	    eqs[i].line = ip;
420 	    eqs[i].length = length;
421 	    *bucket = i;
422 	    break;
423 	  }
424 	else if (eqs[i].hash == h)
425 	  {
426 	    char const *eqline = eqs[i].line;
427 
428 	    /* Reuse existing class if lines_differ reports the lines
429                equal.  */
430 	    if (eqs[i].length == length)
431 	      {
432 		/* Reuse existing equivalence class if the lines are identical.
433 		   This detects the common case of exact identity
434 		   faster than lines_differ would.  */
435 		if (memcmp (eqline, ip, length) == 0)
436 		  break;
437 		if (!same_length_diff_contents_compare_anyway)
438 		  continue;
439 	      }
440 	    else if (!diff_length_compare_anyway)
441 	      continue;
442 
443 	    if (! lines_differ (eqline, ip))
444 	      break;
445 	  }
446 
447       /* Maybe increase the size of the line table.  */
448       if (line == alloc_lines)
449 	{
450 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
451 	  if (PTRDIFF_MAX / 3 <= alloc_lines
452 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
453 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
454 	    xalloc_die ();
455 	  alloc_lines = 2 * alloc_lines - linbuf_base;
456 	  cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
457 	  linbuf += linbuf_base;
458 	  linbuf = xrealloc (linbuf,
459 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
460 	  linbuf -= linbuf_base;
461 	}
462       linbuf[line] = ip;
463       cureqs[line] = i;
464       ++line;
465     }
466 
467   current->buffered_lines = line;
468 
469   for (i = 0;  ;  i++)
470     {
471       /* Record the line start for lines in the suffix that we care about.
472 	 Record one more line start than lines,
473 	 so that we can compute the length of any buffered line.  */
474       if (line == alloc_lines)
475 	{
476 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
477 	  if (PTRDIFF_MAX / 3 <= alloc_lines
478 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
479 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
480 	    xalloc_die ();
481 	  alloc_lines = 2 * alloc_lines - linbuf_base;
482 	  linbuf += linbuf_base;
483 	  linbuf = xrealloc (linbuf,
484 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
485 	  linbuf -= linbuf_base;
486 	}
487       linbuf[line] = (char const *) p;
488 
489       if ((char const *) p == bufend)
490 	break;
491 
492       if (context <= i && no_diff_means_no_output)
493 	break;
494 
495       line++;
496 
497       while (*p++ != '\n')
498 	continue;
499     }
500 
501   /* Done with cache in local variables.  */
502   current->linbuf = linbuf;
503   current->valid_lines = line;
504   current->alloc_lines = alloc_lines;
505   current->equivs = cureqs;
506   equivs = eqs;
507   equivs_alloc = eqs_alloc;
508   equivs_index = eqs_index;
509 }
510 
511 /* Prepare the text.  Make sure the text end is initialized.
512    Make sure text ends in a newline,
513    but remember that we had to add one.
514    Strip trailing CRs, if that was requested.  */
515 
516 static void
517 prepare_text (struct file_data *current)
518 {
519   size_t buffered = current->buffered;
520   char *p = FILE_BUFFER (current);
521   char *dst;
522 
523   if (buffered == 0 || p[buffered - 1] == '\n')
524     current->missing_newline = 0;
525   else
526     {
527       p[buffered++] = '\n';
528       current->missing_newline = 1;
529     }
530 
531   if (!p)
532     return;
533 
534   /* Don't use uninitialized storage when planting or using sentinels.  */
535   memset (p + buffered, 0, sizeof (word));
536 
537   if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
538     {
539       char const *src = dst;
540       char const *srclim = p + buffered;
541 
542       do
543 	dst += ! ((*dst = *src++) == '\r' && *src == '\n');
544       while (src < srclim);
545 
546       buffered -= src - dst;
547     }
548 
549   current->buffered = buffered;
550 }
551 
552 /* We have found N lines in a buffer of size S; guess the
553    proportionate number of lines that will be found in a buffer of
554    size T.  However, do not guess a number of lines so large that the
555    resulting line table might cause overflow in size calculations.  */
556 static lin
557 guess_lines (lin n, size_t s, size_t t)
558 {
559   size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
560   lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
561   return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
562 }
563 
564 /* Given a vector of two file_data objects, find the identical
565    prefixes and suffixes of each object.  */
566 
567 static void
568 find_identical_ends (struct file_data filevec[])
569 {
570   word *w0, *w1;
571   char *p0, *p1, *buffer0, *buffer1;
572   char const *end0, *beg0;
573   char const **linbuf0, **linbuf1;
574   lin i, lines;
575   size_t n0, n1;
576   lin alloc_lines0, alloc_lines1;
577   lin buffered_prefix, prefix_count, prefix_mask;
578   lin middle_guess, suffix_guess;
579 
580   slurp (&filevec[0]);
581   prepare_text (&filevec[0]);
582   if (filevec[0].desc != filevec[1].desc)
583     {
584       slurp (&filevec[1]);
585       prepare_text (&filevec[1]);
586     }
587   else
588     {
589       filevec[1].buffer = filevec[0].buffer;
590       filevec[1].bufsize = filevec[0].bufsize;
591       filevec[1].buffered = filevec[0].buffered;
592       filevec[1].missing_newline = filevec[0].missing_newline;
593     }
594 
595   /* Find identical prefix.  */
596 
597   w0 = filevec[0].buffer;
598   w1 = filevec[1].buffer;
599   p0 = buffer0 = (char *) w0;
600   p1 = buffer1 = (char *) w1;
601   n0 = filevec[0].buffered;
602   n1 = filevec[1].buffered;
603 
604   if (p0 == p1)
605     /* The buffers are the same; sentinels won't work.  */
606     p0 = p1 += n1;
607   else
608     {
609       /* Insert end sentinels, in this case characters that are guaranteed
610 	 to make the equality test false, and thus terminate the loop.  */
611 
612       if (n0 < n1)
613 	p0[n0] = ~p1[n0];
614       else
615 	p1[n1] = ~p0[n1];
616 
617       /* Loop until first mismatch, or to the sentinel characters.  */
618 
619       /* Compare a word at a time for speed.  */
620       while (*w0 == *w1)
621 	w0++, w1++;
622 
623       /* Do the last few bytes of comparison a byte at a time.  */
624       p0 = (char *) w0;
625       p1 = (char *) w1;
626       while (*p0 == *p1)
627 	p0++, p1++;
628 
629       /* Don't mistakenly count missing newline as part of prefix.  */
630       if (ROBUST_OUTPUT_STYLE (output_style)
631 	  && ((buffer0 + n0 - filevec[0].missing_newline < p0)
632 	      !=
633 	      (buffer1 + n1 - filevec[1].missing_newline < p1)))
634 	p0--, p1--;
635     }
636 
637   /* Now P0 and P1 point at the first nonmatching characters.  */
638 
639   /* Skip back to last line-beginning in the prefix,
640      and then discard up to HORIZON_LINES lines from the prefix.  */
641   i = horizon_lines;
642   while (p0 != buffer0 && (p0[-1] != '\n' || i--))
643     p0--, p1--;
644 
645   /* Record the prefix.  */
646   filevec[0].prefix_end = p0;
647   filevec[1].prefix_end = p1;
648 
649   /* Find identical suffix.  */
650 
651   /* P0 and P1 point beyond the last chars not yet compared.  */
652   p0 = buffer0 + n0;
653   p1 = buffer1 + n1;
654 
655   if (! ROBUST_OUTPUT_STYLE (output_style)
656       || filevec[0].missing_newline == filevec[1].missing_newline)
657     {
658       end0 = p0;	/* Addr of last char in file 0.  */
659 
660       /* Get value of P0 at which we should stop scanning backward:
661 	 this is when either P0 or P1 points just past the last char
662 	 of the identical prefix.  */
663       beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
664 
665       /* Scan back until chars don't match or we reach that point.  */
666       for (; p0 != beg0; p0--, p1--)
667 	if (*p0 != *p1)
668 	  {
669 	    /* Point at the first char of the matching suffix.  */
670 	    beg0 = p0;
671 	    break;
672 	  }
673 
674       /* Are we at a line-beginning in both files?  If not, add the rest of
675 	 this line to the main body.  Discard up to HORIZON_LINES lines from
676 	 the identical suffix.  Also, discard one extra line,
677 	 because shift_boundaries may need it.  */
678       i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
679 			    &&
680 			    (buffer1 == p1 || p1[-1] == '\n'));
681       while (i-- && p0 != end0)
682 	while (*p0++ != '\n')
683 	  continue;
684 
685       p1 += p0 - beg0;
686     }
687 
688   /* Record the suffix.  */
689   filevec[0].suffix_begin = p0;
690   filevec[1].suffix_begin = p1;
691 
692   /* Calculate number of lines of prefix to save.
693 
694      prefix_count == 0 means save the whole prefix;
695      we need this for options like -D that output the whole file,
696      or for enormous contexts (to avoid worrying about arithmetic overflow).
697      We also need it for options like -F that output some preceding line;
698      at least we will need to find the last few lines,
699      but since we don't know how many, it's easiest to find them all.
700 
701      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
702      of the line buffer; they'll be moved to the proper location later.
703      Handle 1 more line than the context says (because we count 1 too many),
704      rounded up to the next power of 2 to speed index computation.  */
705 
706   if (no_diff_means_no_output && ! function_regexp.fastmap
707       && context < LIN_MAX / 4 && context < n0)
708     {
709       middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
710       suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
711       for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
712 	continue;
713       alloc_lines0 = (prefix_count + middle_guess
714 		      + MIN (context, suffix_guess));
715     }
716   else
717     {
718       prefix_count = 0;
719       alloc_lines0 = guess_lines (0, 0, n0);
720     }
721 
722   prefix_mask = prefix_count - 1;
723   lines = 0;
724   linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
725   p0 = buffer0;
726 
727   /* If the prefix is needed, find the prefix lines.  */
728   if (! (no_diff_means_no_output
729 	 && filevec[0].prefix_end == p0
730 	 && filevec[1].prefix_end == p1))
731     {
732       end0 = filevec[0].prefix_end;
733       while (p0 != end0)
734 	{
735 	  lin l = lines++ & prefix_mask;
736 	  if (l == alloc_lines0)
737 	    {
738 	      if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
739 		xalloc_die ();
740 	      alloc_lines0 *= 2;
741 	      linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
742 	    }
743 	  linbuf0[l] = p0;
744 	  while (*p0++ != '\n')
745 	    continue;
746 	}
747     }
748   buffered_prefix = prefix_count && context < lines ? context : lines;
749 
750   /* Allocate line buffer 1.  */
751 
752   middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
753   suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
754   alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
755   if (alloc_lines1 < buffered_prefix
756       || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
757     xalloc_die ();
758   linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
759 
760   if (buffered_prefix != lines)
761     {
762       /* Rotate prefix lines to proper location.  */
763       for (i = 0;  i < buffered_prefix;  i++)
764 	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
765       for (i = 0;  i < buffered_prefix;  i++)
766 	linbuf0[i] = linbuf1[i];
767     }
768 
769   /* Initialize line buffer 1 from line buffer 0.  */
770   for (i = 0; i < buffered_prefix; i++)
771     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
772 
773   /* Record the line buffer, adjusted so that
774      linbuf[0] points at the first differing line.  */
775   filevec[0].linbuf = linbuf0 + buffered_prefix;
776   filevec[1].linbuf = linbuf1 + buffered_prefix;
777   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
778   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
779   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
780   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
781 }
782 
783 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
784    than 2**k.  This table is derived from Chris K. Caldwell's list
785    <http://www.utm.edu/research/primes/lists/2small/>.  */
786 
787 static unsigned char const prime_offset[] =
788 {
789   0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
790   15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
791   11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
792   55, 93, 1, 57, 25
793 };
794 
795 /* Verify that this host's size_t is not too wide for the above table.  */
796 
797 verify (enough_prime_offsets,
798 	sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
799 
800 /* Given a vector of two file_data objects, read the file associated
801    with each one, and build the table of equivalence classes.
802    Return nonzero if either file appears to be a binary file.
803    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
804 
805 bool
806 read_files (struct file_data filevec[], bool pretend_binary)
807 {
808   int i;
809   bool skip_test = text | pretend_binary;
810   bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
811 
812   if (filevec[0].desc != filevec[1].desc)
813     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
814   else
815     {
816       filevec[1].buffer = filevec[0].buffer;
817       filevec[1].bufsize = filevec[0].bufsize;
818       filevec[1].buffered = filevec[0].buffered;
819     }
820   if (appears_binary)
821     {
822       set_binary_mode (filevec[0].desc, 1);
823       set_binary_mode (filevec[1].desc, 1);
824       return 1;
825     }
826 
827   find_identical_ends (filevec);
828 
829   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
830   if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
831     xalloc_die ();
832   equivs = xmalloc (equivs_alloc * sizeof *equivs);
833   /* Equivalence class 0 is permanently safe for lines that were not
834      hashed.  Real equivalence classes start at 1.  */
835   equivs_index = 1;
836 
837   /* Allocate (one plus) a prime number of hash buckets.  Use a prime
838      number between 1/3 and 2/3 of the value of equiv_allocs,
839      approximately.  */
840   for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
841     continue;
842   nbuckets = ((size_t) 1 << i) - prime_offset[i];
843   if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
844     xalloc_die ();
845   buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
846   buckets++;
847 
848   for (i = 0; i < 2; i++)
849     find_and_hash_each_line (&filevec[i]);
850 
851   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
852 
853   free (equivs);
854   free (buckets - 1);
855 
856   return 0;
857 }
858