xref: /netbsd-src/external/gpl2/xcvs/dist/diff/io.c (revision a7c918477dd5f12c1da816ba05caf44eab2d06d6)
1*a7c91847Schristos /* File I/O for GNU DIFF.
2*a7c91847Schristos    Copyright (C) 1988, 1989, 1992, 1993, 1994 Free Software Foundation, Inc.
3*a7c91847Schristos 
4*a7c91847Schristos This file is part of GNU DIFF.
5*a7c91847Schristos 
6*a7c91847Schristos GNU DIFF is free software; you can redistribute it and/or modify
7*a7c91847Schristos it under the terms of the GNU General Public License as published by
8*a7c91847Schristos the Free Software Foundation; either version 2, or (at your option)
9*a7c91847Schristos any later version.
10*a7c91847Schristos 
11*a7c91847Schristos GNU DIFF is distributed in the hope that it will be useful,
12*a7c91847Schristos but WITHOUT ANY WARRANTY; without even the implied warranty of
13*a7c91847Schristos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14*a7c91847Schristos GNU General Public License for more details.
15*a7c91847Schristos 
16*a7c91847Schristos */
17*a7c91847Schristos 
18*a7c91847Schristos #include "diff.h"
19*a7c91847Schristos 
20*a7c91847Schristos /* Rotate a value n bits to the left. */
21*a7c91847Schristos #define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
22*a7c91847Schristos #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
23*a7c91847Schristos 
24*a7c91847Schristos /* Given a hash value and a new character, return a new hash value. */
25*a7c91847Schristos #define HASH(h, c) ((c) + ROL (h, 7))
26*a7c91847Schristos 
27*a7c91847Schristos /* Guess remaining number of lines from number N of lines so far,
28*a7c91847Schristos    size S so far, and total size T.  */
29*a7c91847Schristos #define GUESS_LINES(n,s,t) (((t) - (s)) / ((n) < 10 ? 32 : (s) / ((n)-1)) + 5)
30*a7c91847Schristos 
31*a7c91847Schristos /* Type used for fast prefix comparison in find_identical_ends.  */
32*a7c91847Schristos #ifndef word
33*a7c91847Schristos #define word int
34*a7c91847Schristos #endif
35*a7c91847Schristos 
36*a7c91847Schristos /* Lines are put into equivalence classes (of lines that match in line_cmp).
37*a7c91847Schristos    Each equivalence class is represented by one of these structures,
38*a7c91847Schristos    but only while the classes are being computed.
39*a7c91847Schristos    Afterward, each class is represented by a number.  */
40*a7c91847Schristos struct equivclass
41*a7c91847Schristos {
42*a7c91847Schristos   int next;	/* Next item in this bucket. */
43*a7c91847Schristos   unsigned hash;	/* Hash of lines in this class.  */
44*a7c91847Schristos   char const *line;	/* A line that fits this class. */
45*a7c91847Schristos   size_t length;	/* That line's length, not counting its newline.  */
46*a7c91847Schristos };
47*a7c91847Schristos 
48*a7c91847Schristos /* Hash-table: array of buckets, each being a chain of equivalence classes.
49*a7c91847Schristos    buckets[-1] is reserved for incomplete lines.  */
50*a7c91847Schristos static int *buckets;
51*a7c91847Schristos 
52*a7c91847Schristos /* Number of buckets in the hash table array, not counting buckets[-1]. */
53*a7c91847Schristos static int nbuckets;
54*a7c91847Schristos 
55*a7c91847Schristos /* Array in which the equivalence classes are allocated.
56*a7c91847Schristos    The bucket-chains go through the elements in this array.
57*a7c91847Schristos    The number of an equivalence class is its index in this array.  */
58*a7c91847Schristos static struct equivclass *equivs;
59*a7c91847Schristos 
60*a7c91847Schristos /* Index of first free element in the array `equivs'.  */
61*a7c91847Schristos static int equivs_index;
62*a7c91847Schristos 
63*a7c91847Schristos /* Number of elements allocated in the array `equivs'.  */
64*a7c91847Schristos static int equivs_alloc;
65*a7c91847Schristos 
66*a7c91847Schristos static void find_and_hash_each_line PARAMS((struct file_data *));
67*a7c91847Schristos static void find_identical_ends PARAMS((struct file_data[]));
68*a7c91847Schristos static void prepare_text_end PARAMS((struct file_data *));
69*a7c91847Schristos 
70*a7c91847Schristos /* Check for binary files and compare them for exact identity.  */
71*a7c91847Schristos 
72*a7c91847Schristos /* Return 1 if BUF contains a non text character.
73*a7c91847Schristos    SIZE is the number of characters in BUF.  */
74*a7c91847Schristos 
75*a7c91847Schristos #define binary_file_p(buf, size) (memchr (buf, '\0', size) != 0)
76*a7c91847Schristos 
77*a7c91847Schristos /* Get ready to read the current file.
78*a7c91847Schristos    Return nonzero if SKIP_TEST is zero,
79*a7c91847Schristos    and if it appears to be a binary file.  */
80*a7c91847Schristos 
81*a7c91847Schristos int
sip(current,skip_test)82*a7c91847Schristos sip (current, skip_test)
83*a7c91847Schristos      struct file_data *current;
84*a7c91847Schristos      int skip_test;
85*a7c91847Schristos {
86*a7c91847Schristos   /* If we have a nonexistent file at this stage, treat it as empty.  */
87*a7c91847Schristos   if (current->desc < 0)
88*a7c91847Schristos     {
89*a7c91847Schristos       /* Leave room for a sentinel.  */
90*a7c91847Schristos       current->bufsize = sizeof (word);
91*a7c91847Schristos       current->buffer = xmalloc (current->bufsize);
92*a7c91847Schristos     }
93*a7c91847Schristos   else
94*a7c91847Schristos     {
95*a7c91847Schristos       current->bufsize = STAT_BLOCKSIZE (current->stat);
96*a7c91847Schristos       current->buffer = xmalloc (current->bufsize);
97*a7c91847Schristos 
98*a7c91847Schristos       if (! skip_test)
99*a7c91847Schristos 	{
100*a7c91847Schristos 	  /* Check first part of file to see if it's a binary file.  */
101*a7c91847Schristos #if HAVE_SETMODE
102*a7c91847Schristos 	  int oldmode = setmode (current->desc, O_BINARY);
103*a7c91847Schristos #endif
104*a7c91847Schristos 	  ssize_t n = read (current->desc, current->buffer, current->bufsize);
105*a7c91847Schristos 	  if (n == -1)
106*a7c91847Schristos 	    pfatal_with_name (current->name);
107*a7c91847Schristos 	  current->buffered_chars = n;
108*a7c91847Schristos #if HAVE_SETMODE
109*a7c91847Schristos 	  if (oldmode != O_BINARY)
110*a7c91847Schristos 	    {
111*a7c91847Schristos 	      if (lseek (current->desc, - (off_t) n, SEEK_CUR) == -1)
112*a7c91847Schristos 		pfatal_with_name (current->name);
113*a7c91847Schristos 	      setmode (current->desc, oldmode);
114*a7c91847Schristos 	      current->buffered_chars = 0;
115*a7c91847Schristos 	    }
116*a7c91847Schristos #endif
117*a7c91847Schristos 	  return binary_file_p (current->buffer, n);
118*a7c91847Schristos 	}
119*a7c91847Schristos     }
120*a7c91847Schristos 
121*a7c91847Schristos   current->buffered_chars = 0;
122*a7c91847Schristos   return 0;
123*a7c91847Schristos }
124*a7c91847Schristos 
125*a7c91847Schristos /* Slurp the rest of the current file completely into memory.  */
126*a7c91847Schristos 
127*a7c91847Schristos void
slurp(current)128*a7c91847Schristos slurp (current)
129*a7c91847Schristos      struct file_data *current;
130*a7c91847Schristos {
131*a7c91847Schristos   ssize_t cc;
132*a7c91847Schristos 
133*a7c91847Schristos   if (current->desc < 0)
134*a7c91847Schristos     /* The file is nonexistent.  */
135*a7c91847Schristos     ;
136*a7c91847Schristos   else if (S_ISREG (current->stat.st_mode))
137*a7c91847Schristos     {
138*a7c91847Schristos       /* It's a regular file; slurp in the rest all at once.  */
139*a7c91847Schristos 
140*a7c91847Schristos       /* Get the size out of the stat block.
141*a7c91847Schristos 	 Allocate enough room for appended newline and sentinel.  */
142*a7c91847Schristos       cc = current->stat.st_size + 1 + sizeof (word);
143*a7c91847Schristos       if (current->bufsize < cc)
144*a7c91847Schristos 	{
145*a7c91847Schristos 	  current->bufsize = cc;
146*a7c91847Schristos 	  current->buffer = xrealloc (current->buffer, cc);
147*a7c91847Schristos 	}
148*a7c91847Schristos 
149*a7c91847Schristos       if (current->buffered_chars < current->stat.st_size)
150*a7c91847Schristos 	{
151*a7c91847Schristos 	  cc = read (current->desc,
152*a7c91847Schristos 		     current->buffer + current->buffered_chars,
153*a7c91847Schristos 		     current->stat.st_size - current->buffered_chars);
154*a7c91847Schristos 	  if (cc == -1)
155*a7c91847Schristos 	    pfatal_with_name (current->name);
156*a7c91847Schristos 	  current->buffered_chars += cc;
157*a7c91847Schristos 	}
158*a7c91847Schristos     }
159*a7c91847Schristos   /* It's not a regular file; read it, growing the buffer as needed.  */
160*a7c91847Schristos   else if (always_text_flag || current->buffered_chars != 0)
161*a7c91847Schristos     {
162*a7c91847Schristos       for (;;)
163*a7c91847Schristos 	{
164*a7c91847Schristos 	  if (current->buffered_chars == current->bufsize)
165*a7c91847Schristos 	    {
166*a7c91847Schristos 	      current->bufsize = current->bufsize * 2;
167*a7c91847Schristos 	      current->buffer = xrealloc (current->buffer, current->bufsize);
168*a7c91847Schristos 	    }
169*a7c91847Schristos 	  cc = read (current->desc,
170*a7c91847Schristos 		     current->buffer + current->buffered_chars,
171*a7c91847Schristos 		     current->bufsize - current->buffered_chars);
172*a7c91847Schristos 	  if (cc == 0)
173*a7c91847Schristos 	    break;
174*a7c91847Schristos 	  if (cc == -1)
175*a7c91847Schristos 	    pfatal_with_name (current->name);
176*a7c91847Schristos 	  current->buffered_chars += cc;
177*a7c91847Schristos 	}
178*a7c91847Schristos       /* Allocate just enough room for appended newline and sentinel.  */
179*a7c91847Schristos       current->bufsize = current->buffered_chars + 1 + sizeof (word);
180*a7c91847Schristos       current->buffer = xrealloc (current->buffer, current->bufsize);
181*a7c91847Schristos     }
182*a7c91847Schristos }
183*a7c91847Schristos 
184*a7c91847Schristos /* Split the file into lines, simultaneously computing the equivalence class for
185*a7c91847Schristos    each line. */
186*a7c91847Schristos 
187*a7c91847Schristos static void
find_and_hash_each_line(current)188*a7c91847Schristos find_and_hash_each_line (current)
189*a7c91847Schristos      struct file_data *current;
190*a7c91847Schristos {
191*a7c91847Schristos   unsigned h;
192*a7c91847Schristos   unsigned char const *p = (unsigned char const *) current->prefix_end;
193*a7c91847Schristos   unsigned char c;
194*a7c91847Schristos   int i, *bucket;
195*a7c91847Schristos   size_t length;
196*a7c91847Schristos 
197*a7c91847Schristos   /* Cache often-used quantities in local variables to help the compiler.  */
198*a7c91847Schristos   char const **linbuf = current->linbuf;
199*a7c91847Schristos   int alloc_lines = current->alloc_lines;
200*a7c91847Schristos   int line = 0;
201*a7c91847Schristos   int linbuf_base = current->linbuf_base;
202*a7c91847Schristos   int *cureqs = (int *) xmalloc (alloc_lines * sizeof (int));
203*a7c91847Schristos   struct equivclass *eqs = equivs;
204*a7c91847Schristos   int eqs_index = equivs_index;
205*a7c91847Schristos   int eqs_alloc = equivs_alloc;
206*a7c91847Schristos   char const *suffix_begin = current->suffix_begin;
207*a7c91847Schristos   char const *bufend = current->buffer + current->buffered_chars;
208*a7c91847Schristos   int use_line_cmp = ignore_some_line_changes;
209*a7c91847Schristos 
210*a7c91847Schristos   while ((char const *) p < suffix_begin)
211*a7c91847Schristos     {
212*a7c91847Schristos       char const *ip = (char const *) p;
213*a7c91847Schristos 
214*a7c91847Schristos       /* Compute the equivalence class for this line.  */
215*a7c91847Schristos 
216*a7c91847Schristos       h = 0;
217*a7c91847Schristos 
218*a7c91847Schristos       /* Hash this line until we find a newline. */
219*a7c91847Schristos       if (ignore_case_flag)
220*a7c91847Schristos 	{
221*a7c91847Schristos 	  if (ignore_all_space_flag)
222*a7c91847Schristos 	    while ((c = *p++) != '\n')
223*a7c91847Schristos 	      {
224*a7c91847Schristos 		if (! ISSPACE (c))
225*a7c91847Schristos 		  h = HASH (h, ISUPPER (c) ? tolower (c) : c);
226*a7c91847Schristos 	      }
227*a7c91847Schristos 	  else if (ignore_space_change_flag)
228*a7c91847Schristos 	    while ((c = *p++) != '\n')
229*a7c91847Schristos 	      {
230*a7c91847Schristos 		if (ISSPACE (c))
231*a7c91847Schristos 		  {
232*a7c91847Schristos 		    for (;;)
233*a7c91847Schristos 		      {
234*a7c91847Schristos 			c = *p++;
235*a7c91847Schristos 			if (!ISSPACE (c))
236*a7c91847Schristos 			  break;
237*a7c91847Schristos 			if (c == '\n')
238*a7c91847Schristos 			  goto hashing_done;
239*a7c91847Schristos 		      }
240*a7c91847Schristos 		    h = HASH (h, ' ');
241*a7c91847Schristos 		  }
242*a7c91847Schristos 		/* C is now the first non-space.  */
243*a7c91847Schristos 		h = HASH (h, ISUPPER (c) ? tolower (c) : c);
244*a7c91847Schristos 	      }
245*a7c91847Schristos 	  else
246*a7c91847Schristos 	    while ((c = *p++) != '\n')
247*a7c91847Schristos 	      h = HASH (h, ISUPPER (c) ? tolower (c) : c);
248*a7c91847Schristos 	}
249*a7c91847Schristos       else
250*a7c91847Schristos 	{
251*a7c91847Schristos 	  if (ignore_all_space_flag)
252*a7c91847Schristos 	    while ((c = *p++) != '\n')
253*a7c91847Schristos 	      {
254*a7c91847Schristos 		if (! ISSPACE (c))
255*a7c91847Schristos 		  h = HASH (h, c);
256*a7c91847Schristos 	      }
257*a7c91847Schristos 	  else if (ignore_space_change_flag)
258*a7c91847Schristos 	    while ((c = *p++) != '\n')
259*a7c91847Schristos 	      {
260*a7c91847Schristos 		if (ISSPACE (c))
261*a7c91847Schristos 		  {
262*a7c91847Schristos 		    for (;;)
263*a7c91847Schristos 		      {
264*a7c91847Schristos 			c = *p++;
265*a7c91847Schristos 			if (!ISSPACE (c))
266*a7c91847Schristos 			  break;
267*a7c91847Schristos 			if (c == '\n')
268*a7c91847Schristos 			  goto hashing_done;
269*a7c91847Schristos 		      }
270*a7c91847Schristos 		    h = HASH (h, ' ');
271*a7c91847Schristos 		  }
272*a7c91847Schristos 		/* C is now the first non-space.  */
273*a7c91847Schristos 		h = HASH (h, c);
274*a7c91847Schristos 	      }
275*a7c91847Schristos 	  else
276*a7c91847Schristos 	    while ((c = *p++) != '\n')
277*a7c91847Schristos 	      h = HASH (h, c);
278*a7c91847Schristos 	}
279*a7c91847Schristos    hashing_done:;
280*a7c91847Schristos 
281*a7c91847Schristos       bucket = &buckets[h % nbuckets];
282*a7c91847Schristos       length = (char const *) p - ip - 1;
283*a7c91847Schristos 
284*a7c91847Schristos       if ((char const *) p == bufend
285*a7c91847Schristos 	  && current->missing_newline
286*a7c91847Schristos 	  && ROBUST_OUTPUT_STYLE (output_style))
287*a7c91847Schristos 	{
288*a7c91847Schristos 	  /* This line is incomplete.  If this is significant,
289*a7c91847Schristos 	     put the line into bucket[-1].  */
290*a7c91847Schristos 	  if (! (ignore_space_change_flag | ignore_all_space_flag))
291*a7c91847Schristos 	    bucket = &buckets[-1];
292*a7c91847Schristos 
293*a7c91847Schristos 	  /* Omit the inserted newline when computing linbuf later.  */
294*a7c91847Schristos 	  p--;
295*a7c91847Schristos 	  bufend = suffix_begin = (char const *) p;
296*a7c91847Schristos 	}
297*a7c91847Schristos 
298*a7c91847Schristos       for (i = *bucket;  ;  i = eqs[i].next)
299*a7c91847Schristos 	if (!i)
300*a7c91847Schristos 	  {
301*a7c91847Schristos 	    /* Create a new equivalence class in this bucket. */
302*a7c91847Schristos 	    i = eqs_index++;
303*a7c91847Schristos 	    if (i == eqs_alloc)
304*a7c91847Schristos 	      eqs = (struct equivclass *)
305*a7c91847Schristos 		      xrealloc (eqs, (eqs_alloc*=2) * sizeof(*eqs));
306*a7c91847Schristos 	    eqs[i].next = *bucket;
307*a7c91847Schristos 	    eqs[i].hash = h;
308*a7c91847Schristos 	    eqs[i].line = ip;
309*a7c91847Schristos 	    eqs[i].length = length;
310*a7c91847Schristos 	    *bucket = i;
311*a7c91847Schristos 	    break;
312*a7c91847Schristos 	  }
313*a7c91847Schristos 	else if (eqs[i].hash == h)
314*a7c91847Schristos 	  {
315*a7c91847Schristos 	    char const *eqline = eqs[i].line;
316*a7c91847Schristos 
317*a7c91847Schristos 	    /* Reuse existing equivalence class if the lines are identical.
318*a7c91847Schristos 	       This detects the common case of exact identity
319*a7c91847Schristos 	       faster than complete comparison would.  */
320*a7c91847Schristos 	    if (eqs[i].length == length && memcmp (eqline, ip, length) == 0)
321*a7c91847Schristos 	      break;
322*a7c91847Schristos 
323*a7c91847Schristos 	    /* Reuse existing class if line_cmp reports the lines equal.  */
324*a7c91847Schristos 	    if (use_line_cmp && line_cmp (eqline, ip) == 0)
325*a7c91847Schristos 	      break;
326*a7c91847Schristos 	  }
327*a7c91847Schristos 
328*a7c91847Schristos       /* Maybe increase the size of the line table. */
329*a7c91847Schristos       if (line == alloc_lines)
330*a7c91847Schristos 	{
331*a7c91847Schristos 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
332*a7c91847Schristos 	  alloc_lines = 2 * alloc_lines - linbuf_base;
333*a7c91847Schristos 	  cureqs = (int *) xrealloc (cureqs, alloc_lines * sizeof (*cureqs));
334*a7c91847Schristos 	  linbuf = (char const **) xrealloc (linbuf + linbuf_base,
335*a7c91847Schristos 					     (alloc_lines - linbuf_base)
336*a7c91847Schristos 					     * sizeof (*linbuf))
337*a7c91847Schristos 		   - linbuf_base;
338*a7c91847Schristos 	}
339*a7c91847Schristos       linbuf[line] = ip;
340*a7c91847Schristos       cureqs[line] = i;
341*a7c91847Schristos       ++line;
342*a7c91847Schristos     }
343*a7c91847Schristos 
344*a7c91847Schristos   current->buffered_lines = line;
345*a7c91847Schristos 
346*a7c91847Schristos   for (i = 0;  ;  i++)
347*a7c91847Schristos     {
348*a7c91847Schristos       /* Record the line start for lines in the suffix that we care about.
349*a7c91847Schristos 	 Record one more line start than lines,
350*a7c91847Schristos 	 so that we can compute the length of any buffered line.  */
351*a7c91847Schristos       if (line == alloc_lines)
352*a7c91847Schristos 	{
353*a7c91847Schristos 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
354*a7c91847Schristos 	  alloc_lines = 2 * alloc_lines - linbuf_base;
355*a7c91847Schristos 	  linbuf = (char const **) xrealloc (linbuf + linbuf_base,
356*a7c91847Schristos 					     (alloc_lines - linbuf_base)
357*a7c91847Schristos 					     * sizeof (*linbuf))
358*a7c91847Schristos 		   - linbuf_base;
359*a7c91847Schristos 	}
360*a7c91847Schristos       linbuf[line] = (char const *) p;
361*a7c91847Schristos 
362*a7c91847Schristos       if ((char const *) p == bufend)
363*a7c91847Schristos 	break;
364*a7c91847Schristos 
365*a7c91847Schristos       if (context <= i && no_diff_means_no_output)
366*a7c91847Schristos 	break;
367*a7c91847Schristos 
368*a7c91847Schristos       line++;
369*a7c91847Schristos 
370*a7c91847Schristos       while (*p++ != '\n')
371*a7c91847Schristos 	;
372*a7c91847Schristos     }
373*a7c91847Schristos 
374*a7c91847Schristos   /* Done with cache in local variables.  */
375*a7c91847Schristos   current->linbuf = linbuf;
376*a7c91847Schristos   current->valid_lines = line;
377*a7c91847Schristos   current->alloc_lines = alloc_lines;
378*a7c91847Schristos   current->equivs = cureqs;
379*a7c91847Schristos   equivs = eqs;
380*a7c91847Schristos   equivs_alloc = eqs_alloc;
381*a7c91847Schristos   equivs_index = eqs_index;
382*a7c91847Schristos }
383*a7c91847Schristos 
384*a7c91847Schristos /* Prepare the end of the text.  Make sure it's initialized.
385*a7c91847Schristos    Make sure text ends in a newline,
386*a7c91847Schristos    but remember that we had to add one.  */
387*a7c91847Schristos 
388*a7c91847Schristos static void
prepare_text_end(current)389*a7c91847Schristos prepare_text_end (current)
390*a7c91847Schristos      struct file_data *current;
391*a7c91847Schristos {
392*a7c91847Schristos   size_t buffered_chars = current->buffered_chars;
393*a7c91847Schristos   char *p = current->buffer;
394*a7c91847Schristos 
395*a7c91847Schristos   if (buffered_chars == 0 || p[buffered_chars - 1] == '\n')
396*a7c91847Schristos     current->missing_newline = 0;
397*a7c91847Schristos   else
398*a7c91847Schristos     {
399*a7c91847Schristos       p[buffered_chars++] = '\n';
400*a7c91847Schristos       current->buffered_chars = buffered_chars;
401*a7c91847Schristos       current->missing_newline = 1;
402*a7c91847Schristos     }
403*a7c91847Schristos 
404*a7c91847Schristos   /* Don't use uninitialized storage when planting or using sentinels.  */
405*a7c91847Schristos   if (p)
406*a7c91847Schristos     bzero (p + buffered_chars, sizeof (word));
407*a7c91847Schristos }
408*a7c91847Schristos 
409*a7c91847Schristos /* Given a vector of two file_data objects, find the identical
410*a7c91847Schristos    prefixes and suffixes of each object. */
411*a7c91847Schristos 
412*a7c91847Schristos static void
find_identical_ends(filevec)413*a7c91847Schristos find_identical_ends (filevec)
414*a7c91847Schristos      struct file_data filevec[];
415*a7c91847Schristos {
416*a7c91847Schristos   word *w0, *w1;
417*a7c91847Schristos   char *p0, *p1, *buffer0, *buffer1;
418*a7c91847Schristos   char const *end0, *beg0;
419*a7c91847Schristos   char const **linbuf0, **linbuf1;
420*a7c91847Schristos   int i, lines;
421*a7c91847Schristos   size_t n0, n1, tem;
422*a7c91847Schristos   int alloc_lines0, alloc_lines1;
423*a7c91847Schristos   int buffered_prefix, prefix_count, prefix_mask;
424*a7c91847Schristos 
425*a7c91847Schristos   slurp (&filevec[0]);
426*a7c91847Schristos   if (filevec[0].desc != filevec[1].desc)
427*a7c91847Schristos     slurp (&filevec[1]);
428*a7c91847Schristos   else
429*a7c91847Schristos     {
430*a7c91847Schristos       filevec[1].buffer = filevec[0].buffer;
431*a7c91847Schristos       filevec[1].bufsize = filevec[0].bufsize;
432*a7c91847Schristos       filevec[1].buffered_chars = filevec[0].buffered_chars;
433*a7c91847Schristos     }
434*a7c91847Schristos   for (i = 0; i < 2; i++)
435*a7c91847Schristos     prepare_text_end (&filevec[i]);
436*a7c91847Schristos 
437*a7c91847Schristos   /* Find identical prefix.  */
438*a7c91847Schristos 
439*a7c91847Schristos   p0 = buffer0 = filevec[0].buffer;
440*a7c91847Schristos   p1 = buffer1 = filevec[1].buffer;
441*a7c91847Schristos 
442*a7c91847Schristos   n0 = filevec[0].buffered_chars;
443*a7c91847Schristos   n1 = filevec[1].buffered_chars;
444*a7c91847Schristos 
445*a7c91847Schristos   if (p0 == p1)
446*a7c91847Schristos     /* The buffers are the same; sentinels won't work.  */
447*a7c91847Schristos     p0 = p1 += n1;
448*a7c91847Schristos   else
449*a7c91847Schristos     {
450*a7c91847Schristos       /* Insert end sentinels, in this case characters that are guaranteed
451*a7c91847Schristos 	 to make the equality test false, and thus terminate the loop.  */
452*a7c91847Schristos 
453*a7c91847Schristos       if (n0 < n1)
454*a7c91847Schristos 	p0[n0] = ~p1[n0];
455*a7c91847Schristos       else
456*a7c91847Schristos 	p1[n1] = ~p0[n1];
457*a7c91847Schristos 
458*a7c91847Schristos       /* Loop until first mismatch, or to the sentinel characters.  */
459*a7c91847Schristos 
460*a7c91847Schristos       /* Compare a word at a time for speed.  */
461*a7c91847Schristos       w0 = (word *) p0;
462*a7c91847Schristos       w1 = (word *) p1;
463*a7c91847Schristos       while (*w0++ == *w1++)
464*a7c91847Schristos 	;
465*a7c91847Schristos       --w0, --w1;
466*a7c91847Schristos 
467*a7c91847Schristos       /* Do the last few bytes of comparison a byte at a time.  */
468*a7c91847Schristos       p0 = (char *) w0;
469*a7c91847Schristos       p1 = (char *) w1;
470*a7c91847Schristos       while (*p0++ == *p1++)
471*a7c91847Schristos 	;
472*a7c91847Schristos       --p0, --p1;
473*a7c91847Schristos 
474*a7c91847Schristos       /* Don't mistakenly count missing newline as part of prefix. */
475*a7c91847Schristos       if (ROBUST_OUTPUT_STYLE (output_style)
476*a7c91847Schristos 	  && (buffer0 + n0 - filevec[0].missing_newline < p0)
477*a7c91847Schristos 	     !=
478*a7c91847Schristos 	     (buffer1 + n1 - filevec[1].missing_newline < p1))
479*a7c91847Schristos 	--p0, --p1;
480*a7c91847Schristos     }
481*a7c91847Schristos 
482*a7c91847Schristos   /* Now P0 and P1 point at the first nonmatching characters.  */
483*a7c91847Schristos 
484*a7c91847Schristos   /* Skip back to last line-beginning in the prefix,
485*a7c91847Schristos      and then discard up to HORIZON_LINES lines from the prefix.  */
486*a7c91847Schristos   i = horizon_lines;
487*a7c91847Schristos   while (p0 != buffer0 && (p0[-1] != '\n' || i--))
488*a7c91847Schristos     --p0, --p1;
489*a7c91847Schristos 
490*a7c91847Schristos   /* Record the prefix.  */
491*a7c91847Schristos   filevec[0].prefix_end = p0;
492*a7c91847Schristos   filevec[1].prefix_end = p1;
493*a7c91847Schristos 
494*a7c91847Schristos   /* Find identical suffix.  */
495*a7c91847Schristos 
496*a7c91847Schristos   /* P0 and P1 point beyond the last chars not yet compared.  */
497*a7c91847Schristos   p0 = buffer0 + n0;
498*a7c91847Schristos   p1 = buffer1 + n1;
499*a7c91847Schristos 
500*a7c91847Schristos   if (! ROBUST_OUTPUT_STYLE (output_style)
501*a7c91847Schristos       || filevec[0].missing_newline == filevec[1].missing_newline)
502*a7c91847Schristos     {
503*a7c91847Schristos       end0 = p0;	/* Addr of last char in file 0.  */
504*a7c91847Schristos 
505*a7c91847Schristos       /* Get value of P0 at which we should stop scanning backward:
506*a7c91847Schristos 	 this is when either P0 or P1 points just past the last char
507*a7c91847Schristos 	 of the identical prefix.  */
508*a7c91847Schristos       beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
509*a7c91847Schristos 
510*a7c91847Schristos       /* Scan back until chars don't match or we reach that point.  */
511*a7c91847Schristos       for (; p0 != beg0; p0--, p1--)
512*a7c91847Schristos 	if (*p0 != *p1)
513*a7c91847Schristos 	  {
514*a7c91847Schristos 	    /* Point at the first char of the matching suffix.  */
515*a7c91847Schristos 	    beg0 = p0;
516*a7c91847Schristos 	    break;
517*a7c91847Schristos 	  }
518*a7c91847Schristos 
519*a7c91847Schristos       /* Are we at a line-beginning in both files?  If not, add the rest of
520*a7c91847Schristos 	 this line to the main body.  Discard up to HORIZON_LINES lines from
521*a7c91847Schristos 	 the identical suffix.  Also, discard one extra line,
522*a7c91847Schristos 	 because shift_boundaries may need it.  */
523*a7c91847Schristos       i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
524*a7c91847Schristos 			    &&
525*a7c91847Schristos 			    (buffer1 == p1 || p1[-1] == '\n'));
526*a7c91847Schristos       while (i-- && p0 != end0)
527*a7c91847Schristos 	while (*p0++ != '\n')
528*a7c91847Schristos 	  ;
529*a7c91847Schristos 
530*a7c91847Schristos       p1 += p0 - beg0;
531*a7c91847Schristos     }
532*a7c91847Schristos 
533*a7c91847Schristos   /* Record the suffix.  */
534*a7c91847Schristos   filevec[0].suffix_begin = p0;
535*a7c91847Schristos   filevec[1].suffix_begin = p1;
536*a7c91847Schristos 
537*a7c91847Schristos   /* Calculate number of lines of prefix to save.
538*a7c91847Schristos 
539*a7c91847Schristos      prefix_count == 0 means save the whole prefix;
540*a7c91847Schristos      we need this with for options like -D that output the whole file.
541*a7c91847Schristos      We also need it for options like -F that output some preceding line;
542*a7c91847Schristos      at least we will need to find the last few lines,
543*a7c91847Schristos      but since we don't know how many, it's easiest to find them all.
544*a7c91847Schristos 
545*a7c91847Schristos      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
546*a7c91847Schristos      of the line buffer; they'll be moved to the proper location later.
547*a7c91847Schristos      Handle 1 more line than the context says (because we count 1 too many),
548*a7c91847Schristos      rounded up to the next power of 2 to speed index computation.  */
549*a7c91847Schristos 
550*a7c91847Schristos   if (no_diff_means_no_output && ! function_regexp_list)
551*a7c91847Schristos     {
552*a7c91847Schristos       for (prefix_count = 1;  prefix_count < context + 1;  prefix_count *= 2)
553*a7c91847Schristos 	;
554*a7c91847Schristos       prefix_mask = prefix_count - 1;
555*a7c91847Schristos       alloc_lines0
556*a7c91847Schristos 	= prefix_count
557*a7c91847Schristos 	  + GUESS_LINES (0, 0, p0 - filevec[0].prefix_end)
558*a7c91847Schristos 	  + context;
559*a7c91847Schristos     }
560*a7c91847Schristos   else
561*a7c91847Schristos     {
562*a7c91847Schristos       prefix_count = 0;
563*a7c91847Schristos       prefix_mask = ~0;
564*a7c91847Schristos       alloc_lines0 = GUESS_LINES (0, 0, n0);
565*a7c91847Schristos     }
566*a7c91847Schristos 
567*a7c91847Schristos   lines = 0;
568*a7c91847Schristos   linbuf0 = (char const **) xmalloc (alloc_lines0 * sizeof (*linbuf0));
569*a7c91847Schristos 
570*a7c91847Schristos   /* If the prefix is needed, find the prefix lines.  */
571*a7c91847Schristos   if (! (no_diff_means_no_output
572*a7c91847Schristos 	 && filevec[0].prefix_end == p0
573*a7c91847Schristos 	 && filevec[1].prefix_end == p1))
574*a7c91847Schristos     {
575*a7c91847Schristos       p0 = buffer0;
576*a7c91847Schristos       end0 = filevec[0].prefix_end;
577*a7c91847Schristos       while (p0 != end0)
578*a7c91847Schristos 	{
579*a7c91847Schristos 	  int l = lines++ & prefix_mask;
580*a7c91847Schristos 	  if (l == alloc_lines0)
581*a7c91847Schristos 	    linbuf0 = (char const **) xrealloc (linbuf0, (alloc_lines0 *= 2)
582*a7c91847Schristos 							 * sizeof(*linbuf0));
583*a7c91847Schristos 	  linbuf0[l] = p0;
584*a7c91847Schristos 	  while (*p0++ != '\n')
585*a7c91847Schristos 	    ;
586*a7c91847Schristos 	}
587*a7c91847Schristos     }
588*a7c91847Schristos   buffered_prefix = prefix_count && context < lines ? context : lines;
589*a7c91847Schristos 
590*a7c91847Schristos   /* Allocate line buffer 1.  */
591*a7c91847Schristos   tem = prefix_count ? filevec[1].suffix_begin - buffer1 : n1;
592*a7c91847Schristos 
593*a7c91847Schristos   alloc_lines1
594*a7c91847Schristos     = (buffered_prefix
595*a7c91847Schristos        + GUESS_LINES (lines, filevec[1].prefix_end - buffer1, tem)
596*a7c91847Schristos        + context);
597*a7c91847Schristos   linbuf1 = (char const **) xmalloc (alloc_lines1 * sizeof (*linbuf1));
598*a7c91847Schristos 
599*a7c91847Schristos   if (buffered_prefix != lines)
600*a7c91847Schristos     {
601*a7c91847Schristos       /* Rotate prefix lines to proper location.  */
602*a7c91847Schristos       for (i = 0;  i < buffered_prefix;  i++)
603*a7c91847Schristos 	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
604*a7c91847Schristos       for (i = 0;  i < buffered_prefix;  i++)
605*a7c91847Schristos 	linbuf0[i] = linbuf1[i];
606*a7c91847Schristos     }
607*a7c91847Schristos 
608*a7c91847Schristos   /* Initialize line buffer 1 from line buffer 0.  */
609*a7c91847Schristos   for (i = 0; i < buffered_prefix; i++)
610*a7c91847Schristos     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
611*a7c91847Schristos 
612*a7c91847Schristos   /* Record the line buffer, adjusted so that
613*a7c91847Schristos      linbuf*[0] points at the first differing line.  */
614*a7c91847Schristos   filevec[0].linbuf = linbuf0 + buffered_prefix;
615*a7c91847Schristos   filevec[1].linbuf = linbuf1 + buffered_prefix;
616*a7c91847Schristos   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
617*a7c91847Schristos   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
618*a7c91847Schristos   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
619*a7c91847Schristos   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
620*a7c91847Schristos }
621*a7c91847Schristos 
622*a7c91847Schristos /* Largest primes less than some power of two, for nbuckets.  Values range
623*a7c91847Schristos    from useful to preposterous.  If one of these numbers isn't prime
624*a7c91847Schristos    after all, don't blame it on me, blame it on primes (6) . . . */
625*a7c91847Schristos static int const primes[] =
626*a7c91847Schristos {
627*a7c91847Schristos   509,
628*a7c91847Schristos   1021,
629*a7c91847Schristos   2039,
630*a7c91847Schristos   4093,
631*a7c91847Schristos   8191,
632*a7c91847Schristos   16381,
633*a7c91847Schristos   32749,
634*a7c91847Schristos #if 32767 < INT_MAX
635*a7c91847Schristos   65521,
636*a7c91847Schristos   131071,
637*a7c91847Schristos   262139,
638*a7c91847Schristos   524287,
639*a7c91847Schristos   1048573,
640*a7c91847Schristos   2097143,
641*a7c91847Schristos   4194301,
642*a7c91847Schristos   8388593,
643*a7c91847Schristos   16777213,
644*a7c91847Schristos   33554393,
645*a7c91847Schristos   67108859,			/* Preposterously large . . . */
646*a7c91847Schristos   134217689,
647*a7c91847Schristos   268435399,
648*a7c91847Schristos   536870909,
649*a7c91847Schristos   1073741789,
650*a7c91847Schristos   2147483647,
651*a7c91847Schristos #endif
652*a7c91847Schristos   0
653*a7c91847Schristos };
654*a7c91847Schristos 
655*a7c91847Schristos /* Given a vector of two file_data objects, read the file associated
656*a7c91847Schristos    with each one, and build the table of equivalence classes.
657*a7c91847Schristos    Return 1 if either file appears to be a binary file.
658*a7c91847Schristos    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
659*a7c91847Schristos 
660*a7c91847Schristos int
read_files(filevec,pretend_binary)661*a7c91847Schristos read_files (filevec, pretend_binary)
662*a7c91847Schristos      struct file_data filevec[];
663*a7c91847Schristos      int pretend_binary;
664*a7c91847Schristos {
665*a7c91847Schristos   int i;
666*a7c91847Schristos   int skip_test = always_text_flag | pretend_binary;
667*a7c91847Schristos   int appears_binary = pretend_binary | sip (&filevec[0], skip_test);
668*a7c91847Schristos 
669*a7c91847Schristos   if (filevec[0].desc != filevec[1].desc)
670*a7c91847Schristos     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
671*a7c91847Schristos   else
672*a7c91847Schristos     {
673*a7c91847Schristos       filevec[1].buffer = filevec[0].buffer;
674*a7c91847Schristos       filevec[1].bufsize = filevec[0].bufsize;
675*a7c91847Schristos       filevec[1].buffered_chars = filevec[0].buffered_chars;
676*a7c91847Schristos     }
677*a7c91847Schristos   if (appears_binary)
678*a7c91847Schristos     {
679*a7c91847Schristos #if HAVE_SETMODE
680*a7c91847Schristos       setmode (filevec[0].desc, O_BINARY);
681*a7c91847Schristos       setmode (filevec[1].desc, O_BINARY);
682*a7c91847Schristos #endif
683*a7c91847Schristos       return 1;
684*a7c91847Schristos     }
685*a7c91847Schristos 
686*a7c91847Schristos   find_identical_ends (filevec);
687*a7c91847Schristos 
688*a7c91847Schristos   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
689*a7c91847Schristos   equivs = (struct equivclass *) xmalloc (equivs_alloc * sizeof (struct equivclass));
690*a7c91847Schristos   /* Equivalence class 0 is permanently safe for lines that were not
691*a7c91847Schristos      hashed.  Real equivalence classes start at 1. */
692*a7c91847Schristos   equivs_index = 1;
693*a7c91847Schristos 
694*a7c91847Schristos   for (i = 0;  primes[i] < equivs_alloc / 3;  i++)
695*a7c91847Schristos     if (! primes[i])
696*a7c91847Schristos       abort ();
697*a7c91847Schristos   nbuckets = primes[i];
698*a7c91847Schristos 
699*a7c91847Schristos   buckets = (int *) xmalloc ((nbuckets + 1) * sizeof (*buckets));
700*a7c91847Schristos   bzero (buckets++, (nbuckets + 1) * sizeof (*buckets));
701*a7c91847Schristos 
702*a7c91847Schristos   for (i = 0; i < 2; i++)
703*a7c91847Schristos     find_and_hash_each_line (&filevec[i]);
704*a7c91847Schristos 
705*a7c91847Schristos   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
706*a7c91847Schristos 
707*a7c91847Schristos   free (equivs);
708*a7c91847Schristos   free (buckets - 1);
709*a7c91847Schristos 
710*a7c91847Schristos   return 0;
711*a7c91847Schristos }
712