1*a7c91847Schristos /* File I/O for GNU DIFF.
2*a7c91847Schristos Copyright (C) 1988, 1989, 1992, 1993, 1994 Free Software Foundation, Inc.
3*a7c91847Schristos
4*a7c91847Schristos This file is part of GNU DIFF.
5*a7c91847Schristos
6*a7c91847Schristos GNU DIFF is free software; you can redistribute it and/or modify
7*a7c91847Schristos it under the terms of the GNU General Public License as published by
8*a7c91847Schristos the Free Software Foundation; either version 2, or (at your option)
9*a7c91847Schristos any later version.
10*a7c91847Schristos
11*a7c91847Schristos GNU DIFF is distributed in the hope that it will be useful,
12*a7c91847Schristos but WITHOUT ANY WARRANTY; without even the implied warranty of
13*a7c91847Schristos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14*a7c91847Schristos GNU General Public License for more details.
15*a7c91847Schristos
16*a7c91847Schristos */
17*a7c91847Schristos
18*a7c91847Schristos #include "diff.h"
19*a7c91847Schristos
20*a7c91847Schristos /* Rotate a value n bits to the left. */
21*a7c91847Schristos #define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
22*a7c91847Schristos #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
23*a7c91847Schristos
24*a7c91847Schristos /* Given a hash value and a new character, return a new hash value. */
25*a7c91847Schristos #define HASH(h, c) ((c) + ROL (h, 7))
26*a7c91847Schristos
27*a7c91847Schristos /* Guess remaining number of lines from number N of lines so far,
28*a7c91847Schristos size S so far, and total size T. */
29*a7c91847Schristos #define GUESS_LINES(n,s,t) (((t) - (s)) / ((n) < 10 ? 32 : (s) / ((n)-1)) + 5)
30*a7c91847Schristos
31*a7c91847Schristos /* Type used for fast prefix comparison in find_identical_ends. */
32*a7c91847Schristos #ifndef word
33*a7c91847Schristos #define word int
34*a7c91847Schristos #endif
35*a7c91847Schristos
36*a7c91847Schristos /* Lines are put into equivalence classes (of lines that match in line_cmp).
37*a7c91847Schristos Each equivalence class is represented by one of these structures,
38*a7c91847Schristos but only while the classes are being computed.
39*a7c91847Schristos Afterward, each class is represented by a number. */
40*a7c91847Schristos struct equivclass
41*a7c91847Schristos {
42*a7c91847Schristos int next; /* Next item in this bucket. */
43*a7c91847Schristos unsigned hash; /* Hash of lines in this class. */
44*a7c91847Schristos char const *line; /* A line that fits this class. */
45*a7c91847Schristos size_t length; /* That line's length, not counting its newline. */
46*a7c91847Schristos };
47*a7c91847Schristos
48*a7c91847Schristos /* Hash-table: array of buckets, each being a chain of equivalence classes.
49*a7c91847Schristos buckets[-1] is reserved for incomplete lines. */
50*a7c91847Schristos static int *buckets;
51*a7c91847Schristos
52*a7c91847Schristos /* Number of buckets in the hash table array, not counting buckets[-1]. */
53*a7c91847Schristos static int nbuckets;
54*a7c91847Schristos
55*a7c91847Schristos /* Array in which the equivalence classes are allocated.
56*a7c91847Schristos The bucket-chains go through the elements in this array.
57*a7c91847Schristos The number of an equivalence class is its index in this array. */
58*a7c91847Schristos static struct equivclass *equivs;
59*a7c91847Schristos
60*a7c91847Schristos /* Index of first free element in the array `equivs'. */
61*a7c91847Schristos static int equivs_index;
62*a7c91847Schristos
63*a7c91847Schristos /* Number of elements allocated in the array `equivs'. */
64*a7c91847Schristos static int equivs_alloc;
65*a7c91847Schristos
66*a7c91847Schristos static void find_and_hash_each_line PARAMS((struct file_data *));
67*a7c91847Schristos static void find_identical_ends PARAMS((struct file_data[]));
68*a7c91847Schristos static void prepare_text_end PARAMS((struct file_data *));
69*a7c91847Schristos
70*a7c91847Schristos /* Check for binary files and compare them for exact identity. */
71*a7c91847Schristos
72*a7c91847Schristos /* Return 1 if BUF contains a non text character.
73*a7c91847Schristos SIZE is the number of characters in BUF. */
74*a7c91847Schristos
75*a7c91847Schristos #define binary_file_p(buf, size) (memchr (buf, '\0', size) != 0)
76*a7c91847Schristos
77*a7c91847Schristos /* Get ready to read the current file.
78*a7c91847Schristos Return nonzero if SKIP_TEST is zero,
79*a7c91847Schristos and if it appears to be a binary file. */
80*a7c91847Schristos
81*a7c91847Schristos int
sip(current,skip_test)82*a7c91847Schristos sip (current, skip_test)
83*a7c91847Schristos struct file_data *current;
84*a7c91847Schristos int skip_test;
85*a7c91847Schristos {
86*a7c91847Schristos /* If we have a nonexistent file at this stage, treat it as empty. */
87*a7c91847Schristos if (current->desc < 0)
88*a7c91847Schristos {
89*a7c91847Schristos /* Leave room for a sentinel. */
90*a7c91847Schristos current->bufsize = sizeof (word);
91*a7c91847Schristos current->buffer = xmalloc (current->bufsize);
92*a7c91847Schristos }
93*a7c91847Schristos else
94*a7c91847Schristos {
95*a7c91847Schristos current->bufsize = STAT_BLOCKSIZE (current->stat);
96*a7c91847Schristos current->buffer = xmalloc (current->bufsize);
97*a7c91847Schristos
98*a7c91847Schristos if (! skip_test)
99*a7c91847Schristos {
100*a7c91847Schristos /* Check first part of file to see if it's a binary file. */
101*a7c91847Schristos #if HAVE_SETMODE
102*a7c91847Schristos int oldmode = setmode (current->desc, O_BINARY);
103*a7c91847Schristos #endif
104*a7c91847Schristos ssize_t n = read (current->desc, current->buffer, current->bufsize);
105*a7c91847Schristos if (n == -1)
106*a7c91847Schristos pfatal_with_name (current->name);
107*a7c91847Schristos current->buffered_chars = n;
108*a7c91847Schristos #if HAVE_SETMODE
109*a7c91847Schristos if (oldmode != O_BINARY)
110*a7c91847Schristos {
111*a7c91847Schristos if (lseek (current->desc, - (off_t) n, SEEK_CUR) == -1)
112*a7c91847Schristos pfatal_with_name (current->name);
113*a7c91847Schristos setmode (current->desc, oldmode);
114*a7c91847Schristos current->buffered_chars = 0;
115*a7c91847Schristos }
116*a7c91847Schristos #endif
117*a7c91847Schristos return binary_file_p (current->buffer, n);
118*a7c91847Schristos }
119*a7c91847Schristos }
120*a7c91847Schristos
121*a7c91847Schristos current->buffered_chars = 0;
122*a7c91847Schristos return 0;
123*a7c91847Schristos }
124*a7c91847Schristos
125*a7c91847Schristos /* Slurp the rest of the current file completely into memory. */
126*a7c91847Schristos
127*a7c91847Schristos void
slurp(current)128*a7c91847Schristos slurp (current)
129*a7c91847Schristos struct file_data *current;
130*a7c91847Schristos {
131*a7c91847Schristos ssize_t cc;
132*a7c91847Schristos
133*a7c91847Schristos if (current->desc < 0)
134*a7c91847Schristos /* The file is nonexistent. */
135*a7c91847Schristos ;
136*a7c91847Schristos else if (S_ISREG (current->stat.st_mode))
137*a7c91847Schristos {
138*a7c91847Schristos /* It's a regular file; slurp in the rest all at once. */
139*a7c91847Schristos
140*a7c91847Schristos /* Get the size out of the stat block.
141*a7c91847Schristos Allocate enough room for appended newline and sentinel. */
142*a7c91847Schristos cc = current->stat.st_size + 1 + sizeof (word);
143*a7c91847Schristos if (current->bufsize < cc)
144*a7c91847Schristos {
145*a7c91847Schristos current->bufsize = cc;
146*a7c91847Schristos current->buffer = xrealloc (current->buffer, cc);
147*a7c91847Schristos }
148*a7c91847Schristos
149*a7c91847Schristos if (current->buffered_chars < current->stat.st_size)
150*a7c91847Schristos {
151*a7c91847Schristos cc = read (current->desc,
152*a7c91847Schristos current->buffer + current->buffered_chars,
153*a7c91847Schristos current->stat.st_size - current->buffered_chars);
154*a7c91847Schristos if (cc == -1)
155*a7c91847Schristos pfatal_with_name (current->name);
156*a7c91847Schristos current->buffered_chars += cc;
157*a7c91847Schristos }
158*a7c91847Schristos }
159*a7c91847Schristos /* It's not a regular file; read it, growing the buffer as needed. */
160*a7c91847Schristos else if (always_text_flag || current->buffered_chars != 0)
161*a7c91847Schristos {
162*a7c91847Schristos for (;;)
163*a7c91847Schristos {
164*a7c91847Schristos if (current->buffered_chars == current->bufsize)
165*a7c91847Schristos {
166*a7c91847Schristos current->bufsize = current->bufsize * 2;
167*a7c91847Schristos current->buffer = xrealloc (current->buffer, current->bufsize);
168*a7c91847Schristos }
169*a7c91847Schristos cc = read (current->desc,
170*a7c91847Schristos current->buffer + current->buffered_chars,
171*a7c91847Schristos current->bufsize - current->buffered_chars);
172*a7c91847Schristos if (cc == 0)
173*a7c91847Schristos break;
174*a7c91847Schristos if (cc == -1)
175*a7c91847Schristos pfatal_with_name (current->name);
176*a7c91847Schristos current->buffered_chars += cc;
177*a7c91847Schristos }
178*a7c91847Schristos /* Allocate just enough room for appended newline and sentinel. */
179*a7c91847Schristos current->bufsize = current->buffered_chars + 1 + sizeof (word);
180*a7c91847Schristos current->buffer = xrealloc (current->buffer, current->bufsize);
181*a7c91847Schristos }
182*a7c91847Schristos }
183*a7c91847Schristos
184*a7c91847Schristos /* Split the file into lines, simultaneously computing the equivalence class for
185*a7c91847Schristos each line. */
186*a7c91847Schristos
187*a7c91847Schristos static void
find_and_hash_each_line(current)188*a7c91847Schristos find_and_hash_each_line (current)
189*a7c91847Schristos struct file_data *current;
190*a7c91847Schristos {
191*a7c91847Schristos unsigned h;
192*a7c91847Schristos unsigned char const *p = (unsigned char const *) current->prefix_end;
193*a7c91847Schristos unsigned char c;
194*a7c91847Schristos int i, *bucket;
195*a7c91847Schristos size_t length;
196*a7c91847Schristos
197*a7c91847Schristos /* Cache often-used quantities in local variables to help the compiler. */
198*a7c91847Schristos char const **linbuf = current->linbuf;
199*a7c91847Schristos int alloc_lines = current->alloc_lines;
200*a7c91847Schristos int line = 0;
201*a7c91847Schristos int linbuf_base = current->linbuf_base;
202*a7c91847Schristos int *cureqs = (int *) xmalloc (alloc_lines * sizeof (int));
203*a7c91847Schristos struct equivclass *eqs = equivs;
204*a7c91847Schristos int eqs_index = equivs_index;
205*a7c91847Schristos int eqs_alloc = equivs_alloc;
206*a7c91847Schristos char const *suffix_begin = current->suffix_begin;
207*a7c91847Schristos char const *bufend = current->buffer + current->buffered_chars;
208*a7c91847Schristos int use_line_cmp = ignore_some_line_changes;
209*a7c91847Schristos
210*a7c91847Schristos while ((char const *) p < suffix_begin)
211*a7c91847Schristos {
212*a7c91847Schristos char const *ip = (char const *) p;
213*a7c91847Schristos
214*a7c91847Schristos /* Compute the equivalence class for this line. */
215*a7c91847Schristos
216*a7c91847Schristos h = 0;
217*a7c91847Schristos
218*a7c91847Schristos /* Hash this line until we find a newline. */
219*a7c91847Schristos if (ignore_case_flag)
220*a7c91847Schristos {
221*a7c91847Schristos if (ignore_all_space_flag)
222*a7c91847Schristos while ((c = *p++) != '\n')
223*a7c91847Schristos {
224*a7c91847Schristos if (! ISSPACE (c))
225*a7c91847Schristos h = HASH (h, ISUPPER (c) ? tolower (c) : c);
226*a7c91847Schristos }
227*a7c91847Schristos else if (ignore_space_change_flag)
228*a7c91847Schristos while ((c = *p++) != '\n')
229*a7c91847Schristos {
230*a7c91847Schristos if (ISSPACE (c))
231*a7c91847Schristos {
232*a7c91847Schristos for (;;)
233*a7c91847Schristos {
234*a7c91847Schristos c = *p++;
235*a7c91847Schristos if (!ISSPACE (c))
236*a7c91847Schristos break;
237*a7c91847Schristos if (c == '\n')
238*a7c91847Schristos goto hashing_done;
239*a7c91847Schristos }
240*a7c91847Schristos h = HASH (h, ' ');
241*a7c91847Schristos }
242*a7c91847Schristos /* C is now the first non-space. */
243*a7c91847Schristos h = HASH (h, ISUPPER (c) ? tolower (c) : c);
244*a7c91847Schristos }
245*a7c91847Schristos else
246*a7c91847Schristos while ((c = *p++) != '\n')
247*a7c91847Schristos h = HASH (h, ISUPPER (c) ? tolower (c) : c);
248*a7c91847Schristos }
249*a7c91847Schristos else
250*a7c91847Schristos {
251*a7c91847Schristos if (ignore_all_space_flag)
252*a7c91847Schristos while ((c = *p++) != '\n')
253*a7c91847Schristos {
254*a7c91847Schristos if (! ISSPACE (c))
255*a7c91847Schristos h = HASH (h, c);
256*a7c91847Schristos }
257*a7c91847Schristos else if (ignore_space_change_flag)
258*a7c91847Schristos while ((c = *p++) != '\n')
259*a7c91847Schristos {
260*a7c91847Schristos if (ISSPACE (c))
261*a7c91847Schristos {
262*a7c91847Schristos for (;;)
263*a7c91847Schristos {
264*a7c91847Schristos c = *p++;
265*a7c91847Schristos if (!ISSPACE (c))
266*a7c91847Schristos break;
267*a7c91847Schristos if (c == '\n')
268*a7c91847Schristos goto hashing_done;
269*a7c91847Schristos }
270*a7c91847Schristos h = HASH (h, ' ');
271*a7c91847Schristos }
272*a7c91847Schristos /* C is now the first non-space. */
273*a7c91847Schristos h = HASH (h, c);
274*a7c91847Schristos }
275*a7c91847Schristos else
276*a7c91847Schristos while ((c = *p++) != '\n')
277*a7c91847Schristos h = HASH (h, c);
278*a7c91847Schristos }
279*a7c91847Schristos hashing_done:;
280*a7c91847Schristos
281*a7c91847Schristos bucket = &buckets[h % nbuckets];
282*a7c91847Schristos length = (char const *) p - ip - 1;
283*a7c91847Schristos
284*a7c91847Schristos if ((char const *) p == bufend
285*a7c91847Schristos && current->missing_newline
286*a7c91847Schristos && ROBUST_OUTPUT_STYLE (output_style))
287*a7c91847Schristos {
288*a7c91847Schristos /* This line is incomplete. If this is significant,
289*a7c91847Schristos put the line into bucket[-1]. */
290*a7c91847Schristos if (! (ignore_space_change_flag | ignore_all_space_flag))
291*a7c91847Schristos bucket = &buckets[-1];
292*a7c91847Schristos
293*a7c91847Schristos /* Omit the inserted newline when computing linbuf later. */
294*a7c91847Schristos p--;
295*a7c91847Schristos bufend = suffix_begin = (char const *) p;
296*a7c91847Schristos }
297*a7c91847Schristos
298*a7c91847Schristos for (i = *bucket; ; i = eqs[i].next)
299*a7c91847Schristos if (!i)
300*a7c91847Schristos {
301*a7c91847Schristos /* Create a new equivalence class in this bucket. */
302*a7c91847Schristos i = eqs_index++;
303*a7c91847Schristos if (i == eqs_alloc)
304*a7c91847Schristos eqs = (struct equivclass *)
305*a7c91847Schristos xrealloc (eqs, (eqs_alloc*=2) * sizeof(*eqs));
306*a7c91847Schristos eqs[i].next = *bucket;
307*a7c91847Schristos eqs[i].hash = h;
308*a7c91847Schristos eqs[i].line = ip;
309*a7c91847Schristos eqs[i].length = length;
310*a7c91847Schristos *bucket = i;
311*a7c91847Schristos break;
312*a7c91847Schristos }
313*a7c91847Schristos else if (eqs[i].hash == h)
314*a7c91847Schristos {
315*a7c91847Schristos char const *eqline = eqs[i].line;
316*a7c91847Schristos
317*a7c91847Schristos /* Reuse existing equivalence class if the lines are identical.
318*a7c91847Schristos This detects the common case of exact identity
319*a7c91847Schristos faster than complete comparison would. */
320*a7c91847Schristos if (eqs[i].length == length && memcmp (eqline, ip, length) == 0)
321*a7c91847Schristos break;
322*a7c91847Schristos
323*a7c91847Schristos /* Reuse existing class if line_cmp reports the lines equal. */
324*a7c91847Schristos if (use_line_cmp && line_cmp (eqline, ip) == 0)
325*a7c91847Schristos break;
326*a7c91847Schristos }
327*a7c91847Schristos
328*a7c91847Schristos /* Maybe increase the size of the line table. */
329*a7c91847Schristos if (line == alloc_lines)
330*a7c91847Schristos {
331*a7c91847Schristos /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
332*a7c91847Schristos alloc_lines = 2 * alloc_lines - linbuf_base;
333*a7c91847Schristos cureqs = (int *) xrealloc (cureqs, alloc_lines * sizeof (*cureqs));
334*a7c91847Schristos linbuf = (char const **) xrealloc (linbuf + linbuf_base,
335*a7c91847Schristos (alloc_lines - linbuf_base)
336*a7c91847Schristos * sizeof (*linbuf))
337*a7c91847Schristos - linbuf_base;
338*a7c91847Schristos }
339*a7c91847Schristos linbuf[line] = ip;
340*a7c91847Schristos cureqs[line] = i;
341*a7c91847Schristos ++line;
342*a7c91847Schristos }
343*a7c91847Schristos
344*a7c91847Schristos current->buffered_lines = line;
345*a7c91847Schristos
346*a7c91847Schristos for (i = 0; ; i++)
347*a7c91847Schristos {
348*a7c91847Schristos /* Record the line start for lines in the suffix that we care about.
349*a7c91847Schristos Record one more line start than lines,
350*a7c91847Schristos so that we can compute the length of any buffered line. */
351*a7c91847Schristos if (line == alloc_lines)
352*a7c91847Schristos {
353*a7c91847Schristos /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
354*a7c91847Schristos alloc_lines = 2 * alloc_lines - linbuf_base;
355*a7c91847Schristos linbuf = (char const **) xrealloc (linbuf + linbuf_base,
356*a7c91847Schristos (alloc_lines - linbuf_base)
357*a7c91847Schristos * sizeof (*linbuf))
358*a7c91847Schristos - linbuf_base;
359*a7c91847Schristos }
360*a7c91847Schristos linbuf[line] = (char const *) p;
361*a7c91847Schristos
362*a7c91847Schristos if ((char const *) p == bufend)
363*a7c91847Schristos break;
364*a7c91847Schristos
365*a7c91847Schristos if (context <= i && no_diff_means_no_output)
366*a7c91847Schristos break;
367*a7c91847Schristos
368*a7c91847Schristos line++;
369*a7c91847Schristos
370*a7c91847Schristos while (*p++ != '\n')
371*a7c91847Schristos ;
372*a7c91847Schristos }
373*a7c91847Schristos
374*a7c91847Schristos /* Done with cache in local variables. */
375*a7c91847Schristos current->linbuf = linbuf;
376*a7c91847Schristos current->valid_lines = line;
377*a7c91847Schristos current->alloc_lines = alloc_lines;
378*a7c91847Schristos current->equivs = cureqs;
379*a7c91847Schristos equivs = eqs;
380*a7c91847Schristos equivs_alloc = eqs_alloc;
381*a7c91847Schristos equivs_index = eqs_index;
382*a7c91847Schristos }
383*a7c91847Schristos
384*a7c91847Schristos /* Prepare the end of the text. Make sure it's initialized.
385*a7c91847Schristos Make sure text ends in a newline,
386*a7c91847Schristos but remember that we had to add one. */
387*a7c91847Schristos
388*a7c91847Schristos static void
prepare_text_end(current)389*a7c91847Schristos prepare_text_end (current)
390*a7c91847Schristos struct file_data *current;
391*a7c91847Schristos {
392*a7c91847Schristos size_t buffered_chars = current->buffered_chars;
393*a7c91847Schristos char *p = current->buffer;
394*a7c91847Schristos
395*a7c91847Schristos if (buffered_chars == 0 || p[buffered_chars - 1] == '\n')
396*a7c91847Schristos current->missing_newline = 0;
397*a7c91847Schristos else
398*a7c91847Schristos {
399*a7c91847Schristos p[buffered_chars++] = '\n';
400*a7c91847Schristos current->buffered_chars = buffered_chars;
401*a7c91847Schristos current->missing_newline = 1;
402*a7c91847Schristos }
403*a7c91847Schristos
404*a7c91847Schristos /* Don't use uninitialized storage when planting or using sentinels. */
405*a7c91847Schristos if (p)
406*a7c91847Schristos bzero (p + buffered_chars, sizeof (word));
407*a7c91847Schristos }
408*a7c91847Schristos
409*a7c91847Schristos /* Given a vector of two file_data objects, find the identical
410*a7c91847Schristos prefixes and suffixes of each object. */
411*a7c91847Schristos
412*a7c91847Schristos static void
find_identical_ends(filevec)413*a7c91847Schristos find_identical_ends (filevec)
414*a7c91847Schristos struct file_data filevec[];
415*a7c91847Schristos {
416*a7c91847Schristos word *w0, *w1;
417*a7c91847Schristos char *p0, *p1, *buffer0, *buffer1;
418*a7c91847Schristos char const *end0, *beg0;
419*a7c91847Schristos char const **linbuf0, **linbuf1;
420*a7c91847Schristos int i, lines;
421*a7c91847Schristos size_t n0, n1, tem;
422*a7c91847Schristos int alloc_lines0, alloc_lines1;
423*a7c91847Schristos int buffered_prefix, prefix_count, prefix_mask;
424*a7c91847Schristos
425*a7c91847Schristos slurp (&filevec[0]);
426*a7c91847Schristos if (filevec[0].desc != filevec[1].desc)
427*a7c91847Schristos slurp (&filevec[1]);
428*a7c91847Schristos else
429*a7c91847Schristos {
430*a7c91847Schristos filevec[1].buffer = filevec[0].buffer;
431*a7c91847Schristos filevec[1].bufsize = filevec[0].bufsize;
432*a7c91847Schristos filevec[1].buffered_chars = filevec[0].buffered_chars;
433*a7c91847Schristos }
434*a7c91847Schristos for (i = 0; i < 2; i++)
435*a7c91847Schristos prepare_text_end (&filevec[i]);
436*a7c91847Schristos
437*a7c91847Schristos /* Find identical prefix. */
438*a7c91847Schristos
439*a7c91847Schristos p0 = buffer0 = filevec[0].buffer;
440*a7c91847Schristos p1 = buffer1 = filevec[1].buffer;
441*a7c91847Schristos
442*a7c91847Schristos n0 = filevec[0].buffered_chars;
443*a7c91847Schristos n1 = filevec[1].buffered_chars;
444*a7c91847Schristos
445*a7c91847Schristos if (p0 == p1)
446*a7c91847Schristos /* The buffers are the same; sentinels won't work. */
447*a7c91847Schristos p0 = p1 += n1;
448*a7c91847Schristos else
449*a7c91847Schristos {
450*a7c91847Schristos /* Insert end sentinels, in this case characters that are guaranteed
451*a7c91847Schristos to make the equality test false, and thus terminate the loop. */
452*a7c91847Schristos
453*a7c91847Schristos if (n0 < n1)
454*a7c91847Schristos p0[n0] = ~p1[n0];
455*a7c91847Schristos else
456*a7c91847Schristos p1[n1] = ~p0[n1];
457*a7c91847Schristos
458*a7c91847Schristos /* Loop until first mismatch, or to the sentinel characters. */
459*a7c91847Schristos
460*a7c91847Schristos /* Compare a word at a time for speed. */
461*a7c91847Schristos w0 = (word *) p0;
462*a7c91847Schristos w1 = (word *) p1;
463*a7c91847Schristos while (*w0++ == *w1++)
464*a7c91847Schristos ;
465*a7c91847Schristos --w0, --w1;
466*a7c91847Schristos
467*a7c91847Schristos /* Do the last few bytes of comparison a byte at a time. */
468*a7c91847Schristos p0 = (char *) w0;
469*a7c91847Schristos p1 = (char *) w1;
470*a7c91847Schristos while (*p0++ == *p1++)
471*a7c91847Schristos ;
472*a7c91847Schristos --p0, --p1;
473*a7c91847Schristos
474*a7c91847Schristos /* Don't mistakenly count missing newline as part of prefix. */
475*a7c91847Schristos if (ROBUST_OUTPUT_STYLE (output_style)
476*a7c91847Schristos && (buffer0 + n0 - filevec[0].missing_newline < p0)
477*a7c91847Schristos !=
478*a7c91847Schristos (buffer1 + n1 - filevec[1].missing_newline < p1))
479*a7c91847Schristos --p0, --p1;
480*a7c91847Schristos }
481*a7c91847Schristos
482*a7c91847Schristos /* Now P0 and P1 point at the first nonmatching characters. */
483*a7c91847Schristos
484*a7c91847Schristos /* Skip back to last line-beginning in the prefix,
485*a7c91847Schristos and then discard up to HORIZON_LINES lines from the prefix. */
486*a7c91847Schristos i = horizon_lines;
487*a7c91847Schristos while (p0 != buffer0 && (p0[-1] != '\n' || i--))
488*a7c91847Schristos --p0, --p1;
489*a7c91847Schristos
490*a7c91847Schristos /* Record the prefix. */
491*a7c91847Schristos filevec[0].prefix_end = p0;
492*a7c91847Schristos filevec[1].prefix_end = p1;
493*a7c91847Schristos
494*a7c91847Schristos /* Find identical suffix. */
495*a7c91847Schristos
496*a7c91847Schristos /* P0 and P1 point beyond the last chars not yet compared. */
497*a7c91847Schristos p0 = buffer0 + n0;
498*a7c91847Schristos p1 = buffer1 + n1;
499*a7c91847Schristos
500*a7c91847Schristos if (! ROBUST_OUTPUT_STYLE (output_style)
501*a7c91847Schristos || filevec[0].missing_newline == filevec[1].missing_newline)
502*a7c91847Schristos {
503*a7c91847Schristos end0 = p0; /* Addr of last char in file 0. */
504*a7c91847Schristos
505*a7c91847Schristos /* Get value of P0 at which we should stop scanning backward:
506*a7c91847Schristos this is when either P0 or P1 points just past the last char
507*a7c91847Schristos of the identical prefix. */
508*a7c91847Schristos beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
509*a7c91847Schristos
510*a7c91847Schristos /* Scan back until chars don't match or we reach that point. */
511*a7c91847Schristos for (; p0 != beg0; p0--, p1--)
512*a7c91847Schristos if (*p0 != *p1)
513*a7c91847Schristos {
514*a7c91847Schristos /* Point at the first char of the matching suffix. */
515*a7c91847Schristos beg0 = p0;
516*a7c91847Schristos break;
517*a7c91847Schristos }
518*a7c91847Schristos
519*a7c91847Schristos /* Are we at a line-beginning in both files? If not, add the rest of
520*a7c91847Schristos this line to the main body. Discard up to HORIZON_LINES lines from
521*a7c91847Schristos the identical suffix. Also, discard one extra line,
522*a7c91847Schristos because shift_boundaries may need it. */
523*a7c91847Schristos i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
524*a7c91847Schristos &&
525*a7c91847Schristos (buffer1 == p1 || p1[-1] == '\n'));
526*a7c91847Schristos while (i-- && p0 != end0)
527*a7c91847Schristos while (*p0++ != '\n')
528*a7c91847Schristos ;
529*a7c91847Schristos
530*a7c91847Schristos p1 += p0 - beg0;
531*a7c91847Schristos }
532*a7c91847Schristos
533*a7c91847Schristos /* Record the suffix. */
534*a7c91847Schristos filevec[0].suffix_begin = p0;
535*a7c91847Schristos filevec[1].suffix_begin = p1;
536*a7c91847Schristos
537*a7c91847Schristos /* Calculate number of lines of prefix to save.
538*a7c91847Schristos
539*a7c91847Schristos prefix_count == 0 means save the whole prefix;
540*a7c91847Schristos we need this with for options like -D that output the whole file.
541*a7c91847Schristos We also need it for options like -F that output some preceding line;
542*a7c91847Schristos at least we will need to find the last few lines,
543*a7c91847Schristos but since we don't know how many, it's easiest to find them all.
544*a7c91847Schristos
545*a7c91847Schristos Otherwise, prefix_count != 0. Save just prefix_count lines at start
546*a7c91847Schristos of the line buffer; they'll be moved to the proper location later.
547*a7c91847Schristos Handle 1 more line than the context says (because we count 1 too many),
548*a7c91847Schristos rounded up to the next power of 2 to speed index computation. */
549*a7c91847Schristos
550*a7c91847Schristos if (no_diff_means_no_output && ! function_regexp_list)
551*a7c91847Schristos {
552*a7c91847Schristos for (prefix_count = 1; prefix_count < context + 1; prefix_count *= 2)
553*a7c91847Schristos ;
554*a7c91847Schristos prefix_mask = prefix_count - 1;
555*a7c91847Schristos alloc_lines0
556*a7c91847Schristos = prefix_count
557*a7c91847Schristos + GUESS_LINES (0, 0, p0 - filevec[0].prefix_end)
558*a7c91847Schristos + context;
559*a7c91847Schristos }
560*a7c91847Schristos else
561*a7c91847Schristos {
562*a7c91847Schristos prefix_count = 0;
563*a7c91847Schristos prefix_mask = ~0;
564*a7c91847Schristos alloc_lines0 = GUESS_LINES (0, 0, n0);
565*a7c91847Schristos }
566*a7c91847Schristos
567*a7c91847Schristos lines = 0;
568*a7c91847Schristos linbuf0 = (char const **) xmalloc (alloc_lines0 * sizeof (*linbuf0));
569*a7c91847Schristos
570*a7c91847Schristos /* If the prefix is needed, find the prefix lines. */
571*a7c91847Schristos if (! (no_diff_means_no_output
572*a7c91847Schristos && filevec[0].prefix_end == p0
573*a7c91847Schristos && filevec[1].prefix_end == p1))
574*a7c91847Schristos {
575*a7c91847Schristos p0 = buffer0;
576*a7c91847Schristos end0 = filevec[0].prefix_end;
577*a7c91847Schristos while (p0 != end0)
578*a7c91847Schristos {
579*a7c91847Schristos int l = lines++ & prefix_mask;
580*a7c91847Schristos if (l == alloc_lines0)
581*a7c91847Schristos linbuf0 = (char const **) xrealloc (linbuf0, (alloc_lines0 *= 2)
582*a7c91847Schristos * sizeof(*linbuf0));
583*a7c91847Schristos linbuf0[l] = p0;
584*a7c91847Schristos while (*p0++ != '\n')
585*a7c91847Schristos ;
586*a7c91847Schristos }
587*a7c91847Schristos }
588*a7c91847Schristos buffered_prefix = prefix_count && context < lines ? context : lines;
589*a7c91847Schristos
590*a7c91847Schristos /* Allocate line buffer 1. */
591*a7c91847Schristos tem = prefix_count ? filevec[1].suffix_begin - buffer1 : n1;
592*a7c91847Schristos
593*a7c91847Schristos alloc_lines1
594*a7c91847Schristos = (buffered_prefix
595*a7c91847Schristos + GUESS_LINES (lines, filevec[1].prefix_end - buffer1, tem)
596*a7c91847Schristos + context);
597*a7c91847Schristos linbuf1 = (char const **) xmalloc (alloc_lines1 * sizeof (*linbuf1));
598*a7c91847Schristos
599*a7c91847Schristos if (buffered_prefix != lines)
600*a7c91847Schristos {
601*a7c91847Schristos /* Rotate prefix lines to proper location. */
602*a7c91847Schristos for (i = 0; i < buffered_prefix; i++)
603*a7c91847Schristos linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
604*a7c91847Schristos for (i = 0; i < buffered_prefix; i++)
605*a7c91847Schristos linbuf0[i] = linbuf1[i];
606*a7c91847Schristos }
607*a7c91847Schristos
608*a7c91847Schristos /* Initialize line buffer 1 from line buffer 0. */
609*a7c91847Schristos for (i = 0; i < buffered_prefix; i++)
610*a7c91847Schristos linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
611*a7c91847Schristos
612*a7c91847Schristos /* Record the line buffer, adjusted so that
613*a7c91847Schristos linbuf*[0] points at the first differing line. */
614*a7c91847Schristos filevec[0].linbuf = linbuf0 + buffered_prefix;
615*a7c91847Schristos filevec[1].linbuf = linbuf1 + buffered_prefix;
616*a7c91847Schristos filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
617*a7c91847Schristos filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
618*a7c91847Schristos filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
619*a7c91847Schristos filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
620*a7c91847Schristos }
621*a7c91847Schristos
622*a7c91847Schristos /* Largest primes less than some power of two, for nbuckets. Values range
623*a7c91847Schristos from useful to preposterous. If one of these numbers isn't prime
624*a7c91847Schristos after all, don't blame it on me, blame it on primes (6) . . . */
625*a7c91847Schristos static int const primes[] =
626*a7c91847Schristos {
627*a7c91847Schristos 509,
628*a7c91847Schristos 1021,
629*a7c91847Schristos 2039,
630*a7c91847Schristos 4093,
631*a7c91847Schristos 8191,
632*a7c91847Schristos 16381,
633*a7c91847Schristos 32749,
634*a7c91847Schristos #if 32767 < INT_MAX
635*a7c91847Schristos 65521,
636*a7c91847Schristos 131071,
637*a7c91847Schristos 262139,
638*a7c91847Schristos 524287,
639*a7c91847Schristos 1048573,
640*a7c91847Schristos 2097143,
641*a7c91847Schristos 4194301,
642*a7c91847Schristos 8388593,
643*a7c91847Schristos 16777213,
644*a7c91847Schristos 33554393,
645*a7c91847Schristos 67108859, /* Preposterously large . . . */
646*a7c91847Schristos 134217689,
647*a7c91847Schristos 268435399,
648*a7c91847Schristos 536870909,
649*a7c91847Schristos 1073741789,
650*a7c91847Schristos 2147483647,
651*a7c91847Schristos #endif
652*a7c91847Schristos 0
653*a7c91847Schristos };
654*a7c91847Schristos
655*a7c91847Schristos /* Given a vector of two file_data objects, read the file associated
656*a7c91847Schristos with each one, and build the table of equivalence classes.
657*a7c91847Schristos Return 1 if either file appears to be a binary file.
658*a7c91847Schristos If PRETEND_BINARY is nonzero, pretend they are binary regardless. */
659*a7c91847Schristos
660*a7c91847Schristos int
read_files(filevec,pretend_binary)661*a7c91847Schristos read_files (filevec, pretend_binary)
662*a7c91847Schristos struct file_data filevec[];
663*a7c91847Schristos int pretend_binary;
664*a7c91847Schristos {
665*a7c91847Schristos int i;
666*a7c91847Schristos int skip_test = always_text_flag | pretend_binary;
667*a7c91847Schristos int appears_binary = pretend_binary | sip (&filevec[0], skip_test);
668*a7c91847Schristos
669*a7c91847Schristos if (filevec[0].desc != filevec[1].desc)
670*a7c91847Schristos appears_binary |= sip (&filevec[1], skip_test | appears_binary);
671*a7c91847Schristos else
672*a7c91847Schristos {
673*a7c91847Schristos filevec[1].buffer = filevec[0].buffer;
674*a7c91847Schristos filevec[1].bufsize = filevec[0].bufsize;
675*a7c91847Schristos filevec[1].buffered_chars = filevec[0].buffered_chars;
676*a7c91847Schristos }
677*a7c91847Schristos if (appears_binary)
678*a7c91847Schristos {
679*a7c91847Schristos #if HAVE_SETMODE
680*a7c91847Schristos setmode (filevec[0].desc, O_BINARY);
681*a7c91847Schristos setmode (filevec[1].desc, O_BINARY);
682*a7c91847Schristos #endif
683*a7c91847Schristos return 1;
684*a7c91847Schristos }
685*a7c91847Schristos
686*a7c91847Schristos find_identical_ends (filevec);
687*a7c91847Schristos
688*a7c91847Schristos equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
689*a7c91847Schristos equivs = (struct equivclass *) xmalloc (equivs_alloc * sizeof (struct equivclass));
690*a7c91847Schristos /* Equivalence class 0 is permanently safe for lines that were not
691*a7c91847Schristos hashed. Real equivalence classes start at 1. */
692*a7c91847Schristos equivs_index = 1;
693*a7c91847Schristos
694*a7c91847Schristos for (i = 0; primes[i] < equivs_alloc / 3; i++)
695*a7c91847Schristos if (! primes[i])
696*a7c91847Schristos abort ();
697*a7c91847Schristos nbuckets = primes[i];
698*a7c91847Schristos
699*a7c91847Schristos buckets = (int *) xmalloc ((nbuckets + 1) * sizeof (*buckets));
700*a7c91847Schristos bzero (buckets++, (nbuckets + 1) * sizeof (*buckets));
701*a7c91847Schristos
702*a7c91847Schristos for (i = 0; i < 2; i++)
703*a7c91847Schristos find_and_hash_each_line (&filevec[i]);
704*a7c91847Schristos
705*a7c91847Schristos filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
706*a7c91847Schristos
707*a7c91847Schristos free (equivs);
708*a7c91847Schristos free (buckets - 1);
709*a7c91847Schristos
710*a7c91847Schristos return 0;
711*a7c91847Schristos }
712