1*86d7f5d3SJohn Marino /* File I/O for GNU DIFF.
2*86d7f5d3SJohn Marino Copyright (C) 1988, 1989, 1992, 1993, 1994 Free Software Foundation, Inc.
3*86d7f5d3SJohn Marino
4*86d7f5d3SJohn Marino This file is part of GNU DIFF.
5*86d7f5d3SJohn Marino
6*86d7f5d3SJohn Marino GNU DIFF is free software; you can redistribute it and/or modify
7*86d7f5d3SJohn Marino it under the terms of the GNU General Public License as published by
8*86d7f5d3SJohn Marino the Free Software Foundation; either version 2, or (at your option)
9*86d7f5d3SJohn Marino any later version.
10*86d7f5d3SJohn Marino
11*86d7f5d3SJohn Marino GNU DIFF is distributed in the hope that it will be useful,
12*86d7f5d3SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
13*86d7f5d3SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14*86d7f5d3SJohn Marino GNU General Public License for more details.
15*86d7f5d3SJohn Marino
16*86d7f5d3SJohn Marino */
17*86d7f5d3SJohn Marino
18*86d7f5d3SJohn Marino #include "diff.h"
19*86d7f5d3SJohn Marino
20*86d7f5d3SJohn Marino /* Rotate a value n bits to the left. */
21*86d7f5d3SJohn Marino #define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
22*86d7f5d3SJohn Marino #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
23*86d7f5d3SJohn Marino
24*86d7f5d3SJohn Marino /* Given a hash value and a new character, return a new hash value. */
25*86d7f5d3SJohn Marino #define HASH(h, c) ((c) + ROL (h, 7))
26*86d7f5d3SJohn Marino
27*86d7f5d3SJohn Marino /* Guess remaining number of lines from number N of lines so far,
28*86d7f5d3SJohn Marino size S so far, and total size T. */
29*86d7f5d3SJohn Marino #define GUESS_LINES(n,s,t) (((t) - (s)) / ((n) < 10 ? 32 : (s) / ((n)-1)) + 5)
30*86d7f5d3SJohn Marino
31*86d7f5d3SJohn Marino /* Type used for fast prefix comparison in find_identical_ends. */
32*86d7f5d3SJohn Marino #ifndef word
33*86d7f5d3SJohn Marino #define word int
34*86d7f5d3SJohn Marino #endif
35*86d7f5d3SJohn Marino
36*86d7f5d3SJohn Marino /* Lines are put into equivalence classes (of lines that match in line_cmp).
37*86d7f5d3SJohn Marino Each equivalence class is represented by one of these structures,
38*86d7f5d3SJohn Marino but only while the classes are being computed.
39*86d7f5d3SJohn Marino Afterward, each class is represented by a number. */
40*86d7f5d3SJohn Marino struct equivclass
41*86d7f5d3SJohn Marino {
42*86d7f5d3SJohn Marino int next; /* Next item in this bucket. */
43*86d7f5d3SJohn Marino unsigned hash; /* Hash of lines in this class. */
44*86d7f5d3SJohn Marino char const *line; /* A line that fits this class. */
45*86d7f5d3SJohn Marino size_t length; /* That line's length, not counting its newline. */
46*86d7f5d3SJohn Marino };
47*86d7f5d3SJohn Marino
48*86d7f5d3SJohn Marino /* Hash-table: array of buckets, each being a chain of equivalence classes.
49*86d7f5d3SJohn Marino buckets[-1] is reserved for incomplete lines. */
50*86d7f5d3SJohn Marino static int *buckets;
51*86d7f5d3SJohn Marino
52*86d7f5d3SJohn Marino /* Number of buckets in the hash table array, not counting buckets[-1]. */
53*86d7f5d3SJohn Marino static int nbuckets;
54*86d7f5d3SJohn Marino
55*86d7f5d3SJohn Marino /* Array in which the equivalence classes are allocated.
56*86d7f5d3SJohn Marino The bucket-chains go through the elements in this array.
57*86d7f5d3SJohn Marino The number of an equivalence class is its index in this array. */
58*86d7f5d3SJohn Marino static struct equivclass *equivs;
59*86d7f5d3SJohn Marino
60*86d7f5d3SJohn Marino /* Index of first free element in the array `equivs'. */
61*86d7f5d3SJohn Marino static int equivs_index;
62*86d7f5d3SJohn Marino
63*86d7f5d3SJohn Marino /* Number of elements allocated in the array `equivs'. */
64*86d7f5d3SJohn Marino static int equivs_alloc;
65*86d7f5d3SJohn Marino
66*86d7f5d3SJohn Marino static void find_and_hash_each_line PARAMS((struct file_data *));
67*86d7f5d3SJohn Marino static void find_identical_ends PARAMS((struct file_data[]));
68*86d7f5d3SJohn Marino static void prepare_text_end PARAMS((struct file_data *));
69*86d7f5d3SJohn Marino
70*86d7f5d3SJohn Marino /* Check for binary files and compare them for exact identity. */
71*86d7f5d3SJohn Marino
72*86d7f5d3SJohn Marino /* Return 1 if BUF contains a non text character.
73*86d7f5d3SJohn Marino SIZE is the number of characters in BUF. */
74*86d7f5d3SJohn Marino
75*86d7f5d3SJohn Marino #define binary_file_p(buf, size) (memchr (buf, '\0', size) != 0)
76*86d7f5d3SJohn Marino
77*86d7f5d3SJohn Marino /* Get ready to read the current file.
78*86d7f5d3SJohn Marino Return nonzero if SKIP_TEST is zero,
79*86d7f5d3SJohn Marino and if it appears to be a binary file. */
80*86d7f5d3SJohn Marino
81*86d7f5d3SJohn Marino int
sip(current,skip_test)82*86d7f5d3SJohn Marino sip (current, skip_test)
83*86d7f5d3SJohn Marino struct file_data *current;
84*86d7f5d3SJohn Marino int skip_test;
85*86d7f5d3SJohn Marino {
86*86d7f5d3SJohn Marino /* If we have a nonexistent file at this stage, treat it as empty. */
87*86d7f5d3SJohn Marino if (current->desc < 0)
88*86d7f5d3SJohn Marino {
89*86d7f5d3SJohn Marino /* Leave room for a sentinel. */
90*86d7f5d3SJohn Marino current->bufsize = sizeof (word);
91*86d7f5d3SJohn Marino current->buffer = xmalloc (current->bufsize);
92*86d7f5d3SJohn Marino }
93*86d7f5d3SJohn Marino else
94*86d7f5d3SJohn Marino {
95*86d7f5d3SJohn Marino current->bufsize = STAT_BLOCKSIZE (current->stat);
96*86d7f5d3SJohn Marino current->buffer = xmalloc (current->bufsize);
97*86d7f5d3SJohn Marino
98*86d7f5d3SJohn Marino if (! skip_test)
99*86d7f5d3SJohn Marino {
100*86d7f5d3SJohn Marino /* Check first part of file to see if it's a binary file. */
101*86d7f5d3SJohn Marino #if HAVE_SETMODE
102*86d7f5d3SJohn Marino int oldmode = setmode (current->desc, O_BINARY);
103*86d7f5d3SJohn Marino #endif
104*86d7f5d3SJohn Marino ssize_t n = read (current->desc, current->buffer, current->bufsize);
105*86d7f5d3SJohn Marino if (n == -1)
106*86d7f5d3SJohn Marino pfatal_with_name (current->name);
107*86d7f5d3SJohn Marino current->buffered_chars = n;
108*86d7f5d3SJohn Marino #if HAVE_SETMODE
109*86d7f5d3SJohn Marino if (oldmode != O_BINARY)
110*86d7f5d3SJohn Marino {
111*86d7f5d3SJohn Marino if (lseek (current->desc, - (off_t) n, SEEK_CUR) == -1)
112*86d7f5d3SJohn Marino pfatal_with_name (current->name);
113*86d7f5d3SJohn Marino setmode (current->desc, oldmode);
114*86d7f5d3SJohn Marino current->buffered_chars = 0;
115*86d7f5d3SJohn Marino }
116*86d7f5d3SJohn Marino #endif
117*86d7f5d3SJohn Marino return binary_file_p (current->buffer, n);
118*86d7f5d3SJohn Marino }
119*86d7f5d3SJohn Marino }
120*86d7f5d3SJohn Marino
121*86d7f5d3SJohn Marino current->buffered_chars = 0;
122*86d7f5d3SJohn Marino return 0;
123*86d7f5d3SJohn Marino }
124*86d7f5d3SJohn Marino
125*86d7f5d3SJohn Marino /* Slurp the rest of the current file completely into memory. */
126*86d7f5d3SJohn Marino
127*86d7f5d3SJohn Marino void
slurp(current)128*86d7f5d3SJohn Marino slurp (current)
129*86d7f5d3SJohn Marino struct file_data *current;
130*86d7f5d3SJohn Marino {
131*86d7f5d3SJohn Marino ssize_t cc;
132*86d7f5d3SJohn Marino
133*86d7f5d3SJohn Marino if (current->desc < 0)
134*86d7f5d3SJohn Marino /* The file is nonexistent. */
135*86d7f5d3SJohn Marino ;
136*86d7f5d3SJohn Marino else if (S_ISREG (current->stat.st_mode))
137*86d7f5d3SJohn Marino {
138*86d7f5d3SJohn Marino /* It's a regular file; slurp in the rest all at once. */
139*86d7f5d3SJohn Marino
140*86d7f5d3SJohn Marino /* Get the size out of the stat block.
141*86d7f5d3SJohn Marino Allocate enough room for appended newline and sentinel. */
142*86d7f5d3SJohn Marino cc = current->stat.st_size + 1 + sizeof (word);
143*86d7f5d3SJohn Marino if (current->bufsize < cc)
144*86d7f5d3SJohn Marino {
145*86d7f5d3SJohn Marino current->bufsize = cc;
146*86d7f5d3SJohn Marino current->buffer = xrealloc (current->buffer, cc);
147*86d7f5d3SJohn Marino }
148*86d7f5d3SJohn Marino
149*86d7f5d3SJohn Marino if (current->buffered_chars < current->stat.st_size)
150*86d7f5d3SJohn Marino {
151*86d7f5d3SJohn Marino cc = read (current->desc,
152*86d7f5d3SJohn Marino current->buffer + current->buffered_chars,
153*86d7f5d3SJohn Marino current->stat.st_size - current->buffered_chars);
154*86d7f5d3SJohn Marino if (cc == -1)
155*86d7f5d3SJohn Marino pfatal_with_name (current->name);
156*86d7f5d3SJohn Marino current->buffered_chars += cc;
157*86d7f5d3SJohn Marino }
158*86d7f5d3SJohn Marino }
159*86d7f5d3SJohn Marino /* It's not a regular file; read it, growing the buffer as needed. */
160*86d7f5d3SJohn Marino else if (always_text_flag || current->buffered_chars != 0)
161*86d7f5d3SJohn Marino {
162*86d7f5d3SJohn Marino for (;;)
163*86d7f5d3SJohn Marino {
164*86d7f5d3SJohn Marino if (current->buffered_chars == current->bufsize)
165*86d7f5d3SJohn Marino {
166*86d7f5d3SJohn Marino current->bufsize = current->bufsize * 2;
167*86d7f5d3SJohn Marino current->buffer = xrealloc (current->buffer, current->bufsize);
168*86d7f5d3SJohn Marino }
169*86d7f5d3SJohn Marino cc = read (current->desc,
170*86d7f5d3SJohn Marino current->buffer + current->buffered_chars,
171*86d7f5d3SJohn Marino current->bufsize - current->buffered_chars);
172*86d7f5d3SJohn Marino if (cc == 0)
173*86d7f5d3SJohn Marino break;
174*86d7f5d3SJohn Marino if (cc == -1)
175*86d7f5d3SJohn Marino pfatal_with_name (current->name);
176*86d7f5d3SJohn Marino current->buffered_chars += cc;
177*86d7f5d3SJohn Marino }
178*86d7f5d3SJohn Marino /* Allocate just enough room for appended newline and sentinel. */
179*86d7f5d3SJohn Marino current->bufsize = current->buffered_chars + 1 + sizeof (word);
180*86d7f5d3SJohn Marino current->buffer = xrealloc (current->buffer, current->bufsize);
181*86d7f5d3SJohn Marino }
182*86d7f5d3SJohn Marino }
183*86d7f5d3SJohn Marino
184*86d7f5d3SJohn Marino /* Split the file into lines, simultaneously computing the equivalence class for
185*86d7f5d3SJohn Marino each line. */
186*86d7f5d3SJohn Marino
187*86d7f5d3SJohn Marino static void
find_and_hash_each_line(current)188*86d7f5d3SJohn Marino find_and_hash_each_line (current)
189*86d7f5d3SJohn Marino struct file_data *current;
190*86d7f5d3SJohn Marino {
191*86d7f5d3SJohn Marino unsigned h;
192*86d7f5d3SJohn Marino unsigned char const *p = (unsigned char const *) current->prefix_end;
193*86d7f5d3SJohn Marino unsigned char c;
194*86d7f5d3SJohn Marino int i, *bucket;
195*86d7f5d3SJohn Marino size_t length;
196*86d7f5d3SJohn Marino
197*86d7f5d3SJohn Marino /* Cache often-used quantities in local variables to help the compiler. */
198*86d7f5d3SJohn Marino char const **linbuf = current->linbuf;
199*86d7f5d3SJohn Marino int alloc_lines = current->alloc_lines;
200*86d7f5d3SJohn Marino int line = 0;
201*86d7f5d3SJohn Marino int linbuf_base = current->linbuf_base;
202*86d7f5d3SJohn Marino int *cureqs = (int *) xmalloc (alloc_lines * sizeof (int));
203*86d7f5d3SJohn Marino struct equivclass *eqs = equivs;
204*86d7f5d3SJohn Marino int eqs_index = equivs_index;
205*86d7f5d3SJohn Marino int eqs_alloc = equivs_alloc;
206*86d7f5d3SJohn Marino char const *suffix_begin = current->suffix_begin;
207*86d7f5d3SJohn Marino char const *bufend = current->buffer + current->buffered_chars;
208*86d7f5d3SJohn Marino int use_line_cmp = ignore_some_line_changes;
209*86d7f5d3SJohn Marino
210*86d7f5d3SJohn Marino while ((char const *) p < suffix_begin)
211*86d7f5d3SJohn Marino {
212*86d7f5d3SJohn Marino char const *ip = (char const *) p;
213*86d7f5d3SJohn Marino
214*86d7f5d3SJohn Marino /* Compute the equivalence class for this line. */
215*86d7f5d3SJohn Marino
216*86d7f5d3SJohn Marino h = 0;
217*86d7f5d3SJohn Marino
218*86d7f5d3SJohn Marino /* Hash this line until we find a newline. */
219*86d7f5d3SJohn Marino if (ignore_case_flag)
220*86d7f5d3SJohn Marino {
221*86d7f5d3SJohn Marino if (ignore_all_space_flag)
222*86d7f5d3SJohn Marino while ((c = *p++) != '\n')
223*86d7f5d3SJohn Marino {
224*86d7f5d3SJohn Marino if (! ISSPACE (c))
225*86d7f5d3SJohn Marino h = HASH (h, ISUPPER (c) ? tolower (c) : c);
226*86d7f5d3SJohn Marino }
227*86d7f5d3SJohn Marino else if (ignore_space_change_flag)
228*86d7f5d3SJohn Marino while ((c = *p++) != '\n')
229*86d7f5d3SJohn Marino {
230*86d7f5d3SJohn Marino if (ISSPACE (c))
231*86d7f5d3SJohn Marino {
232*86d7f5d3SJohn Marino for (;;)
233*86d7f5d3SJohn Marino {
234*86d7f5d3SJohn Marino c = *p++;
235*86d7f5d3SJohn Marino if (!ISSPACE (c))
236*86d7f5d3SJohn Marino break;
237*86d7f5d3SJohn Marino if (c == '\n')
238*86d7f5d3SJohn Marino goto hashing_done;
239*86d7f5d3SJohn Marino }
240*86d7f5d3SJohn Marino h = HASH (h, ' ');
241*86d7f5d3SJohn Marino }
242*86d7f5d3SJohn Marino /* C is now the first non-space. */
243*86d7f5d3SJohn Marino h = HASH (h, ISUPPER (c) ? tolower (c) : c);
244*86d7f5d3SJohn Marino }
245*86d7f5d3SJohn Marino else
246*86d7f5d3SJohn Marino while ((c = *p++) != '\n')
247*86d7f5d3SJohn Marino h = HASH (h, ISUPPER (c) ? tolower (c) : c);
248*86d7f5d3SJohn Marino }
249*86d7f5d3SJohn Marino else
250*86d7f5d3SJohn Marino {
251*86d7f5d3SJohn Marino if (ignore_all_space_flag)
252*86d7f5d3SJohn Marino while ((c = *p++) != '\n')
253*86d7f5d3SJohn Marino {
254*86d7f5d3SJohn Marino if (! ISSPACE (c))
255*86d7f5d3SJohn Marino h = HASH (h, c);
256*86d7f5d3SJohn Marino }
257*86d7f5d3SJohn Marino else if (ignore_space_change_flag)
258*86d7f5d3SJohn Marino while ((c = *p++) != '\n')
259*86d7f5d3SJohn Marino {
260*86d7f5d3SJohn Marino if (ISSPACE (c))
261*86d7f5d3SJohn Marino {
262*86d7f5d3SJohn Marino for (;;)
263*86d7f5d3SJohn Marino {
264*86d7f5d3SJohn Marino c = *p++;
265*86d7f5d3SJohn Marino if (!ISSPACE (c))
266*86d7f5d3SJohn Marino break;
267*86d7f5d3SJohn Marino if (c == '\n')
268*86d7f5d3SJohn Marino goto hashing_done;
269*86d7f5d3SJohn Marino }
270*86d7f5d3SJohn Marino h = HASH (h, ' ');
271*86d7f5d3SJohn Marino }
272*86d7f5d3SJohn Marino /* C is now the first non-space. */
273*86d7f5d3SJohn Marino h = HASH (h, c);
274*86d7f5d3SJohn Marino }
275*86d7f5d3SJohn Marino else
276*86d7f5d3SJohn Marino while ((c = *p++) != '\n')
277*86d7f5d3SJohn Marino h = HASH (h, c);
278*86d7f5d3SJohn Marino }
279*86d7f5d3SJohn Marino hashing_done:;
280*86d7f5d3SJohn Marino
281*86d7f5d3SJohn Marino bucket = &buckets[h % nbuckets];
282*86d7f5d3SJohn Marino length = (char const *) p - ip - 1;
283*86d7f5d3SJohn Marino
284*86d7f5d3SJohn Marino if ((char const *) p == bufend
285*86d7f5d3SJohn Marino && current->missing_newline
286*86d7f5d3SJohn Marino && ROBUST_OUTPUT_STYLE (output_style))
287*86d7f5d3SJohn Marino {
288*86d7f5d3SJohn Marino /* This line is incomplete. If this is significant,
289*86d7f5d3SJohn Marino put the line into bucket[-1]. */
290*86d7f5d3SJohn Marino if (! (ignore_space_change_flag | ignore_all_space_flag))
291*86d7f5d3SJohn Marino bucket = &buckets[-1];
292*86d7f5d3SJohn Marino
293*86d7f5d3SJohn Marino /* Omit the inserted newline when computing linbuf later. */
294*86d7f5d3SJohn Marino p--;
295*86d7f5d3SJohn Marino bufend = suffix_begin = (char const *) p;
296*86d7f5d3SJohn Marino }
297*86d7f5d3SJohn Marino
298*86d7f5d3SJohn Marino for (i = *bucket; ; i = eqs[i].next)
299*86d7f5d3SJohn Marino if (!i)
300*86d7f5d3SJohn Marino {
301*86d7f5d3SJohn Marino /* Create a new equivalence class in this bucket. */
302*86d7f5d3SJohn Marino i = eqs_index++;
303*86d7f5d3SJohn Marino if (i == eqs_alloc)
304*86d7f5d3SJohn Marino eqs = (struct equivclass *)
305*86d7f5d3SJohn Marino xrealloc (eqs, (eqs_alloc*=2) * sizeof(*eqs));
306*86d7f5d3SJohn Marino eqs[i].next = *bucket;
307*86d7f5d3SJohn Marino eqs[i].hash = h;
308*86d7f5d3SJohn Marino eqs[i].line = ip;
309*86d7f5d3SJohn Marino eqs[i].length = length;
310*86d7f5d3SJohn Marino *bucket = i;
311*86d7f5d3SJohn Marino break;
312*86d7f5d3SJohn Marino }
313*86d7f5d3SJohn Marino else if (eqs[i].hash == h)
314*86d7f5d3SJohn Marino {
315*86d7f5d3SJohn Marino char const *eqline = eqs[i].line;
316*86d7f5d3SJohn Marino
317*86d7f5d3SJohn Marino /* Reuse existing equivalence class if the lines are identical.
318*86d7f5d3SJohn Marino This detects the common case of exact identity
319*86d7f5d3SJohn Marino faster than complete comparison would. */
320*86d7f5d3SJohn Marino if (eqs[i].length == length && memcmp (eqline, ip, length) == 0)
321*86d7f5d3SJohn Marino break;
322*86d7f5d3SJohn Marino
323*86d7f5d3SJohn Marino /* Reuse existing class if line_cmp reports the lines equal. */
324*86d7f5d3SJohn Marino if (use_line_cmp && line_cmp (eqline, ip) == 0)
325*86d7f5d3SJohn Marino break;
326*86d7f5d3SJohn Marino }
327*86d7f5d3SJohn Marino
328*86d7f5d3SJohn Marino /* Maybe increase the size of the line table. */
329*86d7f5d3SJohn Marino if (line == alloc_lines)
330*86d7f5d3SJohn Marino {
331*86d7f5d3SJohn Marino /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
332*86d7f5d3SJohn Marino alloc_lines = 2 * alloc_lines - linbuf_base;
333*86d7f5d3SJohn Marino cureqs = (int *) xrealloc (cureqs, alloc_lines * sizeof (*cureqs));
334*86d7f5d3SJohn Marino linbuf = (char const **) xrealloc (linbuf + linbuf_base,
335*86d7f5d3SJohn Marino (alloc_lines - linbuf_base)
336*86d7f5d3SJohn Marino * sizeof (*linbuf))
337*86d7f5d3SJohn Marino - linbuf_base;
338*86d7f5d3SJohn Marino }
339*86d7f5d3SJohn Marino linbuf[line] = ip;
340*86d7f5d3SJohn Marino cureqs[line] = i;
341*86d7f5d3SJohn Marino ++line;
342*86d7f5d3SJohn Marino }
343*86d7f5d3SJohn Marino
344*86d7f5d3SJohn Marino current->buffered_lines = line;
345*86d7f5d3SJohn Marino
346*86d7f5d3SJohn Marino for (i = 0; ; i++)
347*86d7f5d3SJohn Marino {
348*86d7f5d3SJohn Marino /* Record the line start for lines in the suffix that we care about.
349*86d7f5d3SJohn Marino Record one more line start than lines,
350*86d7f5d3SJohn Marino so that we can compute the length of any buffered line. */
351*86d7f5d3SJohn Marino if (line == alloc_lines)
352*86d7f5d3SJohn Marino {
353*86d7f5d3SJohn Marino /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
354*86d7f5d3SJohn Marino alloc_lines = 2 * alloc_lines - linbuf_base;
355*86d7f5d3SJohn Marino linbuf = (char const **) xrealloc (linbuf + linbuf_base,
356*86d7f5d3SJohn Marino (alloc_lines - linbuf_base)
357*86d7f5d3SJohn Marino * sizeof (*linbuf))
358*86d7f5d3SJohn Marino - linbuf_base;
359*86d7f5d3SJohn Marino }
360*86d7f5d3SJohn Marino linbuf[line] = (char const *) p;
361*86d7f5d3SJohn Marino
362*86d7f5d3SJohn Marino if ((char const *) p == bufend)
363*86d7f5d3SJohn Marino break;
364*86d7f5d3SJohn Marino
365*86d7f5d3SJohn Marino if (context <= i && no_diff_means_no_output)
366*86d7f5d3SJohn Marino break;
367*86d7f5d3SJohn Marino
368*86d7f5d3SJohn Marino line++;
369*86d7f5d3SJohn Marino
370*86d7f5d3SJohn Marino while (*p++ != '\n')
371*86d7f5d3SJohn Marino ;
372*86d7f5d3SJohn Marino }
373*86d7f5d3SJohn Marino
374*86d7f5d3SJohn Marino /* Done with cache in local variables. */
375*86d7f5d3SJohn Marino current->linbuf = linbuf;
376*86d7f5d3SJohn Marino current->valid_lines = line;
377*86d7f5d3SJohn Marino current->alloc_lines = alloc_lines;
378*86d7f5d3SJohn Marino current->equivs = cureqs;
379*86d7f5d3SJohn Marino equivs = eqs;
380*86d7f5d3SJohn Marino equivs_alloc = eqs_alloc;
381*86d7f5d3SJohn Marino equivs_index = eqs_index;
382*86d7f5d3SJohn Marino }
383*86d7f5d3SJohn Marino
384*86d7f5d3SJohn Marino /* Prepare the end of the text. Make sure it's initialized.
385*86d7f5d3SJohn Marino Make sure text ends in a newline,
386*86d7f5d3SJohn Marino but remember that we had to add one. */
387*86d7f5d3SJohn Marino
388*86d7f5d3SJohn Marino static void
prepare_text_end(current)389*86d7f5d3SJohn Marino prepare_text_end (current)
390*86d7f5d3SJohn Marino struct file_data *current;
391*86d7f5d3SJohn Marino {
392*86d7f5d3SJohn Marino size_t buffered_chars = current->buffered_chars;
393*86d7f5d3SJohn Marino char *p = current->buffer;
394*86d7f5d3SJohn Marino
395*86d7f5d3SJohn Marino if (buffered_chars == 0 || p[buffered_chars - 1] == '\n')
396*86d7f5d3SJohn Marino current->missing_newline = 0;
397*86d7f5d3SJohn Marino else
398*86d7f5d3SJohn Marino {
399*86d7f5d3SJohn Marino p[buffered_chars++] = '\n';
400*86d7f5d3SJohn Marino current->buffered_chars = buffered_chars;
401*86d7f5d3SJohn Marino current->missing_newline = 1;
402*86d7f5d3SJohn Marino }
403*86d7f5d3SJohn Marino
404*86d7f5d3SJohn Marino /* Don't use uninitialized storage when planting or using sentinels. */
405*86d7f5d3SJohn Marino if (p)
406*86d7f5d3SJohn Marino bzero (p + buffered_chars, sizeof (word));
407*86d7f5d3SJohn Marino }
408*86d7f5d3SJohn Marino
409*86d7f5d3SJohn Marino /* Given a vector of two file_data objects, find the identical
410*86d7f5d3SJohn Marino prefixes and suffixes of each object. */
411*86d7f5d3SJohn Marino
412*86d7f5d3SJohn Marino static void
find_identical_ends(filevec)413*86d7f5d3SJohn Marino find_identical_ends (filevec)
414*86d7f5d3SJohn Marino struct file_data filevec[];
415*86d7f5d3SJohn Marino {
416*86d7f5d3SJohn Marino word *w0, *w1;
417*86d7f5d3SJohn Marino char *p0, *p1, *buffer0, *buffer1;
418*86d7f5d3SJohn Marino char const *end0, *beg0;
419*86d7f5d3SJohn Marino char const **linbuf0, **linbuf1;
420*86d7f5d3SJohn Marino int i, lines;
421*86d7f5d3SJohn Marino size_t n0, n1, tem;
422*86d7f5d3SJohn Marino int alloc_lines0, alloc_lines1;
423*86d7f5d3SJohn Marino int buffered_prefix, prefix_count, prefix_mask;
424*86d7f5d3SJohn Marino
425*86d7f5d3SJohn Marino slurp (&filevec[0]);
426*86d7f5d3SJohn Marino if (filevec[0].desc != filevec[1].desc)
427*86d7f5d3SJohn Marino slurp (&filevec[1]);
428*86d7f5d3SJohn Marino else
429*86d7f5d3SJohn Marino {
430*86d7f5d3SJohn Marino filevec[1].buffer = filevec[0].buffer;
431*86d7f5d3SJohn Marino filevec[1].bufsize = filevec[0].bufsize;
432*86d7f5d3SJohn Marino filevec[1].buffered_chars = filevec[0].buffered_chars;
433*86d7f5d3SJohn Marino }
434*86d7f5d3SJohn Marino for (i = 0; i < 2; i++)
435*86d7f5d3SJohn Marino prepare_text_end (&filevec[i]);
436*86d7f5d3SJohn Marino
437*86d7f5d3SJohn Marino /* Find identical prefix. */
438*86d7f5d3SJohn Marino
439*86d7f5d3SJohn Marino p0 = buffer0 = filevec[0].buffer;
440*86d7f5d3SJohn Marino p1 = buffer1 = filevec[1].buffer;
441*86d7f5d3SJohn Marino
442*86d7f5d3SJohn Marino n0 = filevec[0].buffered_chars;
443*86d7f5d3SJohn Marino n1 = filevec[1].buffered_chars;
444*86d7f5d3SJohn Marino
445*86d7f5d3SJohn Marino if (p0 == p1)
446*86d7f5d3SJohn Marino /* The buffers are the same; sentinels won't work. */
447*86d7f5d3SJohn Marino p0 = p1 += n1;
448*86d7f5d3SJohn Marino else
449*86d7f5d3SJohn Marino {
450*86d7f5d3SJohn Marino /* Insert end sentinels, in this case characters that are guaranteed
451*86d7f5d3SJohn Marino to make the equality test false, and thus terminate the loop. */
452*86d7f5d3SJohn Marino
453*86d7f5d3SJohn Marino if (n0 < n1)
454*86d7f5d3SJohn Marino p0[n0] = ~p1[n0];
455*86d7f5d3SJohn Marino else
456*86d7f5d3SJohn Marino p1[n1] = ~p0[n1];
457*86d7f5d3SJohn Marino
458*86d7f5d3SJohn Marino /* Loop until first mismatch, or to the sentinel characters. */
459*86d7f5d3SJohn Marino
460*86d7f5d3SJohn Marino /* Compare a word at a time for speed. */
461*86d7f5d3SJohn Marino w0 = (word *) p0;
462*86d7f5d3SJohn Marino w1 = (word *) p1;
463*86d7f5d3SJohn Marino while (*w0++ == *w1++)
464*86d7f5d3SJohn Marino ;
465*86d7f5d3SJohn Marino --w0, --w1;
466*86d7f5d3SJohn Marino
467*86d7f5d3SJohn Marino /* Do the last few bytes of comparison a byte at a time. */
468*86d7f5d3SJohn Marino p0 = (char *) w0;
469*86d7f5d3SJohn Marino p1 = (char *) w1;
470*86d7f5d3SJohn Marino while (*p0++ == *p1++)
471*86d7f5d3SJohn Marino ;
472*86d7f5d3SJohn Marino --p0, --p1;
473*86d7f5d3SJohn Marino
474*86d7f5d3SJohn Marino /* Don't mistakenly count missing newline as part of prefix. */
475*86d7f5d3SJohn Marino if (ROBUST_OUTPUT_STYLE (output_style)
476*86d7f5d3SJohn Marino && (buffer0 + n0 - filevec[0].missing_newline < p0)
477*86d7f5d3SJohn Marino !=
478*86d7f5d3SJohn Marino (buffer1 + n1 - filevec[1].missing_newline < p1))
479*86d7f5d3SJohn Marino --p0, --p1;
480*86d7f5d3SJohn Marino }
481*86d7f5d3SJohn Marino
482*86d7f5d3SJohn Marino /* Now P0 and P1 point at the first nonmatching characters. */
483*86d7f5d3SJohn Marino
484*86d7f5d3SJohn Marino /* Skip back to last line-beginning in the prefix,
485*86d7f5d3SJohn Marino and then discard up to HORIZON_LINES lines from the prefix. */
486*86d7f5d3SJohn Marino i = horizon_lines;
487*86d7f5d3SJohn Marino while (p0 != buffer0 && (p0[-1] != '\n' || i--))
488*86d7f5d3SJohn Marino --p0, --p1;
489*86d7f5d3SJohn Marino
490*86d7f5d3SJohn Marino /* Record the prefix. */
491*86d7f5d3SJohn Marino filevec[0].prefix_end = p0;
492*86d7f5d3SJohn Marino filevec[1].prefix_end = p1;
493*86d7f5d3SJohn Marino
494*86d7f5d3SJohn Marino /* Find identical suffix. */
495*86d7f5d3SJohn Marino
496*86d7f5d3SJohn Marino /* P0 and P1 point beyond the last chars not yet compared. */
497*86d7f5d3SJohn Marino p0 = buffer0 + n0;
498*86d7f5d3SJohn Marino p1 = buffer1 + n1;
499*86d7f5d3SJohn Marino
500*86d7f5d3SJohn Marino if (! ROBUST_OUTPUT_STYLE (output_style)
501*86d7f5d3SJohn Marino || filevec[0].missing_newline == filevec[1].missing_newline)
502*86d7f5d3SJohn Marino {
503*86d7f5d3SJohn Marino end0 = p0; /* Addr of last char in file 0. */
504*86d7f5d3SJohn Marino
505*86d7f5d3SJohn Marino /* Get value of P0 at which we should stop scanning backward:
506*86d7f5d3SJohn Marino this is when either P0 or P1 points just past the last char
507*86d7f5d3SJohn Marino of the identical prefix. */
508*86d7f5d3SJohn Marino beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
509*86d7f5d3SJohn Marino
510*86d7f5d3SJohn Marino /* Scan back until chars don't match or we reach that point. */
511*86d7f5d3SJohn Marino for (; p0 != beg0; p0--, p1--)
512*86d7f5d3SJohn Marino if (*p0 != *p1)
513*86d7f5d3SJohn Marino {
514*86d7f5d3SJohn Marino /* Point at the first char of the matching suffix. */
515*86d7f5d3SJohn Marino beg0 = p0;
516*86d7f5d3SJohn Marino break;
517*86d7f5d3SJohn Marino }
518*86d7f5d3SJohn Marino
519*86d7f5d3SJohn Marino /* Are we at a line-beginning in both files? If not, add the rest of
520*86d7f5d3SJohn Marino this line to the main body. Discard up to HORIZON_LINES lines from
521*86d7f5d3SJohn Marino the identical suffix. Also, discard one extra line,
522*86d7f5d3SJohn Marino because shift_boundaries may need it. */
523*86d7f5d3SJohn Marino i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
524*86d7f5d3SJohn Marino &&
525*86d7f5d3SJohn Marino (buffer1 == p1 || p1[-1] == '\n'));
526*86d7f5d3SJohn Marino while (i-- && p0 != end0)
527*86d7f5d3SJohn Marino while (*p0++ != '\n')
528*86d7f5d3SJohn Marino ;
529*86d7f5d3SJohn Marino
530*86d7f5d3SJohn Marino p1 += p0 - beg0;
531*86d7f5d3SJohn Marino }
532*86d7f5d3SJohn Marino
533*86d7f5d3SJohn Marino /* Record the suffix. */
534*86d7f5d3SJohn Marino filevec[0].suffix_begin = p0;
535*86d7f5d3SJohn Marino filevec[1].suffix_begin = p1;
536*86d7f5d3SJohn Marino
537*86d7f5d3SJohn Marino /* Calculate number of lines of prefix to save.
538*86d7f5d3SJohn Marino
539*86d7f5d3SJohn Marino prefix_count == 0 means save the whole prefix;
540*86d7f5d3SJohn Marino we need this with for options like -D that output the whole file.
541*86d7f5d3SJohn Marino We also need it for options like -F that output some preceding line;
542*86d7f5d3SJohn Marino at least we will need to find the last few lines,
543*86d7f5d3SJohn Marino but since we don't know how many, it's easiest to find them all.
544*86d7f5d3SJohn Marino
545*86d7f5d3SJohn Marino Otherwise, prefix_count != 0. Save just prefix_count lines at start
546*86d7f5d3SJohn Marino of the line buffer; they'll be moved to the proper location later.
547*86d7f5d3SJohn Marino Handle 1 more line than the context says (because we count 1 too many),
548*86d7f5d3SJohn Marino rounded up to the next power of 2 to speed index computation. */
549*86d7f5d3SJohn Marino
550*86d7f5d3SJohn Marino if (no_diff_means_no_output && ! function_regexp_list)
551*86d7f5d3SJohn Marino {
552*86d7f5d3SJohn Marino for (prefix_count = 1; prefix_count < context + 1; prefix_count *= 2)
553*86d7f5d3SJohn Marino ;
554*86d7f5d3SJohn Marino prefix_mask = prefix_count - 1;
555*86d7f5d3SJohn Marino alloc_lines0
556*86d7f5d3SJohn Marino = prefix_count
557*86d7f5d3SJohn Marino + GUESS_LINES (0, 0, p0 - filevec[0].prefix_end)
558*86d7f5d3SJohn Marino + context;
559*86d7f5d3SJohn Marino }
560*86d7f5d3SJohn Marino else
561*86d7f5d3SJohn Marino {
562*86d7f5d3SJohn Marino prefix_count = 0;
563*86d7f5d3SJohn Marino prefix_mask = ~0;
564*86d7f5d3SJohn Marino alloc_lines0 = GUESS_LINES (0, 0, n0);
565*86d7f5d3SJohn Marino }
566*86d7f5d3SJohn Marino
567*86d7f5d3SJohn Marino lines = 0;
568*86d7f5d3SJohn Marino linbuf0 = (char const **) xmalloc (alloc_lines0 * sizeof (*linbuf0));
569*86d7f5d3SJohn Marino
570*86d7f5d3SJohn Marino /* If the prefix is needed, find the prefix lines. */
571*86d7f5d3SJohn Marino if (! (no_diff_means_no_output
572*86d7f5d3SJohn Marino && filevec[0].prefix_end == p0
573*86d7f5d3SJohn Marino && filevec[1].prefix_end == p1))
574*86d7f5d3SJohn Marino {
575*86d7f5d3SJohn Marino p0 = buffer0;
576*86d7f5d3SJohn Marino end0 = filevec[0].prefix_end;
577*86d7f5d3SJohn Marino while (p0 != end0)
578*86d7f5d3SJohn Marino {
579*86d7f5d3SJohn Marino int l = lines++ & prefix_mask;
580*86d7f5d3SJohn Marino if (l == alloc_lines0)
581*86d7f5d3SJohn Marino linbuf0 = (char const **) xrealloc (linbuf0, (alloc_lines0 *= 2)
582*86d7f5d3SJohn Marino * sizeof(*linbuf0));
583*86d7f5d3SJohn Marino linbuf0[l] = p0;
584*86d7f5d3SJohn Marino while (*p0++ != '\n')
585*86d7f5d3SJohn Marino ;
586*86d7f5d3SJohn Marino }
587*86d7f5d3SJohn Marino }
588*86d7f5d3SJohn Marino buffered_prefix = prefix_count && context < lines ? context : lines;
589*86d7f5d3SJohn Marino
590*86d7f5d3SJohn Marino /* Allocate line buffer 1. */
591*86d7f5d3SJohn Marino tem = prefix_count ? filevec[1].suffix_begin - buffer1 : n1;
592*86d7f5d3SJohn Marino
593*86d7f5d3SJohn Marino alloc_lines1
594*86d7f5d3SJohn Marino = (buffered_prefix
595*86d7f5d3SJohn Marino + GUESS_LINES (lines, filevec[1].prefix_end - buffer1, tem)
596*86d7f5d3SJohn Marino + context);
597*86d7f5d3SJohn Marino linbuf1 = (char const **) xmalloc (alloc_lines1 * sizeof (*linbuf1));
598*86d7f5d3SJohn Marino
599*86d7f5d3SJohn Marino if (buffered_prefix != lines)
600*86d7f5d3SJohn Marino {
601*86d7f5d3SJohn Marino /* Rotate prefix lines to proper location. */
602*86d7f5d3SJohn Marino for (i = 0; i < buffered_prefix; i++)
603*86d7f5d3SJohn Marino linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
604*86d7f5d3SJohn Marino for (i = 0; i < buffered_prefix; i++)
605*86d7f5d3SJohn Marino linbuf0[i] = linbuf1[i];
606*86d7f5d3SJohn Marino }
607*86d7f5d3SJohn Marino
608*86d7f5d3SJohn Marino /* Initialize line buffer 1 from line buffer 0. */
609*86d7f5d3SJohn Marino for (i = 0; i < buffered_prefix; i++)
610*86d7f5d3SJohn Marino linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
611*86d7f5d3SJohn Marino
612*86d7f5d3SJohn Marino /* Record the line buffer, adjusted so that
613*86d7f5d3SJohn Marino linbuf*[0] points at the first differing line. */
614*86d7f5d3SJohn Marino filevec[0].linbuf = linbuf0 + buffered_prefix;
615*86d7f5d3SJohn Marino filevec[1].linbuf = linbuf1 + buffered_prefix;
616*86d7f5d3SJohn Marino filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
617*86d7f5d3SJohn Marino filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
618*86d7f5d3SJohn Marino filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
619*86d7f5d3SJohn Marino filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
620*86d7f5d3SJohn Marino }
621*86d7f5d3SJohn Marino
622*86d7f5d3SJohn Marino /* Largest primes less than some power of two, for nbuckets. Values range
623*86d7f5d3SJohn Marino from useful to preposterous. If one of these numbers isn't prime
624*86d7f5d3SJohn Marino after all, don't blame it on me, blame it on primes (6) . . . */
625*86d7f5d3SJohn Marino static int const primes[] =
626*86d7f5d3SJohn Marino {
627*86d7f5d3SJohn Marino 509,
628*86d7f5d3SJohn Marino 1021,
629*86d7f5d3SJohn Marino 2039,
630*86d7f5d3SJohn Marino 4093,
631*86d7f5d3SJohn Marino 8191,
632*86d7f5d3SJohn Marino 16381,
633*86d7f5d3SJohn Marino 32749,
634*86d7f5d3SJohn Marino #if 32767 < INT_MAX
635*86d7f5d3SJohn Marino 65521,
636*86d7f5d3SJohn Marino 131071,
637*86d7f5d3SJohn Marino 262139,
638*86d7f5d3SJohn Marino 524287,
639*86d7f5d3SJohn Marino 1048573,
640*86d7f5d3SJohn Marino 2097143,
641*86d7f5d3SJohn Marino 4194301,
642*86d7f5d3SJohn Marino 8388593,
643*86d7f5d3SJohn Marino 16777213,
644*86d7f5d3SJohn Marino 33554393,
645*86d7f5d3SJohn Marino 67108859, /* Preposterously large . . . */
646*86d7f5d3SJohn Marino 134217689,
647*86d7f5d3SJohn Marino 268435399,
648*86d7f5d3SJohn Marino 536870909,
649*86d7f5d3SJohn Marino 1073741789,
650*86d7f5d3SJohn Marino 2147483647,
651*86d7f5d3SJohn Marino #endif
652*86d7f5d3SJohn Marino 0
653*86d7f5d3SJohn Marino };
654*86d7f5d3SJohn Marino
655*86d7f5d3SJohn Marino /* Given a vector of two file_data objects, read the file associated
656*86d7f5d3SJohn Marino with each one, and build the table of equivalence classes.
657*86d7f5d3SJohn Marino Return 1 if either file appears to be a binary file.
658*86d7f5d3SJohn Marino If PRETEND_BINARY is nonzero, pretend they are binary regardless. */
659*86d7f5d3SJohn Marino
660*86d7f5d3SJohn Marino int
read_files(filevec,pretend_binary)661*86d7f5d3SJohn Marino read_files (filevec, pretend_binary)
662*86d7f5d3SJohn Marino struct file_data filevec[];
663*86d7f5d3SJohn Marino int pretend_binary;
664*86d7f5d3SJohn Marino {
665*86d7f5d3SJohn Marino int i;
666*86d7f5d3SJohn Marino int skip_test = always_text_flag | pretend_binary;
667*86d7f5d3SJohn Marino int appears_binary = pretend_binary | sip (&filevec[0], skip_test);
668*86d7f5d3SJohn Marino
669*86d7f5d3SJohn Marino if (filevec[0].desc != filevec[1].desc)
670*86d7f5d3SJohn Marino appears_binary |= sip (&filevec[1], skip_test | appears_binary);
671*86d7f5d3SJohn Marino else
672*86d7f5d3SJohn Marino {
673*86d7f5d3SJohn Marino filevec[1].buffer = filevec[0].buffer;
674*86d7f5d3SJohn Marino filevec[1].bufsize = filevec[0].bufsize;
675*86d7f5d3SJohn Marino filevec[1].buffered_chars = filevec[0].buffered_chars;
676*86d7f5d3SJohn Marino }
677*86d7f5d3SJohn Marino if (appears_binary)
678*86d7f5d3SJohn Marino {
679*86d7f5d3SJohn Marino #if HAVE_SETMODE
680*86d7f5d3SJohn Marino setmode (filevec[0].desc, O_BINARY);
681*86d7f5d3SJohn Marino setmode (filevec[1].desc, O_BINARY);
682*86d7f5d3SJohn Marino #endif
683*86d7f5d3SJohn Marino return 1;
684*86d7f5d3SJohn Marino }
685*86d7f5d3SJohn Marino
686*86d7f5d3SJohn Marino find_identical_ends (filevec);
687*86d7f5d3SJohn Marino
688*86d7f5d3SJohn Marino equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
689*86d7f5d3SJohn Marino equivs = (struct equivclass *) xmalloc (equivs_alloc * sizeof (struct equivclass));
690*86d7f5d3SJohn Marino /* Equivalence class 0 is permanently safe for lines that were not
691*86d7f5d3SJohn Marino hashed. Real equivalence classes start at 1. */
692*86d7f5d3SJohn Marino equivs_index = 1;
693*86d7f5d3SJohn Marino
694*86d7f5d3SJohn Marino for (i = 0; primes[i] < equivs_alloc / 3; i++)
695*86d7f5d3SJohn Marino if (! primes[i])
696*86d7f5d3SJohn Marino abort ();
697*86d7f5d3SJohn Marino nbuckets = primes[i];
698*86d7f5d3SJohn Marino
699*86d7f5d3SJohn Marino buckets = (int *) xmalloc ((nbuckets + 1) * sizeof (*buckets));
700*86d7f5d3SJohn Marino bzero (buckets++, (nbuckets + 1) * sizeof (*buckets));
701*86d7f5d3SJohn Marino
702*86d7f5d3SJohn Marino for (i = 0; i < 2; i++)
703*86d7f5d3SJohn Marino find_and_hash_each_line (&filevec[i]);
704*86d7f5d3SJohn Marino
705*86d7f5d3SJohn Marino filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
706*86d7f5d3SJohn Marino
707*86d7f5d3SJohn Marino free (equivs);
708*86d7f5d3SJohn Marino free (buckets - 1);
709*86d7f5d3SJohn Marino
710*86d7f5d3SJohn Marino return 0;
711*86d7f5d3SJohn Marino }
712