186d7f5d3SJohn Marino /* File I/O for GNU DIFF.
286d7f5d3SJohn Marino Copyright (C) 1988, 1989, 1992, 1993, 1994 Free Software Foundation, Inc.
386d7f5d3SJohn Marino
486d7f5d3SJohn Marino This file is part of GNU DIFF.
586d7f5d3SJohn Marino
686d7f5d3SJohn Marino GNU DIFF is free software; you can redistribute it and/or modify
786d7f5d3SJohn Marino it under the terms of the GNU General Public License as published by
886d7f5d3SJohn Marino the Free Software Foundation; either version 2, or (at your option)
986d7f5d3SJohn Marino any later version.
1086d7f5d3SJohn Marino
1186d7f5d3SJohn Marino GNU DIFF is distributed in the hope that it will be useful,
1286d7f5d3SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
1386d7f5d3SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1486d7f5d3SJohn Marino GNU General Public License for more details.
1586d7f5d3SJohn Marino
1686d7f5d3SJohn Marino */
1786d7f5d3SJohn Marino
1886d7f5d3SJohn Marino #include "diff.h"
1986d7f5d3SJohn Marino
2086d7f5d3SJohn Marino /* Rotate a value n bits to the left. */
2186d7f5d3SJohn Marino #define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
2286d7f5d3SJohn Marino #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
2386d7f5d3SJohn Marino
2486d7f5d3SJohn Marino /* Given a hash value and a new character, return a new hash value. */
2586d7f5d3SJohn Marino #define HASH(h, c) ((c) + ROL (h, 7))
2686d7f5d3SJohn Marino
2786d7f5d3SJohn Marino /* Guess remaining number of lines from number N of lines so far,
2886d7f5d3SJohn Marino size S so far, and total size T. */
2986d7f5d3SJohn Marino #define GUESS_LINES(n,s,t) (((t) - (s)) / ((n) < 10 ? 32 : (s) / ((n)-1)) + 5)
3086d7f5d3SJohn Marino
3186d7f5d3SJohn Marino /* Type used for fast prefix comparison in find_identical_ends. */
3286d7f5d3SJohn Marino #ifndef word
3386d7f5d3SJohn Marino #define word int
3486d7f5d3SJohn Marino #endif
3586d7f5d3SJohn Marino
3686d7f5d3SJohn Marino /* Lines are put into equivalence classes (of lines that match in line_cmp).
3786d7f5d3SJohn Marino Each equivalence class is represented by one of these structures,
3886d7f5d3SJohn Marino but only while the classes are being computed.
3986d7f5d3SJohn Marino Afterward, each class is represented by a number. */
4086d7f5d3SJohn Marino struct equivclass
4186d7f5d3SJohn Marino {
4286d7f5d3SJohn Marino int next; /* Next item in this bucket. */
4386d7f5d3SJohn Marino unsigned hash; /* Hash of lines in this class. */
4486d7f5d3SJohn Marino char const *line; /* A line that fits this class. */
4586d7f5d3SJohn Marino size_t length; /* That line's length, not counting its newline. */
4686d7f5d3SJohn Marino };
4786d7f5d3SJohn Marino
4886d7f5d3SJohn Marino /* Hash-table: array of buckets, each being a chain of equivalence classes.
4986d7f5d3SJohn Marino buckets[-1] is reserved for incomplete lines. */
5086d7f5d3SJohn Marino static int *buckets;
5186d7f5d3SJohn Marino
5286d7f5d3SJohn Marino /* Number of buckets in the hash table array, not counting buckets[-1]. */
5386d7f5d3SJohn Marino static int nbuckets;
5486d7f5d3SJohn Marino
5586d7f5d3SJohn Marino /* Array in which the equivalence classes are allocated.
5686d7f5d3SJohn Marino The bucket-chains go through the elements in this array.
5786d7f5d3SJohn Marino The number of an equivalence class is its index in this array. */
5886d7f5d3SJohn Marino static struct equivclass *equivs;
5986d7f5d3SJohn Marino
6086d7f5d3SJohn Marino /* Index of first free element in the array `equivs'. */
6186d7f5d3SJohn Marino static int equivs_index;
6286d7f5d3SJohn Marino
6386d7f5d3SJohn Marino /* Number of elements allocated in the array `equivs'. */
6486d7f5d3SJohn Marino static int equivs_alloc;
6586d7f5d3SJohn Marino
6686d7f5d3SJohn Marino static void find_and_hash_each_line PARAMS((struct file_data *));
6786d7f5d3SJohn Marino static void find_identical_ends PARAMS((struct file_data[]));
6886d7f5d3SJohn Marino static void prepare_text_end PARAMS((struct file_data *));
6986d7f5d3SJohn Marino
7086d7f5d3SJohn Marino /* Check for binary files and compare them for exact identity. */
7186d7f5d3SJohn Marino
7286d7f5d3SJohn Marino /* Return 1 if BUF contains a non text character.
7386d7f5d3SJohn Marino SIZE is the number of characters in BUF. */
7486d7f5d3SJohn Marino
7586d7f5d3SJohn Marino #define binary_file_p(buf, size) (memchr (buf, '\0', size) != 0)
7686d7f5d3SJohn Marino
7786d7f5d3SJohn Marino /* Get ready to read the current file.
7886d7f5d3SJohn Marino Return nonzero if SKIP_TEST is zero,
7986d7f5d3SJohn Marino and if it appears to be a binary file. */
8086d7f5d3SJohn Marino
8186d7f5d3SJohn Marino int
sip(current,skip_test)8286d7f5d3SJohn Marino sip (current, skip_test)
8386d7f5d3SJohn Marino struct file_data *current;
8486d7f5d3SJohn Marino int skip_test;
8586d7f5d3SJohn Marino {
8686d7f5d3SJohn Marino /* If we have a nonexistent file at this stage, treat it as empty. */
8786d7f5d3SJohn Marino if (current->desc < 0)
8886d7f5d3SJohn Marino {
8986d7f5d3SJohn Marino /* Leave room for a sentinel. */
9086d7f5d3SJohn Marino current->bufsize = sizeof (word);
9186d7f5d3SJohn Marino current->buffer = xmalloc (current->bufsize);
9286d7f5d3SJohn Marino }
9386d7f5d3SJohn Marino else
9486d7f5d3SJohn Marino {
9586d7f5d3SJohn Marino current->bufsize = STAT_BLOCKSIZE (current->stat);
9686d7f5d3SJohn Marino current->buffer = xmalloc (current->bufsize);
9786d7f5d3SJohn Marino
9886d7f5d3SJohn Marino if (! skip_test)
9986d7f5d3SJohn Marino {
10086d7f5d3SJohn Marino /* Check first part of file to see if it's a binary file. */
10186d7f5d3SJohn Marino #if HAVE_SETMODE
10286d7f5d3SJohn Marino int oldmode = setmode (current->desc, O_BINARY);
10386d7f5d3SJohn Marino #endif
10486d7f5d3SJohn Marino ssize_t n = read (current->desc, current->buffer, current->bufsize);
10586d7f5d3SJohn Marino if (n == -1)
10686d7f5d3SJohn Marino pfatal_with_name (current->name);
10786d7f5d3SJohn Marino current->buffered_chars = n;
10886d7f5d3SJohn Marino #if HAVE_SETMODE
10986d7f5d3SJohn Marino if (oldmode != O_BINARY)
11086d7f5d3SJohn Marino {
11186d7f5d3SJohn Marino if (lseek (current->desc, - (off_t) n, SEEK_CUR) == -1)
11286d7f5d3SJohn Marino pfatal_with_name (current->name);
11386d7f5d3SJohn Marino setmode (current->desc, oldmode);
11486d7f5d3SJohn Marino current->buffered_chars = 0;
11586d7f5d3SJohn Marino }
11686d7f5d3SJohn Marino #endif
11786d7f5d3SJohn Marino return binary_file_p (current->buffer, n);
11886d7f5d3SJohn Marino }
11986d7f5d3SJohn Marino }
12086d7f5d3SJohn Marino
12186d7f5d3SJohn Marino current->buffered_chars = 0;
12286d7f5d3SJohn Marino return 0;
12386d7f5d3SJohn Marino }
12486d7f5d3SJohn Marino
12586d7f5d3SJohn Marino /* Slurp the rest of the current file completely into memory. */
12686d7f5d3SJohn Marino
12786d7f5d3SJohn Marino void
slurp(current)12886d7f5d3SJohn Marino slurp (current)
12986d7f5d3SJohn Marino struct file_data *current;
13086d7f5d3SJohn Marino {
13186d7f5d3SJohn Marino ssize_t cc;
13286d7f5d3SJohn Marino
13386d7f5d3SJohn Marino if (current->desc < 0)
13486d7f5d3SJohn Marino /* The file is nonexistent. */
13586d7f5d3SJohn Marino ;
13686d7f5d3SJohn Marino else if (S_ISREG (current->stat.st_mode))
13786d7f5d3SJohn Marino {
13886d7f5d3SJohn Marino /* It's a regular file; slurp in the rest all at once. */
13986d7f5d3SJohn Marino
14086d7f5d3SJohn Marino /* Get the size out of the stat block.
14186d7f5d3SJohn Marino Allocate enough room for appended newline and sentinel. */
14286d7f5d3SJohn Marino cc = current->stat.st_size + 1 + sizeof (word);
14386d7f5d3SJohn Marino if (current->bufsize < cc)
14486d7f5d3SJohn Marino {
14586d7f5d3SJohn Marino current->bufsize = cc;
14686d7f5d3SJohn Marino current->buffer = xrealloc (current->buffer, cc);
14786d7f5d3SJohn Marino }
14886d7f5d3SJohn Marino
14986d7f5d3SJohn Marino if (current->buffered_chars < current->stat.st_size)
15086d7f5d3SJohn Marino {
15186d7f5d3SJohn Marino cc = read (current->desc,
15286d7f5d3SJohn Marino current->buffer + current->buffered_chars,
15386d7f5d3SJohn Marino current->stat.st_size - current->buffered_chars);
15486d7f5d3SJohn Marino if (cc == -1)
15586d7f5d3SJohn Marino pfatal_with_name (current->name);
15686d7f5d3SJohn Marino current->buffered_chars += cc;
15786d7f5d3SJohn Marino }
15886d7f5d3SJohn Marino }
15986d7f5d3SJohn Marino /* It's not a regular file; read it, growing the buffer as needed. */
16086d7f5d3SJohn Marino else if (always_text_flag || current->buffered_chars != 0)
16186d7f5d3SJohn Marino {
16286d7f5d3SJohn Marino for (;;)
16386d7f5d3SJohn Marino {
16486d7f5d3SJohn Marino if (current->buffered_chars == current->bufsize)
16586d7f5d3SJohn Marino {
16686d7f5d3SJohn Marino current->bufsize = current->bufsize * 2;
16786d7f5d3SJohn Marino current->buffer = xrealloc (current->buffer, current->bufsize);
16886d7f5d3SJohn Marino }
16986d7f5d3SJohn Marino cc = read (current->desc,
17086d7f5d3SJohn Marino current->buffer + current->buffered_chars,
17186d7f5d3SJohn Marino current->bufsize - current->buffered_chars);
17286d7f5d3SJohn Marino if (cc == 0)
17386d7f5d3SJohn Marino break;
17486d7f5d3SJohn Marino if (cc == -1)
17586d7f5d3SJohn Marino pfatal_with_name (current->name);
17686d7f5d3SJohn Marino current->buffered_chars += cc;
17786d7f5d3SJohn Marino }
17886d7f5d3SJohn Marino /* Allocate just enough room for appended newline and sentinel. */
17986d7f5d3SJohn Marino current->bufsize = current->buffered_chars + 1 + sizeof (word);
18086d7f5d3SJohn Marino current->buffer = xrealloc (current->buffer, current->bufsize);
18186d7f5d3SJohn Marino }
18286d7f5d3SJohn Marino }
18386d7f5d3SJohn Marino
18486d7f5d3SJohn Marino /* Split the file into lines, simultaneously computing the equivalence class for
18586d7f5d3SJohn Marino each line. */
18686d7f5d3SJohn Marino
18786d7f5d3SJohn Marino static void
find_and_hash_each_line(current)18886d7f5d3SJohn Marino find_and_hash_each_line (current)
18986d7f5d3SJohn Marino struct file_data *current;
19086d7f5d3SJohn Marino {
19186d7f5d3SJohn Marino unsigned h;
19286d7f5d3SJohn Marino unsigned char const *p = (unsigned char const *) current->prefix_end;
19386d7f5d3SJohn Marino unsigned char c;
19486d7f5d3SJohn Marino int i, *bucket;
19586d7f5d3SJohn Marino size_t length;
19686d7f5d3SJohn Marino
19786d7f5d3SJohn Marino /* Cache often-used quantities in local variables to help the compiler. */
19886d7f5d3SJohn Marino char const **linbuf = current->linbuf;
19986d7f5d3SJohn Marino int alloc_lines = current->alloc_lines;
20086d7f5d3SJohn Marino int line = 0;
20186d7f5d3SJohn Marino int linbuf_base = current->linbuf_base;
20286d7f5d3SJohn Marino int *cureqs = (int *) xmalloc (alloc_lines * sizeof (int));
20386d7f5d3SJohn Marino struct equivclass *eqs = equivs;
20486d7f5d3SJohn Marino int eqs_index = equivs_index;
20586d7f5d3SJohn Marino int eqs_alloc = equivs_alloc;
20686d7f5d3SJohn Marino char const *suffix_begin = current->suffix_begin;
20786d7f5d3SJohn Marino char const *bufend = current->buffer + current->buffered_chars;
20886d7f5d3SJohn Marino int use_line_cmp = ignore_some_line_changes;
20986d7f5d3SJohn Marino
21086d7f5d3SJohn Marino while ((char const *) p < suffix_begin)
21186d7f5d3SJohn Marino {
21286d7f5d3SJohn Marino char const *ip = (char const *) p;
21386d7f5d3SJohn Marino
21486d7f5d3SJohn Marino /* Compute the equivalence class for this line. */
21586d7f5d3SJohn Marino
21686d7f5d3SJohn Marino h = 0;
21786d7f5d3SJohn Marino
21886d7f5d3SJohn Marino /* Hash this line until we find a newline. */
21986d7f5d3SJohn Marino if (ignore_case_flag)
22086d7f5d3SJohn Marino {
22186d7f5d3SJohn Marino if (ignore_all_space_flag)
22286d7f5d3SJohn Marino while ((c = *p++) != '\n')
22386d7f5d3SJohn Marino {
22486d7f5d3SJohn Marino if (! ISSPACE (c))
22586d7f5d3SJohn Marino h = HASH (h, ISUPPER (c) ? tolower (c) : c);
22686d7f5d3SJohn Marino }
22786d7f5d3SJohn Marino else if (ignore_space_change_flag)
22886d7f5d3SJohn Marino while ((c = *p++) != '\n')
22986d7f5d3SJohn Marino {
23086d7f5d3SJohn Marino if (ISSPACE (c))
23186d7f5d3SJohn Marino {
23286d7f5d3SJohn Marino for (;;)
23386d7f5d3SJohn Marino {
23486d7f5d3SJohn Marino c = *p++;
23586d7f5d3SJohn Marino if (!ISSPACE (c))
23686d7f5d3SJohn Marino break;
23786d7f5d3SJohn Marino if (c == '\n')
23886d7f5d3SJohn Marino goto hashing_done;
23986d7f5d3SJohn Marino }
24086d7f5d3SJohn Marino h = HASH (h, ' ');
24186d7f5d3SJohn Marino }
24286d7f5d3SJohn Marino /* C is now the first non-space. */
24386d7f5d3SJohn Marino h = HASH (h, ISUPPER (c) ? tolower (c) : c);
24486d7f5d3SJohn Marino }
24586d7f5d3SJohn Marino else
24686d7f5d3SJohn Marino while ((c = *p++) != '\n')
24786d7f5d3SJohn Marino h = HASH (h, ISUPPER (c) ? tolower (c) : c);
24886d7f5d3SJohn Marino }
24986d7f5d3SJohn Marino else
25086d7f5d3SJohn Marino {
25186d7f5d3SJohn Marino if (ignore_all_space_flag)
25286d7f5d3SJohn Marino while ((c = *p++) != '\n')
25386d7f5d3SJohn Marino {
25486d7f5d3SJohn Marino if (! ISSPACE (c))
25586d7f5d3SJohn Marino h = HASH (h, c);
25686d7f5d3SJohn Marino }
25786d7f5d3SJohn Marino else if (ignore_space_change_flag)
25886d7f5d3SJohn Marino while ((c = *p++) != '\n')
25986d7f5d3SJohn Marino {
26086d7f5d3SJohn Marino if (ISSPACE (c))
26186d7f5d3SJohn Marino {
26286d7f5d3SJohn Marino for (;;)
26386d7f5d3SJohn Marino {
26486d7f5d3SJohn Marino c = *p++;
26586d7f5d3SJohn Marino if (!ISSPACE (c))
26686d7f5d3SJohn Marino break;
26786d7f5d3SJohn Marino if (c == '\n')
26886d7f5d3SJohn Marino goto hashing_done;
26986d7f5d3SJohn Marino }
27086d7f5d3SJohn Marino h = HASH (h, ' ');
27186d7f5d3SJohn Marino }
27286d7f5d3SJohn Marino /* C is now the first non-space. */
27386d7f5d3SJohn Marino h = HASH (h, c);
27486d7f5d3SJohn Marino }
27586d7f5d3SJohn Marino else
27686d7f5d3SJohn Marino while ((c = *p++) != '\n')
27786d7f5d3SJohn Marino h = HASH (h, c);
27886d7f5d3SJohn Marino }
27986d7f5d3SJohn Marino hashing_done:;
28086d7f5d3SJohn Marino
28186d7f5d3SJohn Marino bucket = &buckets[h % nbuckets];
28286d7f5d3SJohn Marino length = (char const *) p - ip - 1;
28386d7f5d3SJohn Marino
28486d7f5d3SJohn Marino if ((char const *) p == bufend
28586d7f5d3SJohn Marino && current->missing_newline
28686d7f5d3SJohn Marino && ROBUST_OUTPUT_STYLE (output_style))
28786d7f5d3SJohn Marino {
28886d7f5d3SJohn Marino /* This line is incomplete. If this is significant,
28986d7f5d3SJohn Marino put the line into bucket[-1]. */
29086d7f5d3SJohn Marino if (! (ignore_space_change_flag | ignore_all_space_flag))
29186d7f5d3SJohn Marino bucket = &buckets[-1];
29286d7f5d3SJohn Marino
29386d7f5d3SJohn Marino /* Omit the inserted newline when computing linbuf later. */
29486d7f5d3SJohn Marino p--;
29586d7f5d3SJohn Marino bufend = suffix_begin = (char const *) p;
29686d7f5d3SJohn Marino }
29786d7f5d3SJohn Marino
29886d7f5d3SJohn Marino for (i = *bucket; ; i = eqs[i].next)
29986d7f5d3SJohn Marino if (!i)
30086d7f5d3SJohn Marino {
30186d7f5d3SJohn Marino /* Create a new equivalence class in this bucket. */
30286d7f5d3SJohn Marino i = eqs_index++;
30386d7f5d3SJohn Marino if (i == eqs_alloc)
30486d7f5d3SJohn Marino eqs = (struct equivclass *)
30586d7f5d3SJohn Marino xrealloc (eqs, (eqs_alloc*=2) * sizeof(*eqs));
30686d7f5d3SJohn Marino eqs[i].next = *bucket;
30786d7f5d3SJohn Marino eqs[i].hash = h;
30886d7f5d3SJohn Marino eqs[i].line = ip;
30986d7f5d3SJohn Marino eqs[i].length = length;
31086d7f5d3SJohn Marino *bucket = i;
31186d7f5d3SJohn Marino break;
31286d7f5d3SJohn Marino }
31386d7f5d3SJohn Marino else if (eqs[i].hash == h)
31486d7f5d3SJohn Marino {
31586d7f5d3SJohn Marino char const *eqline = eqs[i].line;
31686d7f5d3SJohn Marino
31786d7f5d3SJohn Marino /* Reuse existing equivalence class if the lines are identical.
31886d7f5d3SJohn Marino This detects the common case of exact identity
31986d7f5d3SJohn Marino faster than complete comparison would. */
32086d7f5d3SJohn Marino if (eqs[i].length == length && memcmp (eqline, ip, length) == 0)
32186d7f5d3SJohn Marino break;
32286d7f5d3SJohn Marino
32386d7f5d3SJohn Marino /* Reuse existing class if line_cmp reports the lines equal. */
32486d7f5d3SJohn Marino if (use_line_cmp && line_cmp (eqline, ip) == 0)
32586d7f5d3SJohn Marino break;
32686d7f5d3SJohn Marino }
32786d7f5d3SJohn Marino
32886d7f5d3SJohn Marino /* Maybe increase the size of the line table. */
32986d7f5d3SJohn Marino if (line == alloc_lines)
33086d7f5d3SJohn Marino {
33186d7f5d3SJohn Marino /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
33286d7f5d3SJohn Marino alloc_lines = 2 * alloc_lines - linbuf_base;
33386d7f5d3SJohn Marino cureqs = (int *) xrealloc (cureqs, alloc_lines * sizeof (*cureqs));
33486d7f5d3SJohn Marino linbuf = (char const **) xrealloc (linbuf + linbuf_base,
33586d7f5d3SJohn Marino (alloc_lines - linbuf_base)
33686d7f5d3SJohn Marino * sizeof (*linbuf))
33786d7f5d3SJohn Marino - linbuf_base;
33886d7f5d3SJohn Marino }
33986d7f5d3SJohn Marino linbuf[line] = ip;
34086d7f5d3SJohn Marino cureqs[line] = i;
34186d7f5d3SJohn Marino ++line;
34286d7f5d3SJohn Marino }
34386d7f5d3SJohn Marino
34486d7f5d3SJohn Marino current->buffered_lines = line;
34586d7f5d3SJohn Marino
34686d7f5d3SJohn Marino for (i = 0; ; i++)
34786d7f5d3SJohn Marino {
34886d7f5d3SJohn Marino /* Record the line start for lines in the suffix that we care about.
34986d7f5d3SJohn Marino Record one more line start than lines,
35086d7f5d3SJohn Marino so that we can compute the length of any buffered line. */
35186d7f5d3SJohn Marino if (line == alloc_lines)
35286d7f5d3SJohn Marino {
35386d7f5d3SJohn Marino /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
35486d7f5d3SJohn Marino alloc_lines = 2 * alloc_lines - linbuf_base;
35586d7f5d3SJohn Marino linbuf = (char const **) xrealloc (linbuf + linbuf_base,
35686d7f5d3SJohn Marino (alloc_lines - linbuf_base)
35786d7f5d3SJohn Marino * sizeof (*linbuf))
35886d7f5d3SJohn Marino - linbuf_base;
35986d7f5d3SJohn Marino }
36086d7f5d3SJohn Marino linbuf[line] = (char const *) p;
36186d7f5d3SJohn Marino
36286d7f5d3SJohn Marino if ((char const *) p == bufend)
36386d7f5d3SJohn Marino break;
36486d7f5d3SJohn Marino
36586d7f5d3SJohn Marino if (context <= i && no_diff_means_no_output)
36686d7f5d3SJohn Marino break;
36786d7f5d3SJohn Marino
36886d7f5d3SJohn Marino line++;
36986d7f5d3SJohn Marino
37086d7f5d3SJohn Marino while (*p++ != '\n')
37186d7f5d3SJohn Marino ;
37286d7f5d3SJohn Marino }
37386d7f5d3SJohn Marino
37486d7f5d3SJohn Marino /* Done with cache in local variables. */
37586d7f5d3SJohn Marino current->linbuf = linbuf;
37686d7f5d3SJohn Marino current->valid_lines = line;
37786d7f5d3SJohn Marino current->alloc_lines = alloc_lines;
37886d7f5d3SJohn Marino current->equivs = cureqs;
37986d7f5d3SJohn Marino equivs = eqs;
38086d7f5d3SJohn Marino equivs_alloc = eqs_alloc;
38186d7f5d3SJohn Marino equivs_index = eqs_index;
38286d7f5d3SJohn Marino }
38386d7f5d3SJohn Marino
38486d7f5d3SJohn Marino /* Prepare the end of the text. Make sure it's initialized.
38586d7f5d3SJohn Marino Make sure text ends in a newline,
38686d7f5d3SJohn Marino but remember that we had to add one. */
38786d7f5d3SJohn Marino
38886d7f5d3SJohn Marino static void
prepare_text_end(current)38986d7f5d3SJohn Marino prepare_text_end (current)
39086d7f5d3SJohn Marino struct file_data *current;
39186d7f5d3SJohn Marino {
39286d7f5d3SJohn Marino size_t buffered_chars = current->buffered_chars;
39386d7f5d3SJohn Marino char *p = current->buffer;
39486d7f5d3SJohn Marino
39586d7f5d3SJohn Marino if (buffered_chars == 0 || p[buffered_chars - 1] == '\n')
39686d7f5d3SJohn Marino current->missing_newline = 0;
39786d7f5d3SJohn Marino else
39886d7f5d3SJohn Marino {
39986d7f5d3SJohn Marino p[buffered_chars++] = '\n';
40086d7f5d3SJohn Marino current->buffered_chars = buffered_chars;
40186d7f5d3SJohn Marino current->missing_newline = 1;
40286d7f5d3SJohn Marino }
40386d7f5d3SJohn Marino
40486d7f5d3SJohn Marino /* Don't use uninitialized storage when planting or using sentinels. */
40586d7f5d3SJohn Marino if (p)
40686d7f5d3SJohn Marino bzero (p + buffered_chars, sizeof (word));
40786d7f5d3SJohn Marino }
40886d7f5d3SJohn Marino
40986d7f5d3SJohn Marino /* Given a vector of two file_data objects, find the identical
41086d7f5d3SJohn Marino prefixes and suffixes of each object. */
41186d7f5d3SJohn Marino
41286d7f5d3SJohn Marino static void
find_identical_ends(filevec)41386d7f5d3SJohn Marino find_identical_ends (filevec)
41486d7f5d3SJohn Marino struct file_data filevec[];
41586d7f5d3SJohn Marino {
41686d7f5d3SJohn Marino word *w0, *w1;
41786d7f5d3SJohn Marino char *p0, *p1, *buffer0, *buffer1;
41886d7f5d3SJohn Marino char const *end0, *beg0;
41986d7f5d3SJohn Marino char const **linbuf0, **linbuf1;
42086d7f5d3SJohn Marino int i, lines;
42186d7f5d3SJohn Marino size_t n0, n1, tem;
42286d7f5d3SJohn Marino int alloc_lines0, alloc_lines1;
42386d7f5d3SJohn Marino int buffered_prefix, prefix_count, prefix_mask;
42486d7f5d3SJohn Marino
42586d7f5d3SJohn Marino slurp (&filevec[0]);
42686d7f5d3SJohn Marino if (filevec[0].desc != filevec[1].desc)
42786d7f5d3SJohn Marino slurp (&filevec[1]);
42886d7f5d3SJohn Marino else
42986d7f5d3SJohn Marino {
43086d7f5d3SJohn Marino filevec[1].buffer = filevec[0].buffer;
43186d7f5d3SJohn Marino filevec[1].bufsize = filevec[0].bufsize;
43286d7f5d3SJohn Marino filevec[1].buffered_chars = filevec[0].buffered_chars;
43386d7f5d3SJohn Marino }
43486d7f5d3SJohn Marino for (i = 0; i < 2; i++)
43586d7f5d3SJohn Marino prepare_text_end (&filevec[i]);
43686d7f5d3SJohn Marino
43786d7f5d3SJohn Marino /* Find identical prefix. */
43886d7f5d3SJohn Marino
43986d7f5d3SJohn Marino p0 = buffer0 = filevec[0].buffer;
44086d7f5d3SJohn Marino p1 = buffer1 = filevec[1].buffer;
44186d7f5d3SJohn Marino
44286d7f5d3SJohn Marino n0 = filevec[0].buffered_chars;
44386d7f5d3SJohn Marino n1 = filevec[1].buffered_chars;
44486d7f5d3SJohn Marino
44586d7f5d3SJohn Marino if (p0 == p1)
44686d7f5d3SJohn Marino /* The buffers are the same; sentinels won't work. */
44786d7f5d3SJohn Marino p0 = p1 += n1;
44886d7f5d3SJohn Marino else
44986d7f5d3SJohn Marino {
45086d7f5d3SJohn Marino /* Insert end sentinels, in this case characters that are guaranteed
45186d7f5d3SJohn Marino to make the equality test false, and thus terminate the loop. */
45286d7f5d3SJohn Marino
45386d7f5d3SJohn Marino if (n0 < n1)
45486d7f5d3SJohn Marino p0[n0] = ~p1[n0];
45586d7f5d3SJohn Marino else
45686d7f5d3SJohn Marino p1[n1] = ~p0[n1];
45786d7f5d3SJohn Marino
45886d7f5d3SJohn Marino /* Loop until first mismatch, or to the sentinel characters. */
45986d7f5d3SJohn Marino
46086d7f5d3SJohn Marino /* Compare a word at a time for speed. */
46186d7f5d3SJohn Marino w0 = (word *) p0;
46286d7f5d3SJohn Marino w1 = (word *) p1;
46386d7f5d3SJohn Marino while (*w0++ == *w1++)
46486d7f5d3SJohn Marino ;
46586d7f5d3SJohn Marino --w0, --w1;
46686d7f5d3SJohn Marino
46786d7f5d3SJohn Marino /* Do the last few bytes of comparison a byte at a time. */
46886d7f5d3SJohn Marino p0 = (char *) w0;
46986d7f5d3SJohn Marino p1 = (char *) w1;
47086d7f5d3SJohn Marino while (*p0++ == *p1++)
47186d7f5d3SJohn Marino ;
47286d7f5d3SJohn Marino --p0, --p1;
47386d7f5d3SJohn Marino
47486d7f5d3SJohn Marino /* Don't mistakenly count missing newline as part of prefix. */
47586d7f5d3SJohn Marino if (ROBUST_OUTPUT_STYLE (output_style)
47686d7f5d3SJohn Marino && (buffer0 + n0 - filevec[0].missing_newline < p0)
47786d7f5d3SJohn Marino !=
47886d7f5d3SJohn Marino (buffer1 + n1 - filevec[1].missing_newline < p1))
47986d7f5d3SJohn Marino --p0, --p1;
48086d7f5d3SJohn Marino }
48186d7f5d3SJohn Marino
48286d7f5d3SJohn Marino /* Now P0 and P1 point at the first nonmatching characters. */
48386d7f5d3SJohn Marino
48486d7f5d3SJohn Marino /* Skip back to last line-beginning in the prefix,
48586d7f5d3SJohn Marino and then discard up to HORIZON_LINES lines from the prefix. */
48686d7f5d3SJohn Marino i = horizon_lines;
48786d7f5d3SJohn Marino while (p0 != buffer0 && (p0[-1] != '\n' || i--))
48886d7f5d3SJohn Marino --p0, --p1;
48986d7f5d3SJohn Marino
49086d7f5d3SJohn Marino /* Record the prefix. */
49186d7f5d3SJohn Marino filevec[0].prefix_end = p0;
49286d7f5d3SJohn Marino filevec[1].prefix_end = p1;
49386d7f5d3SJohn Marino
49486d7f5d3SJohn Marino /* Find identical suffix. */
49586d7f5d3SJohn Marino
49686d7f5d3SJohn Marino /* P0 and P1 point beyond the last chars not yet compared. */
49786d7f5d3SJohn Marino p0 = buffer0 + n0;
49886d7f5d3SJohn Marino p1 = buffer1 + n1;
49986d7f5d3SJohn Marino
50086d7f5d3SJohn Marino if (! ROBUST_OUTPUT_STYLE (output_style)
50186d7f5d3SJohn Marino || filevec[0].missing_newline == filevec[1].missing_newline)
50286d7f5d3SJohn Marino {
50386d7f5d3SJohn Marino end0 = p0; /* Addr of last char in file 0. */
50486d7f5d3SJohn Marino
50586d7f5d3SJohn Marino /* Get value of P0 at which we should stop scanning backward:
50686d7f5d3SJohn Marino this is when either P0 or P1 points just past the last char
50786d7f5d3SJohn Marino of the identical prefix. */
50886d7f5d3SJohn Marino beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
50986d7f5d3SJohn Marino
51086d7f5d3SJohn Marino /* Scan back until chars don't match or we reach that point. */
51186d7f5d3SJohn Marino for (; p0 != beg0; p0--, p1--)
51286d7f5d3SJohn Marino if (*p0 != *p1)
51386d7f5d3SJohn Marino {
51486d7f5d3SJohn Marino /* Point at the first char of the matching suffix. */
51586d7f5d3SJohn Marino beg0 = p0;
51686d7f5d3SJohn Marino break;
51786d7f5d3SJohn Marino }
51886d7f5d3SJohn Marino
51986d7f5d3SJohn Marino /* Are we at a line-beginning in both files? If not, add the rest of
52086d7f5d3SJohn Marino this line to the main body. Discard up to HORIZON_LINES lines from
52186d7f5d3SJohn Marino the identical suffix. Also, discard one extra line,
52286d7f5d3SJohn Marino because shift_boundaries may need it. */
52386d7f5d3SJohn Marino i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
52486d7f5d3SJohn Marino &&
52586d7f5d3SJohn Marino (buffer1 == p1 || p1[-1] == '\n'));
52686d7f5d3SJohn Marino while (i-- && p0 != end0)
52786d7f5d3SJohn Marino while (*p0++ != '\n')
52886d7f5d3SJohn Marino ;
52986d7f5d3SJohn Marino
53086d7f5d3SJohn Marino p1 += p0 - beg0;
53186d7f5d3SJohn Marino }
53286d7f5d3SJohn Marino
53386d7f5d3SJohn Marino /* Record the suffix. */
53486d7f5d3SJohn Marino filevec[0].suffix_begin = p0;
53586d7f5d3SJohn Marino filevec[1].suffix_begin = p1;
53686d7f5d3SJohn Marino
53786d7f5d3SJohn Marino /* Calculate number of lines of prefix to save.
53886d7f5d3SJohn Marino
53986d7f5d3SJohn Marino prefix_count == 0 means save the whole prefix;
54086d7f5d3SJohn Marino we need this with for options like -D that output the whole file.
54186d7f5d3SJohn Marino We also need it for options like -F that output some preceding line;
54286d7f5d3SJohn Marino at least we will need to find the last few lines,
54386d7f5d3SJohn Marino but since we don't know how many, it's easiest to find them all.
54486d7f5d3SJohn Marino
54586d7f5d3SJohn Marino Otherwise, prefix_count != 0. Save just prefix_count lines at start
54686d7f5d3SJohn Marino of the line buffer; they'll be moved to the proper location later.
54786d7f5d3SJohn Marino Handle 1 more line than the context says (because we count 1 too many),
54886d7f5d3SJohn Marino rounded up to the next power of 2 to speed index computation. */
54986d7f5d3SJohn Marino
55086d7f5d3SJohn Marino if (no_diff_means_no_output && ! function_regexp_list)
55186d7f5d3SJohn Marino {
55286d7f5d3SJohn Marino for (prefix_count = 1; prefix_count < context + 1; prefix_count *= 2)
55386d7f5d3SJohn Marino ;
55486d7f5d3SJohn Marino prefix_mask = prefix_count - 1;
55586d7f5d3SJohn Marino alloc_lines0
55686d7f5d3SJohn Marino = prefix_count
55786d7f5d3SJohn Marino + GUESS_LINES (0, 0, p0 - filevec[0].prefix_end)
55886d7f5d3SJohn Marino + context;
55986d7f5d3SJohn Marino }
56086d7f5d3SJohn Marino else
56186d7f5d3SJohn Marino {
56286d7f5d3SJohn Marino prefix_count = 0;
56386d7f5d3SJohn Marino prefix_mask = ~0;
56486d7f5d3SJohn Marino alloc_lines0 = GUESS_LINES (0, 0, n0);
56586d7f5d3SJohn Marino }
56686d7f5d3SJohn Marino
56786d7f5d3SJohn Marino lines = 0;
56886d7f5d3SJohn Marino linbuf0 = (char const **) xmalloc (alloc_lines0 * sizeof (*linbuf0));
56986d7f5d3SJohn Marino
57086d7f5d3SJohn Marino /* If the prefix is needed, find the prefix lines. */
57186d7f5d3SJohn Marino if (! (no_diff_means_no_output
57286d7f5d3SJohn Marino && filevec[0].prefix_end == p0
57386d7f5d3SJohn Marino && filevec[1].prefix_end == p1))
57486d7f5d3SJohn Marino {
57586d7f5d3SJohn Marino p0 = buffer0;
57686d7f5d3SJohn Marino end0 = filevec[0].prefix_end;
57786d7f5d3SJohn Marino while (p0 != end0)
57886d7f5d3SJohn Marino {
57986d7f5d3SJohn Marino int l = lines++ & prefix_mask;
58086d7f5d3SJohn Marino if (l == alloc_lines0)
58186d7f5d3SJohn Marino linbuf0 = (char const **) xrealloc (linbuf0, (alloc_lines0 *= 2)
58286d7f5d3SJohn Marino * sizeof(*linbuf0));
58386d7f5d3SJohn Marino linbuf0[l] = p0;
58486d7f5d3SJohn Marino while (*p0++ != '\n')
58586d7f5d3SJohn Marino ;
58686d7f5d3SJohn Marino }
58786d7f5d3SJohn Marino }
58886d7f5d3SJohn Marino buffered_prefix = prefix_count && context < lines ? context : lines;
58986d7f5d3SJohn Marino
59086d7f5d3SJohn Marino /* Allocate line buffer 1. */
59186d7f5d3SJohn Marino tem = prefix_count ? filevec[1].suffix_begin - buffer1 : n1;
59286d7f5d3SJohn Marino
59386d7f5d3SJohn Marino alloc_lines1
59486d7f5d3SJohn Marino = (buffered_prefix
59586d7f5d3SJohn Marino + GUESS_LINES (lines, filevec[1].prefix_end - buffer1, tem)
59686d7f5d3SJohn Marino + context);
59786d7f5d3SJohn Marino linbuf1 = (char const **) xmalloc (alloc_lines1 * sizeof (*linbuf1));
59886d7f5d3SJohn Marino
59986d7f5d3SJohn Marino if (buffered_prefix != lines)
60086d7f5d3SJohn Marino {
60186d7f5d3SJohn Marino /* Rotate prefix lines to proper location. */
60286d7f5d3SJohn Marino for (i = 0; i < buffered_prefix; i++)
60386d7f5d3SJohn Marino linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
60486d7f5d3SJohn Marino for (i = 0; i < buffered_prefix; i++)
60586d7f5d3SJohn Marino linbuf0[i] = linbuf1[i];
60686d7f5d3SJohn Marino }
60786d7f5d3SJohn Marino
60886d7f5d3SJohn Marino /* Initialize line buffer 1 from line buffer 0. */
60986d7f5d3SJohn Marino for (i = 0; i < buffered_prefix; i++)
61086d7f5d3SJohn Marino linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
61186d7f5d3SJohn Marino
61286d7f5d3SJohn Marino /* Record the line buffer, adjusted so that
61386d7f5d3SJohn Marino linbuf*[0] points at the first differing line. */
61486d7f5d3SJohn Marino filevec[0].linbuf = linbuf0 + buffered_prefix;
61586d7f5d3SJohn Marino filevec[1].linbuf = linbuf1 + buffered_prefix;
61686d7f5d3SJohn Marino filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
61786d7f5d3SJohn Marino filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
61886d7f5d3SJohn Marino filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
61986d7f5d3SJohn Marino filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
62086d7f5d3SJohn Marino }
62186d7f5d3SJohn Marino
62286d7f5d3SJohn Marino /* Largest primes less than some power of two, for nbuckets. Values range
62386d7f5d3SJohn Marino from useful to preposterous. If one of these numbers isn't prime
62486d7f5d3SJohn Marino after all, don't blame it on me, blame it on primes (6) . . . */
62586d7f5d3SJohn Marino static int const primes[] =
62686d7f5d3SJohn Marino {
62786d7f5d3SJohn Marino 509,
62886d7f5d3SJohn Marino 1021,
62986d7f5d3SJohn Marino 2039,
63086d7f5d3SJohn Marino 4093,
63186d7f5d3SJohn Marino 8191,
63286d7f5d3SJohn Marino 16381,
63386d7f5d3SJohn Marino 32749,
63486d7f5d3SJohn Marino #if 32767 < INT_MAX
63586d7f5d3SJohn Marino 65521,
63686d7f5d3SJohn Marino 131071,
63786d7f5d3SJohn Marino 262139,
63886d7f5d3SJohn Marino 524287,
63986d7f5d3SJohn Marino 1048573,
64086d7f5d3SJohn Marino 2097143,
64186d7f5d3SJohn Marino 4194301,
64286d7f5d3SJohn Marino 8388593,
64386d7f5d3SJohn Marino 16777213,
64486d7f5d3SJohn Marino 33554393,
64586d7f5d3SJohn Marino 67108859, /* Preposterously large . . . */
64686d7f5d3SJohn Marino 134217689,
64786d7f5d3SJohn Marino 268435399,
64886d7f5d3SJohn Marino 536870909,
64986d7f5d3SJohn Marino 1073741789,
65086d7f5d3SJohn Marino 2147483647,
65186d7f5d3SJohn Marino #endif
65286d7f5d3SJohn Marino 0
65386d7f5d3SJohn Marino };
65486d7f5d3SJohn Marino
65586d7f5d3SJohn Marino /* Given a vector of two file_data objects, read the file associated
65686d7f5d3SJohn Marino with each one, and build the table of equivalence classes.
65786d7f5d3SJohn Marino Return 1 if either file appears to be a binary file.
65886d7f5d3SJohn Marino If PRETEND_BINARY is nonzero, pretend they are binary regardless. */
65986d7f5d3SJohn Marino
66086d7f5d3SJohn Marino int
read_files(filevec,pretend_binary)66186d7f5d3SJohn Marino read_files (filevec, pretend_binary)
66286d7f5d3SJohn Marino struct file_data filevec[];
66386d7f5d3SJohn Marino int pretend_binary;
66486d7f5d3SJohn Marino {
66586d7f5d3SJohn Marino int i;
66686d7f5d3SJohn Marino int skip_test = always_text_flag | pretend_binary;
66786d7f5d3SJohn Marino int appears_binary = pretend_binary | sip (&filevec[0], skip_test);
66886d7f5d3SJohn Marino
66986d7f5d3SJohn Marino if (filevec[0].desc != filevec[1].desc)
67086d7f5d3SJohn Marino appears_binary |= sip (&filevec[1], skip_test | appears_binary);
67186d7f5d3SJohn Marino else
67286d7f5d3SJohn Marino {
67386d7f5d3SJohn Marino filevec[1].buffer = filevec[0].buffer;
67486d7f5d3SJohn Marino filevec[1].bufsize = filevec[0].bufsize;
67586d7f5d3SJohn Marino filevec[1].buffered_chars = filevec[0].buffered_chars;
67686d7f5d3SJohn Marino }
67786d7f5d3SJohn Marino if (appears_binary)
67886d7f5d3SJohn Marino {
67986d7f5d3SJohn Marino #if HAVE_SETMODE
68086d7f5d3SJohn Marino setmode (filevec[0].desc, O_BINARY);
68186d7f5d3SJohn Marino setmode (filevec[1].desc, O_BINARY);
68286d7f5d3SJohn Marino #endif
68386d7f5d3SJohn Marino return 1;
68486d7f5d3SJohn Marino }
68586d7f5d3SJohn Marino
68686d7f5d3SJohn Marino find_identical_ends (filevec);
68786d7f5d3SJohn Marino
68886d7f5d3SJohn Marino equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
68986d7f5d3SJohn Marino equivs = (struct equivclass *) xmalloc (equivs_alloc * sizeof (struct equivclass));
69086d7f5d3SJohn Marino /* Equivalence class 0 is permanently safe for lines that were not
69186d7f5d3SJohn Marino hashed. Real equivalence classes start at 1. */
69286d7f5d3SJohn Marino equivs_index = 1;
69386d7f5d3SJohn Marino
69486d7f5d3SJohn Marino for (i = 0; primes[i] < equivs_alloc / 3; i++)
69586d7f5d3SJohn Marino if (! primes[i])
69686d7f5d3SJohn Marino abort ();
69786d7f5d3SJohn Marino nbuckets = primes[i];
69886d7f5d3SJohn Marino
69986d7f5d3SJohn Marino buckets = (int *) xmalloc ((nbuckets + 1) * sizeof (*buckets));
70086d7f5d3SJohn Marino bzero (buckets++, (nbuckets + 1) * sizeof (*buckets));
70186d7f5d3SJohn Marino
70286d7f5d3SJohn Marino for (i = 0; i < 2; i++)
70386d7f5d3SJohn Marino find_and_hash_each_line (&filevec[i]);
70486d7f5d3SJohn Marino
70586d7f5d3SJohn Marino filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
70686d7f5d3SJohn Marino
70786d7f5d3SJohn Marino free (equivs);
70886d7f5d3SJohn Marino free (buckets - 1);
70986d7f5d3SJohn Marino
71086d7f5d3SJohn Marino return 0;
71186d7f5d3SJohn Marino }
712