xref: /netbsd-src/external/gpl2/diffutils/dist/src/analyze.c (revision 75f6d617e282811cb173c2ccfbf5df0dd71f7045)
1*75f6d617Schristos /*	$NetBSD: analyze.c,v 1.1.1.1 2016/01/13 03:15:30 christos Exp $	*/
2*75f6d617Schristos 
3*75f6d617Schristos /* Analyze file differences for GNU DIFF.
4*75f6d617Schristos 
5*75f6d617Schristos    Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002
6*75f6d617Schristos    Free Software Foundation, Inc.
7*75f6d617Schristos 
8*75f6d617Schristos    This file is part of GNU DIFF.
9*75f6d617Schristos 
10*75f6d617Schristos    GNU DIFF is free software; you can redistribute it and/or modify
11*75f6d617Schristos    it under the terms of the GNU General Public License as published by
12*75f6d617Schristos    the Free Software Foundation; either version 2, or (at your option)
13*75f6d617Schristos    any later version.
14*75f6d617Schristos 
15*75f6d617Schristos    GNU DIFF is distributed in the hope that it will be useful,
16*75f6d617Schristos    but WITHOUT ANY WARRANTY; without even the implied warranty of
17*75f6d617Schristos    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18*75f6d617Schristos    GNU General Public License for more details.
19*75f6d617Schristos 
20*75f6d617Schristos    You should have received a copy of the GNU General Public License
21*75f6d617Schristos    along with this program; see the file COPYING.
22*75f6d617Schristos    If not, write to the Free Software Foundation,
23*75f6d617Schristos    59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
24*75f6d617Schristos 
25*75f6d617Schristos /* The basic algorithm is described in:
26*75f6d617Schristos    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
27*75f6d617Schristos    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
28*75f6d617Schristos    see especially section 4.2, which describes the variation used below.
29*75f6d617Schristos    Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
30*75f6d617Schristos    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
31*75f6d617Schristos    at the price of producing suboptimal output for large inputs with
32*75f6d617Schristos    many differences.
33*75f6d617Schristos 
34*75f6d617Schristos    The basic algorithm was independently discovered as described in:
35*75f6d617Schristos    "Algorithms for Approximate String Matching", E. Ukkonen,
36*75f6d617Schristos    Information and Control Vol. 64, 1985, pp. 100-118.  */
37*75f6d617Schristos 
38*75f6d617Schristos #include "diff.h"
39*75f6d617Schristos #include <cmpbuf.h>
40*75f6d617Schristos #include <error.h>
41*75f6d617Schristos #include <regex.h>
42*75f6d617Schristos #include <xalloc.h>
43*75f6d617Schristos 
44*75f6d617Schristos static lin *xvec, *yvec;	/* Vectors being compared. */
45*75f6d617Schristos static lin *fdiag;		/* Vector, indexed by diagonal, containing
46*75f6d617Schristos 				   1 + the X coordinate of the point furthest
47*75f6d617Schristos 				   along the given diagonal in the forward
48*75f6d617Schristos 				   search of the edit matrix. */
49*75f6d617Schristos static lin *bdiag;		/* Vector, indexed by diagonal, containing
50*75f6d617Schristos 				   the X coordinate of the point furthest
51*75f6d617Schristos 				   along the given diagonal in the backward
52*75f6d617Schristos 				   search of the edit matrix. */
53*75f6d617Schristos static lin too_expensive;	/* Edit scripts longer than this are too
54*75f6d617Schristos 				   expensive to compute.  */
55*75f6d617Schristos 
56*75f6d617Schristos #define SNAKE_LIMIT 20	/* Snakes bigger than this are considered `big'.  */
57*75f6d617Schristos 
58*75f6d617Schristos struct partition
59*75f6d617Schristos {
60*75f6d617Schristos   lin xmid, ymid;	/* Midpoints of this partition.  */
61*75f6d617Schristos   bool lo_minimal;	/* Nonzero if low half will be analyzed minimally.  */
62*75f6d617Schristos   bool hi_minimal;	/* Likewise for high half.  */
63*75f6d617Schristos };
64*75f6d617Schristos 
65*75f6d617Schristos /* Find the midpoint of the shortest edit script for a specified
66*75f6d617Schristos    portion of the two files.
67*75f6d617Schristos 
68*75f6d617Schristos    Scan from the beginnings of the files, and simultaneously from the ends,
69*75f6d617Schristos    doing a breadth-first search through the space of edit-sequence.
70*75f6d617Schristos    When the two searches meet, we have found the midpoint of the shortest
71*75f6d617Schristos    edit sequence.
72*75f6d617Schristos 
73*75f6d617Schristos    If FIND_MINIMAL is nonzero, find the minimal edit script regardless
74*75f6d617Schristos    of expense.  Otherwise, if the search is too expensive, use
75*75f6d617Schristos    heuristics to stop the search and report a suboptimal answer.
76*75f6d617Schristos 
77*75f6d617Schristos    Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
78*75f6d617Schristos    XMID - YMID equals the number of inserted lines minus the number
79*75f6d617Schristos    of deleted lines (counting only lines before the midpoint).
80*75f6d617Schristos    Return the approximate edit cost; this is the total number of
81*75f6d617Schristos    lines inserted or deleted (counting only lines before the midpoint),
82*75f6d617Schristos    unless a heuristic is used to terminate the search prematurely.
83*75f6d617Schristos 
84*75f6d617Schristos    Set PART->lo_minimal to true iff the minimal edit script for the
85*75f6d617Schristos    left half of the partition is known; similarly for PART->hi_minimal.
86*75f6d617Schristos 
87*75f6d617Schristos    This function assumes that the first lines of the specified portions
88*75f6d617Schristos    of the two files do not match, and likewise that the last lines do not
89*75f6d617Schristos    match.  The caller must trim matching lines from the beginning and end
90*75f6d617Schristos    of the portions it is going to specify.
91*75f6d617Schristos 
92*75f6d617Schristos    If we return the "wrong" partitions,
93*75f6d617Schristos    the worst this can do is cause suboptimal diff output.
94*75f6d617Schristos    It cannot cause incorrect diff output.  */
95*75f6d617Schristos 
96*75f6d617Schristos static lin
diag(lin xoff,lin xlim,lin yoff,lin ylim,bool find_minimal,struct partition * part)97*75f6d617Schristos diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal,
98*75f6d617Schristos       struct partition *part)
99*75f6d617Schristos {
100*75f6d617Schristos   lin *const fd = fdiag;	/* Give the compiler a chance. */
101*75f6d617Schristos   lin *const bd = bdiag;	/* Additional help for the compiler. */
102*75f6d617Schristos   lin const *const xv = xvec;	/* Still more help for the compiler. */
103*75f6d617Schristos   lin const *const yv = yvec;	/* And more and more . . . */
104*75f6d617Schristos   lin const dmin = xoff - ylim;	/* Minimum valid diagonal. */
105*75f6d617Schristos   lin const dmax = xlim - yoff;	/* Maximum valid diagonal. */
106*75f6d617Schristos   lin const fmid = xoff - yoff;	/* Center diagonal of top-down search. */
107*75f6d617Schristos   lin const bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
108*75f6d617Schristos   lin fmin = fmid, fmax = fmid;	/* Limits of top-down search. */
109*75f6d617Schristos   lin bmin = bmid, bmax = bmid;	/* Limits of bottom-up search. */
110*75f6d617Schristos   lin c;			/* Cost. */
111*75f6d617Schristos   bool odd = (fmid - bmid) & 1;	/* True if southeast corner is on an odd
112*75f6d617Schristos 				   diagonal with respect to the northwest. */
113*75f6d617Schristos 
114*75f6d617Schristos   fd[fmid] = xoff;
115*75f6d617Schristos   bd[bmid] = xlim;
116*75f6d617Schristos 
117*75f6d617Schristos   for (c = 1;; ++c)
118*75f6d617Schristos     {
119*75f6d617Schristos       lin d;			/* Active diagonal. */
120*75f6d617Schristos       bool big_snake = 0;
121*75f6d617Schristos 
122*75f6d617Schristos       /* Extend the top-down search by an edit step in each diagonal. */
123*75f6d617Schristos       fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
124*75f6d617Schristos       fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
125*75f6d617Schristos       for (d = fmax; d >= fmin; d -= 2)
126*75f6d617Schristos 	{
127*75f6d617Schristos 	  lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
128*75f6d617Schristos 
129*75f6d617Schristos 	  if (tlo >= thi)
130*75f6d617Schristos 	    x = tlo + 1;
131*75f6d617Schristos 	  else
132*75f6d617Schristos 	    x = thi;
133*75f6d617Schristos 	  oldx = x;
134*75f6d617Schristos 	  y = x - d;
135*75f6d617Schristos 	  while (x < xlim && y < ylim && xv[x] == yv[y])
136*75f6d617Schristos 	    ++x, ++y;
137*75f6d617Schristos 	  if (x - oldx > SNAKE_LIMIT)
138*75f6d617Schristos 	    big_snake = 1;
139*75f6d617Schristos 	  fd[d] = x;
140*75f6d617Schristos 	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
141*75f6d617Schristos 	    {
142*75f6d617Schristos 	      part->xmid = x;
143*75f6d617Schristos 	      part->ymid = y;
144*75f6d617Schristos 	      part->lo_minimal = part->hi_minimal = 1;
145*75f6d617Schristos 	      return 2 * c - 1;
146*75f6d617Schristos 	    }
147*75f6d617Schristos 	}
148*75f6d617Schristos 
149*75f6d617Schristos       /* Similarly extend the bottom-up search.  */
150*75f6d617Schristos       bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin;
151*75f6d617Schristos       bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax;
152*75f6d617Schristos       for (d = bmax; d >= bmin; d -= 2)
153*75f6d617Schristos 	{
154*75f6d617Schristos 	  lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
155*75f6d617Schristos 
156*75f6d617Schristos 	  if (tlo < thi)
157*75f6d617Schristos 	    x = tlo;
158*75f6d617Schristos 	  else
159*75f6d617Schristos 	    x = thi - 1;
160*75f6d617Schristos 	  oldx = x;
161*75f6d617Schristos 	  y = x - d;
162*75f6d617Schristos 	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
163*75f6d617Schristos 	    --x, --y;
164*75f6d617Schristos 	  if (oldx - x > SNAKE_LIMIT)
165*75f6d617Schristos 	    big_snake = 1;
166*75f6d617Schristos 	  bd[d] = x;
167*75f6d617Schristos 	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
168*75f6d617Schristos 	    {
169*75f6d617Schristos 	      part->xmid = x;
170*75f6d617Schristos 	      part->ymid = y;
171*75f6d617Schristos 	      part->lo_minimal = part->hi_minimal = 1;
172*75f6d617Schristos 	      return 2 * c;
173*75f6d617Schristos 	    }
174*75f6d617Schristos 	}
175*75f6d617Schristos 
176*75f6d617Schristos       if (find_minimal)
177*75f6d617Schristos 	continue;
178*75f6d617Schristos 
179*75f6d617Schristos       /* Heuristic: check occasionally for a diagonal that has made
180*75f6d617Schristos 	 lots of progress compared with the edit distance.
181*75f6d617Schristos 	 If we have any such, find the one that has made the most
182*75f6d617Schristos 	 progress and return it as if it had succeeded.
183*75f6d617Schristos 
184*75f6d617Schristos 	 With this heuristic, for files with a constant small density
185*75f6d617Schristos 	 of changes, the algorithm is linear in the file size.  */
186*75f6d617Schristos 
187*75f6d617Schristos       if (200 < c && big_snake && speed_large_files)
188*75f6d617Schristos 	{
189*75f6d617Schristos 	  lin best;
190*75f6d617Schristos 
191*75f6d617Schristos 	  best = 0;
192*75f6d617Schristos 	  for (d = fmax; d >= fmin; d -= 2)
193*75f6d617Schristos 	    {
194*75f6d617Schristos 	      lin dd = d - fmid;
195*75f6d617Schristos 	      lin x = fd[d];
196*75f6d617Schristos 	      lin y = x - d;
197*75f6d617Schristos 	      lin v = (x - xoff) * 2 - dd;
198*75f6d617Schristos 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
199*75f6d617Schristos 		{
200*75f6d617Schristos 		  if (v > best
201*75f6d617Schristos 		      && xoff + SNAKE_LIMIT <= x && x < xlim
202*75f6d617Schristos 		      && yoff + SNAKE_LIMIT <= y && y < ylim)
203*75f6d617Schristos 		    {
204*75f6d617Schristos 		      /* We have a good enough best diagonal;
205*75f6d617Schristos 			 now insist that it end with a significant snake.  */
206*75f6d617Schristos 		      int k;
207*75f6d617Schristos 
208*75f6d617Schristos 		      for (k = 1; xv[x - k] == yv[y - k]; k++)
209*75f6d617Schristos 			if (k == SNAKE_LIMIT)
210*75f6d617Schristos 			  {
211*75f6d617Schristos 			    best = v;
212*75f6d617Schristos 			    part->xmid = x;
213*75f6d617Schristos 			    part->ymid = y;
214*75f6d617Schristos 			    break;
215*75f6d617Schristos 			  }
216*75f6d617Schristos 		    }
217*75f6d617Schristos 		}
218*75f6d617Schristos 	    }
219*75f6d617Schristos 	  if (best > 0)
220*75f6d617Schristos 	    {
221*75f6d617Schristos 	      part->lo_minimal = 1;
222*75f6d617Schristos 	      part->hi_minimal = 0;
223*75f6d617Schristos 	      return 2 * c - 1;
224*75f6d617Schristos 	    }
225*75f6d617Schristos 
226*75f6d617Schristos 	  best = 0;
227*75f6d617Schristos 	  for (d = bmax; d >= bmin; d -= 2)
228*75f6d617Schristos 	    {
229*75f6d617Schristos 	      lin dd = d - bmid;
230*75f6d617Schristos 	      lin x = bd[d];
231*75f6d617Schristos 	      lin y = x - d;
232*75f6d617Schristos 	      lin v = (xlim - x) * 2 + dd;
233*75f6d617Schristos 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
234*75f6d617Schristos 		{
235*75f6d617Schristos 		  if (v > best
236*75f6d617Schristos 		      && xoff < x && x <= xlim - SNAKE_LIMIT
237*75f6d617Schristos 		      && yoff < y && y <= ylim - SNAKE_LIMIT)
238*75f6d617Schristos 		    {
239*75f6d617Schristos 		      /* We have a good enough best diagonal;
240*75f6d617Schristos 			 now insist that it end with a significant snake.  */
241*75f6d617Schristos 		      int k;
242*75f6d617Schristos 
243*75f6d617Schristos 		      for (k = 0; xv[x + k] == yv[y + k]; k++)
244*75f6d617Schristos 			if (k == SNAKE_LIMIT - 1)
245*75f6d617Schristos 			  {
246*75f6d617Schristos 			    best = v;
247*75f6d617Schristos 			    part->xmid = x;
248*75f6d617Schristos 			    part->ymid = y;
249*75f6d617Schristos 			    break;
250*75f6d617Schristos 			  }
251*75f6d617Schristos 		    }
252*75f6d617Schristos 		}
253*75f6d617Schristos 	    }
254*75f6d617Schristos 	  if (best > 0)
255*75f6d617Schristos 	    {
256*75f6d617Schristos 	      part->lo_minimal = 0;
257*75f6d617Schristos 	      part->hi_minimal = 1;
258*75f6d617Schristos 	      return 2 * c - 1;
259*75f6d617Schristos 	    }
260*75f6d617Schristos 	}
261*75f6d617Schristos 
262*75f6d617Schristos       /* Heuristic: if we've gone well beyond the call of duty,
263*75f6d617Schristos 	 give up and report halfway between our best results so far.  */
264*75f6d617Schristos       if (c >= too_expensive)
265*75f6d617Schristos 	{
266*75f6d617Schristos 	  lin fxybest, fxbest;
267*75f6d617Schristos 	  lin bxybest, bxbest;
268*75f6d617Schristos 
269*75f6d617Schristos 	  fxbest = bxbest = 0;  /* Pacify `gcc -Wall'.  */
270*75f6d617Schristos 
271*75f6d617Schristos 	  /* Find forward diagonal that maximizes X + Y.  */
272*75f6d617Schristos 	  fxybest = -1;
273*75f6d617Schristos 	  for (d = fmax; d >= fmin; d -= 2)
274*75f6d617Schristos 	    {
275*75f6d617Schristos 	      lin x = MIN (fd[d], xlim);
276*75f6d617Schristos 	      lin y = x - d;
277*75f6d617Schristos 	      if (ylim < y)
278*75f6d617Schristos 		x = ylim + d, y = ylim;
279*75f6d617Schristos 	      if (fxybest < x + y)
280*75f6d617Schristos 		{
281*75f6d617Schristos 		  fxybest = x + y;
282*75f6d617Schristos 		  fxbest = x;
283*75f6d617Schristos 		}
284*75f6d617Schristos 	    }
285*75f6d617Schristos 
286*75f6d617Schristos 	  /* Find backward diagonal that minimizes X + Y.  */
287*75f6d617Schristos 	  bxybest = LIN_MAX;
288*75f6d617Schristos 	  for (d = bmax; d >= bmin; d -= 2)
289*75f6d617Schristos 	    {
290*75f6d617Schristos 	      lin x = MAX (xoff, bd[d]);
291*75f6d617Schristos 	      lin y = x - d;
292*75f6d617Schristos 	      if (y < yoff)
293*75f6d617Schristos 		x = yoff + d, y = yoff;
294*75f6d617Schristos 	      if (x + y < bxybest)
295*75f6d617Schristos 		{
296*75f6d617Schristos 		  bxybest = x + y;
297*75f6d617Schristos 		  bxbest = x;
298*75f6d617Schristos 		}
299*75f6d617Schristos 	    }
300*75f6d617Schristos 
301*75f6d617Schristos 	  /* Use the better of the two diagonals.  */
302*75f6d617Schristos 	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
303*75f6d617Schristos 	    {
304*75f6d617Schristos 	      part->xmid = fxbest;
305*75f6d617Schristos 	      part->ymid = fxybest - fxbest;
306*75f6d617Schristos 	      part->lo_minimal = 1;
307*75f6d617Schristos 	      part->hi_minimal = 0;
308*75f6d617Schristos 	    }
309*75f6d617Schristos 	  else
310*75f6d617Schristos 	    {
311*75f6d617Schristos 	      part->xmid = bxbest;
312*75f6d617Schristos 	      part->ymid = bxybest - bxbest;
313*75f6d617Schristos 	      part->lo_minimal = 0;
314*75f6d617Schristos 	      part->hi_minimal = 1;
315*75f6d617Schristos 	    }
316*75f6d617Schristos 	  return 2 * c - 1;
317*75f6d617Schristos 	}
318*75f6d617Schristos     }
319*75f6d617Schristos }
320*75f6d617Schristos 
321*75f6d617Schristos /* Compare in detail contiguous subsequences of the two files
322*75f6d617Schristos    which are known, as a whole, to match each other.
323*75f6d617Schristos 
324*75f6d617Schristos    The results are recorded in the vectors files[N].changed, by
325*75f6d617Schristos    storing 1 in the element for each line that is an insertion or deletion.
326*75f6d617Schristos 
327*75f6d617Schristos    The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
328*75f6d617Schristos 
329*75f6d617Schristos    Note that XLIM, YLIM are exclusive bounds.
330*75f6d617Schristos    All line numbers are origin-0 and discarded lines are not counted.
331*75f6d617Schristos 
332*75f6d617Schristos    If FIND_MINIMAL, find a minimal difference no matter how
333*75f6d617Schristos    expensive it is.  */
334*75f6d617Schristos 
335*75f6d617Schristos static void
compareseq(lin xoff,lin xlim,lin yoff,lin ylim,bool find_minimal)336*75f6d617Schristos compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal)
337*75f6d617Schristos {
338*75f6d617Schristos   lin * const xv = xvec; /* Help the compiler.  */
339*75f6d617Schristos   lin * const yv = yvec;
340*75f6d617Schristos 
341*75f6d617Schristos   /* Slide down the bottom initial diagonal. */
342*75f6d617Schristos   while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
343*75f6d617Schristos     ++xoff, ++yoff;
344*75f6d617Schristos   /* Slide up the top initial diagonal. */
345*75f6d617Schristos   while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
346*75f6d617Schristos     --xlim, --ylim;
347*75f6d617Schristos 
348*75f6d617Schristos   /* Handle simple cases. */
349*75f6d617Schristos   if (xoff == xlim)
350*75f6d617Schristos     while (yoff < ylim)
351*75f6d617Schristos       files[1].changed[files[1].realindexes[yoff++]] = 1;
352*75f6d617Schristos   else if (yoff == ylim)
353*75f6d617Schristos     while (xoff < xlim)
354*75f6d617Schristos       files[0].changed[files[0].realindexes[xoff++]] = 1;
355*75f6d617Schristos   else
356*75f6d617Schristos     {
357*75f6d617Schristos       lin c;
358*75f6d617Schristos       struct partition part;
359*75f6d617Schristos 
360*75f6d617Schristos       /* Find a point of correspondence in the middle of the files.  */
361*75f6d617Schristos 
362*75f6d617Schristos       c = diag (xoff, xlim, yoff, ylim, find_minimal, &part);
363*75f6d617Schristos 
364*75f6d617Schristos       if (c == 1)
365*75f6d617Schristos 	{
366*75f6d617Schristos 	  /* This should be impossible, because it implies that
367*75f6d617Schristos 	     one of the two subsequences is empty,
368*75f6d617Schristos 	     and that case was handled above without calling `diag'.
369*75f6d617Schristos 	     Let's verify that this is true.  */
370*75f6d617Schristos 	  abort ();
371*75f6d617Schristos #if 0
372*75f6d617Schristos 	  /* The two subsequences differ by a single insert or delete;
373*75f6d617Schristos 	     record it and we are done.  */
374*75f6d617Schristos 	  if (part.xmid - part.ymid < xoff - yoff)
375*75f6d617Schristos 	    files[1].changed[files[1].realindexes[part.ymid - 1]] = 1;
376*75f6d617Schristos 	  else
377*75f6d617Schristos 	    files[0].changed[files[0].realindexes[part.xmid]] = 1;
378*75f6d617Schristos #endif
379*75f6d617Schristos 	}
380*75f6d617Schristos       else
381*75f6d617Schristos 	{
382*75f6d617Schristos 	  /* Use the partitions to split this problem into subproblems.  */
383*75f6d617Schristos 	  compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
384*75f6d617Schristos 	  compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
385*75f6d617Schristos 	}
386*75f6d617Schristos     }
387*75f6d617Schristos }
388*75f6d617Schristos 
389*75f6d617Schristos /* Discard lines from one file that have no matches in the other file.
390*75f6d617Schristos 
391*75f6d617Schristos    A line which is discarded will not be considered by the actual
392*75f6d617Schristos    comparison algorithm; it will be as if that line were not in the file.
393*75f6d617Schristos    The file's `realindexes' table maps virtual line numbers
394*75f6d617Schristos    (which don't count the discarded lines) into real line numbers;
395*75f6d617Schristos    this is how the actual comparison algorithm produces results
396*75f6d617Schristos    that are comprehensible when the discarded lines are counted.
397*75f6d617Schristos 
398*75f6d617Schristos    When we discard a line, we also mark it as a deletion or insertion
399*75f6d617Schristos    so that it will be printed in the output.  */
400*75f6d617Schristos 
401*75f6d617Schristos static void
discard_confusing_lines(struct file_data filevec[])402*75f6d617Schristos discard_confusing_lines (struct file_data filevec[])
403*75f6d617Schristos {
404*75f6d617Schristos   int f;
405*75f6d617Schristos   lin i;
406*75f6d617Schristos   char *discarded[2];
407*75f6d617Schristos   lin *equiv_count[2];
408*75f6d617Schristos   lin *p;
409*75f6d617Schristos 
410*75f6d617Schristos   /* Allocate our results.  */
411*75f6d617Schristos   p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
412*75f6d617Schristos 	       * (2 * sizeof *p));
413*75f6d617Schristos   for (f = 0; f < 2; f++)
414*75f6d617Schristos     {
415*75f6d617Schristos       filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
416*75f6d617Schristos       filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
417*75f6d617Schristos     }
418*75f6d617Schristos 
419*75f6d617Schristos   /* Set up equiv_count[F][I] as the number of lines in file F
420*75f6d617Schristos      that fall in equivalence class I.  */
421*75f6d617Schristos 
422*75f6d617Schristos   p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
423*75f6d617Schristos   equiv_count[0] = p;
424*75f6d617Schristos   equiv_count[1] = p + filevec[0].equiv_max;
425*75f6d617Schristos 
426*75f6d617Schristos   for (i = 0; i < filevec[0].buffered_lines; ++i)
427*75f6d617Schristos     ++equiv_count[0][filevec[0].equivs[i]];
428*75f6d617Schristos   for (i = 0; i < filevec[1].buffered_lines; ++i)
429*75f6d617Schristos     ++equiv_count[1][filevec[1].equivs[i]];
430*75f6d617Schristos 
431*75f6d617Schristos   /* Set up tables of which lines are going to be discarded.  */
432*75f6d617Schristos 
433*75f6d617Schristos   discarded[0] = zalloc (filevec[0].buffered_lines
434*75f6d617Schristos 			 + filevec[1].buffered_lines);
435*75f6d617Schristos   discarded[1] = discarded[0] + filevec[0].buffered_lines;
436*75f6d617Schristos 
437*75f6d617Schristos   /* Mark to be discarded each line that matches no line of the other file.
438*75f6d617Schristos      If a line matches many lines, mark it as provisionally discardable.  */
439*75f6d617Schristos 
440*75f6d617Schristos   for (f = 0; f < 2; f++)
441*75f6d617Schristos     {
442*75f6d617Schristos       size_t end = filevec[f].buffered_lines;
443*75f6d617Schristos       char *discards = discarded[f];
444*75f6d617Schristos       lin *counts = equiv_count[1 - f];
445*75f6d617Schristos       lin *equivs = filevec[f].equivs;
446*75f6d617Schristos       size_t many = 5;
447*75f6d617Schristos       size_t tem = end / 64;
448*75f6d617Schristos 
449*75f6d617Schristos       /* Multiply MANY by approximate square root of number of lines.
450*75f6d617Schristos 	 That is the threshold for provisionally discardable lines.  */
451*75f6d617Schristos       while ((tem = tem >> 2) > 0)
452*75f6d617Schristos 	many *= 2;
453*75f6d617Schristos 
454*75f6d617Schristos       for (i = 0; i < end; i++)
455*75f6d617Schristos 	{
456*75f6d617Schristos 	  lin nmatch;
457*75f6d617Schristos 	  if (equivs[i] == 0)
458*75f6d617Schristos 	    continue;
459*75f6d617Schristos 	  nmatch = counts[equivs[i]];
460*75f6d617Schristos 	  if (nmatch == 0)
461*75f6d617Schristos 	    discards[i] = 1;
462*75f6d617Schristos 	  else if (nmatch > many)
463*75f6d617Schristos 	    discards[i] = 2;
464*75f6d617Schristos 	}
465*75f6d617Schristos     }
466*75f6d617Schristos 
467*75f6d617Schristos   /* Don't really discard the provisional lines except when they occur
468*75f6d617Schristos      in a run of discardables, with nonprovisionals at the beginning
469*75f6d617Schristos      and end.  */
470*75f6d617Schristos 
471*75f6d617Schristos   for (f = 0; f < 2; f++)
472*75f6d617Schristos     {
473*75f6d617Schristos       lin end = filevec[f].buffered_lines;
474*75f6d617Schristos       register char *discards = discarded[f];
475*75f6d617Schristos 
476*75f6d617Schristos       for (i = 0; i < end; i++)
477*75f6d617Schristos 	{
478*75f6d617Schristos 	  /* Cancel provisional discards not in middle of run of discards.  */
479*75f6d617Schristos 	  if (discards[i] == 2)
480*75f6d617Schristos 	    discards[i] = 0;
481*75f6d617Schristos 	  else if (discards[i] != 0)
482*75f6d617Schristos 	    {
483*75f6d617Schristos 	      /* We have found a nonprovisional discard.  */
484*75f6d617Schristos 	      register lin j;
485*75f6d617Schristos 	      lin length;
486*75f6d617Schristos 	      lin provisional = 0;
487*75f6d617Schristos 
488*75f6d617Schristos 	      /* Find end of this run of discardable lines.
489*75f6d617Schristos 		 Count how many are provisionally discardable.  */
490*75f6d617Schristos 	      for (j = i; j < end; j++)
491*75f6d617Schristos 		{
492*75f6d617Schristos 		  if (discards[j] == 0)
493*75f6d617Schristos 		    break;
494*75f6d617Schristos 		  if (discards[j] == 2)
495*75f6d617Schristos 		    ++provisional;
496*75f6d617Schristos 		}
497*75f6d617Schristos 
498*75f6d617Schristos 	      /* Cancel provisional discards at end, and shrink the run.  */
499*75f6d617Schristos 	      while (j > i && discards[j - 1] == 2)
500*75f6d617Schristos 		discards[--j] = 0, --provisional;
501*75f6d617Schristos 
502*75f6d617Schristos 	      /* Now we have the length of a run of discardable lines
503*75f6d617Schristos 		 whose first and last are not provisional.  */
504*75f6d617Schristos 	      length = j - i;
505*75f6d617Schristos 
506*75f6d617Schristos 	      /* If 1/4 of the lines in the run are provisional,
507*75f6d617Schristos 		 cancel discarding of all provisional lines in the run.  */
508*75f6d617Schristos 	      if (provisional * 4 > length)
509*75f6d617Schristos 		{
510*75f6d617Schristos 		  while (j > i)
511*75f6d617Schristos 		    if (discards[--j] == 2)
512*75f6d617Schristos 		      discards[j] = 0;
513*75f6d617Schristos 		}
514*75f6d617Schristos 	      else
515*75f6d617Schristos 		{
516*75f6d617Schristos 		  register lin consec;
517*75f6d617Schristos 		  lin minimum = 1;
518*75f6d617Schristos 		  lin tem = length >> 2;
519*75f6d617Schristos 
520*75f6d617Schristos 		  /* MINIMUM is approximate square root of LENGTH/4.
521*75f6d617Schristos 		     A subrun of two or more provisionals can stand
522*75f6d617Schristos 		     when LENGTH is at least 16.
523*75f6d617Schristos 		     A subrun of 4 or more can stand when LENGTH >= 64.  */
524*75f6d617Schristos 		  while (0 < (tem >>= 2))
525*75f6d617Schristos 		    minimum <<= 1;
526*75f6d617Schristos 		  minimum++;
527*75f6d617Schristos 
528*75f6d617Schristos 		  /* Cancel any subrun of MINIMUM or more provisionals
529*75f6d617Schristos 		     within the larger run.  */
530*75f6d617Schristos 		  for (j = 0, consec = 0; j < length; j++)
531*75f6d617Schristos 		    if (discards[i + j] != 2)
532*75f6d617Schristos 		      consec = 0;
533*75f6d617Schristos 		    else if (minimum == ++consec)
534*75f6d617Schristos 		      /* Back up to start of subrun, to cancel it all.  */
535*75f6d617Schristos 		      j -= consec;
536*75f6d617Schristos 		    else if (minimum < consec)
537*75f6d617Schristos 		      discards[i + j] = 0;
538*75f6d617Schristos 
539*75f6d617Schristos 		  /* Scan from beginning of run
540*75f6d617Schristos 		     until we find 3 or more nonprovisionals in a row
541*75f6d617Schristos 		     or until the first nonprovisional at least 8 lines in.
542*75f6d617Schristos 		     Until that point, cancel any provisionals.  */
543*75f6d617Schristos 		  for (j = 0, consec = 0; j < length; j++)
544*75f6d617Schristos 		    {
545*75f6d617Schristos 		      if (j >= 8 && discards[i + j] == 1)
546*75f6d617Schristos 			break;
547*75f6d617Schristos 		      if (discards[i + j] == 2)
548*75f6d617Schristos 			consec = 0, discards[i + j] = 0;
549*75f6d617Schristos 		      else if (discards[i + j] == 0)
550*75f6d617Schristos 			consec = 0;
551*75f6d617Schristos 		      else
552*75f6d617Schristos 			consec++;
553*75f6d617Schristos 		      if (consec == 3)
554*75f6d617Schristos 			break;
555*75f6d617Schristos 		    }
556*75f6d617Schristos 
557*75f6d617Schristos 		  /* I advances to the last line of the run.  */
558*75f6d617Schristos 		  i += length - 1;
559*75f6d617Schristos 
560*75f6d617Schristos 		  /* Same thing, from end.  */
561*75f6d617Schristos 		  for (j = 0, consec = 0; j < length; j++)
562*75f6d617Schristos 		    {
563*75f6d617Schristos 		      if (j >= 8 && discards[i - j] == 1)
564*75f6d617Schristos 			break;
565*75f6d617Schristos 		      if (discards[i - j] == 2)
566*75f6d617Schristos 			consec = 0, discards[i - j] = 0;
567*75f6d617Schristos 		      else if (discards[i - j] == 0)
568*75f6d617Schristos 			consec = 0;
569*75f6d617Schristos 		      else
570*75f6d617Schristos 			consec++;
571*75f6d617Schristos 		      if (consec == 3)
572*75f6d617Schristos 			break;
573*75f6d617Schristos 		    }
574*75f6d617Schristos 		}
575*75f6d617Schristos 	    }
576*75f6d617Schristos 	}
577*75f6d617Schristos     }
578*75f6d617Schristos 
579*75f6d617Schristos   /* Actually discard the lines. */
580*75f6d617Schristos   for (f = 0; f < 2; f++)
581*75f6d617Schristos     {
582*75f6d617Schristos       char *discards = discarded[f];
583*75f6d617Schristos       lin end = filevec[f].buffered_lines;
584*75f6d617Schristos       lin j = 0;
585*75f6d617Schristos       for (i = 0; i < end; ++i)
586*75f6d617Schristos 	if (minimal || discards[i] == 0)
587*75f6d617Schristos 	  {
588*75f6d617Schristos 	    filevec[f].undiscarded[j] = filevec[f].equivs[i];
589*75f6d617Schristos 	    filevec[f].realindexes[j++] = i;
590*75f6d617Schristos 	  }
591*75f6d617Schristos 	else
592*75f6d617Schristos 	  filevec[f].changed[i] = 1;
593*75f6d617Schristos       filevec[f].nondiscarded_lines = j;
594*75f6d617Schristos     }
595*75f6d617Schristos 
596*75f6d617Schristos   free (discarded[0]);
597*75f6d617Schristos   free (equiv_count[0]);
598*75f6d617Schristos }
599*75f6d617Schristos 
600*75f6d617Schristos /* Adjust inserts/deletes of identical lines to join changes
601*75f6d617Schristos    as much as possible.
602*75f6d617Schristos 
603*75f6d617Schristos    We do something when a run of changed lines include a
604*75f6d617Schristos    line at one end and have an excluded, identical line at the other.
605*75f6d617Schristos    We are free to choose which identical line is included.
606*75f6d617Schristos    `compareseq' usually chooses the one at the beginning,
607*75f6d617Schristos    but usually it is cleaner to consider the following identical line
608*75f6d617Schristos    to be the "change".  */
609*75f6d617Schristos 
610*75f6d617Schristos static void
shift_boundaries(struct file_data filevec[])611*75f6d617Schristos shift_boundaries (struct file_data filevec[])
612*75f6d617Schristos {
613*75f6d617Schristos   int f;
614*75f6d617Schristos 
615*75f6d617Schristos   for (f = 0; f < 2; f++)
616*75f6d617Schristos     {
617*75f6d617Schristos       bool *changed = filevec[f].changed;
618*75f6d617Schristos       bool const *other_changed = filevec[1 - f].changed;
619*75f6d617Schristos       lin const *equivs = filevec[f].equivs;
620*75f6d617Schristos       lin i = 0;
621*75f6d617Schristos       lin j = 0;
622*75f6d617Schristos       lin i_end = filevec[f].buffered_lines;
623*75f6d617Schristos 
624*75f6d617Schristos       while (1)
625*75f6d617Schristos 	{
626*75f6d617Schristos 	  lin runlength, start, corresponding;
627*75f6d617Schristos 
628*75f6d617Schristos 	  /* Scan forwards to find beginning of another run of changes.
629*75f6d617Schristos 	     Also keep track of the corresponding point in the other file.  */
630*75f6d617Schristos 
631*75f6d617Schristos 	  while (i < i_end && !changed[i])
632*75f6d617Schristos 	    {
633*75f6d617Schristos 	      while (other_changed[j++])
634*75f6d617Schristos 		continue;
635*75f6d617Schristos 	      i++;
636*75f6d617Schristos 	    }
637*75f6d617Schristos 
638*75f6d617Schristos 	  if (i == i_end)
639*75f6d617Schristos 	    break;
640*75f6d617Schristos 
641*75f6d617Schristos 	  start = i;
642*75f6d617Schristos 
643*75f6d617Schristos 	  /* Find the end of this run of changes.  */
644*75f6d617Schristos 
645*75f6d617Schristos 	  while (changed[++i])
646*75f6d617Schristos 	    continue;
647*75f6d617Schristos 	  while (other_changed[j])
648*75f6d617Schristos 	    j++;
649*75f6d617Schristos 
650*75f6d617Schristos 	  do
651*75f6d617Schristos 	    {
652*75f6d617Schristos 	      /* Record the length of this run of changes, so that
653*75f6d617Schristos 		 we can later determine whether the run has grown.  */
654*75f6d617Schristos 	      runlength = i - start;
655*75f6d617Schristos 
656*75f6d617Schristos 	      /* Move the changed region back, so long as the
657*75f6d617Schristos 		 previous unchanged line matches the last changed one.
658*75f6d617Schristos 		 This merges with previous changed regions.  */
659*75f6d617Schristos 
660*75f6d617Schristos 	      while (start && equivs[start - 1] == equivs[i - 1])
661*75f6d617Schristos 		{
662*75f6d617Schristos 		  changed[--start] = 1;
663*75f6d617Schristos 		  changed[--i] = 0;
664*75f6d617Schristos 		  while (changed[start - 1])
665*75f6d617Schristos 		    start--;
666*75f6d617Schristos 		  while (other_changed[--j])
667*75f6d617Schristos 		    continue;
668*75f6d617Schristos 		}
669*75f6d617Schristos 
670*75f6d617Schristos 	      /* Set CORRESPONDING to the end of the changed run, at the last
671*75f6d617Schristos 		 point where it corresponds to a changed run in the other file.
672*75f6d617Schristos 		 CORRESPONDING == I_END means no such point has been found.  */
673*75f6d617Schristos 	      corresponding = other_changed[j - 1] ? i : i_end;
674*75f6d617Schristos 
675*75f6d617Schristos 	      /* Move the changed region forward, so long as the
676*75f6d617Schristos 		 first changed line matches the following unchanged one.
677*75f6d617Schristos 		 This merges with following changed regions.
678*75f6d617Schristos 		 Do this second, so that if there are no merges,
679*75f6d617Schristos 		 the changed region is moved forward as far as possible.  */
680*75f6d617Schristos 
681*75f6d617Schristos 	      while (i != i_end && equivs[start] == equivs[i])
682*75f6d617Schristos 		{
683*75f6d617Schristos 		  changed[start++] = 0;
684*75f6d617Schristos 		  changed[i++] = 1;
685*75f6d617Schristos 		  while (changed[i])
686*75f6d617Schristos 		    i++;
687*75f6d617Schristos 		  while (other_changed[++j])
688*75f6d617Schristos 		    corresponding = i;
689*75f6d617Schristos 		}
690*75f6d617Schristos 	    }
691*75f6d617Schristos 	  while (runlength != i - start);
692*75f6d617Schristos 
693*75f6d617Schristos 	  /* If possible, move the fully-merged run of changes
694*75f6d617Schristos 	     back to a corresponding run in the other file.  */
695*75f6d617Schristos 
696*75f6d617Schristos 	  while (corresponding < i)
697*75f6d617Schristos 	    {
698*75f6d617Schristos 	      changed[--start] = 1;
699*75f6d617Schristos 	      changed[--i] = 0;
700*75f6d617Schristos 	      while (other_changed[--j])
701*75f6d617Schristos 		continue;
702*75f6d617Schristos 	    }
703*75f6d617Schristos 	}
704*75f6d617Schristos     }
705*75f6d617Schristos }
706*75f6d617Schristos 
707*75f6d617Schristos /* Cons an additional entry onto the front of an edit script OLD.
708*75f6d617Schristos    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
709*75f6d617Schristos    DELETED is the number of lines deleted here from file 0.
710*75f6d617Schristos    INSERTED is the number of lines inserted here in file 1.
711*75f6d617Schristos 
712*75f6d617Schristos    If DELETED is 0 then LINE0 is the number of the line before
713*75f6d617Schristos    which the insertion was done; vice versa for INSERTED and LINE1.  */
714*75f6d617Schristos 
715*75f6d617Schristos static struct change *
add_change(lin line0,lin line1,lin deleted,lin inserted,struct change * old)716*75f6d617Schristos add_change (lin line0, lin line1, lin deleted, lin inserted,
717*75f6d617Schristos 	    struct change *old)
718*75f6d617Schristos {
719*75f6d617Schristos   struct change *new = xmalloc (sizeof *new);
720*75f6d617Schristos 
721*75f6d617Schristos   new->line0 = line0;
722*75f6d617Schristos   new->line1 = line1;
723*75f6d617Schristos   new->inserted = inserted;
724*75f6d617Schristos   new->deleted = deleted;
725*75f6d617Schristos   new->link = old;
726*75f6d617Schristos   return new;
727*75f6d617Schristos }
728*75f6d617Schristos 
729*75f6d617Schristos /* Scan the tables of which lines are inserted and deleted,
730*75f6d617Schristos    producing an edit script in reverse order.  */
731*75f6d617Schristos 
732*75f6d617Schristos static struct change *
build_reverse_script(struct file_data const filevec[])733*75f6d617Schristos build_reverse_script (struct file_data const filevec[])
734*75f6d617Schristos {
735*75f6d617Schristos   struct change *script = 0;
736*75f6d617Schristos   bool *changed0 = filevec[0].changed;
737*75f6d617Schristos   bool *changed1 = filevec[1].changed;
738*75f6d617Schristos   lin len0 = filevec[0].buffered_lines;
739*75f6d617Schristos   lin len1 = filevec[1].buffered_lines;
740*75f6d617Schristos 
741*75f6d617Schristos   /* Note that changedN[len0] does exist, and is 0.  */
742*75f6d617Schristos 
743*75f6d617Schristos   lin i0 = 0, i1 = 0;
744*75f6d617Schristos 
745*75f6d617Schristos   while (i0 < len0 || i1 < len1)
746*75f6d617Schristos     {
747*75f6d617Schristos       if (changed0[i0] | changed1[i1])
748*75f6d617Schristos 	{
749*75f6d617Schristos 	  lin line0 = i0, line1 = i1;
750*75f6d617Schristos 
751*75f6d617Schristos 	  /* Find # lines changed here in each file.  */
752*75f6d617Schristos 	  while (changed0[i0]) ++i0;
753*75f6d617Schristos 	  while (changed1[i1]) ++i1;
754*75f6d617Schristos 
755*75f6d617Schristos 	  /* Record this change.  */
756*75f6d617Schristos 	  script = add_change (line0, line1, i0 - line0, i1 - line1, script);
757*75f6d617Schristos 	}
758*75f6d617Schristos 
759*75f6d617Schristos       /* We have reached lines in the two files that match each other.  */
760*75f6d617Schristos       i0++, i1++;
761*75f6d617Schristos     }
762*75f6d617Schristos 
763*75f6d617Schristos   return script;
764*75f6d617Schristos }
765*75f6d617Schristos 
766*75f6d617Schristos /* Scan the tables of which lines are inserted and deleted,
767*75f6d617Schristos    producing an edit script in forward order.  */
768*75f6d617Schristos 
769*75f6d617Schristos static struct change *
build_script(struct file_data const filevec[])770*75f6d617Schristos build_script (struct file_data const filevec[])
771*75f6d617Schristos {
772*75f6d617Schristos   struct change *script = 0;
773*75f6d617Schristos   bool *changed0 = filevec[0].changed;
774*75f6d617Schristos   bool *changed1 = filevec[1].changed;
775*75f6d617Schristos   lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
776*75f6d617Schristos 
777*75f6d617Schristos   /* Note that changedN[-1] does exist, and is 0.  */
778*75f6d617Schristos 
779*75f6d617Schristos   while (i0 >= 0 || i1 >= 0)
780*75f6d617Schristos     {
781*75f6d617Schristos       if (changed0[i0 - 1] | changed1[i1 - 1])
782*75f6d617Schristos 	{
783*75f6d617Schristos 	  lin line0 = i0, line1 = i1;
784*75f6d617Schristos 
785*75f6d617Schristos 	  /* Find # lines changed here in each file.  */
786*75f6d617Schristos 	  while (changed0[i0 - 1]) --i0;
787*75f6d617Schristos 	  while (changed1[i1 - 1]) --i1;
788*75f6d617Schristos 
789*75f6d617Schristos 	  /* Record this change.  */
790*75f6d617Schristos 	  script = add_change (i0, i1, line0 - i0, line1 - i1, script);
791*75f6d617Schristos 	}
792*75f6d617Schristos 
793*75f6d617Schristos       /* We have reached lines in the two files that match each other.  */
794*75f6d617Schristos       i0--, i1--;
795*75f6d617Schristos     }
796*75f6d617Schristos 
797*75f6d617Schristos   return script;
798*75f6d617Schristos }
799*75f6d617Schristos 
800*75f6d617Schristos /* If CHANGES, briefly report that two files differed.
801*75f6d617Schristos    Return 2 if trouble, CHANGES otherwise.  */
802*75f6d617Schristos static int
briefly_report(int changes,struct file_data const filevec[])803*75f6d617Schristos briefly_report (int changes, struct file_data const filevec[])
804*75f6d617Schristos {
805*75f6d617Schristos   if (changes)
806*75f6d617Schristos     {
807*75f6d617Schristos       char const *label0 = file_label[0] ? file_label[0] : filevec[0].name;
808*75f6d617Schristos       char const *label1 = file_label[1] ? file_label[1] : filevec[1].name;
809*75f6d617Schristos 
810*75f6d617Schristos       if (brief)
811*75f6d617Schristos 	message ("Files %s and %s differ\n", label0, label1);
812*75f6d617Schristos       else
813*75f6d617Schristos 	{
814*75f6d617Schristos 	  message ("Binary files %s and %s differ\n", label0, label1);
815*75f6d617Schristos 	  changes = 2;
816*75f6d617Schristos 	}
817*75f6d617Schristos     }
818*75f6d617Schristos 
819*75f6d617Schristos   return changes;
820*75f6d617Schristos }
821*75f6d617Schristos 
822*75f6d617Schristos /* Report the differences of two files.  */
823*75f6d617Schristos int
diff_2_files(struct comparison * cmp)824*75f6d617Schristos diff_2_files (struct comparison *cmp)
825*75f6d617Schristos {
826*75f6d617Schristos   lin diags;
827*75f6d617Schristos   int f;
828*75f6d617Schristos   struct change *e, *p;
829*75f6d617Schristos   struct change *script;
830*75f6d617Schristos   int changes;
831*75f6d617Schristos 
832*75f6d617Schristos 
833*75f6d617Schristos   /* If we have detected that either file is binary,
834*75f6d617Schristos      compare the two files as binary.  This can happen
835*75f6d617Schristos      only when the first chunk is read.
836*75f6d617Schristos      Also, --brief without any --ignore-* options means
837*75f6d617Schristos      we can speed things up by treating the files as binary.  */
838*75f6d617Schristos 
839*75f6d617Schristos   if (read_files (cmp->file, files_can_be_treated_as_binary))
840*75f6d617Schristos     {
841*75f6d617Schristos       /* Files with different lengths must be different.  */
842*75f6d617Schristos       if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
843*75f6d617Schristos 	  && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
844*75f6d617Schristos 	  && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
845*75f6d617Schristos 	changes = 1;
846*75f6d617Schristos 
847*75f6d617Schristos       /* Standard input equals itself.  */
848*75f6d617Schristos       else if (cmp->file[0].desc == cmp->file[1].desc)
849*75f6d617Schristos 	changes = 0;
850*75f6d617Schristos 
851*75f6d617Schristos       else
852*75f6d617Schristos 	/* Scan both files, a buffer at a time, looking for a difference.  */
853*75f6d617Schristos 	{
854*75f6d617Schristos 	  /* Allocate same-sized buffers for both files.  */
855*75f6d617Schristos 	  size_t lcm_max = PTRDIFF_MAX - 1;
856*75f6d617Schristos 	  size_t buffer_size =
857*75f6d617Schristos 	    buffer_lcm (sizeof (word),
858*75f6d617Schristos 			buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
859*75f6d617Schristos 				    STAT_BLOCKSIZE (cmp->file[1].stat),
860*75f6d617Schristos 				    lcm_max),
861*75f6d617Schristos 			lcm_max);
862*75f6d617Schristos 	  for (f = 0; f < 2; f++)
863*75f6d617Schristos 	    cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
864*75f6d617Schristos 
865*75f6d617Schristos 	  for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
866*75f6d617Schristos 	    {
867*75f6d617Schristos 	      /* Read a buffer's worth from both files.  */
868*75f6d617Schristos 	      for (f = 0; f < 2; f++)
869*75f6d617Schristos 		if (0 <= cmp->file[f].desc)
870*75f6d617Schristos 		  file_block_read (&cmp->file[f],
871*75f6d617Schristos 				   buffer_size - cmp->file[f].buffered);
872*75f6d617Schristos 
873*75f6d617Schristos 	      /* If the buffers differ, the files differ.  */
874*75f6d617Schristos 	      if (cmp->file[0].buffered != cmp->file[1].buffered
875*75f6d617Schristos 		  || memcmp (cmp->file[0].buffer,
876*75f6d617Schristos 			     cmp->file[1].buffer,
877*75f6d617Schristos 			     cmp->file[0].buffered))
878*75f6d617Schristos 		{
879*75f6d617Schristos 		  changes = 1;
880*75f6d617Schristos 		  break;
881*75f6d617Schristos 		}
882*75f6d617Schristos 
883*75f6d617Schristos 	      /* If we reach end of file, the files are the same.  */
884*75f6d617Schristos 	      if (cmp->file[0].buffered != buffer_size)
885*75f6d617Schristos 		{
886*75f6d617Schristos 		  changes = 0;
887*75f6d617Schristos 		  break;
888*75f6d617Schristos 		}
889*75f6d617Schristos 	    }
890*75f6d617Schristos 	}
891*75f6d617Schristos 
892*75f6d617Schristos       changes = briefly_report (changes, cmp->file);
893*75f6d617Schristos     }
894*75f6d617Schristos   else
895*75f6d617Schristos     {
896*75f6d617Schristos       /* Allocate vectors for the results of comparison:
897*75f6d617Schristos 	 a flag for each line of each file, saying whether that line
898*75f6d617Schristos 	 is an insertion or deletion.
899*75f6d617Schristos 	 Allocate an extra element, always 0, at each end of each vector.  */
900*75f6d617Schristos 
901*75f6d617Schristos       size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
902*75f6d617Schristos       bool *flag_space = zalloc (s * sizeof *flag_space);
903*75f6d617Schristos       cmp->file[0].changed = flag_space + 1;
904*75f6d617Schristos       cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
905*75f6d617Schristos 
906*75f6d617Schristos       /* Some lines are obviously insertions or deletions
907*75f6d617Schristos 	 because they don't match anything.  Detect them now, and
908*75f6d617Schristos 	 avoid even thinking about them in the main comparison algorithm.  */
909*75f6d617Schristos 
910*75f6d617Schristos       discard_confusing_lines (cmp->file);
911*75f6d617Schristos 
912*75f6d617Schristos       /* Now do the main comparison algorithm, considering just the
913*75f6d617Schristos 	 undiscarded lines.  */
914*75f6d617Schristos 
915*75f6d617Schristos       xvec = cmp->file[0].undiscarded;
916*75f6d617Schristos       yvec = cmp->file[1].undiscarded;
917*75f6d617Schristos       diags = (cmp->file[0].nondiscarded_lines
918*75f6d617Schristos 	       + cmp->file[1].nondiscarded_lines + 3);
919*75f6d617Schristos       fdiag = xmalloc (diags * (2 * sizeof *fdiag));
920*75f6d617Schristos       bdiag = fdiag + diags;
921*75f6d617Schristos       fdiag += cmp->file[1].nondiscarded_lines + 1;
922*75f6d617Schristos       bdiag += cmp->file[1].nondiscarded_lines + 1;
923*75f6d617Schristos 
924*75f6d617Schristos       /* Set TOO_EXPENSIVE to be approximate square root of input size,
925*75f6d617Schristos 	 bounded below by 256.  */
926*75f6d617Schristos       too_expensive = 1;
927*75f6d617Schristos       for (;  diags != 0;  diags >>= 2)
928*75f6d617Schristos 	too_expensive <<= 1;
929*75f6d617Schristos       too_expensive = MAX (256, too_expensive);
930*75f6d617Schristos 
931*75f6d617Schristos       files[0] = cmp->file[0];
932*75f6d617Schristos       files[1] = cmp->file[1];
933*75f6d617Schristos 
934*75f6d617Schristos       compareseq (0, cmp->file[0].nondiscarded_lines,
935*75f6d617Schristos 		  0, cmp->file[1].nondiscarded_lines, minimal);
936*75f6d617Schristos 
937*75f6d617Schristos       free (fdiag - (cmp->file[1].nondiscarded_lines + 1));
938*75f6d617Schristos 
939*75f6d617Schristos       /* Modify the results slightly to make them prettier
940*75f6d617Schristos 	 in cases where that can validly be done.  */
941*75f6d617Schristos 
942*75f6d617Schristos       shift_boundaries (cmp->file);
943*75f6d617Schristos 
944*75f6d617Schristos       /* Get the results of comparison in the form of a chain
945*75f6d617Schristos 	 of `struct change's -- an edit script.  */
946*75f6d617Schristos 
947*75f6d617Schristos       if (output_style == OUTPUT_ED)
948*75f6d617Schristos 	script = build_reverse_script (cmp->file);
949*75f6d617Schristos       else
950*75f6d617Schristos 	script = build_script (cmp->file);
951*75f6d617Schristos 
952*75f6d617Schristos       /* Set CHANGES if we had any diffs.
953*75f6d617Schristos 	 If some changes are ignored, we must scan the script to decide.  */
954*75f6d617Schristos       if (ignore_blank_lines || ignore_regexp.fastmap)
955*75f6d617Schristos 	{
956*75f6d617Schristos 	  struct change *next = script;
957*75f6d617Schristos 	  changes = 0;
958*75f6d617Schristos 
959*75f6d617Schristos 	  while (next && changes == 0)
960*75f6d617Schristos 	    {
961*75f6d617Schristos 	      struct change *this, *end;
962*75f6d617Schristos 	      lin first0, last0, first1, last1;
963*75f6d617Schristos 
964*75f6d617Schristos 	      /* Find a set of changes that belong together.  */
965*75f6d617Schristos 	      this = next;
966*75f6d617Schristos 	      end = find_change (next);
967*75f6d617Schristos 
968*75f6d617Schristos 	      /* Disconnect them from the rest of the changes, making them
969*75f6d617Schristos 		 a hunk, and remember the rest for next iteration.  */
970*75f6d617Schristos 	      next = end->link;
971*75f6d617Schristos 	      end->link = 0;
972*75f6d617Schristos 
973*75f6d617Schristos 	      /* Determine whether this hunk is really a difference.  */
974*75f6d617Schristos 	      if (analyze_hunk (this, &first0, &last0, &first1, &last1))
975*75f6d617Schristos 		changes = 1;
976*75f6d617Schristos 
977*75f6d617Schristos 	      /* Reconnect the script so it will all be freed properly.  */
978*75f6d617Schristos 	      end->link = next;
979*75f6d617Schristos 	    }
980*75f6d617Schristos 	}
981*75f6d617Schristos       else
982*75f6d617Schristos 	changes = (script != 0);
983*75f6d617Schristos 
984*75f6d617Schristos       if (brief)
985*75f6d617Schristos 	changes = briefly_report (changes, cmp->file);
986*75f6d617Schristos       else
987*75f6d617Schristos 	{
988*75f6d617Schristos 	  if (changes | !no_diff_means_no_output)
989*75f6d617Schristos 	    {
990*75f6d617Schristos 	      /* Record info for starting up output,
991*75f6d617Schristos 		 to be used if and when we have some output to print.  */
992*75f6d617Schristos 	      setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
993*75f6d617Schristos 			    file_label[1] ? file_label[1] : cmp->file[1].name,
994*75f6d617Schristos 			    cmp->parent != 0);
995*75f6d617Schristos 
996*75f6d617Schristos 	      switch (output_style)
997*75f6d617Schristos 		{
998*75f6d617Schristos 		case OUTPUT_CONTEXT:
999*75f6d617Schristos 		  print_context_script (script, 0);
1000*75f6d617Schristos 		  break;
1001*75f6d617Schristos 
1002*75f6d617Schristos 		case OUTPUT_UNIFIED:
1003*75f6d617Schristos 		  print_context_script (script, 1);
1004*75f6d617Schristos 		  break;
1005*75f6d617Schristos 
1006*75f6d617Schristos 		case OUTPUT_ED:
1007*75f6d617Schristos 		  print_ed_script (script);
1008*75f6d617Schristos 		  break;
1009*75f6d617Schristos 
1010*75f6d617Schristos 		case OUTPUT_FORWARD_ED:
1011*75f6d617Schristos 		  pr_forward_ed_script (script);
1012*75f6d617Schristos 		  break;
1013*75f6d617Schristos 
1014*75f6d617Schristos 		case OUTPUT_RCS:
1015*75f6d617Schristos 		  print_rcs_script (script);
1016*75f6d617Schristos 		  break;
1017*75f6d617Schristos 
1018*75f6d617Schristos 		case OUTPUT_NORMAL:
1019*75f6d617Schristos 		  print_normal_script (script);
1020*75f6d617Schristos 		  break;
1021*75f6d617Schristos 
1022*75f6d617Schristos 		case OUTPUT_IFDEF:
1023*75f6d617Schristos 		  print_ifdef_script (script);
1024*75f6d617Schristos 		  break;
1025*75f6d617Schristos 
1026*75f6d617Schristos 		case OUTPUT_SDIFF:
1027*75f6d617Schristos 		  print_sdiff_script (script);
1028*75f6d617Schristos 		  break;
1029*75f6d617Schristos 
1030*75f6d617Schristos 		default:
1031*75f6d617Schristos 		  abort ();
1032*75f6d617Schristos 		}
1033*75f6d617Schristos 
1034*75f6d617Schristos 	      finish_output ();
1035*75f6d617Schristos 	    }
1036*75f6d617Schristos 	}
1037*75f6d617Schristos 
1038*75f6d617Schristos       free (cmp->file[0].undiscarded);
1039*75f6d617Schristos 
1040*75f6d617Schristos       free (flag_space);
1041*75f6d617Schristos 
1042*75f6d617Schristos       for (f = 0; f < 2; f++)
1043*75f6d617Schristos 	{
1044*75f6d617Schristos 	  free (cmp->file[f].equivs);
1045*75f6d617Schristos 	  free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
1046*75f6d617Schristos 	}
1047*75f6d617Schristos 
1048*75f6d617Schristos       for (e = script; e; e = p)
1049*75f6d617Schristos 	{
1050*75f6d617Schristos 	  p = e->link;
1051*75f6d617Schristos 	  free (e);
1052*75f6d617Schristos 	}
1053*75f6d617Schristos 
1054*75f6d617Schristos       if (! ROBUST_OUTPUT_STYLE (output_style))
1055*75f6d617Schristos 	for (f = 0; f < 2; ++f)
1056*75f6d617Schristos 	  if (cmp->file[f].missing_newline)
1057*75f6d617Schristos 	    {
1058*75f6d617Schristos 	      error (0, 0, "%s: %s\n",
1059*75f6d617Schristos 		     file_label[f] ? file_label[f] : cmp->file[f].name,
1060*75f6d617Schristos 		     _("No newline at end of file"));
1061*75f6d617Schristos 	      changes = 2;
1062*75f6d617Schristos 	    }
1063*75f6d617Schristos     }
1064*75f6d617Schristos 
1065*75f6d617Schristos   if (cmp->file[0].buffer != cmp->file[1].buffer)
1066*75f6d617Schristos     free (cmp->file[0].buffer);
1067*75f6d617Schristos   free (cmp->file[1].buffer);
1068*75f6d617Schristos 
1069*75f6d617Schristos   return changes;
1070*75f6d617Schristos }
1071