xref: /freebsd-src/contrib/diff/src/analyze.c (revision 18fd37a72c3a7549d2d4f6c6ea00bdcd2bdaca01)
1*18fd37a7SXin LI /* Analyze file differences for GNU DIFF.
2*18fd37a7SXin LI 
3*18fd37a7SXin LI    Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
4*18fd37a7SXin LI    2004 Free Software Foundation, Inc.
5*18fd37a7SXin LI 
6*18fd37a7SXin LI    This file is part of GNU DIFF.
7*18fd37a7SXin LI 
8*18fd37a7SXin LI    GNU DIFF is free software; you can redistribute it and/or modify
9*18fd37a7SXin LI    it under the terms of the GNU General Public License as published by
10*18fd37a7SXin LI    the Free Software Foundation; either version 2, or (at your option)
11*18fd37a7SXin LI    any later version.
12*18fd37a7SXin LI 
13*18fd37a7SXin LI    GNU DIFF is distributed in the hope that it will be useful,
14*18fd37a7SXin LI    but WITHOUT ANY WARRANTY; without even the implied warranty of
15*18fd37a7SXin LI    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16*18fd37a7SXin LI    GNU General Public License for more details.
17*18fd37a7SXin LI 
18*18fd37a7SXin LI    You should have received a copy of the GNU General Public License
19*18fd37a7SXin LI    along with this program; see the file COPYING.
20*18fd37a7SXin LI    If not, write to the Free Software Foundation,
21*18fd37a7SXin LI    59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
22*18fd37a7SXin LI 
23*18fd37a7SXin LI /* The basic algorithm is described in:
24*18fd37a7SXin LI    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
25*18fd37a7SXin LI    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
26*18fd37a7SXin LI    see especially section 4.2, which describes the variation used below.
27*18fd37a7SXin LI    Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
28*18fd37a7SXin LI    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
29*18fd37a7SXin LI    at the price of producing suboptimal output for large inputs with
30*18fd37a7SXin LI    many differences.
31*18fd37a7SXin LI 
32*18fd37a7SXin LI    The basic algorithm was independently discovered as described in:
33*18fd37a7SXin LI    "Algorithms for Approximate String Matching", E. Ukkonen,
34*18fd37a7SXin LI    Information and Control Vol. 64, 1985, pp. 100-118.  */
35*18fd37a7SXin LI 
36*18fd37a7SXin LI #include "diff.h"
37*18fd37a7SXin LI #include <cmpbuf.h>
38*18fd37a7SXin LI #include <error.h>
39*18fd37a7SXin LI #include <file-type.h>
40*18fd37a7SXin LI #include <xalloc.h>
41*18fd37a7SXin LI 
42*18fd37a7SXin LI static lin *xvec, *yvec;	/* Vectors being compared. */
43*18fd37a7SXin LI static lin *fdiag;		/* Vector, indexed by diagonal, containing
44*18fd37a7SXin LI 				   1 + the X coordinate of the point furthest
45*18fd37a7SXin LI 				   along the given diagonal in the forward
46*18fd37a7SXin LI 				   search of the edit matrix. */
47*18fd37a7SXin LI static lin *bdiag;		/* Vector, indexed by diagonal, containing
48*18fd37a7SXin LI 				   the X coordinate of the point furthest
49*18fd37a7SXin LI 				   along the given diagonal in the backward
50*18fd37a7SXin LI 				   search of the edit matrix. */
51*18fd37a7SXin LI static lin too_expensive;	/* Edit scripts longer than this are too
52*18fd37a7SXin LI 				   expensive to compute.  */
53*18fd37a7SXin LI 
54*18fd37a7SXin LI #define SNAKE_LIMIT 20	/* Snakes bigger than this are considered `big'.  */
55*18fd37a7SXin LI 
56*18fd37a7SXin LI struct partition
57*18fd37a7SXin LI {
58*18fd37a7SXin LI   lin xmid, ymid;	/* Midpoints of this partition.  */
59*18fd37a7SXin LI   bool lo_minimal;	/* Nonzero if low half will be analyzed minimally.  */
60*18fd37a7SXin LI   bool hi_minimal;	/* Likewise for high half.  */
61*18fd37a7SXin LI };
62*18fd37a7SXin LI 
63*18fd37a7SXin LI /* Find the midpoint of the shortest edit script for a specified
64*18fd37a7SXin LI    portion of the two files.
65*18fd37a7SXin LI 
66*18fd37a7SXin LI    Scan from the beginnings of the files, and simultaneously from the ends,
67*18fd37a7SXin LI    doing a breadth-first search through the space of edit-sequence.
68*18fd37a7SXin LI    When the two searches meet, we have found the midpoint of the shortest
69*18fd37a7SXin LI    edit sequence.
70*18fd37a7SXin LI 
71*18fd37a7SXin LI    If FIND_MINIMAL is nonzero, find the minimal edit script regardless
72*18fd37a7SXin LI    of expense.  Otherwise, if the search is too expensive, use
73*18fd37a7SXin LI    heuristics to stop the search and report a suboptimal answer.
74*18fd37a7SXin LI 
75*18fd37a7SXin LI    Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
76*18fd37a7SXin LI    XMID - YMID equals the number of inserted lines minus the number
77*18fd37a7SXin LI    of deleted lines (counting only lines before the midpoint).
78*18fd37a7SXin LI 
79*18fd37a7SXin LI    Set PART->lo_minimal to true iff the minimal edit script for the
80*18fd37a7SXin LI    left half of the partition is known; similarly for PART->hi_minimal.
81*18fd37a7SXin LI 
82*18fd37a7SXin LI    This function assumes that the first lines of the specified portions
83*18fd37a7SXin LI    of the two files do not match, and likewise that the last lines do not
84*18fd37a7SXin LI    match.  The caller must trim matching lines from the beginning and end
85*18fd37a7SXin LI    of the portions it is going to specify.
86*18fd37a7SXin LI 
87*18fd37a7SXin LI    If we return the "wrong" partitions,
88*18fd37a7SXin LI    the worst this can do is cause suboptimal diff output.
89*18fd37a7SXin LI    It cannot cause incorrect diff output.  */
90*18fd37a7SXin LI 
91*18fd37a7SXin LI static void
diag(lin xoff,lin xlim,lin yoff,lin ylim,bool find_minimal,struct partition * part)92*18fd37a7SXin LI diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal,
93*18fd37a7SXin LI       struct partition *part)
94*18fd37a7SXin LI {
95*18fd37a7SXin LI   lin *const fd = fdiag;	/* Give the compiler a chance. */
96*18fd37a7SXin LI   lin *const bd = bdiag;	/* Additional help for the compiler. */
97*18fd37a7SXin LI   lin const *const xv = xvec;	/* Still more help for the compiler. */
98*18fd37a7SXin LI   lin const *const yv = yvec;	/* And more and more . . . */
99*18fd37a7SXin LI   lin const dmin = xoff - ylim;	/* Minimum valid diagonal. */
100*18fd37a7SXin LI   lin const dmax = xlim - yoff;	/* Maximum valid diagonal. */
101*18fd37a7SXin LI   lin const fmid = xoff - yoff;	/* Center diagonal of top-down search. */
102*18fd37a7SXin LI   lin const bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
103*18fd37a7SXin LI   lin fmin = fmid, fmax = fmid;	/* Limits of top-down search. */
104*18fd37a7SXin LI   lin bmin = bmid, bmax = bmid;	/* Limits of bottom-up search. */
105*18fd37a7SXin LI   lin c;			/* Cost. */
106*18fd37a7SXin LI   bool odd = (fmid - bmid) & 1;	/* True if southeast corner is on an odd
107*18fd37a7SXin LI 				   diagonal with respect to the northwest. */
108*18fd37a7SXin LI 
109*18fd37a7SXin LI   fd[fmid] = xoff;
110*18fd37a7SXin LI   bd[bmid] = xlim;
111*18fd37a7SXin LI 
112*18fd37a7SXin LI   for (c = 1;; ++c)
113*18fd37a7SXin LI     {
114*18fd37a7SXin LI       lin d;			/* Active diagonal. */
115*18fd37a7SXin LI       bool big_snake = false;
116*18fd37a7SXin LI 
117*18fd37a7SXin LI       /* Extend the top-down search by an edit step in each diagonal. */
118*18fd37a7SXin LI       fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
119*18fd37a7SXin LI       fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
120*18fd37a7SXin LI       for (d = fmax; d >= fmin; d -= 2)
121*18fd37a7SXin LI 	{
122*18fd37a7SXin LI 	  lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
123*18fd37a7SXin LI 
124*18fd37a7SXin LI 	  if (tlo >= thi)
125*18fd37a7SXin LI 	    x = tlo + 1;
126*18fd37a7SXin LI 	  else
127*18fd37a7SXin LI 	    x = thi;
128*18fd37a7SXin LI 	  oldx = x;
129*18fd37a7SXin LI 	  y = x - d;
130*18fd37a7SXin LI 	  while (x < xlim && y < ylim && xv[x] == yv[y])
131*18fd37a7SXin LI 	    ++x, ++y;
132*18fd37a7SXin LI 	  if (x - oldx > SNAKE_LIMIT)
133*18fd37a7SXin LI 	    big_snake = true;
134*18fd37a7SXin LI 	  fd[d] = x;
135*18fd37a7SXin LI 	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
136*18fd37a7SXin LI 	    {
137*18fd37a7SXin LI 	      part->xmid = x;
138*18fd37a7SXin LI 	      part->ymid = y;
139*18fd37a7SXin LI 	      part->lo_minimal = part->hi_minimal = true;
140*18fd37a7SXin LI 	      return;
141*18fd37a7SXin LI 	    }
142*18fd37a7SXin LI 	}
143*18fd37a7SXin LI 
144*18fd37a7SXin LI       /* Similarly extend the bottom-up search.  */
145*18fd37a7SXin LI       bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin;
146*18fd37a7SXin LI       bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax;
147*18fd37a7SXin LI       for (d = bmax; d >= bmin; d -= 2)
148*18fd37a7SXin LI 	{
149*18fd37a7SXin LI 	  lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
150*18fd37a7SXin LI 
151*18fd37a7SXin LI 	  if (tlo < thi)
152*18fd37a7SXin LI 	    x = tlo;
153*18fd37a7SXin LI 	  else
154*18fd37a7SXin LI 	    x = thi - 1;
155*18fd37a7SXin LI 	  oldx = x;
156*18fd37a7SXin LI 	  y = x - d;
157*18fd37a7SXin LI 	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
158*18fd37a7SXin LI 	    --x, --y;
159*18fd37a7SXin LI 	  if (oldx - x > SNAKE_LIMIT)
160*18fd37a7SXin LI 	    big_snake = true;
161*18fd37a7SXin LI 	  bd[d] = x;
162*18fd37a7SXin LI 	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
163*18fd37a7SXin LI 	    {
164*18fd37a7SXin LI 	      part->xmid = x;
165*18fd37a7SXin LI 	      part->ymid = y;
166*18fd37a7SXin LI 	      part->lo_minimal = part->hi_minimal = true;
167*18fd37a7SXin LI 	      return;
168*18fd37a7SXin LI 	    }
169*18fd37a7SXin LI 	}
170*18fd37a7SXin LI 
171*18fd37a7SXin LI       if (find_minimal)
172*18fd37a7SXin LI 	continue;
173*18fd37a7SXin LI 
174*18fd37a7SXin LI       /* Heuristic: check occasionally for a diagonal that has made
175*18fd37a7SXin LI 	 lots of progress compared with the edit distance.
176*18fd37a7SXin LI 	 If we have any such, find the one that has made the most
177*18fd37a7SXin LI 	 progress and return it as if it had succeeded.
178*18fd37a7SXin LI 
179*18fd37a7SXin LI 	 With this heuristic, for files with a constant small density
180*18fd37a7SXin LI 	 of changes, the algorithm is linear in the file size.  */
181*18fd37a7SXin LI 
182*18fd37a7SXin LI       if (200 < c && big_snake && speed_large_files)
183*18fd37a7SXin LI 	{
184*18fd37a7SXin LI 	  lin best = 0;
185*18fd37a7SXin LI 
186*18fd37a7SXin LI 	  for (d = fmax; d >= fmin; d -= 2)
187*18fd37a7SXin LI 	    {
188*18fd37a7SXin LI 	      lin dd = d - fmid;
189*18fd37a7SXin LI 	      lin x = fd[d];
190*18fd37a7SXin LI 	      lin y = x - d;
191*18fd37a7SXin LI 	      lin v = (x - xoff) * 2 - dd;
192*18fd37a7SXin LI 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
193*18fd37a7SXin LI 		{
194*18fd37a7SXin LI 		  if (v > best
195*18fd37a7SXin LI 		      && xoff + SNAKE_LIMIT <= x && x < xlim
196*18fd37a7SXin LI 		      && yoff + SNAKE_LIMIT <= y && y < ylim)
197*18fd37a7SXin LI 		    {
198*18fd37a7SXin LI 		      /* We have a good enough best diagonal;
199*18fd37a7SXin LI 			 now insist that it end with a significant snake.  */
200*18fd37a7SXin LI 		      int k;
201*18fd37a7SXin LI 
202*18fd37a7SXin LI 		      for (k = 1; xv[x - k] == yv[y - k]; k++)
203*18fd37a7SXin LI 			if (k == SNAKE_LIMIT)
204*18fd37a7SXin LI 			  {
205*18fd37a7SXin LI 			    best = v;
206*18fd37a7SXin LI 			    part->xmid = x;
207*18fd37a7SXin LI 			    part->ymid = y;
208*18fd37a7SXin LI 			    break;
209*18fd37a7SXin LI 			  }
210*18fd37a7SXin LI 		    }
211*18fd37a7SXin LI 		}
212*18fd37a7SXin LI 	    }
213*18fd37a7SXin LI 	  if (best > 0)
214*18fd37a7SXin LI 	    {
215*18fd37a7SXin LI 	      part->lo_minimal = true;
216*18fd37a7SXin LI 	      part->hi_minimal = false;
217*18fd37a7SXin LI 	      return;
218*18fd37a7SXin LI 	    }
219*18fd37a7SXin LI 
220*18fd37a7SXin LI 	  best = 0;
221*18fd37a7SXin LI 	  for (d = bmax; d >= bmin; d -= 2)
222*18fd37a7SXin LI 	    {
223*18fd37a7SXin LI 	      lin dd = d - bmid;
224*18fd37a7SXin LI 	      lin x = bd[d];
225*18fd37a7SXin LI 	      lin y = x - d;
226*18fd37a7SXin LI 	      lin v = (xlim - x) * 2 + dd;
227*18fd37a7SXin LI 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
228*18fd37a7SXin LI 		{
229*18fd37a7SXin LI 		  if (v > best
230*18fd37a7SXin LI 		      && xoff < x && x <= xlim - SNAKE_LIMIT
231*18fd37a7SXin LI 		      && yoff < y && y <= ylim - SNAKE_LIMIT)
232*18fd37a7SXin LI 		    {
233*18fd37a7SXin LI 		      /* We have a good enough best diagonal;
234*18fd37a7SXin LI 			 now insist that it end with a significant snake.  */
235*18fd37a7SXin LI 		      int k;
236*18fd37a7SXin LI 
237*18fd37a7SXin LI 		      for (k = 0; xv[x + k] == yv[y + k]; k++)
238*18fd37a7SXin LI 			if (k == SNAKE_LIMIT - 1)
239*18fd37a7SXin LI 			  {
240*18fd37a7SXin LI 			    best = v;
241*18fd37a7SXin LI 			    part->xmid = x;
242*18fd37a7SXin LI 			    part->ymid = y;
243*18fd37a7SXin LI 			    break;
244*18fd37a7SXin LI 			  }
245*18fd37a7SXin LI 		    }
246*18fd37a7SXin LI 		}
247*18fd37a7SXin LI 	    }
248*18fd37a7SXin LI 	  if (best > 0)
249*18fd37a7SXin LI 	    {
250*18fd37a7SXin LI 	      part->lo_minimal = false;
251*18fd37a7SXin LI 	      part->hi_minimal = true;
252*18fd37a7SXin LI 	      return;
253*18fd37a7SXin LI 	    }
254*18fd37a7SXin LI 	}
255*18fd37a7SXin LI 
256*18fd37a7SXin LI       /* Heuristic: if we've gone well beyond the call of duty,
257*18fd37a7SXin LI 	 give up and report halfway between our best results so far.  */
258*18fd37a7SXin LI       if (c >= too_expensive)
259*18fd37a7SXin LI 	{
260*18fd37a7SXin LI 	  lin fxybest, fxbest;
261*18fd37a7SXin LI 	  lin bxybest, bxbest;
262*18fd37a7SXin LI 
263*18fd37a7SXin LI 	  fxbest = bxbest = 0;  /* Pacify `gcc -Wall'.  */
264*18fd37a7SXin LI 
265*18fd37a7SXin LI 	  /* Find forward diagonal that maximizes X + Y.  */
266*18fd37a7SXin LI 	  fxybest = -1;
267*18fd37a7SXin LI 	  for (d = fmax; d >= fmin; d -= 2)
268*18fd37a7SXin LI 	    {
269*18fd37a7SXin LI 	      lin x = MIN (fd[d], xlim);
270*18fd37a7SXin LI 	      lin y = x - d;
271*18fd37a7SXin LI 	      if (ylim < y)
272*18fd37a7SXin LI 		x = ylim + d, y = ylim;
273*18fd37a7SXin LI 	      if (fxybest < x + y)
274*18fd37a7SXin LI 		{
275*18fd37a7SXin LI 		  fxybest = x + y;
276*18fd37a7SXin LI 		  fxbest = x;
277*18fd37a7SXin LI 		}
278*18fd37a7SXin LI 	    }
279*18fd37a7SXin LI 
280*18fd37a7SXin LI 	  /* Find backward diagonal that minimizes X + Y.  */
281*18fd37a7SXin LI 	  bxybest = LIN_MAX;
282*18fd37a7SXin LI 	  for (d = bmax; d >= bmin; d -= 2)
283*18fd37a7SXin LI 	    {
284*18fd37a7SXin LI 	      lin x = MAX (xoff, bd[d]);
285*18fd37a7SXin LI 	      lin y = x - d;
286*18fd37a7SXin LI 	      if (y < yoff)
287*18fd37a7SXin LI 		x = yoff + d, y = yoff;
288*18fd37a7SXin LI 	      if (x + y < bxybest)
289*18fd37a7SXin LI 		{
290*18fd37a7SXin LI 		  bxybest = x + y;
291*18fd37a7SXin LI 		  bxbest = x;
292*18fd37a7SXin LI 		}
293*18fd37a7SXin LI 	    }
294*18fd37a7SXin LI 
295*18fd37a7SXin LI 	  /* Use the better of the two diagonals.  */
296*18fd37a7SXin LI 	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
297*18fd37a7SXin LI 	    {
298*18fd37a7SXin LI 	      part->xmid = fxbest;
299*18fd37a7SXin LI 	      part->ymid = fxybest - fxbest;
300*18fd37a7SXin LI 	      part->lo_minimal = true;
301*18fd37a7SXin LI 	      part->hi_minimal = false;
302*18fd37a7SXin LI 	    }
303*18fd37a7SXin LI 	  else
304*18fd37a7SXin LI 	    {
305*18fd37a7SXin LI 	      part->xmid = bxbest;
306*18fd37a7SXin LI 	      part->ymid = bxybest - bxbest;
307*18fd37a7SXin LI 	      part->lo_minimal = false;
308*18fd37a7SXin LI 	      part->hi_minimal = true;
309*18fd37a7SXin LI 	    }
310*18fd37a7SXin LI 	  return;
311*18fd37a7SXin LI 	}
312*18fd37a7SXin LI     }
313*18fd37a7SXin LI }
314*18fd37a7SXin LI 
315*18fd37a7SXin LI /* Compare in detail contiguous subsequences of the two files
316*18fd37a7SXin LI    which are known, as a whole, to match each other.
317*18fd37a7SXin LI 
318*18fd37a7SXin LI    The results are recorded in the vectors files[N].changed, by
319*18fd37a7SXin LI    storing 1 in the element for each line that is an insertion or deletion.
320*18fd37a7SXin LI 
321*18fd37a7SXin LI    The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
322*18fd37a7SXin LI 
323*18fd37a7SXin LI    Note that XLIM, YLIM are exclusive bounds.
324*18fd37a7SXin LI    All line numbers are origin-0 and discarded lines are not counted.
325*18fd37a7SXin LI 
326*18fd37a7SXin LI    If FIND_MINIMAL, find a minimal difference no matter how
327*18fd37a7SXin LI    expensive it is.  */
328*18fd37a7SXin LI 
329*18fd37a7SXin LI static void
compareseq(lin xoff,lin xlim,lin yoff,lin ylim,bool find_minimal)330*18fd37a7SXin LI compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal)
331*18fd37a7SXin LI {
332*18fd37a7SXin LI   lin const *xv = xvec; /* Help the compiler.  */
333*18fd37a7SXin LI   lin const *yv = yvec;
334*18fd37a7SXin LI 
335*18fd37a7SXin LI   /* Slide down the bottom initial diagonal. */
336*18fd37a7SXin LI   while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
337*18fd37a7SXin LI     ++xoff, ++yoff;
338*18fd37a7SXin LI   /* Slide up the top initial diagonal. */
339*18fd37a7SXin LI   while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
340*18fd37a7SXin LI     --xlim, --ylim;
341*18fd37a7SXin LI 
342*18fd37a7SXin LI   /* Handle simple cases. */
343*18fd37a7SXin LI   if (xoff == xlim)
344*18fd37a7SXin LI     while (yoff < ylim)
345*18fd37a7SXin LI       files[1].changed[files[1].realindexes[yoff++]] = 1;
346*18fd37a7SXin LI   else if (yoff == ylim)
347*18fd37a7SXin LI     while (xoff < xlim)
348*18fd37a7SXin LI       files[0].changed[files[0].realindexes[xoff++]] = 1;
349*18fd37a7SXin LI   else
350*18fd37a7SXin LI     {
351*18fd37a7SXin LI       struct partition part;
352*18fd37a7SXin LI 
353*18fd37a7SXin LI       /* Find a point of correspondence in the middle of the files.  */
354*18fd37a7SXin LI       diag (xoff, xlim, yoff, ylim, find_minimal, &part);
355*18fd37a7SXin LI 
356*18fd37a7SXin LI       /* Use the partitions to split this problem into subproblems.  */
357*18fd37a7SXin LI       compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
358*18fd37a7SXin LI       compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
359*18fd37a7SXin LI     }
360*18fd37a7SXin LI }
361*18fd37a7SXin LI 
362*18fd37a7SXin LI /* Discard lines from one file that have no matches in the other file.
363*18fd37a7SXin LI 
364*18fd37a7SXin LI    A line which is discarded will not be considered by the actual
365*18fd37a7SXin LI    comparison algorithm; it will be as if that line were not in the file.
366*18fd37a7SXin LI    The file's `realindexes' table maps virtual line numbers
367*18fd37a7SXin LI    (which don't count the discarded lines) into real line numbers;
368*18fd37a7SXin LI    this is how the actual comparison algorithm produces results
369*18fd37a7SXin LI    that are comprehensible when the discarded lines are counted.
370*18fd37a7SXin LI 
371*18fd37a7SXin LI    When we discard a line, we also mark it as a deletion or insertion
372*18fd37a7SXin LI    so that it will be printed in the output.  */
373*18fd37a7SXin LI 
374*18fd37a7SXin LI static void
discard_confusing_lines(struct file_data filevec[])375*18fd37a7SXin LI discard_confusing_lines (struct file_data filevec[])
376*18fd37a7SXin LI {
377*18fd37a7SXin LI   int f;
378*18fd37a7SXin LI   lin i;
379*18fd37a7SXin LI   char *discarded[2];
380*18fd37a7SXin LI   lin *equiv_count[2];
381*18fd37a7SXin LI   lin *p;
382*18fd37a7SXin LI 
383*18fd37a7SXin LI   /* Allocate our results.  */
384*18fd37a7SXin LI   p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
385*18fd37a7SXin LI 	       * (2 * sizeof *p));
386*18fd37a7SXin LI   for (f = 0; f < 2; f++)
387*18fd37a7SXin LI     {
388*18fd37a7SXin LI       filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
389*18fd37a7SXin LI       filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
390*18fd37a7SXin LI     }
391*18fd37a7SXin LI 
392*18fd37a7SXin LI   /* Set up equiv_count[F][I] as the number of lines in file F
393*18fd37a7SXin LI      that fall in equivalence class I.  */
394*18fd37a7SXin LI 
395*18fd37a7SXin LI   p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
396*18fd37a7SXin LI   equiv_count[0] = p;
397*18fd37a7SXin LI   equiv_count[1] = p + filevec[0].equiv_max;
398*18fd37a7SXin LI 
399*18fd37a7SXin LI   for (i = 0; i < filevec[0].buffered_lines; ++i)
400*18fd37a7SXin LI     ++equiv_count[0][filevec[0].equivs[i]];
401*18fd37a7SXin LI   for (i = 0; i < filevec[1].buffered_lines; ++i)
402*18fd37a7SXin LI     ++equiv_count[1][filevec[1].equivs[i]];
403*18fd37a7SXin LI 
404*18fd37a7SXin LI   /* Set up tables of which lines are going to be discarded.  */
405*18fd37a7SXin LI 
406*18fd37a7SXin LI   discarded[0] = zalloc (filevec[0].buffered_lines
407*18fd37a7SXin LI 			 + filevec[1].buffered_lines);
408*18fd37a7SXin LI   discarded[1] = discarded[0] + filevec[0].buffered_lines;
409*18fd37a7SXin LI 
410*18fd37a7SXin LI   /* Mark to be discarded each line that matches no line of the other file.
411*18fd37a7SXin LI      If a line matches many lines, mark it as provisionally discardable.  */
412*18fd37a7SXin LI 
413*18fd37a7SXin LI   for (f = 0; f < 2; f++)
414*18fd37a7SXin LI     {
415*18fd37a7SXin LI       size_t end = filevec[f].buffered_lines;
416*18fd37a7SXin LI       char *discards = discarded[f];
417*18fd37a7SXin LI       lin *counts = equiv_count[1 - f];
418*18fd37a7SXin LI       lin *equivs = filevec[f].equivs;
419*18fd37a7SXin LI       size_t many = 5;
420*18fd37a7SXin LI       size_t tem = end / 64;
421*18fd37a7SXin LI 
422*18fd37a7SXin LI       /* Multiply MANY by approximate square root of number of lines.
423*18fd37a7SXin LI 	 That is the threshold for provisionally discardable lines.  */
424*18fd37a7SXin LI       while ((tem = tem >> 2) > 0)
425*18fd37a7SXin LI 	many *= 2;
426*18fd37a7SXin LI 
427*18fd37a7SXin LI       for (i = 0; i < end; i++)
428*18fd37a7SXin LI 	{
429*18fd37a7SXin LI 	  lin nmatch;
430*18fd37a7SXin LI 	  if (equivs[i] == 0)
431*18fd37a7SXin LI 	    continue;
432*18fd37a7SXin LI 	  nmatch = counts[equivs[i]];
433*18fd37a7SXin LI 	  if (nmatch == 0)
434*18fd37a7SXin LI 	    discards[i] = 1;
435*18fd37a7SXin LI 	  else if (nmatch > many)
436*18fd37a7SXin LI 	    discards[i] = 2;
437*18fd37a7SXin LI 	}
438*18fd37a7SXin LI     }
439*18fd37a7SXin LI 
440*18fd37a7SXin LI   /* Don't really discard the provisional lines except when they occur
441*18fd37a7SXin LI      in a run of discardables, with nonprovisionals at the beginning
442*18fd37a7SXin LI      and end.  */
443*18fd37a7SXin LI 
444*18fd37a7SXin LI   for (f = 0; f < 2; f++)
445*18fd37a7SXin LI     {
446*18fd37a7SXin LI       lin end = filevec[f].buffered_lines;
447*18fd37a7SXin LI       register char *discards = discarded[f];
448*18fd37a7SXin LI 
449*18fd37a7SXin LI       for (i = 0; i < end; i++)
450*18fd37a7SXin LI 	{
451*18fd37a7SXin LI 	  /* Cancel provisional discards not in middle of run of discards.  */
452*18fd37a7SXin LI 	  if (discards[i] == 2)
453*18fd37a7SXin LI 	    discards[i] = 0;
454*18fd37a7SXin LI 	  else if (discards[i] != 0)
455*18fd37a7SXin LI 	    {
456*18fd37a7SXin LI 	      /* We have found a nonprovisional discard.  */
457*18fd37a7SXin LI 	      register lin j;
458*18fd37a7SXin LI 	      lin length;
459*18fd37a7SXin LI 	      lin provisional = 0;
460*18fd37a7SXin LI 
461*18fd37a7SXin LI 	      /* Find end of this run of discardable lines.
462*18fd37a7SXin LI 		 Count how many are provisionally discardable.  */
463*18fd37a7SXin LI 	      for (j = i; j < end; j++)
464*18fd37a7SXin LI 		{
465*18fd37a7SXin LI 		  if (discards[j] == 0)
466*18fd37a7SXin LI 		    break;
467*18fd37a7SXin LI 		  if (discards[j] == 2)
468*18fd37a7SXin LI 		    ++provisional;
469*18fd37a7SXin LI 		}
470*18fd37a7SXin LI 
471*18fd37a7SXin LI 	      /* Cancel provisional discards at end, and shrink the run.  */
472*18fd37a7SXin LI 	      while (j > i && discards[j - 1] == 2)
473*18fd37a7SXin LI 		discards[--j] = 0, --provisional;
474*18fd37a7SXin LI 
475*18fd37a7SXin LI 	      /* Now we have the length of a run of discardable lines
476*18fd37a7SXin LI 		 whose first and last are not provisional.  */
477*18fd37a7SXin LI 	      length = j - i;
478*18fd37a7SXin LI 
479*18fd37a7SXin LI 	      /* If 1/4 of the lines in the run are provisional,
480*18fd37a7SXin LI 		 cancel discarding of all provisional lines in the run.  */
481*18fd37a7SXin LI 	      if (provisional * 4 > length)
482*18fd37a7SXin LI 		{
483*18fd37a7SXin LI 		  while (j > i)
484*18fd37a7SXin LI 		    if (discards[--j] == 2)
485*18fd37a7SXin LI 		      discards[j] = 0;
486*18fd37a7SXin LI 		}
487*18fd37a7SXin LI 	      else
488*18fd37a7SXin LI 		{
489*18fd37a7SXin LI 		  register lin consec;
490*18fd37a7SXin LI 		  lin minimum = 1;
491*18fd37a7SXin LI 		  lin tem = length >> 2;
492*18fd37a7SXin LI 
493*18fd37a7SXin LI 		  /* MINIMUM is approximate square root of LENGTH/4.
494*18fd37a7SXin LI 		     A subrun of two or more provisionals can stand
495*18fd37a7SXin LI 		     when LENGTH is at least 16.
496*18fd37a7SXin LI 		     A subrun of 4 or more can stand when LENGTH >= 64.  */
497*18fd37a7SXin LI 		  while (0 < (tem >>= 2))
498*18fd37a7SXin LI 		    minimum <<= 1;
499*18fd37a7SXin LI 		  minimum++;
500*18fd37a7SXin LI 
501*18fd37a7SXin LI 		  /* Cancel any subrun of MINIMUM or more provisionals
502*18fd37a7SXin LI 		     within the larger run.  */
503*18fd37a7SXin LI 		  for (j = 0, consec = 0; j < length; j++)
504*18fd37a7SXin LI 		    if (discards[i + j] != 2)
505*18fd37a7SXin LI 		      consec = 0;
506*18fd37a7SXin LI 		    else if (minimum == ++consec)
507*18fd37a7SXin LI 		      /* Back up to start of subrun, to cancel it all.  */
508*18fd37a7SXin LI 		      j -= consec;
509*18fd37a7SXin LI 		    else if (minimum < consec)
510*18fd37a7SXin LI 		      discards[i + j] = 0;
511*18fd37a7SXin LI 
512*18fd37a7SXin LI 		  /* Scan from beginning of run
513*18fd37a7SXin LI 		     until we find 3 or more nonprovisionals in a row
514*18fd37a7SXin LI 		     or until the first nonprovisional at least 8 lines in.
515*18fd37a7SXin LI 		     Until that point, cancel any provisionals.  */
516*18fd37a7SXin LI 		  for (j = 0, consec = 0; j < length; j++)
517*18fd37a7SXin LI 		    {
518*18fd37a7SXin LI 		      if (j >= 8 && discards[i + j] == 1)
519*18fd37a7SXin LI 			break;
520*18fd37a7SXin LI 		      if (discards[i + j] == 2)
521*18fd37a7SXin LI 			consec = 0, discards[i + j] = 0;
522*18fd37a7SXin LI 		      else if (discards[i + j] == 0)
523*18fd37a7SXin LI 			consec = 0;
524*18fd37a7SXin LI 		      else
525*18fd37a7SXin LI 			consec++;
526*18fd37a7SXin LI 		      if (consec == 3)
527*18fd37a7SXin LI 			break;
528*18fd37a7SXin LI 		    }
529*18fd37a7SXin LI 
530*18fd37a7SXin LI 		  /* I advances to the last line of the run.  */
531*18fd37a7SXin LI 		  i += length - 1;
532*18fd37a7SXin LI 
533*18fd37a7SXin LI 		  /* Same thing, from end.  */
534*18fd37a7SXin LI 		  for (j = 0, consec = 0; j < length; j++)
535*18fd37a7SXin LI 		    {
536*18fd37a7SXin LI 		      if (j >= 8 && discards[i - j] == 1)
537*18fd37a7SXin LI 			break;
538*18fd37a7SXin LI 		      if (discards[i - j] == 2)
539*18fd37a7SXin LI 			consec = 0, discards[i - j] = 0;
540*18fd37a7SXin LI 		      else if (discards[i - j] == 0)
541*18fd37a7SXin LI 			consec = 0;
542*18fd37a7SXin LI 		      else
543*18fd37a7SXin LI 			consec++;
544*18fd37a7SXin LI 		      if (consec == 3)
545*18fd37a7SXin LI 			break;
546*18fd37a7SXin LI 		    }
547*18fd37a7SXin LI 		}
548*18fd37a7SXin LI 	    }
549*18fd37a7SXin LI 	}
550*18fd37a7SXin LI     }
551*18fd37a7SXin LI 
552*18fd37a7SXin LI   /* Actually discard the lines. */
553*18fd37a7SXin LI   for (f = 0; f < 2; f++)
554*18fd37a7SXin LI     {
555*18fd37a7SXin LI       char *discards = discarded[f];
556*18fd37a7SXin LI       lin end = filevec[f].buffered_lines;
557*18fd37a7SXin LI       lin j = 0;
558*18fd37a7SXin LI       for (i = 0; i < end; ++i)
559*18fd37a7SXin LI 	if (minimal || discards[i] == 0)
560*18fd37a7SXin LI 	  {
561*18fd37a7SXin LI 	    filevec[f].undiscarded[j] = filevec[f].equivs[i];
562*18fd37a7SXin LI 	    filevec[f].realindexes[j++] = i;
563*18fd37a7SXin LI 	  }
564*18fd37a7SXin LI 	else
565*18fd37a7SXin LI 	  filevec[f].changed[i] = 1;
566*18fd37a7SXin LI       filevec[f].nondiscarded_lines = j;
567*18fd37a7SXin LI     }
568*18fd37a7SXin LI 
569*18fd37a7SXin LI   free (discarded[0]);
570*18fd37a7SXin LI   free (equiv_count[0]);
571*18fd37a7SXin LI }
572*18fd37a7SXin LI 
573*18fd37a7SXin LI /* Adjust inserts/deletes of identical lines to join changes
574*18fd37a7SXin LI    as much as possible.
575*18fd37a7SXin LI 
576*18fd37a7SXin LI    We do something when a run of changed lines include a
577*18fd37a7SXin LI    line at one end and have an excluded, identical line at the other.
578*18fd37a7SXin LI    We are free to choose which identical line is included.
579*18fd37a7SXin LI    `compareseq' usually chooses the one at the beginning,
580*18fd37a7SXin LI    but usually it is cleaner to consider the following identical line
581*18fd37a7SXin LI    to be the "change".  */
582*18fd37a7SXin LI 
583*18fd37a7SXin LI static void
shift_boundaries(struct file_data filevec[])584*18fd37a7SXin LI shift_boundaries (struct file_data filevec[])
585*18fd37a7SXin LI {
586*18fd37a7SXin LI   int f;
587*18fd37a7SXin LI 
588*18fd37a7SXin LI   for (f = 0; f < 2; f++)
589*18fd37a7SXin LI     {
590*18fd37a7SXin LI       char *changed = filevec[f].changed;
591*18fd37a7SXin LI       char *other_changed = filevec[1 - f].changed;
592*18fd37a7SXin LI       lin const *equivs = filevec[f].equivs;
593*18fd37a7SXin LI       lin i = 0;
594*18fd37a7SXin LI       lin j = 0;
595*18fd37a7SXin LI       lin i_end = filevec[f].buffered_lines;
596*18fd37a7SXin LI 
597*18fd37a7SXin LI       while (1)
598*18fd37a7SXin LI 	{
599*18fd37a7SXin LI 	  lin runlength, start, corresponding;
600*18fd37a7SXin LI 
601*18fd37a7SXin LI 	  /* Scan forwards to find beginning of another run of changes.
602*18fd37a7SXin LI 	     Also keep track of the corresponding point in the other file.  */
603*18fd37a7SXin LI 
604*18fd37a7SXin LI 	  while (i < i_end && !changed[i])
605*18fd37a7SXin LI 	    {
606*18fd37a7SXin LI 	      while (other_changed[j++])
607*18fd37a7SXin LI 		continue;
608*18fd37a7SXin LI 	      i++;
609*18fd37a7SXin LI 	    }
610*18fd37a7SXin LI 
611*18fd37a7SXin LI 	  if (i == i_end)
612*18fd37a7SXin LI 	    break;
613*18fd37a7SXin LI 
614*18fd37a7SXin LI 	  start = i;
615*18fd37a7SXin LI 
616*18fd37a7SXin LI 	  /* Find the end of this run of changes.  */
617*18fd37a7SXin LI 
618*18fd37a7SXin LI 	  while (changed[++i])
619*18fd37a7SXin LI 	    continue;
620*18fd37a7SXin LI 	  while (other_changed[j])
621*18fd37a7SXin LI 	    j++;
622*18fd37a7SXin LI 
623*18fd37a7SXin LI 	  do
624*18fd37a7SXin LI 	    {
625*18fd37a7SXin LI 	      /* Record the length of this run of changes, so that
626*18fd37a7SXin LI 		 we can later determine whether the run has grown.  */
627*18fd37a7SXin LI 	      runlength = i - start;
628*18fd37a7SXin LI 
629*18fd37a7SXin LI 	      /* Move the changed region back, so long as the
630*18fd37a7SXin LI 		 previous unchanged line matches the last changed one.
631*18fd37a7SXin LI 		 This merges with previous changed regions.  */
632*18fd37a7SXin LI 
633*18fd37a7SXin LI 	      while (start && equivs[start - 1] == equivs[i - 1])
634*18fd37a7SXin LI 		{
635*18fd37a7SXin LI 		  changed[--start] = 1;
636*18fd37a7SXin LI 		  changed[--i] = 0;
637*18fd37a7SXin LI 		  while (changed[start - 1])
638*18fd37a7SXin LI 		    start--;
639*18fd37a7SXin LI 		  while (other_changed[--j])
640*18fd37a7SXin LI 		    continue;
641*18fd37a7SXin LI 		}
642*18fd37a7SXin LI 
643*18fd37a7SXin LI 	      /* Set CORRESPONDING to the end of the changed run, at the last
644*18fd37a7SXin LI 		 point where it corresponds to a changed run in the other file.
645*18fd37a7SXin LI 		 CORRESPONDING == I_END means no such point has been found.  */
646*18fd37a7SXin LI 	      corresponding = other_changed[j - 1] ? i : i_end;
647*18fd37a7SXin LI 
648*18fd37a7SXin LI 	      /* Move the changed region forward, so long as the
649*18fd37a7SXin LI 		 first changed line matches the following unchanged one.
650*18fd37a7SXin LI 		 This merges with following changed regions.
651*18fd37a7SXin LI 		 Do this second, so that if there are no merges,
652*18fd37a7SXin LI 		 the changed region is moved forward as far as possible.  */
653*18fd37a7SXin LI 
654*18fd37a7SXin LI 	      while (i != i_end && equivs[start] == equivs[i])
655*18fd37a7SXin LI 		{
656*18fd37a7SXin LI 		  changed[start++] = 0;
657*18fd37a7SXin LI 		  changed[i++] = 1;
658*18fd37a7SXin LI 		  while (changed[i])
659*18fd37a7SXin LI 		    i++;
660*18fd37a7SXin LI 		  while (other_changed[++j])
661*18fd37a7SXin LI 		    corresponding = i;
662*18fd37a7SXin LI 		}
663*18fd37a7SXin LI 	    }
664*18fd37a7SXin LI 	  while (runlength != i - start);
665*18fd37a7SXin LI 
666*18fd37a7SXin LI 	  /* If possible, move the fully-merged run of changes
667*18fd37a7SXin LI 	     back to a corresponding run in the other file.  */
668*18fd37a7SXin LI 
669*18fd37a7SXin LI 	  while (corresponding < i)
670*18fd37a7SXin LI 	    {
671*18fd37a7SXin LI 	      changed[--start] = 1;
672*18fd37a7SXin LI 	      changed[--i] = 0;
673*18fd37a7SXin LI 	      while (other_changed[--j])
674*18fd37a7SXin LI 		continue;
675*18fd37a7SXin LI 	    }
676*18fd37a7SXin LI 	}
677*18fd37a7SXin LI     }
678*18fd37a7SXin LI }
679*18fd37a7SXin LI 
680*18fd37a7SXin LI /* Cons an additional entry onto the front of an edit script OLD.
681*18fd37a7SXin LI    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
682*18fd37a7SXin LI    DELETED is the number of lines deleted here from file 0.
683*18fd37a7SXin LI    INSERTED is the number of lines inserted here in file 1.
684*18fd37a7SXin LI 
685*18fd37a7SXin LI    If DELETED is 0 then LINE0 is the number of the line before
686*18fd37a7SXin LI    which the insertion was done; vice versa for INSERTED and LINE1.  */
687*18fd37a7SXin LI 
688*18fd37a7SXin LI static struct change *
add_change(lin line0,lin line1,lin deleted,lin inserted,struct change * old)689*18fd37a7SXin LI add_change (lin line0, lin line1, lin deleted, lin inserted,
690*18fd37a7SXin LI 	    struct change *old)
691*18fd37a7SXin LI {
692*18fd37a7SXin LI   struct change *new = xmalloc (sizeof *new);
693*18fd37a7SXin LI 
694*18fd37a7SXin LI   new->line0 = line0;
695*18fd37a7SXin LI   new->line1 = line1;
696*18fd37a7SXin LI   new->inserted = inserted;
697*18fd37a7SXin LI   new->deleted = deleted;
698*18fd37a7SXin LI   new->link = old;
699*18fd37a7SXin LI   return new;
700*18fd37a7SXin LI }
701*18fd37a7SXin LI 
702*18fd37a7SXin LI /* Scan the tables of which lines are inserted and deleted,
703*18fd37a7SXin LI    producing an edit script in reverse order.  */
704*18fd37a7SXin LI 
705*18fd37a7SXin LI static struct change *
build_reverse_script(struct file_data const filevec[])706*18fd37a7SXin LI build_reverse_script (struct file_data const filevec[])
707*18fd37a7SXin LI {
708*18fd37a7SXin LI   struct change *script = 0;
709*18fd37a7SXin LI   char *changed0 = filevec[0].changed;
710*18fd37a7SXin LI   char *changed1 = filevec[1].changed;
711*18fd37a7SXin LI   lin len0 = filevec[0].buffered_lines;
712*18fd37a7SXin LI   lin len1 = filevec[1].buffered_lines;
713*18fd37a7SXin LI 
714*18fd37a7SXin LI   /* Note that changedN[len0] does exist, and is 0.  */
715*18fd37a7SXin LI 
716*18fd37a7SXin LI   lin i0 = 0, i1 = 0;
717*18fd37a7SXin LI 
718*18fd37a7SXin LI   while (i0 < len0 || i1 < len1)
719*18fd37a7SXin LI     {
720*18fd37a7SXin LI       if (changed0[i0] | changed1[i1])
721*18fd37a7SXin LI 	{
722*18fd37a7SXin LI 	  lin line0 = i0, line1 = i1;
723*18fd37a7SXin LI 
724*18fd37a7SXin LI 	  /* Find # lines changed here in each file.  */
725*18fd37a7SXin LI 	  while (changed0[i0]) ++i0;
726*18fd37a7SXin LI 	  while (changed1[i1]) ++i1;
727*18fd37a7SXin LI 
728*18fd37a7SXin LI 	  /* Record this change.  */
729*18fd37a7SXin LI 	  script = add_change (line0, line1, i0 - line0, i1 - line1, script);
730*18fd37a7SXin LI 	}
731*18fd37a7SXin LI 
732*18fd37a7SXin LI       /* We have reached lines in the two files that match each other.  */
733*18fd37a7SXin LI       i0++, i1++;
734*18fd37a7SXin LI     }
735*18fd37a7SXin LI 
736*18fd37a7SXin LI   return script;
737*18fd37a7SXin LI }
738*18fd37a7SXin LI 
739*18fd37a7SXin LI /* Scan the tables of which lines are inserted and deleted,
740*18fd37a7SXin LI    producing an edit script in forward order.  */
741*18fd37a7SXin LI 
742*18fd37a7SXin LI static struct change *
build_script(struct file_data const filevec[])743*18fd37a7SXin LI build_script (struct file_data const filevec[])
744*18fd37a7SXin LI {
745*18fd37a7SXin LI   struct change *script = 0;
746*18fd37a7SXin LI   char *changed0 = filevec[0].changed;
747*18fd37a7SXin LI   char *changed1 = filevec[1].changed;
748*18fd37a7SXin LI   lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
749*18fd37a7SXin LI 
750*18fd37a7SXin LI   /* Note that changedN[-1] does exist, and is 0.  */
751*18fd37a7SXin LI 
752*18fd37a7SXin LI   while (i0 >= 0 || i1 >= 0)
753*18fd37a7SXin LI     {
754*18fd37a7SXin LI       if (changed0[i0 - 1] | changed1[i1 - 1])
755*18fd37a7SXin LI 	{
756*18fd37a7SXin LI 	  lin line0 = i0, line1 = i1;
757*18fd37a7SXin LI 
758*18fd37a7SXin LI 	  /* Find # lines changed here in each file.  */
759*18fd37a7SXin LI 	  while (changed0[i0 - 1]) --i0;
760*18fd37a7SXin LI 	  while (changed1[i1 - 1]) --i1;
761*18fd37a7SXin LI 
762*18fd37a7SXin LI 	  /* Record this change.  */
763*18fd37a7SXin LI 	  script = add_change (i0, i1, line0 - i0, line1 - i1, script);
764*18fd37a7SXin LI 	}
765*18fd37a7SXin LI 
766*18fd37a7SXin LI       /* We have reached lines in the two files that match each other.  */
767*18fd37a7SXin LI       i0--, i1--;
768*18fd37a7SXin LI     }
769*18fd37a7SXin LI 
770*18fd37a7SXin LI   return script;
771*18fd37a7SXin LI }
772*18fd37a7SXin LI 
773*18fd37a7SXin LI /* If CHANGES, briefly report that two files differed.
774*18fd37a7SXin LI    Return 2 if trouble, CHANGES otherwise.  */
775*18fd37a7SXin LI static int
briefly_report(int changes,struct file_data const filevec[])776*18fd37a7SXin LI briefly_report (int changes, struct file_data const filevec[])
777*18fd37a7SXin LI {
778*18fd37a7SXin LI   if (changes)
779*18fd37a7SXin LI     {
780*18fd37a7SXin LI       char const *label0 = file_label[0] ? file_label[0] : filevec[0].name;
781*18fd37a7SXin LI       char const *label1 = file_label[1] ? file_label[1] : filevec[1].name;
782*18fd37a7SXin LI       message ("Files %s and %s differ\n", label0, label1);
783*18fd37a7SXin LI       if (! brief)
784*18fd37a7SXin LI 	changes = 2;
785*18fd37a7SXin LI     }
786*18fd37a7SXin LI 
787*18fd37a7SXin LI   return changes;
788*18fd37a7SXin LI }
789*18fd37a7SXin LI 
790*18fd37a7SXin LI /* Report the differences of two files.  */
791*18fd37a7SXin LI int
diff_2_files(struct comparison * cmp)792*18fd37a7SXin LI diff_2_files (struct comparison *cmp)
793*18fd37a7SXin LI {
794*18fd37a7SXin LI   lin diags;
795*18fd37a7SXin LI   int f;
796*18fd37a7SXin LI   struct change *e, *p;
797*18fd37a7SXin LI   struct change *script;
798*18fd37a7SXin LI   int changes;
799*18fd37a7SXin LI 
800*18fd37a7SXin LI 
801*18fd37a7SXin LI   /* If we have detected that either file is binary,
802*18fd37a7SXin LI      compare the two files as binary.  This can happen
803*18fd37a7SXin LI      only when the first chunk is read.
804*18fd37a7SXin LI      Also, --brief without any --ignore-* options means
805*18fd37a7SXin LI      we can speed things up by treating the files as binary.  */
806*18fd37a7SXin LI 
807*18fd37a7SXin LI   if (read_files (cmp->file, files_can_be_treated_as_binary))
808*18fd37a7SXin LI     {
809*18fd37a7SXin LI       /* Files with different lengths must be different.  */
810*18fd37a7SXin LI       if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
811*18fd37a7SXin LI 	  && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
812*18fd37a7SXin LI 	  && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
813*18fd37a7SXin LI 	changes = 1;
814*18fd37a7SXin LI 
815*18fd37a7SXin LI       /* Standard input equals itself.  */
816*18fd37a7SXin LI       else if (cmp->file[0].desc == cmp->file[1].desc)
817*18fd37a7SXin LI 	changes = 0;
818*18fd37a7SXin LI 
819*18fd37a7SXin LI       else
820*18fd37a7SXin LI 	/* Scan both files, a buffer at a time, looking for a difference.  */
821*18fd37a7SXin LI 	{
822*18fd37a7SXin LI 	  /* Allocate same-sized buffers for both files.  */
823*18fd37a7SXin LI 	  size_t lcm_max = PTRDIFF_MAX - 1;
824*18fd37a7SXin LI 	  size_t buffer_size =
825*18fd37a7SXin LI 	    buffer_lcm (sizeof (word),
826*18fd37a7SXin LI 			buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
827*18fd37a7SXin LI 				    STAT_BLOCKSIZE (cmp->file[1].stat),
828*18fd37a7SXin LI 				    lcm_max),
829*18fd37a7SXin LI 			lcm_max);
830*18fd37a7SXin LI 	  for (f = 0; f < 2; f++)
831*18fd37a7SXin LI 	    cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
832*18fd37a7SXin LI 
833*18fd37a7SXin LI 	  for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
834*18fd37a7SXin LI 	    {
835*18fd37a7SXin LI 	      /* Read a buffer's worth from both files.  */
836*18fd37a7SXin LI 	      for (f = 0; f < 2; f++)
837*18fd37a7SXin LI 		if (0 <= cmp->file[f].desc)
838*18fd37a7SXin LI 		  file_block_read (&cmp->file[f],
839*18fd37a7SXin LI 				   buffer_size - cmp->file[f].buffered);
840*18fd37a7SXin LI 
841*18fd37a7SXin LI 	      /* If the buffers differ, the files differ.  */
842*18fd37a7SXin LI 	      if (cmp->file[0].buffered != cmp->file[1].buffered
843*18fd37a7SXin LI 		  || memcmp (cmp->file[0].buffer,
844*18fd37a7SXin LI 			     cmp->file[1].buffer,
845*18fd37a7SXin LI 			     cmp->file[0].buffered))
846*18fd37a7SXin LI 		{
847*18fd37a7SXin LI 		  changes = 1;
848*18fd37a7SXin LI 		  break;
849*18fd37a7SXin LI 		}
850*18fd37a7SXin LI 
851*18fd37a7SXin LI 	      /* If we reach end of file, the files are the same.  */
852*18fd37a7SXin LI 	      if (cmp->file[0].buffered != buffer_size)
853*18fd37a7SXin LI 		{
854*18fd37a7SXin LI 		  changes = 0;
855*18fd37a7SXin LI 		  break;
856*18fd37a7SXin LI 		}
857*18fd37a7SXin LI 	    }
858*18fd37a7SXin LI 	}
859*18fd37a7SXin LI 
860*18fd37a7SXin LI       changes = briefly_report (changes, cmp->file);
861*18fd37a7SXin LI     }
862*18fd37a7SXin LI   else
863*18fd37a7SXin LI     {
864*18fd37a7SXin LI       /* Allocate vectors for the results of comparison:
865*18fd37a7SXin LI 	 a flag for each line of each file, saying whether that line
866*18fd37a7SXin LI 	 is an insertion or deletion.
867*18fd37a7SXin LI 	 Allocate an extra element, always 0, at each end of each vector.  */
868*18fd37a7SXin LI 
869*18fd37a7SXin LI       size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
870*18fd37a7SXin LI       char *flag_space = zalloc (s);
871*18fd37a7SXin LI       cmp->file[0].changed = flag_space + 1;
872*18fd37a7SXin LI       cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
873*18fd37a7SXin LI 
874*18fd37a7SXin LI       /* Some lines are obviously insertions or deletions
875*18fd37a7SXin LI 	 because they don't match anything.  Detect them now, and
876*18fd37a7SXin LI 	 avoid even thinking about them in the main comparison algorithm.  */
877*18fd37a7SXin LI 
878*18fd37a7SXin LI       discard_confusing_lines (cmp->file);
879*18fd37a7SXin LI 
880*18fd37a7SXin LI       /* Now do the main comparison algorithm, considering just the
881*18fd37a7SXin LI 	 undiscarded lines.  */
882*18fd37a7SXin LI 
883*18fd37a7SXin LI       xvec = cmp->file[0].undiscarded;
884*18fd37a7SXin LI       yvec = cmp->file[1].undiscarded;
885*18fd37a7SXin LI       diags = (cmp->file[0].nondiscarded_lines
886*18fd37a7SXin LI 	       + cmp->file[1].nondiscarded_lines + 3);
887*18fd37a7SXin LI       fdiag = xmalloc (diags * (2 * sizeof *fdiag));
888*18fd37a7SXin LI       bdiag = fdiag + diags;
889*18fd37a7SXin LI       fdiag += cmp->file[1].nondiscarded_lines + 1;
890*18fd37a7SXin LI       bdiag += cmp->file[1].nondiscarded_lines + 1;
891*18fd37a7SXin LI 
892*18fd37a7SXin LI       /* Set TOO_EXPENSIVE to be approximate square root of input size,
893*18fd37a7SXin LI 	 bounded below by 256.  */
894*18fd37a7SXin LI       too_expensive = 1;
895*18fd37a7SXin LI       for (;  diags != 0;  diags >>= 2)
896*18fd37a7SXin LI 	too_expensive <<= 1;
897*18fd37a7SXin LI       too_expensive = MAX (256, too_expensive);
898*18fd37a7SXin LI 
899*18fd37a7SXin LI       files[0] = cmp->file[0];
900*18fd37a7SXin LI       files[1] = cmp->file[1];
901*18fd37a7SXin LI 
902*18fd37a7SXin LI       compareseq (0, cmp->file[0].nondiscarded_lines,
903*18fd37a7SXin LI 		  0, cmp->file[1].nondiscarded_lines, minimal);
904*18fd37a7SXin LI 
905*18fd37a7SXin LI       free (fdiag - (cmp->file[1].nondiscarded_lines + 1));
906*18fd37a7SXin LI 
907*18fd37a7SXin LI       /* Modify the results slightly to make them prettier
908*18fd37a7SXin LI 	 in cases where that can validly be done.  */
909*18fd37a7SXin LI 
910*18fd37a7SXin LI       shift_boundaries (cmp->file);
911*18fd37a7SXin LI 
912*18fd37a7SXin LI       /* Get the results of comparison in the form of a chain
913*18fd37a7SXin LI 	 of `struct change's -- an edit script.  */
914*18fd37a7SXin LI 
915*18fd37a7SXin LI       if (output_style == OUTPUT_ED)
916*18fd37a7SXin LI 	script = build_reverse_script (cmp->file);
917*18fd37a7SXin LI       else
918*18fd37a7SXin LI 	script = build_script (cmp->file);
919*18fd37a7SXin LI 
920*18fd37a7SXin LI       /* Set CHANGES if we had any diffs.
921*18fd37a7SXin LI 	 If some changes are ignored, we must scan the script to decide.  */
922*18fd37a7SXin LI       if (ignore_blank_lines || ignore_regexp.fastmap)
923*18fd37a7SXin LI 	{
924*18fd37a7SXin LI 	  struct change *next = script;
925*18fd37a7SXin LI 	  changes = 0;
926*18fd37a7SXin LI 
927*18fd37a7SXin LI 	  while (next && changes == 0)
928*18fd37a7SXin LI 	    {
929*18fd37a7SXin LI 	      struct change *this, *end;
930*18fd37a7SXin LI 	      lin first0, last0, first1, last1;
931*18fd37a7SXin LI 
932*18fd37a7SXin LI 	      /* Find a set of changes that belong together.  */
933*18fd37a7SXin LI 	      this = next;
934*18fd37a7SXin LI 	      end = find_change (next);
935*18fd37a7SXin LI 
936*18fd37a7SXin LI 	      /* Disconnect them from the rest of the changes, making them
937*18fd37a7SXin LI 		 a hunk, and remember the rest for next iteration.  */
938*18fd37a7SXin LI 	      next = end->link;
939*18fd37a7SXin LI 	      end->link = 0;
940*18fd37a7SXin LI 
941*18fd37a7SXin LI 	      /* Determine whether this hunk is really a difference.  */
942*18fd37a7SXin LI 	      if (analyze_hunk (this, &first0, &last0, &first1, &last1))
943*18fd37a7SXin LI 		changes = 1;
944*18fd37a7SXin LI 
945*18fd37a7SXin LI 	      /* Reconnect the script so it will all be freed properly.  */
946*18fd37a7SXin LI 	      end->link = next;
947*18fd37a7SXin LI 	    }
948*18fd37a7SXin LI 	}
949*18fd37a7SXin LI       else
950*18fd37a7SXin LI 	changes = (script != 0);
951*18fd37a7SXin LI 
952*18fd37a7SXin LI       if (brief)
953*18fd37a7SXin LI 	changes = briefly_report (changes, cmp->file);
954*18fd37a7SXin LI       else
955*18fd37a7SXin LI 	{
956*18fd37a7SXin LI 	  if (changes | !no_diff_means_no_output)
957*18fd37a7SXin LI 	    {
958*18fd37a7SXin LI 	      /* Record info for starting up output,
959*18fd37a7SXin LI 		 to be used if and when we have some output to print.  */
960*18fd37a7SXin LI 	      setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
961*18fd37a7SXin LI 			    file_label[1] ? file_label[1] : cmp->file[1].name,
962*18fd37a7SXin LI 			    cmp->parent != 0);
963*18fd37a7SXin LI 
964*18fd37a7SXin LI 	      switch (output_style)
965*18fd37a7SXin LI 		{
966*18fd37a7SXin LI 		case OUTPUT_CONTEXT:
967*18fd37a7SXin LI 		  print_context_script (script, false);
968*18fd37a7SXin LI 		  break;
969*18fd37a7SXin LI 
970*18fd37a7SXin LI 		case OUTPUT_UNIFIED:
971*18fd37a7SXin LI 		  print_context_script (script, true);
972*18fd37a7SXin LI 		  break;
973*18fd37a7SXin LI 
974*18fd37a7SXin LI 		case OUTPUT_ED:
975*18fd37a7SXin LI 		  print_ed_script (script);
976*18fd37a7SXin LI 		  break;
977*18fd37a7SXin LI 
978*18fd37a7SXin LI 		case OUTPUT_FORWARD_ED:
979*18fd37a7SXin LI 		  pr_forward_ed_script (script);
980*18fd37a7SXin LI 		  break;
981*18fd37a7SXin LI 
982*18fd37a7SXin LI 		case OUTPUT_RCS:
983*18fd37a7SXin LI 		  print_rcs_script (script);
984*18fd37a7SXin LI 		  break;
985*18fd37a7SXin LI 
986*18fd37a7SXin LI 		case OUTPUT_NORMAL:
987*18fd37a7SXin LI 		  print_normal_script (script);
988*18fd37a7SXin LI 		  break;
989*18fd37a7SXin LI 
990*18fd37a7SXin LI 		case OUTPUT_IFDEF:
991*18fd37a7SXin LI 		  print_ifdef_script (script);
992*18fd37a7SXin LI 		  break;
993*18fd37a7SXin LI 
994*18fd37a7SXin LI 		case OUTPUT_SDIFF:
995*18fd37a7SXin LI 		  print_sdiff_script (script);
996*18fd37a7SXin LI 		  break;
997*18fd37a7SXin LI 
998*18fd37a7SXin LI 		default:
999*18fd37a7SXin LI 		  abort ();
1000*18fd37a7SXin LI 		}
1001*18fd37a7SXin LI 
1002*18fd37a7SXin LI 	      finish_output ();
1003*18fd37a7SXin LI 	    }
1004*18fd37a7SXin LI 	}
1005*18fd37a7SXin LI 
1006*18fd37a7SXin LI       free (cmp->file[0].undiscarded);
1007*18fd37a7SXin LI 
1008*18fd37a7SXin LI       free (flag_space);
1009*18fd37a7SXin LI 
1010*18fd37a7SXin LI       for (f = 0; f < 2; f++)
1011*18fd37a7SXin LI 	{
1012*18fd37a7SXin LI 	  free (cmp->file[f].equivs);
1013*18fd37a7SXin LI 	  free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
1014*18fd37a7SXin LI 	}
1015*18fd37a7SXin LI 
1016*18fd37a7SXin LI       for (e = script; e; e = p)
1017*18fd37a7SXin LI 	{
1018*18fd37a7SXin LI 	  p = e->link;
1019*18fd37a7SXin LI 	  free (e);
1020*18fd37a7SXin LI 	}
1021*18fd37a7SXin LI 
1022*18fd37a7SXin LI       if (! ROBUST_OUTPUT_STYLE (output_style))
1023*18fd37a7SXin LI 	for (f = 0; f < 2; ++f)
1024*18fd37a7SXin LI 	  if (cmp->file[f].missing_newline)
1025*18fd37a7SXin LI 	    {
1026*18fd37a7SXin LI 	      error (0, 0, "%s: %s\n",
1027*18fd37a7SXin LI 		     file_label[f] ? file_label[f] : cmp->file[f].name,
1028*18fd37a7SXin LI 		     _("No newline at end of file"));
1029*18fd37a7SXin LI 	      changes = 2;
1030*18fd37a7SXin LI 	    }
1031*18fd37a7SXin LI     }
1032*18fd37a7SXin LI 
1033*18fd37a7SXin LI   if (cmp->file[0].buffer != cmp->file[1].buffer)
1034*18fd37a7SXin LI     free (cmp->file[0].buffer);
1035*18fd37a7SXin LI   free (cmp->file[1].buffer);
1036*18fd37a7SXin LI 
1037*18fd37a7SXin LI   return changes;
1038*18fd37a7SXin LI }
1039