xref: /netbsd-src/external/gpl2/diffutils/dist/src/analyze.c (revision 75f6d617e282811cb173c2ccfbf5df0dd71f7045)
1 /*	$NetBSD: analyze.c,v 1.1.1.1 2016/01/13 03:15:30 christos Exp $	*/
2 
3 /* Analyze file differences for GNU DIFF.
4 
5    Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002
6    Free Software Foundation, Inc.
7 
8    This file is part of GNU DIFF.
9 
10    GNU DIFF is free software; you can redistribute it and/or modify
11    it under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 2, or (at your option)
13    any later version.
14 
15    GNU DIFF is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; see the file COPYING.
22    If not, write to the Free Software Foundation,
23    59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
24 
25 /* The basic algorithm is described in:
26    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
27    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
28    see especially section 4.2, which describes the variation used below.
29    Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
30    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
31    at the price of producing suboptimal output for large inputs with
32    many differences.
33 
34    The basic algorithm was independently discovered as described in:
35    "Algorithms for Approximate String Matching", E. Ukkonen,
36    Information and Control Vol. 64, 1985, pp. 100-118.  */
37 
38 #include "diff.h"
39 #include <cmpbuf.h>
40 #include <error.h>
41 #include <regex.h>
42 #include <xalloc.h>
43 
44 static lin *xvec, *yvec;	/* Vectors being compared. */
45 static lin *fdiag;		/* Vector, indexed by diagonal, containing
46 				   1 + the X coordinate of the point furthest
47 				   along the given diagonal in the forward
48 				   search of the edit matrix. */
49 static lin *bdiag;		/* Vector, indexed by diagonal, containing
50 				   the X coordinate of the point furthest
51 				   along the given diagonal in the backward
52 				   search of the edit matrix. */
53 static lin too_expensive;	/* Edit scripts longer than this are too
54 				   expensive to compute.  */
55 
56 #define SNAKE_LIMIT 20	/* Snakes bigger than this are considered `big'.  */
57 
58 struct partition
59 {
60   lin xmid, ymid;	/* Midpoints of this partition.  */
61   bool lo_minimal;	/* Nonzero if low half will be analyzed minimally.  */
62   bool hi_minimal;	/* Likewise for high half.  */
63 };
64 
65 /* Find the midpoint of the shortest edit script for a specified
66    portion of the two files.
67 
68    Scan from the beginnings of the files, and simultaneously from the ends,
69    doing a breadth-first search through the space of edit-sequence.
70    When the two searches meet, we have found the midpoint of the shortest
71    edit sequence.
72 
73    If FIND_MINIMAL is nonzero, find the minimal edit script regardless
74    of expense.  Otherwise, if the search is too expensive, use
75    heuristics to stop the search and report a suboptimal answer.
76 
77    Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
78    XMID - YMID equals the number of inserted lines minus the number
79    of deleted lines (counting only lines before the midpoint).
80    Return the approximate edit cost; this is the total number of
81    lines inserted or deleted (counting only lines before the midpoint),
82    unless a heuristic is used to terminate the search prematurely.
83 
84    Set PART->lo_minimal to true iff the minimal edit script for the
85    left half of the partition is known; similarly for PART->hi_minimal.
86 
87    This function assumes that the first lines of the specified portions
88    of the two files do not match, and likewise that the last lines do not
89    match.  The caller must trim matching lines from the beginning and end
90    of the portions it is going to specify.
91 
92    If we return the "wrong" partitions,
93    the worst this can do is cause suboptimal diff output.
94    It cannot cause incorrect diff output.  */
95 
96 static lin
diag(lin xoff,lin xlim,lin yoff,lin ylim,bool find_minimal,struct partition * part)97 diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal,
98       struct partition *part)
99 {
100   lin *const fd = fdiag;	/* Give the compiler a chance. */
101   lin *const bd = bdiag;	/* Additional help for the compiler. */
102   lin const *const xv = xvec;	/* Still more help for the compiler. */
103   lin const *const yv = yvec;	/* And more and more . . . */
104   lin const dmin = xoff - ylim;	/* Minimum valid diagonal. */
105   lin const dmax = xlim - yoff;	/* Maximum valid diagonal. */
106   lin const fmid = xoff - yoff;	/* Center diagonal of top-down search. */
107   lin const bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
108   lin fmin = fmid, fmax = fmid;	/* Limits of top-down search. */
109   lin bmin = bmid, bmax = bmid;	/* Limits of bottom-up search. */
110   lin c;			/* Cost. */
111   bool odd = (fmid - bmid) & 1;	/* True if southeast corner is on an odd
112 				   diagonal with respect to the northwest. */
113 
114   fd[fmid] = xoff;
115   bd[bmid] = xlim;
116 
117   for (c = 1;; ++c)
118     {
119       lin d;			/* Active diagonal. */
120       bool big_snake = 0;
121 
122       /* Extend the top-down search by an edit step in each diagonal. */
123       fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
124       fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
125       for (d = fmax; d >= fmin; d -= 2)
126 	{
127 	  lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
128 
129 	  if (tlo >= thi)
130 	    x = tlo + 1;
131 	  else
132 	    x = thi;
133 	  oldx = x;
134 	  y = x - d;
135 	  while (x < xlim && y < ylim && xv[x] == yv[y])
136 	    ++x, ++y;
137 	  if (x - oldx > SNAKE_LIMIT)
138 	    big_snake = 1;
139 	  fd[d] = x;
140 	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
141 	    {
142 	      part->xmid = x;
143 	      part->ymid = y;
144 	      part->lo_minimal = part->hi_minimal = 1;
145 	      return 2 * c - 1;
146 	    }
147 	}
148 
149       /* Similarly extend the bottom-up search.  */
150       bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin;
151       bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax;
152       for (d = bmax; d >= bmin; d -= 2)
153 	{
154 	  lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
155 
156 	  if (tlo < thi)
157 	    x = tlo;
158 	  else
159 	    x = thi - 1;
160 	  oldx = x;
161 	  y = x - d;
162 	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
163 	    --x, --y;
164 	  if (oldx - x > SNAKE_LIMIT)
165 	    big_snake = 1;
166 	  bd[d] = x;
167 	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
168 	    {
169 	      part->xmid = x;
170 	      part->ymid = y;
171 	      part->lo_minimal = part->hi_minimal = 1;
172 	      return 2 * c;
173 	    }
174 	}
175 
176       if (find_minimal)
177 	continue;
178 
179       /* Heuristic: check occasionally for a diagonal that has made
180 	 lots of progress compared with the edit distance.
181 	 If we have any such, find the one that has made the most
182 	 progress and return it as if it had succeeded.
183 
184 	 With this heuristic, for files with a constant small density
185 	 of changes, the algorithm is linear in the file size.  */
186 
187       if (200 < c && big_snake && speed_large_files)
188 	{
189 	  lin best;
190 
191 	  best = 0;
192 	  for (d = fmax; d >= fmin; d -= 2)
193 	    {
194 	      lin dd = d - fmid;
195 	      lin x = fd[d];
196 	      lin y = x - d;
197 	      lin v = (x - xoff) * 2 - dd;
198 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
199 		{
200 		  if (v > best
201 		      && xoff + SNAKE_LIMIT <= x && x < xlim
202 		      && yoff + SNAKE_LIMIT <= y && y < ylim)
203 		    {
204 		      /* We have a good enough best diagonal;
205 			 now insist that it end with a significant snake.  */
206 		      int k;
207 
208 		      for (k = 1; xv[x - k] == yv[y - k]; k++)
209 			if (k == SNAKE_LIMIT)
210 			  {
211 			    best = v;
212 			    part->xmid = x;
213 			    part->ymid = y;
214 			    break;
215 			  }
216 		    }
217 		}
218 	    }
219 	  if (best > 0)
220 	    {
221 	      part->lo_minimal = 1;
222 	      part->hi_minimal = 0;
223 	      return 2 * c - 1;
224 	    }
225 
226 	  best = 0;
227 	  for (d = bmax; d >= bmin; d -= 2)
228 	    {
229 	      lin dd = d - bmid;
230 	      lin x = bd[d];
231 	      lin y = x - d;
232 	      lin v = (xlim - x) * 2 + dd;
233 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
234 		{
235 		  if (v > best
236 		      && xoff < x && x <= xlim - SNAKE_LIMIT
237 		      && yoff < y && y <= ylim - SNAKE_LIMIT)
238 		    {
239 		      /* We have a good enough best diagonal;
240 			 now insist that it end with a significant snake.  */
241 		      int k;
242 
243 		      for (k = 0; xv[x + k] == yv[y + k]; k++)
244 			if (k == SNAKE_LIMIT - 1)
245 			  {
246 			    best = v;
247 			    part->xmid = x;
248 			    part->ymid = y;
249 			    break;
250 			  }
251 		    }
252 		}
253 	    }
254 	  if (best > 0)
255 	    {
256 	      part->lo_minimal = 0;
257 	      part->hi_minimal = 1;
258 	      return 2 * c - 1;
259 	    }
260 	}
261 
262       /* Heuristic: if we've gone well beyond the call of duty,
263 	 give up and report halfway between our best results so far.  */
264       if (c >= too_expensive)
265 	{
266 	  lin fxybest, fxbest;
267 	  lin bxybest, bxbest;
268 
269 	  fxbest = bxbest = 0;  /* Pacify `gcc -Wall'.  */
270 
271 	  /* Find forward diagonal that maximizes X + Y.  */
272 	  fxybest = -1;
273 	  for (d = fmax; d >= fmin; d -= 2)
274 	    {
275 	      lin x = MIN (fd[d], xlim);
276 	      lin y = x - d;
277 	      if (ylim < y)
278 		x = ylim + d, y = ylim;
279 	      if (fxybest < x + y)
280 		{
281 		  fxybest = x + y;
282 		  fxbest = x;
283 		}
284 	    }
285 
286 	  /* Find backward diagonal that minimizes X + Y.  */
287 	  bxybest = LIN_MAX;
288 	  for (d = bmax; d >= bmin; d -= 2)
289 	    {
290 	      lin x = MAX (xoff, bd[d]);
291 	      lin y = x - d;
292 	      if (y < yoff)
293 		x = yoff + d, y = yoff;
294 	      if (x + y < bxybest)
295 		{
296 		  bxybest = x + y;
297 		  bxbest = x;
298 		}
299 	    }
300 
301 	  /* Use the better of the two diagonals.  */
302 	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
303 	    {
304 	      part->xmid = fxbest;
305 	      part->ymid = fxybest - fxbest;
306 	      part->lo_minimal = 1;
307 	      part->hi_minimal = 0;
308 	    }
309 	  else
310 	    {
311 	      part->xmid = bxbest;
312 	      part->ymid = bxybest - bxbest;
313 	      part->lo_minimal = 0;
314 	      part->hi_minimal = 1;
315 	    }
316 	  return 2 * c - 1;
317 	}
318     }
319 }
320 
321 /* Compare in detail contiguous subsequences of the two files
322    which are known, as a whole, to match each other.
323 
324    The results are recorded in the vectors files[N].changed, by
325    storing 1 in the element for each line that is an insertion or deletion.
326 
327    The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
328 
329    Note that XLIM, YLIM are exclusive bounds.
330    All line numbers are origin-0 and discarded lines are not counted.
331 
332    If FIND_MINIMAL, find a minimal difference no matter how
333    expensive it is.  */
334 
335 static void
compareseq(lin xoff,lin xlim,lin yoff,lin ylim,bool find_minimal)336 compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal)
337 {
338   lin * const xv = xvec; /* Help the compiler.  */
339   lin * const yv = yvec;
340 
341   /* Slide down the bottom initial diagonal. */
342   while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
343     ++xoff, ++yoff;
344   /* Slide up the top initial diagonal. */
345   while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
346     --xlim, --ylim;
347 
348   /* Handle simple cases. */
349   if (xoff == xlim)
350     while (yoff < ylim)
351       files[1].changed[files[1].realindexes[yoff++]] = 1;
352   else if (yoff == ylim)
353     while (xoff < xlim)
354       files[0].changed[files[0].realindexes[xoff++]] = 1;
355   else
356     {
357       lin c;
358       struct partition part;
359 
360       /* Find a point of correspondence in the middle of the files.  */
361 
362       c = diag (xoff, xlim, yoff, ylim, find_minimal, &part);
363 
364       if (c == 1)
365 	{
366 	  /* This should be impossible, because it implies that
367 	     one of the two subsequences is empty,
368 	     and that case was handled above without calling `diag'.
369 	     Let's verify that this is true.  */
370 	  abort ();
371 #if 0
372 	  /* The two subsequences differ by a single insert or delete;
373 	     record it and we are done.  */
374 	  if (part.xmid - part.ymid < xoff - yoff)
375 	    files[1].changed[files[1].realindexes[part.ymid - 1]] = 1;
376 	  else
377 	    files[0].changed[files[0].realindexes[part.xmid]] = 1;
378 #endif
379 	}
380       else
381 	{
382 	  /* Use the partitions to split this problem into subproblems.  */
383 	  compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
384 	  compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
385 	}
386     }
387 }
388 
389 /* Discard lines from one file that have no matches in the other file.
390 
391    A line which is discarded will not be considered by the actual
392    comparison algorithm; it will be as if that line were not in the file.
393    The file's `realindexes' table maps virtual line numbers
394    (which don't count the discarded lines) into real line numbers;
395    this is how the actual comparison algorithm produces results
396    that are comprehensible when the discarded lines are counted.
397 
398    When we discard a line, we also mark it as a deletion or insertion
399    so that it will be printed in the output.  */
400 
401 static void
discard_confusing_lines(struct file_data filevec[])402 discard_confusing_lines (struct file_data filevec[])
403 {
404   int f;
405   lin i;
406   char *discarded[2];
407   lin *equiv_count[2];
408   lin *p;
409 
410   /* Allocate our results.  */
411   p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
412 	       * (2 * sizeof *p));
413   for (f = 0; f < 2; f++)
414     {
415       filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
416       filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
417     }
418 
419   /* Set up equiv_count[F][I] as the number of lines in file F
420      that fall in equivalence class I.  */
421 
422   p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
423   equiv_count[0] = p;
424   equiv_count[1] = p + filevec[0].equiv_max;
425 
426   for (i = 0; i < filevec[0].buffered_lines; ++i)
427     ++equiv_count[0][filevec[0].equivs[i]];
428   for (i = 0; i < filevec[1].buffered_lines; ++i)
429     ++equiv_count[1][filevec[1].equivs[i]];
430 
431   /* Set up tables of which lines are going to be discarded.  */
432 
433   discarded[0] = zalloc (filevec[0].buffered_lines
434 			 + filevec[1].buffered_lines);
435   discarded[1] = discarded[0] + filevec[0].buffered_lines;
436 
437   /* Mark to be discarded each line that matches no line of the other file.
438      If a line matches many lines, mark it as provisionally discardable.  */
439 
440   for (f = 0; f < 2; f++)
441     {
442       size_t end = filevec[f].buffered_lines;
443       char *discards = discarded[f];
444       lin *counts = equiv_count[1 - f];
445       lin *equivs = filevec[f].equivs;
446       size_t many = 5;
447       size_t tem = end / 64;
448 
449       /* Multiply MANY by approximate square root of number of lines.
450 	 That is the threshold for provisionally discardable lines.  */
451       while ((tem = tem >> 2) > 0)
452 	many *= 2;
453 
454       for (i = 0; i < end; i++)
455 	{
456 	  lin nmatch;
457 	  if (equivs[i] == 0)
458 	    continue;
459 	  nmatch = counts[equivs[i]];
460 	  if (nmatch == 0)
461 	    discards[i] = 1;
462 	  else if (nmatch > many)
463 	    discards[i] = 2;
464 	}
465     }
466 
467   /* Don't really discard the provisional lines except when they occur
468      in a run of discardables, with nonprovisionals at the beginning
469      and end.  */
470 
471   for (f = 0; f < 2; f++)
472     {
473       lin end = filevec[f].buffered_lines;
474       register char *discards = discarded[f];
475 
476       for (i = 0; i < end; i++)
477 	{
478 	  /* Cancel provisional discards not in middle of run of discards.  */
479 	  if (discards[i] == 2)
480 	    discards[i] = 0;
481 	  else if (discards[i] != 0)
482 	    {
483 	      /* We have found a nonprovisional discard.  */
484 	      register lin j;
485 	      lin length;
486 	      lin provisional = 0;
487 
488 	      /* Find end of this run of discardable lines.
489 		 Count how many are provisionally discardable.  */
490 	      for (j = i; j < end; j++)
491 		{
492 		  if (discards[j] == 0)
493 		    break;
494 		  if (discards[j] == 2)
495 		    ++provisional;
496 		}
497 
498 	      /* Cancel provisional discards at end, and shrink the run.  */
499 	      while (j > i && discards[j - 1] == 2)
500 		discards[--j] = 0, --provisional;
501 
502 	      /* Now we have the length of a run of discardable lines
503 		 whose first and last are not provisional.  */
504 	      length = j - i;
505 
506 	      /* If 1/4 of the lines in the run are provisional,
507 		 cancel discarding of all provisional lines in the run.  */
508 	      if (provisional * 4 > length)
509 		{
510 		  while (j > i)
511 		    if (discards[--j] == 2)
512 		      discards[j] = 0;
513 		}
514 	      else
515 		{
516 		  register lin consec;
517 		  lin minimum = 1;
518 		  lin tem = length >> 2;
519 
520 		  /* MINIMUM is approximate square root of LENGTH/4.
521 		     A subrun of two or more provisionals can stand
522 		     when LENGTH is at least 16.
523 		     A subrun of 4 or more can stand when LENGTH >= 64.  */
524 		  while (0 < (tem >>= 2))
525 		    minimum <<= 1;
526 		  minimum++;
527 
528 		  /* Cancel any subrun of MINIMUM or more provisionals
529 		     within the larger run.  */
530 		  for (j = 0, consec = 0; j < length; j++)
531 		    if (discards[i + j] != 2)
532 		      consec = 0;
533 		    else if (minimum == ++consec)
534 		      /* Back up to start of subrun, to cancel it all.  */
535 		      j -= consec;
536 		    else if (minimum < consec)
537 		      discards[i + j] = 0;
538 
539 		  /* Scan from beginning of run
540 		     until we find 3 or more nonprovisionals in a row
541 		     or until the first nonprovisional at least 8 lines in.
542 		     Until that point, cancel any provisionals.  */
543 		  for (j = 0, consec = 0; j < length; j++)
544 		    {
545 		      if (j >= 8 && discards[i + j] == 1)
546 			break;
547 		      if (discards[i + j] == 2)
548 			consec = 0, discards[i + j] = 0;
549 		      else if (discards[i + j] == 0)
550 			consec = 0;
551 		      else
552 			consec++;
553 		      if (consec == 3)
554 			break;
555 		    }
556 
557 		  /* I advances to the last line of the run.  */
558 		  i += length - 1;
559 
560 		  /* Same thing, from end.  */
561 		  for (j = 0, consec = 0; j < length; j++)
562 		    {
563 		      if (j >= 8 && discards[i - j] == 1)
564 			break;
565 		      if (discards[i - j] == 2)
566 			consec = 0, discards[i - j] = 0;
567 		      else if (discards[i - j] == 0)
568 			consec = 0;
569 		      else
570 			consec++;
571 		      if (consec == 3)
572 			break;
573 		    }
574 		}
575 	    }
576 	}
577     }
578 
579   /* Actually discard the lines. */
580   for (f = 0; f < 2; f++)
581     {
582       char *discards = discarded[f];
583       lin end = filevec[f].buffered_lines;
584       lin j = 0;
585       for (i = 0; i < end; ++i)
586 	if (minimal || discards[i] == 0)
587 	  {
588 	    filevec[f].undiscarded[j] = filevec[f].equivs[i];
589 	    filevec[f].realindexes[j++] = i;
590 	  }
591 	else
592 	  filevec[f].changed[i] = 1;
593       filevec[f].nondiscarded_lines = j;
594     }
595 
596   free (discarded[0]);
597   free (equiv_count[0]);
598 }
599 
600 /* Adjust inserts/deletes of identical lines to join changes
601    as much as possible.
602 
603    We do something when a run of changed lines include a
604    line at one end and have an excluded, identical line at the other.
605    We are free to choose which identical line is included.
606    `compareseq' usually chooses the one at the beginning,
607    but usually it is cleaner to consider the following identical line
608    to be the "change".  */
609 
610 static void
shift_boundaries(struct file_data filevec[])611 shift_boundaries (struct file_data filevec[])
612 {
613   int f;
614 
615   for (f = 0; f < 2; f++)
616     {
617       bool *changed = filevec[f].changed;
618       bool const *other_changed = filevec[1 - f].changed;
619       lin const *equivs = filevec[f].equivs;
620       lin i = 0;
621       lin j = 0;
622       lin i_end = filevec[f].buffered_lines;
623 
624       while (1)
625 	{
626 	  lin runlength, start, corresponding;
627 
628 	  /* Scan forwards to find beginning of another run of changes.
629 	     Also keep track of the corresponding point in the other file.  */
630 
631 	  while (i < i_end && !changed[i])
632 	    {
633 	      while (other_changed[j++])
634 		continue;
635 	      i++;
636 	    }
637 
638 	  if (i == i_end)
639 	    break;
640 
641 	  start = i;
642 
643 	  /* Find the end of this run of changes.  */
644 
645 	  while (changed[++i])
646 	    continue;
647 	  while (other_changed[j])
648 	    j++;
649 
650 	  do
651 	    {
652 	      /* Record the length of this run of changes, so that
653 		 we can later determine whether the run has grown.  */
654 	      runlength = i - start;
655 
656 	      /* Move the changed region back, so long as the
657 		 previous unchanged line matches the last changed one.
658 		 This merges with previous changed regions.  */
659 
660 	      while (start && equivs[start - 1] == equivs[i - 1])
661 		{
662 		  changed[--start] = 1;
663 		  changed[--i] = 0;
664 		  while (changed[start - 1])
665 		    start--;
666 		  while (other_changed[--j])
667 		    continue;
668 		}
669 
670 	      /* Set CORRESPONDING to the end of the changed run, at the last
671 		 point where it corresponds to a changed run in the other file.
672 		 CORRESPONDING == I_END means no such point has been found.  */
673 	      corresponding = other_changed[j - 1] ? i : i_end;
674 
675 	      /* Move the changed region forward, so long as the
676 		 first changed line matches the following unchanged one.
677 		 This merges with following changed regions.
678 		 Do this second, so that if there are no merges,
679 		 the changed region is moved forward as far as possible.  */
680 
681 	      while (i != i_end && equivs[start] == equivs[i])
682 		{
683 		  changed[start++] = 0;
684 		  changed[i++] = 1;
685 		  while (changed[i])
686 		    i++;
687 		  while (other_changed[++j])
688 		    corresponding = i;
689 		}
690 	    }
691 	  while (runlength != i - start);
692 
693 	  /* If possible, move the fully-merged run of changes
694 	     back to a corresponding run in the other file.  */
695 
696 	  while (corresponding < i)
697 	    {
698 	      changed[--start] = 1;
699 	      changed[--i] = 0;
700 	      while (other_changed[--j])
701 		continue;
702 	    }
703 	}
704     }
705 }
706 
707 /* Cons an additional entry onto the front of an edit script OLD.
708    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
709    DELETED is the number of lines deleted here from file 0.
710    INSERTED is the number of lines inserted here in file 1.
711 
712    If DELETED is 0 then LINE0 is the number of the line before
713    which the insertion was done; vice versa for INSERTED and LINE1.  */
714 
715 static struct change *
add_change(lin line0,lin line1,lin deleted,lin inserted,struct change * old)716 add_change (lin line0, lin line1, lin deleted, lin inserted,
717 	    struct change *old)
718 {
719   struct change *new = xmalloc (sizeof *new);
720 
721   new->line0 = line0;
722   new->line1 = line1;
723   new->inserted = inserted;
724   new->deleted = deleted;
725   new->link = old;
726   return new;
727 }
728 
729 /* Scan the tables of which lines are inserted and deleted,
730    producing an edit script in reverse order.  */
731 
732 static struct change *
build_reverse_script(struct file_data const filevec[])733 build_reverse_script (struct file_data const filevec[])
734 {
735   struct change *script = 0;
736   bool *changed0 = filevec[0].changed;
737   bool *changed1 = filevec[1].changed;
738   lin len0 = filevec[0].buffered_lines;
739   lin len1 = filevec[1].buffered_lines;
740 
741   /* Note that changedN[len0] does exist, and is 0.  */
742 
743   lin i0 = 0, i1 = 0;
744 
745   while (i0 < len0 || i1 < len1)
746     {
747       if (changed0[i0] | changed1[i1])
748 	{
749 	  lin line0 = i0, line1 = i1;
750 
751 	  /* Find # lines changed here in each file.  */
752 	  while (changed0[i0]) ++i0;
753 	  while (changed1[i1]) ++i1;
754 
755 	  /* Record this change.  */
756 	  script = add_change (line0, line1, i0 - line0, i1 - line1, script);
757 	}
758 
759       /* We have reached lines in the two files that match each other.  */
760       i0++, i1++;
761     }
762 
763   return script;
764 }
765 
766 /* Scan the tables of which lines are inserted and deleted,
767    producing an edit script in forward order.  */
768 
769 static struct change *
build_script(struct file_data const filevec[])770 build_script (struct file_data const filevec[])
771 {
772   struct change *script = 0;
773   bool *changed0 = filevec[0].changed;
774   bool *changed1 = filevec[1].changed;
775   lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
776 
777   /* Note that changedN[-1] does exist, and is 0.  */
778 
779   while (i0 >= 0 || i1 >= 0)
780     {
781       if (changed0[i0 - 1] | changed1[i1 - 1])
782 	{
783 	  lin line0 = i0, line1 = i1;
784 
785 	  /* Find # lines changed here in each file.  */
786 	  while (changed0[i0 - 1]) --i0;
787 	  while (changed1[i1 - 1]) --i1;
788 
789 	  /* Record this change.  */
790 	  script = add_change (i0, i1, line0 - i0, line1 - i1, script);
791 	}
792 
793       /* We have reached lines in the two files that match each other.  */
794       i0--, i1--;
795     }
796 
797   return script;
798 }
799 
800 /* If CHANGES, briefly report that two files differed.
801    Return 2 if trouble, CHANGES otherwise.  */
802 static int
briefly_report(int changes,struct file_data const filevec[])803 briefly_report (int changes, struct file_data const filevec[])
804 {
805   if (changes)
806     {
807       char const *label0 = file_label[0] ? file_label[0] : filevec[0].name;
808       char const *label1 = file_label[1] ? file_label[1] : filevec[1].name;
809 
810       if (brief)
811 	message ("Files %s and %s differ\n", label0, label1);
812       else
813 	{
814 	  message ("Binary files %s and %s differ\n", label0, label1);
815 	  changes = 2;
816 	}
817     }
818 
819   return changes;
820 }
821 
822 /* Report the differences of two files.  */
823 int
diff_2_files(struct comparison * cmp)824 diff_2_files (struct comparison *cmp)
825 {
826   lin diags;
827   int f;
828   struct change *e, *p;
829   struct change *script;
830   int changes;
831 
832 
833   /* If we have detected that either file is binary,
834      compare the two files as binary.  This can happen
835      only when the first chunk is read.
836      Also, --brief without any --ignore-* options means
837      we can speed things up by treating the files as binary.  */
838 
839   if (read_files (cmp->file, files_can_be_treated_as_binary))
840     {
841       /* Files with different lengths must be different.  */
842       if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
843 	  && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
844 	  && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
845 	changes = 1;
846 
847       /* Standard input equals itself.  */
848       else if (cmp->file[0].desc == cmp->file[1].desc)
849 	changes = 0;
850 
851       else
852 	/* Scan both files, a buffer at a time, looking for a difference.  */
853 	{
854 	  /* Allocate same-sized buffers for both files.  */
855 	  size_t lcm_max = PTRDIFF_MAX - 1;
856 	  size_t buffer_size =
857 	    buffer_lcm (sizeof (word),
858 			buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
859 				    STAT_BLOCKSIZE (cmp->file[1].stat),
860 				    lcm_max),
861 			lcm_max);
862 	  for (f = 0; f < 2; f++)
863 	    cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
864 
865 	  for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
866 	    {
867 	      /* Read a buffer's worth from both files.  */
868 	      for (f = 0; f < 2; f++)
869 		if (0 <= cmp->file[f].desc)
870 		  file_block_read (&cmp->file[f],
871 				   buffer_size - cmp->file[f].buffered);
872 
873 	      /* If the buffers differ, the files differ.  */
874 	      if (cmp->file[0].buffered != cmp->file[1].buffered
875 		  || memcmp (cmp->file[0].buffer,
876 			     cmp->file[1].buffer,
877 			     cmp->file[0].buffered))
878 		{
879 		  changes = 1;
880 		  break;
881 		}
882 
883 	      /* If we reach end of file, the files are the same.  */
884 	      if (cmp->file[0].buffered != buffer_size)
885 		{
886 		  changes = 0;
887 		  break;
888 		}
889 	    }
890 	}
891 
892       changes = briefly_report (changes, cmp->file);
893     }
894   else
895     {
896       /* Allocate vectors for the results of comparison:
897 	 a flag for each line of each file, saying whether that line
898 	 is an insertion or deletion.
899 	 Allocate an extra element, always 0, at each end of each vector.  */
900 
901       size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
902       bool *flag_space = zalloc (s * sizeof *flag_space);
903       cmp->file[0].changed = flag_space + 1;
904       cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
905 
906       /* Some lines are obviously insertions or deletions
907 	 because they don't match anything.  Detect them now, and
908 	 avoid even thinking about them in the main comparison algorithm.  */
909 
910       discard_confusing_lines (cmp->file);
911 
912       /* Now do the main comparison algorithm, considering just the
913 	 undiscarded lines.  */
914 
915       xvec = cmp->file[0].undiscarded;
916       yvec = cmp->file[1].undiscarded;
917       diags = (cmp->file[0].nondiscarded_lines
918 	       + cmp->file[1].nondiscarded_lines + 3);
919       fdiag = xmalloc (diags * (2 * sizeof *fdiag));
920       bdiag = fdiag + diags;
921       fdiag += cmp->file[1].nondiscarded_lines + 1;
922       bdiag += cmp->file[1].nondiscarded_lines + 1;
923 
924       /* Set TOO_EXPENSIVE to be approximate square root of input size,
925 	 bounded below by 256.  */
926       too_expensive = 1;
927       for (;  diags != 0;  diags >>= 2)
928 	too_expensive <<= 1;
929       too_expensive = MAX (256, too_expensive);
930 
931       files[0] = cmp->file[0];
932       files[1] = cmp->file[1];
933 
934       compareseq (0, cmp->file[0].nondiscarded_lines,
935 		  0, cmp->file[1].nondiscarded_lines, minimal);
936 
937       free (fdiag - (cmp->file[1].nondiscarded_lines + 1));
938 
939       /* Modify the results slightly to make them prettier
940 	 in cases where that can validly be done.  */
941 
942       shift_boundaries (cmp->file);
943 
944       /* Get the results of comparison in the form of a chain
945 	 of `struct change's -- an edit script.  */
946 
947       if (output_style == OUTPUT_ED)
948 	script = build_reverse_script (cmp->file);
949       else
950 	script = build_script (cmp->file);
951 
952       /* Set CHANGES if we had any diffs.
953 	 If some changes are ignored, we must scan the script to decide.  */
954       if (ignore_blank_lines || ignore_regexp.fastmap)
955 	{
956 	  struct change *next = script;
957 	  changes = 0;
958 
959 	  while (next && changes == 0)
960 	    {
961 	      struct change *this, *end;
962 	      lin first0, last0, first1, last1;
963 
964 	      /* Find a set of changes that belong together.  */
965 	      this = next;
966 	      end = find_change (next);
967 
968 	      /* Disconnect them from the rest of the changes, making them
969 		 a hunk, and remember the rest for next iteration.  */
970 	      next = end->link;
971 	      end->link = 0;
972 
973 	      /* Determine whether this hunk is really a difference.  */
974 	      if (analyze_hunk (this, &first0, &last0, &first1, &last1))
975 		changes = 1;
976 
977 	      /* Reconnect the script so it will all be freed properly.  */
978 	      end->link = next;
979 	    }
980 	}
981       else
982 	changes = (script != 0);
983 
984       if (brief)
985 	changes = briefly_report (changes, cmp->file);
986       else
987 	{
988 	  if (changes | !no_diff_means_no_output)
989 	    {
990 	      /* Record info for starting up output,
991 		 to be used if and when we have some output to print.  */
992 	      setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
993 			    file_label[1] ? file_label[1] : cmp->file[1].name,
994 			    cmp->parent != 0);
995 
996 	      switch (output_style)
997 		{
998 		case OUTPUT_CONTEXT:
999 		  print_context_script (script, 0);
1000 		  break;
1001 
1002 		case OUTPUT_UNIFIED:
1003 		  print_context_script (script, 1);
1004 		  break;
1005 
1006 		case OUTPUT_ED:
1007 		  print_ed_script (script);
1008 		  break;
1009 
1010 		case OUTPUT_FORWARD_ED:
1011 		  pr_forward_ed_script (script);
1012 		  break;
1013 
1014 		case OUTPUT_RCS:
1015 		  print_rcs_script (script);
1016 		  break;
1017 
1018 		case OUTPUT_NORMAL:
1019 		  print_normal_script (script);
1020 		  break;
1021 
1022 		case OUTPUT_IFDEF:
1023 		  print_ifdef_script (script);
1024 		  break;
1025 
1026 		case OUTPUT_SDIFF:
1027 		  print_sdiff_script (script);
1028 		  break;
1029 
1030 		default:
1031 		  abort ();
1032 		}
1033 
1034 	      finish_output ();
1035 	    }
1036 	}
1037 
1038       free (cmp->file[0].undiscarded);
1039 
1040       free (flag_space);
1041 
1042       for (f = 0; f < 2; f++)
1043 	{
1044 	  free (cmp->file[f].equivs);
1045 	  free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
1046 	}
1047 
1048       for (e = script; e; e = p)
1049 	{
1050 	  p = e->link;
1051 	  free (e);
1052 	}
1053 
1054       if (! ROBUST_OUTPUT_STYLE (output_style))
1055 	for (f = 0; f < 2; ++f)
1056 	  if (cmp->file[f].missing_newline)
1057 	    {
1058 	      error (0, 0, "%s: %s\n",
1059 		     file_label[f] ? file_label[f] : cmp->file[f].name,
1060 		     _("No newline at end of file"));
1061 	      changes = 2;
1062 	    }
1063     }
1064 
1065   if (cmp->file[0].buffer != cmp->file[1].buffer)
1066     free (cmp->file[0].buffer);
1067   free (cmp->file[1].buffer);
1068 
1069   return changes;
1070 }
1071