xref: /openbsd-src/gnu/usr.bin/cvs/diff/analyze.c (revision 62a742911104f98b9185b2c6b6007d9b1c36396c)
1 /* Analyze file differences for GNU DIFF.
2    Copyright (C) 1988, 1989, 1992, 1993, 1997 Free Software Foundation, Inc.
3 
4 This file is part of GNU DIFF.
5 
6 GNU DIFF is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU DIFF is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 */
17 
18 /* The basic algorithm is described in:
19    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
20    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
21    see especially section 4.2, which describes the variation used below.
22    Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
23    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
24    at the price of producing suboptimal output for large inputs with
25    many differences.
26 
27    The basic algorithm was independently discovered as described in:
28    "Algorithms for Approximate String Matching", E. Ukkonen,
29    Information and Control Vol. 64, 1985, pp. 100-118.  */
30 
31 #include "diff.h"
32 #include "cmpbuf.h"
33 
34 extern int no_discards;
35 
36 static int *xvec, *yvec;	/* Vectors being compared. */
37 static int *fdiag;		/* Vector, indexed by diagonal, containing
38 				   1 + the X coordinate of the point furthest
39 				   along the given diagonal in the forward
40 				   search of the edit matrix. */
41 static int *bdiag;		/* Vector, indexed by diagonal, containing
42 				   the X coordinate of the point furthest
43 				   along the given diagonal in the backward
44 				   search of the edit matrix. */
45 static int too_expensive;	/* Edit scripts longer than this are too
46 				   expensive to compute.  */
47 
48 #define SNAKE_LIMIT 20	/* Snakes bigger than this are considered `big'.  */
49 
50 struct partition
51 {
52   int xmid, ymid;	/* Midpoints of this partition.  */
53   int lo_minimal;	/* Nonzero if low half will be analyzed minimally.  */
54   int hi_minimal;	/* Likewise for high half.  */
55 };
56 
57 static int diag PARAMS((int, int, int, int, int, struct partition *));
58 static struct change *add_change PARAMS((int, int, int, int, struct change *));
59 static struct change *build_reverse_script PARAMS((struct file_data const[]));
60 static struct change *build_script PARAMS((struct file_data const[]));
61 static void briefly_report PARAMS((int, struct file_data const[]));
62 static void compareseq PARAMS((int, int, int, int, int));
63 static void discard_confusing_lines PARAMS((struct file_data[]));
64 static void shift_boundaries PARAMS((struct file_data[]));
65 
66 /* Find the midpoint of the shortest edit script for a specified
67    portion of the two files.
68 
69    Scan from the beginnings of the files, and simultaneously from the ends,
70    doing a breadth-first search through the space of edit-sequence.
71    When the two searches meet, we have found the midpoint of the shortest
72    edit sequence.
73 
74    If MINIMAL is nonzero, find the minimal edit script regardless
75    of expense.  Otherwise, if the search is too expensive, use
76    heuristics to stop the search and report a suboptimal answer.
77 
78    Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal number
79    XMID - YMID equals the number of inserted lines minus the number
80    of deleted lines (counting only lines before the midpoint).
81    Return the approximate edit cost; this is the total number of
82    lines inserted or deleted (counting only lines before the midpoint),
83    unless a heuristic is used to terminate the search prematurely.
84 
85    Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script for the
86    left half of the partition is known; similarly for PART->RIGHT_MINIMAL.
87 
88    This function assumes that the first lines of the specified portions
89    of the two files do not match, and likewise that the last lines do not
90    match.  The caller must trim matching lines from the beginning and end
91    of the portions it is going to specify.
92 
93    If we return the "wrong" partitions,
94    the worst this can do is cause suboptimal diff output.
95    It cannot cause incorrect diff output.  */
96 
97 static int
98 diag (xoff, xlim, yoff, ylim, minimal, part)
99      int xoff, xlim, yoff, ylim, minimal;
100      struct partition *part;
101 {
102   int *const fd = fdiag;	/* Give the compiler a chance. */
103   int *const bd = bdiag;	/* Additional help for the compiler. */
104   int const *const xv = xvec;	/* Still more help for the compiler. */
105   int const *const yv = yvec;	/* And more and more . . . */
106   int const dmin = xoff - ylim;	/* Minimum valid diagonal. */
107   int const dmax = xlim - yoff;	/* Maximum valid diagonal. */
108   int const fmid = xoff - yoff;	/* Center diagonal of top-down search. */
109   int const bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
110   int fmin = fmid, fmax = fmid;	/* Limits of top-down search. */
111   int bmin = bmid, bmax = bmid;	/* Limits of bottom-up search. */
112   int c;			/* Cost. */
113   int odd = (fmid - bmid) & 1;	/* True if southeast corner is on an odd
114 				   diagonal with respect to the northwest. */
115 
116   fd[fmid] = xoff;
117   bd[bmid] = xlim;
118 
119   for (c = 1;; ++c)
120     {
121       int d;			/* Active diagonal. */
122       int big_snake = 0;
123 
124       /* Extend the top-down search by an edit step in each diagonal. */
125       fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
126       fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
127       for (d = fmax; d >= fmin; d -= 2)
128 	{
129 	  int x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
130 
131 	  if (tlo >= thi)
132 	    x = tlo + 1;
133 	  else
134 	    x = thi;
135 	  oldx = x;
136 	  y = x - d;
137 	  while (x < xlim && y < ylim && xv[x] == yv[y])
138 	    ++x, ++y;
139 	  if (x - oldx > SNAKE_LIMIT)
140 	    big_snake = 1;
141 	  fd[d] = x;
142 	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
143 	    {
144 	      part->xmid = x;
145 	      part->ymid = y;
146 	      part->lo_minimal = part->hi_minimal = 1;
147 	      return 2 * c - 1;
148 	    }
149 	}
150 
151       /* Similarly extend the bottom-up search.  */
152       bmin > dmin ? bd[--bmin - 1] = INT_MAX : ++bmin;
153       bmax < dmax ? bd[++bmax + 1] = INT_MAX : --bmax;
154       for (d = bmax; d >= bmin; d -= 2)
155 	{
156 	  int x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
157 
158 	  if (tlo < thi)
159 	    x = tlo;
160 	  else
161 	    x = thi - 1;
162 	  oldx = x;
163 	  y = x - d;
164 	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
165 	    --x, --y;
166 	  if (oldx - x > SNAKE_LIMIT)
167 	    big_snake = 1;
168 	  bd[d] = x;
169 	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
170 	    {
171 	      part->xmid = x;
172 	      part->ymid = y;
173 	      part->lo_minimal = part->hi_minimal = 1;
174 	      return 2 * c;
175 	    }
176 	}
177 
178       if (minimal)
179 	continue;
180 
181       /* Heuristic: check occasionally for a diagonal that has made
182 	 lots of progress compared with the edit distance.
183 	 If we have any such, find the one that has made the most
184 	 progress and return it as if it had succeeded.
185 
186 	 With this heuristic, for files with a constant small density
187 	 of changes, the algorithm is linear in the file size.  */
188 
189       if (c > 200 && big_snake && heuristic)
190 	{
191 	  int best;
192 
193 	  best = 0;
194 	  for (d = fmax; d >= fmin; d -= 2)
195 	    {
196 	      int dd = d - fmid;
197 	      int x = fd[d];
198 	      int y = x - d;
199 	      int v = (x - xoff) * 2 - dd;
200 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
201 		{
202 		  if (v > best
203 		      && xoff + SNAKE_LIMIT <= x && x < xlim
204 		      && yoff + SNAKE_LIMIT <= y && y < ylim)
205 		    {
206 		      /* We have a good enough best diagonal;
207 			 now insist that it end with a significant snake.  */
208 		      int k;
209 
210 		      for (k = 1; xv[x - k] == yv[y - k]; k++)
211 			if (k == SNAKE_LIMIT)
212 			  {
213 			    best = v;
214 			    part->xmid = x;
215 			    part->ymid = y;
216 			    break;
217 			  }
218 		    }
219 		}
220 	    }
221 	  if (best > 0)
222 	    {
223 	      part->lo_minimal = 1;
224 	      part->hi_minimal = 0;
225 	      return 2 * c - 1;
226 	    }
227 
228 	  best = 0;
229 	  for (d = bmax; d >= bmin; d -= 2)
230 	    {
231 	      int dd = d - bmid;
232 	      int x = bd[d];
233 	      int y = x - d;
234 	      int v = (xlim - x) * 2 + dd;
235 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
236 		{
237 		  if (v > best
238 		      && xoff < x && x <= xlim - SNAKE_LIMIT
239 		      && yoff < y && y <= ylim - SNAKE_LIMIT)
240 		    {
241 		      /* We have a good enough best diagonal;
242 			 now insist that it end with a significant snake.  */
243 		      int k;
244 
245 		      for (k = 0; xv[x + k] == yv[y + k]; k++)
246 			if (k == SNAKE_LIMIT - 1)
247 			  {
248 			    best = v;
249 			    part->xmid = x;
250 			    part->ymid = y;
251 			    break;
252 			  }
253 		    }
254 		}
255 	    }
256 	  if (best > 0)
257 	    {
258 	      part->lo_minimal = 0;
259 	      part->hi_minimal = 1;
260 	      return 2 * c - 1;
261 	    }
262 	}
263 
264       /* Heuristic: if we've gone well beyond the call of duty,
265 	 give up and report halfway between our best results so far.  */
266       if (c >= too_expensive)
267 	{
268 	  int fxybest, fxbest;
269 	  int bxybest, bxbest;
270 
271 	  fxbest = bxbest = 0;  /* Pacify `gcc -Wall'.  */
272 
273 	  /* Find forward diagonal that maximizes X + Y.  */
274 	  fxybest = -1;
275 	  for (d = fmax; d >= fmin; d -= 2)
276 	    {
277 	      int x = min (fd[d], xlim);
278 	      int y = x - d;
279 	      if (ylim < y)
280 		x = ylim + d, y = ylim;
281 	      if (fxybest < x + y)
282 		{
283 		  fxybest = x + y;
284 		  fxbest = x;
285 		}
286 	    }
287 
288 	  /* Find backward diagonal that minimizes X + Y.  */
289 	  bxybest = INT_MAX;
290 	  for (d = bmax; d >= bmin; d -= 2)
291 	    {
292 	      int x = max (xoff, bd[d]);
293 	      int y = x - d;
294 	      if (y < yoff)
295 		x = yoff + d, y = yoff;
296 	      if (x + y < bxybest)
297 		{
298 		  bxybest = x + y;
299 		  bxbest = x;
300 		}
301 	    }
302 
303 	  /* Use the better of the two diagonals.  */
304 	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
305 	    {
306 	      part->xmid = fxbest;
307 	      part->ymid = fxybest - fxbest;
308 	      part->lo_minimal = 1;
309 	      part->hi_minimal = 0;
310 	    }
311 	  else
312 	    {
313 	      part->xmid = bxbest;
314 	      part->ymid = bxybest - bxbest;
315 	      part->lo_minimal = 0;
316 	      part->hi_minimal = 1;
317 	    }
318 	  return 2 * c - 1;
319 	}
320     }
321 }
322 
323 /* Compare in detail contiguous subsequences of the two files
324    which are known, as a whole, to match each other.
325 
326    The results are recorded in the vectors files[N].changed_flag, by
327    storing a 1 in the element for each line that is an insertion or deletion.
328 
329    The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
330 
331    Note that XLIM, YLIM are exclusive bounds.
332    All line numbers are origin-0 and discarded lines are not counted.
333 
334    If MINIMAL is nonzero, find a minimal difference no matter how
335    expensive it is.  */
336 
337 static void
338 compareseq (xoff, xlim, yoff, ylim, minimal)
339      int xoff, xlim, yoff, ylim, minimal;
340 {
341   int * const xv = xvec; /* Help the compiler.  */
342   int * const yv = yvec;
343 
344   /* Slide down the bottom initial diagonal. */
345   while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
346     ++xoff, ++yoff;
347   /* Slide up the top initial diagonal. */
348   while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
349     --xlim, --ylim;
350 
351   /* Handle simple cases. */
352   if (xoff == xlim)
353     while (yoff < ylim)
354       files[1].changed_flag[files[1].realindexes[yoff++]] = 1;
355   else if (yoff == ylim)
356     while (xoff < xlim)
357       files[0].changed_flag[files[0].realindexes[xoff++]] = 1;
358   else
359     {
360       int c;
361       struct partition part;
362 
363       /* Find a point of correspondence in the middle of the files.  */
364 
365       c = diag (xoff, xlim, yoff, ylim, minimal, &part);
366 
367       if (c == 1)
368 	{
369 	  /* This should be impossible, because it implies that
370 	     one of the two subsequences is empty,
371 	     and that case was handled above without calling `diag'.
372 	     Let's verify that this is true.  */
373 	  abort ();
374 #if 0
375 	  /* The two subsequences differ by a single insert or delete;
376 	     record it and we are done.  */
377 	  if (part.xmid - part.ymid < xoff - yoff)
378 	    files[1].changed_flag[files[1].realindexes[part.ymid - 1]] = 1;
379 	  else
380 	    files[0].changed_flag[files[0].realindexes[part.xmid]] = 1;
381 #endif
382 	}
383       else
384 	{
385 	  /* Use the partitions to split this problem into subproblems.  */
386 	  compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
387 	  compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
388 	}
389     }
390 }
391 
392 /* Discard lines from one file that have no matches in the other file.
393 
394    A line which is discarded will not be considered by the actual
395    comparison algorithm; it will be as if that line were not in the file.
396    The file's `realindexes' table maps virtual line numbers
397    (which don't count the discarded lines) into real line numbers;
398    this is how the actual comparison algorithm produces results
399    that are comprehensible when the discarded lines are counted.
400 
401    When we discard a line, we also mark it as a deletion or insertion
402    so that it will be printed in the output.  */
403 
404 static void
405 discard_confusing_lines (filevec)
406      struct file_data filevec[];
407 {
408   unsigned int f, i;
409   char *discarded[2];
410   int *equiv_count[2];
411   int *p;
412 
413   /* Allocate our results.  */
414   p = (int *) xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
415 		       * (2 * sizeof (int)));
416   for (f = 0; f < 2; f++)
417     {
418       filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
419       filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
420     }
421 
422   /* Set up equiv_count[F][I] as the number of lines in file F
423      that fall in equivalence class I.  */
424 
425   p = (int *) xmalloc (filevec[0].equiv_max * (2 * sizeof (int)));
426   equiv_count[0] = p;
427   equiv_count[1] = p + filevec[0].equiv_max;
428   bzero (p, filevec[0].equiv_max * (2 * sizeof (int)));
429 
430   for (i = 0; i < filevec[0].buffered_lines; ++i)
431     ++equiv_count[0][filevec[0].equivs[i]];
432   for (i = 0; i < filevec[1].buffered_lines; ++i)
433     ++equiv_count[1][filevec[1].equivs[i]];
434 
435   /* Set up tables of which lines are going to be discarded.  */
436 
437   discarded[0] = xmalloc (sizeof (char)
438 			  * (filevec[0].buffered_lines
439 			     + filevec[1].buffered_lines));
440   discarded[1] = discarded[0] + filevec[0].buffered_lines;
441   bzero (discarded[0], sizeof (char) * (filevec[0].buffered_lines
442 					+ filevec[1].buffered_lines));
443 
444   /* Mark to be discarded each line that matches no line of the other file.
445      If a line matches many lines, mark it as provisionally discardable.  */
446 
447   for (f = 0; f < 2; f++)
448     {
449       unsigned int end = filevec[f].buffered_lines;
450       char *discards = discarded[f];
451       int *counts = equiv_count[1 - f];
452       int *equivs = filevec[f].equivs;
453       unsigned int many = 5;
454       unsigned int tem = end / 64;
455 
456       /* Multiply MANY by approximate square root of number of lines.
457 	 That is the threshold for provisionally discardable lines.  */
458       while ((tem = tem >> 2) > 0)
459 	many *= 2;
460 
461       for (i = 0; i < end; i++)
462 	{
463 	  int nmatch;
464 	  if (equivs[i] == 0)
465 	    continue;
466 	  nmatch = counts[equivs[i]];
467 	  if (nmatch == 0)
468 	    discards[i] = 1;
469 	  else if (nmatch > many)
470 	    discards[i] = 2;
471 	}
472     }
473 
474   /* Don't really discard the provisional lines except when they occur
475      in a run of discardables, with nonprovisionals at the beginning
476      and end.  */
477 
478   for (f = 0; f < 2; f++)
479     {
480       unsigned int end = filevec[f].buffered_lines;
481       register char *discards = discarded[f];
482 
483       for (i = 0; i < end; i++)
484 	{
485 	  /* Cancel provisional discards not in middle of run of discards.  */
486 	  if (discards[i] == 2)
487 	    discards[i] = 0;
488 	  else if (discards[i] != 0)
489 	    {
490 	      /* We have found a nonprovisional discard.  */
491 	      register int j;
492 	      unsigned int length;
493 	      unsigned int provisional = 0;
494 
495 	      /* Find end of this run of discardable lines.
496 		 Count how many are provisionally discardable.  */
497 	      for (j = i; j < end; j++)
498 		{
499 		  if (discards[j] == 0)
500 		    break;
501 		  if (discards[j] == 2)
502 		    ++provisional;
503 		}
504 
505 	      /* Cancel provisional discards at end, and shrink the run.  */
506 	      while (j > i && discards[j - 1] == 2)
507 		discards[--j] = 0, --provisional;
508 
509 	      /* Now we have the length of a run of discardable lines
510 		 whose first and last are not provisional.  */
511 	      length = j - i;
512 
513 	      /* If 1/4 of the lines in the run are provisional,
514 		 cancel discarding of all provisional lines in the run.  */
515 	      if (provisional * 4 > length)
516 		{
517 		  while (j > i)
518 		    if (discards[--j] == 2)
519 		      discards[j] = 0;
520 		}
521 	      else
522 		{
523 		  register unsigned int consec;
524 		  unsigned int minimum = 1;
525 		  unsigned int tem = length / 4;
526 
527 		  /* MINIMUM is approximate square root of LENGTH/4.
528 		     A subrun of two or more provisionals can stand
529 		     when LENGTH is at least 16.
530 		     A subrun of 4 or more can stand when LENGTH >= 64.  */
531 		  while ((tem = tem >> 2) > 0)
532 		    minimum *= 2;
533 		  minimum++;
534 
535 		  /* Cancel any subrun of MINIMUM or more provisionals
536 		     within the larger run.  */
537 		  for (j = 0, consec = 0; j < length; j++)
538 		    if (discards[i + j] != 2)
539 		      consec = 0;
540 		    else if (minimum == ++consec)
541 		      /* Back up to start of subrun, to cancel it all.  */
542 		      j -= consec;
543 		    else if (minimum < consec)
544 		      discards[i + j] = 0;
545 
546 		  /* Scan from beginning of run
547 		     until we find 3 or more nonprovisionals in a row
548 		     or until the first nonprovisional at least 8 lines in.
549 		     Until that point, cancel any provisionals.  */
550 		  for (j = 0, consec = 0; j < length; j++)
551 		    {
552 		      if (j >= 8 && discards[i + j] == 1)
553 			break;
554 		      if (discards[i + j] == 2)
555 			consec = 0, discards[i + j] = 0;
556 		      else if (discards[i + j] == 0)
557 			consec = 0;
558 		      else
559 			consec++;
560 		      if (consec == 3)
561 			break;
562 		    }
563 
564 		  /* I advances to the last line of the run.  */
565 		  i += length - 1;
566 
567 		  /* Same thing, from end.  */
568 		  for (j = 0, consec = 0; j < length; j++)
569 		    {
570 		      if (j >= 8 && discards[i - j] == 1)
571 			break;
572 		      if (discards[i - j] == 2)
573 			consec = 0, discards[i - j] = 0;
574 		      else if (discards[i - j] == 0)
575 			consec = 0;
576 		      else
577 			consec++;
578 		      if (consec == 3)
579 			break;
580 		    }
581 		}
582 	    }
583 	}
584     }
585 
586   /* Actually discard the lines. */
587   for (f = 0; f < 2; f++)
588     {
589       char *discards = discarded[f];
590       unsigned int end = filevec[f].buffered_lines;
591       unsigned int j = 0;
592       for (i = 0; i < end; ++i)
593 	if (no_discards || discards[i] == 0)
594 	  {
595 	    filevec[f].undiscarded[j] = filevec[f].equivs[i];
596 	    filevec[f].realindexes[j++] = i;
597 	  }
598 	else
599 	  filevec[f].changed_flag[i] = 1;
600       filevec[f].nondiscarded_lines = j;
601     }
602 
603   free (discarded[0]);
604   free (equiv_count[0]);
605 }
606 
607 /* Adjust inserts/deletes of identical lines to join changes
608    as much as possible.
609 
610    We do something when a run of changed lines include a
611    line at one end and have an excluded, identical line at the other.
612    We are free to choose which identical line is included.
613    `compareseq' usually chooses the one at the beginning,
614    but usually it is cleaner to consider the following identical line
615    to be the "change".  */
616 
617 int inhibit;
618 
619 static void
620 shift_boundaries (filevec)
621      struct file_data filevec[];
622 {
623   int f;
624   int inhibit_hunk_merge = horizon_lines != context;
625 
626   for (f = 0; f < 2; f++)
627     {
628       char *changed = filevec[f].changed_flag;
629       char const *other_changed = filevec[1-f].changed_flag;
630       int const *equivs = filevec[f].equivs;
631       int i = 0;
632       int j = 0;
633       int i_end = filevec[f].buffered_lines;
634 
635       while (1)
636 	{
637 	  int runlength, start, corresponding;
638 
639 	  /* Scan forwards to find beginning of another run of changes.
640 	     Also keep track of the corresponding point in the other file.  */
641 
642 	  while (i < i_end && changed[i] == 0)
643 	    {
644 	      while (other_changed[j++])
645 		continue;
646 	      i++;
647 	    }
648 
649 	  if (i == i_end)
650 	    break;
651 
652 	  start = i;
653 
654 	  /* Find the end of this run of changes.  */
655 
656 	  while (changed[++i])
657 	    continue;
658 	  while (other_changed[j])
659 	    j++;
660 
661 	  do
662 	    {
663 	      /* Record the length of this run of changes, so that
664 		 we can later determine whether the run has grown.  */
665 	      runlength = i - start;
666 
667 	      if (! inhibit_hunk_merge)
668 		{
669 		  /* Move the changed region back, so long as the
670 		     previous unchanged line matches the last changed one.
671 		     This merges with previous changed regions.  */
672 
673 		  while (start && equivs[start - 1] == equivs[i - 1])
674 		    {
675 		      changed[--start] = 1;
676 		      changed[--i] = 0;
677 		      while (changed[start - 1])
678 			start--;
679 		      while (other_changed[--j])
680 			continue;
681 		    }
682 		}
683 
684 	      /* Set CORRESPONDING to the end of the changed run, at the last
685 		 point where it corresponds to a changed run in the other file.
686 		 CORRESPONDING == I_END means no such point has been found.  */
687 	      corresponding = other_changed[j - 1] ? i : i_end;
688 
689 	      /* Shift the changed region forward, so long as the
690 		 first changed line matches the following unchanged one,
691 		 but if INHIBIT_HUNK_MERGE is 1 do not shift if
692 		 this would merge with another changed region.
693 		 Do this second, so that if there are no merges,
694 		 the changed region is moved forward as far as possible.  */
695 
696 	      while (i != i_end && equivs[start] == equivs[i]
697 		     && ! (inhibit_hunk_merge & other_changed[j + 1]))
698 		{
699 		  changed[start++] = 0;
700 		  changed[i++] = 1;
701 		  while (changed[i])
702 		    i++;
703 		  while (other_changed[++j])
704 		    corresponding = i;
705 		}
706 	    }
707 	  while (runlength != i - start);
708 
709 	  /* If possible, move the fully-merged run of changes
710 	     back to a corresponding run in the other file.  */
711 
712 	  while (corresponding < i)
713 	    {
714 	      changed[--start] = 1;
715 	      changed[--i] = 0;
716 	      while (other_changed[--j])
717 		continue;
718 	    }
719 	}
720     }
721 }
722 
723 /* Cons an additional entry onto the front of an edit script OLD.
724    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
725    DELETED is the number of lines deleted here from file 0.
726    INSERTED is the number of lines inserted here in file 1.
727 
728    If DELETED is 0 then LINE0 is the number of the line before
729    which the insertion was done; vice versa for INSERTED and LINE1.  */
730 
731 static struct change *
732 add_change (line0, line1, deleted, inserted, old)
733      int line0, line1, deleted, inserted;
734      struct change *old;
735 {
736   struct change *new = (struct change *) xmalloc (sizeof (struct change));
737 
738   new->line0 = line0;
739   new->line1 = line1;
740   new->inserted = inserted;
741   new->deleted = deleted;
742   new->link = old;
743   return new;
744 }
745 
746 /* Scan the tables of which lines are inserted and deleted,
747    producing an edit script in reverse order.  */
748 
749 static struct change *
750 build_reverse_script (filevec)
751      struct file_data const filevec[];
752 {
753   struct change *script = 0;
754   char *changed0 = filevec[0].changed_flag;
755   char *changed1 = filevec[1].changed_flag;
756   int len0 = filevec[0].buffered_lines;
757   int len1 = filevec[1].buffered_lines;
758 
759   /* Note that changedN[len0] does exist, and contains 0.  */
760 
761   int i0 = 0, i1 = 0;
762 
763   while (i0 < len0 || i1 < len1)
764     {
765       if (changed0[i0] || changed1[i1])
766 	{
767 	  int line0 = i0, line1 = i1;
768 
769 	  /* Find # lines changed here in each file.  */
770 	  while (changed0[i0]) ++i0;
771 	  while (changed1[i1]) ++i1;
772 
773 	  /* Record this change.  */
774 	  script = add_change (line0, line1, i0 - line0, i1 - line1, script);
775 	}
776 
777       /* We have reached lines in the two files that match each other.  */
778       i0++, i1++;
779     }
780 
781   return script;
782 }
783 
784 /* Scan the tables of which lines are inserted and deleted,
785    producing an edit script in forward order.  */
786 
787 static struct change *
788 build_script (filevec)
789      struct file_data const filevec[];
790 {
791   struct change *script = 0;
792   char *changed0 = filevec[0].changed_flag;
793   char *changed1 = filevec[1].changed_flag;
794   int i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
795 
796   /* Note that changedN[-1] does exist, and contains 0.  */
797 
798   while (i0 >= 0 || i1 >= 0)
799     {
800       if (changed0[i0 - 1] || changed1[i1 - 1])
801 	{
802 	  int line0 = i0, line1 = i1;
803 
804 	  /* Find # lines changed here in each file.  */
805 	  while (changed0[i0 - 1]) --i0;
806 	  while (changed1[i1 - 1]) --i1;
807 
808 	  /* Record this change.  */
809 	  script = add_change (i0, i1, line0 - i0, line1 - i1, script);
810 	}
811 
812       /* We have reached lines in the two files that match each other.  */
813       i0--, i1--;
814     }
815 
816   return script;
817 }
818 
819 /* If CHANGES, briefly report that two files differed.  */
820 static void
821 briefly_report (changes, filevec)
822      int changes;
823      struct file_data const filevec[];
824 {
825   if (changes)
826     message (no_details_flag ? "Files %s and %s differ\n"
827 	     : "Binary files %s and %s differ\n",
828 	     filevec[0].name, filevec[1].name);
829 }
830 
831 /* Report the differences of two files.  DEPTH is the current directory
832    depth. */
833 int
834 diff_2_files (filevec, depth)
835      struct file_data filevec[];
836      int depth;
837 {
838   int diags;
839   int i;
840   struct change *e, *p;
841   struct change *script;
842   int changes;
843 
844 
845   /* If we have detected that either file is binary,
846      compare the two files as binary.  This can happen
847      only when the first chunk is read.
848      Also, --brief without any --ignore-* options means
849      we can speed things up by treating the files as binary.  */
850 
851   if (read_files (filevec, no_details_flag & ~ignore_some_changes))
852     {
853       /* Files with different lengths must be different.  */
854       if (filevec[0].stat.st_size != filevec[1].stat.st_size
855 	  && (filevec[0].desc < 0 || S_ISREG (filevec[0].stat.st_mode))
856 	  && (filevec[1].desc < 0 || S_ISREG (filevec[1].stat.st_mode)))
857 	changes = 1;
858 
859       /* Standard input equals itself.  */
860       else if (filevec[0].desc == filevec[1].desc)
861 	changes = 0;
862 
863       else
864 	/* Scan both files, a buffer at a time, looking for a difference.  */
865 	{
866 	  /* Allocate same-sized buffers for both files.  */
867 	  size_t buffer_size = buffer_lcm (STAT_BLOCKSIZE (filevec[0].stat),
868 					   STAT_BLOCKSIZE (filevec[1].stat));
869 	  for (i = 0; i < 2; i++)
870 	    filevec[i].buffer = xrealloc (filevec[i].buffer, buffer_size);
871 
872 	  for (;;  filevec[0].buffered_chars = filevec[1].buffered_chars = 0)
873 	    {
874 	      /* Read a buffer's worth from both files.  */
875 	      for (i = 0; i < 2; i++)
876 		if (0 <= filevec[i].desc)
877 		  while (filevec[i].buffered_chars != buffer_size)
878 		    {
879 		      int r = read (filevec[i].desc,
880 				    filevec[i].buffer
881 				    + filevec[i].buffered_chars,
882 				    buffer_size - filevec[i].buffered_chars);
883 		      if (r == 0)
884 			break;
885 		      if (r < 0)
886 			pfatal_with_name (filevec[i].name);
887 		      filevec[i].buffered_chars += r;
888 		    }
889 
890 	      /* If the buffers differ, the files differ.  */
891 	      if (filevec[0].buffered_chars != filevec[1].buffered_chars
892 		  || (filevec[0].buffered_chars != 0
893 		      && memcmp (filevec[0].buffer,
894 				 filevec[1].buffer,
895 				 filevec[0].buffered_chars) != 0))
896 		{
897 		  changes = 1;
898 		  break;
899 		}
900 
901 	      /* If we reach end of file, the files are the same.  */
902 	      if (filevec[0].buffered_chars != buffer_size)
903 		{
904 		  changes = 0;
905 		  break;
906 		}
907 	    }
908 	}
909 
910       briefly_report (changes, filevec);
911     }
912   else
913     {
914       /* Allocate vectors for the results of comparison:
915 	 a flag for each line of each file, saying whether that line
916 	 is an insertion or deletion.
917 	 Allocate an extra element, always zero, at each end of each vector.  */
918 
919       size_t s = filevec[0].buffered_lines + filevec[1].buffered_lines + 4;
920       filevec[0].changed_flag = xmalloc (s);
921       bzero (filevec[0].changed_flag, s);
922       filevec[0].changed_flag++;
923       filevec[1].changed_flag = filevec[0].changed_flag
924 				+ filevec[0].buffered_lines + 2;
925 
926       /* Some lines are obviously insertions or deletions
927 	 because they don't match anything.  Detect them now, and
928 	 avoid even thinking about them in the main comparison algorithm.  */
929 
930       discard_confusing_lines (filevec);
931 
932       /* Now do the main comparison algorithm, considering just the
933 	 undiscarded lines.  */
934 
935       xvec = filevec[0].undiscarded;
936       yvec = filevec[1].undiscarded;
937       diags = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
938       fdiag = (int *) xmalloc (diags * (2 * sizeof (int)));
939       bdiag = fdiag + diags;
940       fdiag += filevec[1].nondiscarded_lines + 1;
941       bdiag += filevec[1].nondiscarded_lines + 1;
942 
943       /* Set TOO_EXPENSIVE to be approximate square root of input size,
944 	 bounded below by 256.  */
945       too_expensive = 1;
946       for (i = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines;
947 	   i != 0; i >>= 2)
948 	too_expensive <<= 1;
949       too_expensive = max (256, too_expensive);
950 
951       files[0] = filevec[0];
952       files[1] = filevec[1];
953 
954       compareseq (0, filevec[0].nondiscarded_lines,
955 		  0, filevec[1].nondiscarded_lines, no_discards);
956 
957       free (fdiag - (filevec[1].nondiscarded_lines + 1));
958 
959       /* Modify the results slightly to make them prettier
960 	 in cases where that can validly be done.  */
961 
962       shift_boundaries (filevec);
963 
964       /* Get the results of comparison in the form of a chain
965 	 of `struct change's -- an edit script.  */
966 
967       if (output_style == OUTPUT_ED)
968 	script = build_reverse_script (filevec);
969       else
970 	script = build_script (filevec);
971 
972       /* Set CHANGES if we had any diffs.
973 	 If some changes are ignored, we must scan the script to decide.  */
974       if (ignore_blank_lines_flag || ignore_regexp_list)
975 	{
976 	  struct change *next = script;
977 	  changes = 0;
978 
979 	  while (next && changes == 0)
980 	    {
981 	      struct change *this, *end;
982 	      int first0, last0, first1, last1, deletes, inserts;
983 
984 	      /* Find a set of changes that belong together.  */
985 	      this = next;
986 	      end = find_change (next);
987 
988 	      /* Disconnect them from the rest of the changes, making them
989 		 a hunk, and remember the rest for next iteration.  */
990 	      next = end->link;
991 	      end->link = 0;
992 
993 	      /* Determine whether this hunk is really a difference.  */
994 	      analyze_hunk (this, &first0, &last0, &first1, &last1,
995 			    &deletes, &inserts);
996 
997 	      /* Reconnect the script so it will all be freed properly.  */
998 	      end->link = next;
999 
1000 	      if (deletes || inserts)
1001 		changes = 1;
1002 	    }
1003 	}
1004       else
1005 	changes = (script != 0);
1006 
1007       if (no_details_flag)
1008 	briefly_report (changes, filevec);
1009       else
1010 	{
1011 	  if (changes || ! no_diff_means_no_output)
1012 	    {
1013 	      /* Record info for starting up output,
1014 		 to be used if and when we have some output to print.  */
1015 	      setup_output (files[0].name, files[1].name, depth);
1016 
1017 	      switch (output_style)
1018 		{
1019 		case OUTPUT_CONTEXT:
1020 		  print_context_script (script, 0);
1021 		  break;
1022 
1023 		case OUTPUT_UNIFIED:
1024 		  print_context_script (script, 1);
1025 		  break;
1026 
1027 		case OUTPUT_ED:
1028 		  print_ed_script (script);
1029 		  break;
1030 
1031 		case OUTPUT_FORWARD_ED:
1032 		  pr_forward_ed_script (script);
1033 		  break;
1034 
1035 		case OUTPUT_RCS:
1036 		  print_rcs_script (script);
1037 		  break;
1038 
1039 		case OUTPUT_NORMAL:
1040 		  print_normal_script (script);
1041 		  break;
1042 
1043 		case OUTPUT_IFDEF:
1044 		  print_ifdef_script (script);
1045 		  break;
1046 
1047 		case OUTPUT_SDIFF:
1048 		  print_sdiff_script (script);
1049 		}
1050 
1051 	      finish_output ();
1052 	    }
1053 	}
1054 
1055       free (filevec[0].undiscarded);
1056 
1057       free (filevec[0].changed_flag - 1);
1058 
1059       for (i = 1; i >= 0; --i)
1060 	free (filevec[i].equivs);
1061 
1062       for (i = 0; i < 2; ++i)
1063 	free (filevec[i].linbuf + filevec[i].linbuf_base);
1064 
1065       for (e = script; e; e = p)
1066 	{
1067 	  p = e->link;
1068 	  free (e);
1069 	}
1070 
1071       if (! ROBUST_OUTPUT_STYLE (output_style))
1072 	for (i = 0; i < 2; ++i)
1073 	  if (filevec[i].missing_newline)
1074 	    {
1075 	      diff_error ("No newline at end of file %s", filevec[i].name, "");
1076 	      changes = 2;
1077 	    }
1078     }
1079 
1080   if (filevec[0].buffer != filevec[1].buffer)
1081     free (filevec[0].buffer);
1082   free (filevec[1].buffer);
1083 
1084   return changes;
1085 }
1086