1 /* Generic infrastructure to implement various diff algorithms (implementation). */
2 /*
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18
19 #include <sys/queue.h>
20 #include <ctype.h>
21 #include <errno.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <stdbool.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <limits.h>
28 #include <unistd.h>
29
30 #include <assert.h>
31
32 #include <arraylist.h>
33 #include <diff_main.h>
34
35 #include "diff_internal.h"
36 #include "diff_debug.h"
37
38 inline enum diff_chunk_type
diff_chunk_type(const struct diff_chunk * chunk)39 diff_chunk_type(const struct diff_chunk *chunk)
40 {
41 if (!chunk->left_count && !chunk->right_count)
42 return CHUNK_EMPTY;
43 if (!chunk->solved)
44 return CHUNK_ERROR;
45 if (!chunk->right_count)
46 return CHUNK_MINUS;
47 if (!chunk->left_count)
48 return CHUNK_PLUS;
49 if (chunk->left_count != chunk->right_count)
50 return CHUNK_ERROR;
51 return CHUNK_SAME;
52 }
53
54 static int
read_at(FILE * f,off_t at_pos,unsigned char * buf,size_t len)55 read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
56 {
57 ssize_t r;
58
59 if (fseeko(f, at_pos, SEEK_SET) == -1)
60 return errno;
61 r = fread(buf, sizeof(char), len, f);
62 if ((r == 0 || r < len) && ferror(f))
63 return EIO;
64 if (r != len)
65 return EIO;
66 return 0;
67 }
68
69 static int
buf_cmp(const unsigned char * left,size_t left_len,const unsigned char * right,size_t right_len,bool ignore_whitespace)70 buf_cmp(const unsigned char *left, size_t left_len,
71 const unsigned char *right, size_t right_len,
72 bool ignore_whitespace)
73 {
74 int cmp;
75
76 if (ignore_whitespace) {
77 int il = 0, ir = 0;
78 while (il < left_len && ir < right_len) {
79 unsigned char cl = left[il];
80 unsigned char cr = right[ir];
81
82 if (isspace((unsigned char)cl) && il < left_len) {
83 il++;
84 continue;
85 }
86 if (isspace((unsigned char)cr) && ir < right_len) {
87 ir++;
88 continue;
89 }
90
91 if (cl > cr)
92 return 1;
93 if (cr > cl)
94 return -1;
95 il++;
96 ir++;
97 }
98 while (il < left_len) {
99 unsigned char cl = left[il++];
100 if (!isspace((unsigned char)cl))
101 return 1;
102 }
103 while (ir < right_len) {
104 unsigned char cr = right[ir++];
105 if (!isspace((unsigned char)cr))
106 return -1;
107 }
108
109 return 0;
110 }
111
112 cmp = memcmp(left, right, MIN(left_len, right_len));
113 if (cmp)
114 return cmp;
115 if (left_len == right_len)
116 return 0;
117 return (left_len > right_len) ? 1 : -1;
118 }
119
120 int
diff_atom_cmp(int * cmp,const struct diff_atom * left,const struct diff_atom * right)121 diff_atom_cmp(int *cmp,
122 const struct diff_atom *left,
123 const struct diff_atom *right)
124 {
125 off_t remain_left, remain_right;
126 int flags = (left->root->diff_flags | right->root->diff_flags);
127 bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE);
128
129 if (!left->len && !right->len) {
130 *cmp = 0;
131 return 0;
132 }
133 if (!ignore_whitespace) {
134 if (!right->len) {
135 *cmp = 1;
136 return 0;
137 }
138 if (!left->len) {
139 *cmp = -1;
140 return 0;
141 }
142 }
143
144 if (left->at != NULL && right->at != NULL) {
145 *cmp = buf_cmp(left->at, left->len, right->at, right->len,
146 ignore_whitespace);
147 return 0;
148 }
149
150 remain_left = left->len;
151 remain_right = right->len;
152 while (remain_left > 0 || remain_right > 0) {
153 const size_t chunksz = 8192;
154 unsigned char buf_left[chunksz], buf_right[chunksz];
155 const uint8_t *p_left, *p_right;
156 off_t n_left, n_right;
157 int r;
158
159 if (!remain_right) {
160 *cmp = 1;
161 return 0;
162 }
163 if (!remain_left) {
164 *cmp = -1;
165 return 0;
166 }
167
168 n_left = MIN(chunksz, remain_left);
169 n_right = MIN(chunksz, remain_right);
170
171 if (left->at == NULL) {
172 r = read_at(left->root->f,
173 left->pos + (left->len - remain_left),
174 buf_left, n_left);
175 if (r) {
176 *cmp = 0;
177 return r;
178 }
179 p_left = buf_left;
180 } else {
181 p_left = left->at + (left->len - remain_left);
182 }
183
184 if (right->at == NULL) {
185 r = read_at(right->root->f,
186 right->pos + (right->len - remain_right),
187 buf_right, n_right);
188 if (r) {
189 *cmp = 0;
190 return r;
191 }
192 p_right = buf_right;
193 } else {
194 p_right = right->at + (right->len - remain_right);
195 }
196
197 r = buf_cmp(p_left, n_left, p_right, n_right,
198 ignore_whitespace);
199 if (r) {
200 *cmp = r;
201 return 0;
202 }
203
204 remain_left -= n_left;
205 remain_right -= n_right;
206 }
207
208 *cmp = 0;
209 return 0;
210 }
211
212 int
diff_atom_same(bool * same,const struct diff_atom * left,const struct diff_atom * right)213 diff_atom_same(bool *same,
214 const struct diff_atom *left,
215 const struct diff_atom *right)
216 {
217 int cmp;
218 int r;
219 if (left->hash != right->hash) {
220 *same = false;
221 return 0;
222 }
223 r = diff_atom_cmp(&cmp, left, right);
224 if (r) {
225 *same = true;
226 return r;
227 }
228 *same = (cmp == 0);
229 return 0;
230 }
231
232 static struct diff_chunk *
diff_state_add_solved_chunk(struct diff_state * state,const struct diff_chunk * chunk)233 diff_state_add_solved_chunk(struct diff_state *state,
234 const struct diff_chunk *chunk)
235 {
236 diff_chunk_arraylist_t *result;
237 struct diff_chunk *new_chunk;
238 enum diff_chunk_type last_t;
239 enum diff_chunk_type new_t;
240 struct diff_chunk *last;
241
242 /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
243 * never directly follows a plus chunk. */
244 result = &state->result->chunks;
245
246 last_t = result->len ? diff_chunk_type(&result->head[result->len - 1])
247 : CHUNK_EMPTY;
248 new_t = diff_chunk_type(chunk);
249
250 debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED",
251 result->len);
252 debug("L\n");
253 debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
254 debug("R\n");
255 debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
256
257 if (result->len) {
258 last = &result->head[result->len - 1];
259 assert(chunk->left_start
260 == last->left_start + last->left_count);
261 assert(chunk->right_start
262 == last->right_start + last->right_count);
263 }
264
265 if (new_t == last_t) {
266 new_chunk = &result->head[result->len - 1];
267 new_chunk->left_count += chunk->left_count;
268 new_chunk->right_count += chunk->right_count;
269 debug(" - added chunk touches previous one of same type, joined:\n");
270 debug("L\n");
271 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
272 debug("R\n");
273 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
274 } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
275 enum diff_chunk_type prev_last_t =
276 result->len > 1 ?
277 diff_chunk_type(&result->head[result->len - 2])
278 : CHUNK_EMPTY;
279 /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
280 * Is the one before that also a minus? combine. */
281 if (prev_last_t == CHUNK_MINUS) {
282 new_chunk = &result->head[result->len - 2];
283 new_chunk->left_count += chunk->left_count;
284 new_chunk->right_count += chunk->right_count;
285
286 debug(" - added minus-chunk follows plus-chunk,"
287 " put before that plus-chunk and joined"
288 " with preceding minus-chunk:\n");
289 debug("L\n");
290 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
291 debug("R\n");
292 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
293 } else {
294 ARRAYLIST_INSERT(new_chunk, *result, result->len - 1);
295 if (!new_chunk)
296 return NULL;
297 *new_chunk = *chunk;
298
299 /* The new minus chunk indicates to which position on
300 * the right it corresponds, even though it doesn't add
301 * any lines on the right. By moving above a plus chunk,
302 * that position on the right has shifted. */
303 last = &result->head[result->len - 1];
304 new_chunk->right_start = last->right_start;
305
306 debug(" - added minus-chunk follows plus-chunk,"
307 " put before that plus-chunk\n");
308 }
309
310 /* That last_t == CHUNK_PLUS indicates to which position on the
311 * left it corresponds, even though it doesn't add any lines on
312 * the left. By inserting/extending the prev_last_t ==
313 * CHUNK_MINUS, that position on the left has shifted. */
314 last = &result->head[result->len - 1];
315 last->left_start = new_chunk->left_start
316 + new_chunk->left_count;
317
318 } else {
319 ARRAYLIST_ADD(new_chunk, *result);
320 if (!new_chunk)
321 return NULL;
322 *new_chunk = *chunk;
323 }
324 return new_chunk;
325 }
326
327 /* Even if a left or right side is empty, diff output may need to know the
328 * position in that file.
329 * So left_start or right_start must never be NULL -- pass left_count or
330 * right_count as zero to indicate staying at that position without consuming
331 * any lines. */
332 struct diff_chunk *
diff_state_add_chunk(struct diff_state * state,bool solved,struct diff_atom * left_start,unsigned int left_count,struct diff_atom * right_start,unsigned int right_count)333 diff_state_add_chunk(struct diff_state *state, bool solved,
334 struct diff_atom *left_start, unsigned int left_count,
335 struct diff_atom *right_start, unsigned int right_count)
336 {
337 struct diff_chunk *new_chunk;
338 struct diff_chunk chunk = {
339 .solved = solved,
340 .left_start = left_start,
341 .left_count = left_count,
342 .right_start = right_start,
343 .right_count = right_count,
344 };
345
346 /* An unsolved chunk means store as intermediate result for later
347 * re-iteration.
348 * If there already are intermediate results, that means even a
349 * following solved chunk needs to go to intermediate results, so that
350 * it is later put in the final correct position in solved chunks.
351 */
352 if (!solved || state->temp_result.len) {
353 /* Append to temp_result */
354 debug("ADD %s chunk to temp result:\n",
355 chunk.solved ? "solved" : "UNSOLVED");
356 debug("L\n");
357 debug_dump_atoms(&state->left, left_start, left_count);
358 debug("R\n");
359 debug_dump_atoms(&state->right, right_start, right_count);
360 ARRAYLIST_ADD(new_chunk, state->temp_result);
361 if (!new_chunk)
362 return NULL;
363 *new_chunk = chunk;
364 return new_chunk;
365 }
366
367 return diff_state_add_solved_chunk(state, &chunk);
368 }
369
370 static void
diff_data_init_root(struct diff_data * d,FILE * f,const uint8_t * data,unsigned long long len,int diff_flags)371 diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data,
372 unsigned long long len, int diff_flags)
373 {
374 *d = (struct diff_data){
375 .f = f,
376 .pos = 0,
377 .data = data,
378 .len = len,
379 .root = d,
380 .diff_flags = diff_flags,
381 };
382 }
383
384 void
diff_data_init_subsection(struct diff_data * d,struct diff_data * parent,struct diff_atom * from_atom,unsigned int atoms_count)385 diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
386 struct diff_atom *from_atom, unsigned int atoms_count)
387 {
388 struct diff_atom *last_atom;
389
390 debug("diff_data %p parent %p from_atom %p atoms_count %u\n",
391 d, parent, from_atom, atoms_count);
392 debug(" from_atom ");
393 debug_dump_atom(parent, NULL, from_atom);
394
395 if (atoms_count == 0) {
396 *d = (struct diff_data){
397 .f = NULL,
398 .pos = 0,
399 .data = NULL,
400 .len = 0,
401 .root = parent->root,
402 .atoms.head = NULL,
403 .atoms.len = atoms_count,
404 };
405
406 return;
407 }
408
409 last_atom = from_atom + atoms_count - 1;
410 *d = (struct diff_data){
411 .f = NULL,
412 .pos = from_atom->pos,
413 .data = from_atom->at,
414 .len = (last_atom->pos + last_atom->len) - from_atom->pos,
415 .root = parent->root,
416 .atoms.head = from_atom,
417 .atoms.len = atoms_count,
418 };
419
420 debug("subsection:\n");
421 debug_dump(d);
422 }
423
424 void
diff_data_free(struct diff_data * diff_data)425 diff_data_free(struct diff_data *diff_data)
426 {
427 if (!diff_data)
428 return;
429 if (diff_data->atoms.allocated)
430 ARRAYLIST_FREE(diff_data->atoms);
431 }
432
433 int
diff_algo_none(const struct diff_algo_config * algo_config,struct diff_state * state)434 diff_algo_none(const struct diff_algo_config *algo_config,
435 struct diff_state *state)
436 {
437 debug("\n** %s\n", __func__);
438 debug("left:\n");
439 debug_dump(&state->left);
440 debug("right:\n");
441 debug_dump(&state->right);
442 debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
443 0);
444
445 /* Add a chunk of equal lines, if any */
446 struct diff_atom *l = state->left.atoms.head;
447 unsigned int l_len = state->left.atoms.len;
448 struct diff_atom *r = state->right.atoms.head;
449 unsigned int r_len = state->right.atoms.len;
450 unsigned int equal_atoms_start = 0;
451 unsigned int equal_atoms_end = 0;
452 unsigned int l_idx = 0;
453 unsigned int r_idx = 0;
454
455 while (equal_atoms_start < l_len
456 && equal_atoms_start < r_len) {
457 int err;
458 bool same;
459 err = diff_atom_same(&same, &l[equal_atoms_start],
460 &r[equal_atoms_start]);
461 if (err)
462 return err;
463 if (!same)
464 break;
465 equal_atoms_start++;
466 }
467 while (equal_atoms_end < (l_len - equal_atoms_start)
468 && equal_atoms_end < (r_len - equal_atoms_start)) {
469 int err;
470 bool same;
471 err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end],
472 &r[r_len - 1 - equal_atoms_end]);
473 if (err)
474 return err;
475 if (!same)
476 break;
477 equal_atoms_end++;
478 }
479
480 /* Add a chunk of equal lines at the start */
481 if (equal_atoms_start) {
482 if (!diff_state_add_chunk(state, true,
483 l, equal_atoms_start,
484 r, equal_atoms_start))
485 return ENOMEM;
486 l_idx += equal_atoms_start;
487 r_idx += equal_atoms_start;
488 }
489
490 /* Add a "minus" chunk with all lines from the left. */
491 if (equal_atoms_start + equal_atoms_end < l_len) {
492 unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end;
493 if (!diff_state_add_chunk(state, true,
494 &l[l_idx], add_len,
495 &r[r_idx], 0))
496 return ENOMEM;
497 l_idx += add_len;
498 }
499
500 /* Add a "plus" chunk with all lines from the right. */
501 if (equal_atoms_start + equal_atoms_end < r_len) {
502 unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end;
503 if (!diff_state_add_chunk(state, true,
504 &l[l_idx], 0,
505 &r[r_idx], add_len))
506 return ENOMEM;
507 r_idx += add_len;
508 }
509
510 /* Add a chunk of equal lines at the end */
511 if (equal_atoms_end) {
512 if (!diff_state_add_chunk(state, true,
513 &l[l_idx], equal_atoms_end,
514 &r[r_idx], equal_atoms_end))
515 return ENOMEM;
516 }
517
518 return DIFF_RC_OK;
519 }
520
521 static int
diff_run_algo(const struct diff_algo_config * algo_config,struct diff_state * state)522 diff_run_algo(const struct diff_algo_config *algo_config,
523 struct diff_state *state)
524 {
525 int rc;
526
527 if (!algo_config || !algo_config->impl
528 || !state->recursion_depth_left
529 || !state->left.atoms.len || !state->right.atoms.len) {
530 debug("Fall back to diff_algo_none():%s%s%s\n",
531 (!algo_config || !algo_config->impl) ? " no-cfg" : "",
532 (!state->recursion_depth_left) ? " max-depth" : "",
533 (!state->left.atoms.len || !state->right.atoms.len)?
534 " trivial" : "");
535 return diff_algo_none(algo_config, state);
536 }
537
538 ARRAYLIST_FREE(state->temp_result);
539 ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
540 rc = algo_config->impl(algo_config, state);
541 switch (rc) {
542 case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
543 debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
544 algo_config->fallback_algo);
545 rc = diff_run_algo(algo_config->fallback_algo, state);
546 goto return_rc;
547
548 case DIFF_RC_OK:
549 /* continue below */
550 break;
551
552 default:
553 /* some error happened */
554 goto return_rc;
555 }
556
557 /* Pick up any diff chunks that are still unsolved and feed to
558 * inner_algo. inner_algo will solve unsolved chunks and append to
559 * result, and subsequent solved chunks on this level are then appended
560 * to result afterwards. */
561 int i;
562 for (i = 0; i < state->temp_result.len; i++) {
563 struct diff_chunk *c = &state->temp_result.head[i];
564 if (c->solved) {
565 diff_state_add_solved_chunk(state, c);
566 continue;
567 }
568
569 /* c is an unsolved chunk, feed to inner_algo */
570 struct diff_state inner_state = {
571 .result = state->result,
572 .recursion_depth_left = state->recursion_depth_left - 1,
573 .kd_buf = state->kd_buf,
574 .kd_buf_size = state->kd_buf_size,
575 };
576 diff_data_init_subsection(&inner_state.left, &state->left,
577 c->left_start, c->left_count);
578 diff_data_init_subsection(&inner_state.right, &state->right,
579 c->right_start, c->right_count);
580
581 rc = diff_run_algo(algo_config->inner_algo, &inner_state);
582 state->kd_buf = inner_state.kd_buf;
583 state->kd_buf_size = inner_state.kd_buf_size;
584 if (rc != DIFF_RC_OK)
585 goto return_rc;
586 }
587
588 rc = DIFF_RC_OK;
589 return_rc:
590 ARRAYLIST_FREE(state->temp_result);
591 return rc;
592 }
593
594 int
diff_atomize_file(struct diff_data * d,const struct diff_config * config,FILE * f,const uint8_t * data,off_t len,int diff_flags)595 diff_atomize_file(struct diff_data *d,
596 const struct diff_config *config,
597 FILE *f, const uint8_t *data, off_t len, int diff_flags)
598 {
599 if (!config->atomize_func)
600 return EINVAL;
601
602 diff_data_init_root(d, f, data, len, diff_flags);
603
604 return config->atomize_func(config->atomize_func_data, d);
605
606 }
607
608 struct diff_result *
diff_main(const struct diff_config * config,struct diff_data * left,struct diff_data * right)609 diff_main(const struct diff_config *config, struct diff_data *left,
610 struct diff_data *right)
611 {
612 struct diff_result *result = malloc(sizeof(struct diff_result));
613 if (!result)
614 return NULL;
615
616 *result = (struct diff_result){};
617 result->left = left;
618 result->right = right;
619
620 struct diff_state state = {
621 .result = result,
622 .recursion_depth_left = config->max_recursion_depth ?
623 config->max_recursion_depth : UINT_MAX,
624 .kd_buf = NULL,
625 .kd_buf_size = 0,
626 };
627 diff_data_init_subsection(&state.left, left,
628 left->atoms.head,
629 left->atoms.len);
630 diff_data_init_subsection(&state.right, right,
631 right->atoms.head,
632 right->atoms.len);
633
634 result->rc = diff_run_algo(config->algo, &state);
635 free(state.kd_buf);
636
637 return result;
638 }
639
640 void
diff_result_free(struct diff_result * result)641 diff_result_free(struct diff_result *result)
642 {
643 if (!result)
644 return;
645 ARRAYLIST_FREE(result->chunks);
646 free(result);
647 }
648
649 int
diff_result_contains_printable_chunks(struct diff_result * result)650 diff_result_contains_printable_chunks(struct diff_result *result)
651 {
652 struct diff_chunk *c;
653 enum diff_chunk_type t;
654 int i;
655
656 for (i = 0; i < result->chunks.len; i++) {
657 c = &result->chunks.head[i];
658 t = diff_chunk_type(c);
659 if (t == CHUNK_MINUS || t == CHUNK_PLUS)
660 return 1;
661 }
662
663 return 0;
664 }
665