1*59c8e88eSDag-Erling Smørgrav /* Generic infrastructure to implement various diff algorithms. */ 2*59c8e88eSDag-Erling Smørgrav /* 3*59c8e88eSDag-Erling Smørgrav * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de> 4*59c8e88eSDag-Erling Smørgrav * 5*59c8e88eSDag-Erling Smørgrav * Permission to use, copy, modify, and distribute this software for any 6*59c8e88eSDag-Erling Smørgrav * purpose with or without fee is hereby granted, provided that the above 7*59c8e88eSDag-Erling Smørgrav * copyright notice and this permission notice appear in all copies. 8*59c8e88eSDag-Erling Smørgrav * 9*59c8e88eSDag-Erling Smørgrav * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10*59c8e88eSDag-Erling Smørgrav * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11*59c8e88eSDag-Erling Smørgrav * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12*59c8e88eSDag-Erling Smørgrav * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13*59c8e88eSDag-Erling Smørgrav * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14*59c8e88eSDag-Erling Smørgrav * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15*59c8e88eSDag-Erling Smørgrav * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16*59c8e88eSDag-Erling Smørgrav */ 17*59c8e88eSDag-Erling Smørgrav 18*59c8e88eSDag-Erling Smørgrav struct diff_range { 19*59c8e88eSDag-Erling Smørgrav int start; 20*59c8e88eSDag-Erling Smørgrav int end; 21*59c8e88eSDag-Erling Smørgrav }; 22*59c8e88eSDag-Erling Smørgrav 23*59c8e88eSDag-Erling Smørgrav /* List of all possible return codes of a diff invocation. */ 24*59c8e88eSDag-Erling Smørgrav #define DIFF_RC_USE_DIFF_ALGO_FALLBACK -1 25*59c8e88eSDag-Erling Smørgrav #define DIFF_RC_OK 0 26*59c8e88eSDag-Erling Smørgrav /* Any positive return values are errno values from sys/errno.h */ 27*59c8e88eSDag-Erling Smørgrav 28*59c8e88eSDag-Erling Smørgrav struct diff_atom { 29*59c8e88eSDag-Erling Smørgrav struct diff_data *root; /* back pointer to root diff data */ 30*59c8e88eSDag-Erling Smørgrav 31*59c8e88eSDag-Erling Smørgrav off_t pos; /* set whether memory-mapped or not */ 32*59c8e88eSDag-Erling Smørgrav const uint8_t *at; /* only set if memory-mapped */ 33*59c8e88eSDag-Erling Smørgrav off_t len; 34*59c8e88eSDag-Erling Smørgrav 35*59c8e88eSDag-Erling Smørgrav /* This hash is just a very cheap speed up for finding *mismatching* 36*59c8e88eSDag-Erling Smørgrav * atoms. When hashes match, we still need to compare entire atoms to 37*59c8e88eSDag-Erling Smørgrav * find out whether they are indeed identical or not. 38*59c8e88eSDag-Erling Smørgrav * Calculated over all atom bytes with diff_atom_hash_update(). */ 39*59c8e88eSDag-Erling Smørgrav unsigned int hash; 40*59c8e88eSDag-Erling Smørgrav }; 41*59c8e88eSDag-Erling Smørgrav 42*59c8e88eSDag-Erling Smørgrav /* Mix another atom_byte into the provided hash value and return the result. 43*59c8e88eSDag-Erling Smørgrav * The hash value passed in for the first byte of the atom must be zero. */ 44*59c8e88eSDag-Erling Smørgrav unsigned int 45*59c8e88eSDag-Erling Smørgrav diff_atom_hash_update(unsigned int hash, unsigned char atom_byte); 46*59c8e88eSDag-Erling Smørgrav 47*59c8e88eSDag-Erling Smørgrav /* Compare two atoms for equality. Return 0 on success, or errno on failure. 48*59c8e88eSDag-Erling Smørgrav * Set cmp to -1, 0, or 1, just like strcmp(). */ 49*59c8e88eSDag-Erling Smørgrav int 50*59c8e88eSDag-Erling Smørgrav diff_atom_cmp(int *cmp, 51*59c8e88eSDag-Erling Smørgrav const struct diff_atom *left, 52*59c8e88eSDag-Erling Smørgrav const struct diff_atom *right); 53*59c8e88eSDag-Erling Smørgrav 54*59c8e88eSDag-Erling Smørgrav 55*59c8e88eSDag-Erling Smørgrav /* The atom's index in the entire file. For atoms divided by lines of text, this 56*59c8e88eSDag-Erling Smørgrav * yields the line number (starting with 0). Also works for diff_data that 57*59c8e88eSDag-Erling Smørgrav * reference only a subsection of a file, always reflecting the global position 58*59c8e88eSDag-Erling Smørgrav * in the file (and not the relative position within the subsection). */ 59*59c8e88eSDag-Erling Smørgrav #define diff_atom_root_idx(DIFF_DATA, ATOM) \ 60*59c8e88eSDag-Erling Smørgrav ((ATOM) && ((ATOM) >= (DIFF_DATA)->root->atoms.head) \ 61*59c8e88eSDag-Erling Smørgrav ? (unsigned int)((ATOM) - ((DIFF_DATA)->root->atoms.head)) \ 62*59c8e88eSDag-Erling Smørgrav : (DIFF_DATA)->root->atoms.len) 63*59c8e88eSDag-Erling Smørgrav 64*59c8e88eSDag-Erling Smørgrav /* The atom's index within DIFF_DATA. For atoms divided by lines of text, this 65*59c8e88eSDag-Erling Smørgrav * yields the line number (starting with 0). */ 66*59c8e88eSDag-Erling Smørgrav #define diff_atom_idx(DIFF_DATA, ATOM) \ 67*59c8e88eSDag-Erling Smørgrav ((ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) \ 68*59c8e88eSDag-Erling Smørgrav ? (unsigned int)((ATOM) - ((DIFF_DATA)->atoms.head)) \ 69*59c8e88eSDag-Erling Smørgrav : (DIFF_DATA)->atoms.len) 70*59c8e88eSDag-Erling Smørgrav 71*59c8e88eSDag-Erling Smørgrav #define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \ 72*59c8e88eSDag-Erling Smørgrav for ((ATOM) = (FIRST_ATOM); \ 73*59c8e88eSDag-Erling Smørgrav (ATOM) \ 74*59c8e88eSDag-Erling Smørgrav && ((ATOM) >= (FIRST_ATOM)) \ 75*59c8e88eSDag-Erling Smørgrav && ((ATOM) - (FIRST_ATOM) < (COUNT)); \ 76*59c8e88eSDag-Erling Smørgrav (ATOM)++) 77*59c8e88eSDag-Erling Smørgrav 78*59c8e88eSDag-Erling Smørgrav #define diff_data_foreach_atom(ATOM, DIFF_DATA) \ 79*59c8e88eSDag-Erling Smørgrav foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len) 80*59c8e88eSDag-Erling Smørgrav 81*59c8e88eSDag-Erling Smørgrav #define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \ 82*59c8e88eSDag-Erling Smørgrav for ((ATOM) = (FROM); \ 83*59c8e88eSDag-Erling Smørgrav (ATOM) \ 84*59c8e88eSDag-Erling Smørgrav && ((ATOM) >= (DIFF_DATA)->atoms.head) \ 85*59c8e88eSDag-Erling Smørgrav && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \ 86*59c8e88eSDag-Erling Smørgrav (ATOM)++) 87*59c8e88eSDag-Erling Smørgrav 88*59c8e88eSDag-Erling Smørgrav #define diff_data_foreach_atom_backwards_from(FROM, ATOM, DIFF_DATA) \ 89*59c8e88eSDag-Erling Smørgrav for ((ATOM) = (FROM); \ 90*59c8e88eSDag-Erling Smørgrav (ATOM) \ 91*59c8e88eSDag-Erling Smørgrav && ((ATOM) >= (DIFF_DATA)->atoms.head) \ 92*59c8e88eSDag-Erling Smørgrav && ((ATOM) - (DIFF_DATA)->atoms.head >= 0); \ 93*59c8e88eSDag-Erling Smørgrav (ATOM)--) 94*59c8e88eSDag-Erling Smørgrav 95*59c8e88eSDag-Erling Smørgrav /* For each file, there is a "root" struct diff_data referencing the entire 96*59c8e88eSDag-Erling Smørgrav * file, which the atoms are parsed from. In recursion of diff algorithm, there 97*59c8e88eSDag-Erling Smørgrav * may be "child" struct diff_data only referencing a subsection of the file, 98*59c8e88eSDag-Erling Smørgrav * re-using the atoms parsing. For "root" structs, atoms_allocated will be 99*59c8e88eSDag-Erling Smørgrav * nonzero, indicating that the array of atoms is owned by that struct. For 100*59c8e88eSDag-Erling Smørgrav * "child" structs, atoms_allocated == 0, to indicate that the struct is 101*59c8e88eSDag-Erling Smørgrav * referencing a subset of atoms. */ 102*59c8e88eSDag-Erling Smørgrav struct diff_data { 103*59c8e88eSDag-Erling Smørgrav FILE *f; /* if root diff_data and not memory-mapped */ 104*59c8e88eSDag-Erling Smørgrav off_t pos; /* if not memory-mapped */ 105*59c8e88eSDag-Erling Smørgrav const uint8_t *data; /* if memory-mapped */ 106*59c8e88eSDag-Erling Smørgrav off_t len; 107*59c8e88eSDag-Erling Smørgrav 108*59c8e88eSDag-Erling Smørgrav int atomizer_flags; 109*59c8e88eSDag-Erling Smørgrav ARRAYLIST(struct diff_atom) atoms; 110*59c8e88eSDag-Erling Smørgrav struct diff_data *root; 111*59c8e88eSDag-Erling Smørgrav struct diff_data *current; 112*59c8e88eSDag-Erling Smørgrav void *algo_data; 113*59c8e88eSDag-Erling Smørgrav 114*59c8e88eSDag-Erling Smørgrav int diff_flags; 115*59c8e88eSDag-Erling Smørgrav 116*59c8e88eSDag-Erling Smørgrav int err; 117*59c8e88eSDag-Erling Smørgrav }; 118*59c8e88eSDag-Erling Smørgrav 119*59c8e88eSDag-Erling Smørgrav /* Flags set by file atomizer. */ 120*59c8e88eSDag-Erling Smørgrav #define DIFF_ATOMIZER_FOUND_BINARY_DATA 0x00000001 121*59c8e88eSDag-Erling Smørgrav 122*59c8e88eSDag-Erling Smørgrav /* Flags set by caller of diff_main(). */ 123*59c8e88eSDag-Erling Smørgrav #define DIFF_FLAG_IGNORE_WHITESPACE 0x00000001 124*59c8e88eSDag-Erling Smørgrav #define DIFF_FLAG_SHOW_PROTOTYPES 0x00000002 125*59c8e88eSDag-Erling Smørgrav #define DIFF_FLAG_FORCE_TEXT_DATA 0x00000004 126*59c8e88eSDag-Erling Smørgrav 127*59c8e88eSDag-Erling Smørgrav void diff_data_free(struct diff_data *diff_data); 128*59c8e88eSDag-Erling Smørgrav 129*59c8e88eSDag-Erling Smørgrav struct diff_chunk; 130*59c8e88eSDag-Erling Smørgrav typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t; 131*59c8e88eSDag-Erling Smørgrav 132*59c8e88eSDag-Erling Smørgrav struct diff_result { 133*59c8e88eSDag-Erling Smørgrav int rc; 134*59c8e88eSDag-Erling Smørgrav 135*59c8e88eSDag-Erling Smørgrav /* 136*59c8e88eSDag-Erling Smørgrav * Pointers to diff data passed in via diff_main. 137*59c8e88eSDag-Erling Smørgrav * Do not free these diff_data before freeing the diff_result struct. 138*59c8e88eSDag-Erling Smørgrav */ 139*59c8e88eSDag-Erling Smørgrav struct diff_data *left; 140*59c8e88eSDag-Erling Smørgrav struct diff_data *right; 141*59c8e88eSDag-Erling Smørgrav 142*59c8e88eSDag-Erling Smørgrav diff_chunk_arraylist_t chunks; 143*59c8e88eSDag-Erling Smørgrav }; 144*59c8e88eSDag-Erling Smørgrav 145*59c8e88eSDag-Erling Smørgrav enum diff_chunk_type { 146*59c8e88eSDag-Erling Smørgrav CHUNK_EMPTY, 147*59c8e88eSDag-Erling Smørgrav CHUNK_PLUS, 148*59c8e88eSDag-Erling Smørgrav CHUNK_MINUS, 149*59c8e88eSDag-Erling Smørgrav CHUNK_SAME, 150*59c8e88eSDag-Erling Smørgrav CHUNK_ERROR, 151*59c8e88eSDag-Erling Smørgrav }; 152*59c8e88eSDag-Erling Smørgrav 153*59c8e88eSDag-Erling Smørgrav enum diff_chunk_type diff_chunk_type(const struct diff_chunk *c); 154*59c8e88eSDag-Erling Smørgrav 155*59c8e88eSDag-Erling Smørgrav struct diff_state; 156*59c8e88eSDag-Erling Smørgrav 157*59c8e88eSDag-Erling Smørgrav /* Signature of a utility function to divide a file into diff atoms. 158*59c8e88eSDag-Erling Smørgrav * An example is diff_atomize_text_by_line() in diff_atomize_text.c. 159*59c8e88eSDag-Erling Smørgrav * 160*59c8e88eSDag-Erling Smørgrav * func_data: context pointer (free to be used by implementation). 161*59c8e88eSDag-Erling Smørgrav * d: struct diff_data with d->data and d->len already set up, and 162*59c8e88eSDag-Erling Smørgrav * d->atoms to be created and d->atomizer_flags to be set up. 163*59c8e88eSDag-Erling Smørgrav */ 164*59c8e88eSDag-Erling Smørgrav typedef int (*diff_atomize_func_t)(void *func_data, struct diff_data *d); 165*59c8e88eSDag-Erling Smørgrav 166*59c8e88eSDag-Erling Smørgrav extern int diff_atomize_text_by_line(void *func_data, struct diff_data *d); 167*59c8e88eSDag-Erling Smørgrav 168*59c8e88eSDag-Erling Smørgrav struct diff_algo_config; 169*59c8e88eSDag-Erling Smørgrav typedef int (*diff_algo_impl_t)( 170*59c8e88eSDag-Erling Smørgrav const struct diff_algo_config *algo_config, struct diff_state *state); 171*59c8e88eSDag-Erling Smørgrav 172*59c8e88eSDag-Erling Smørgrav /* Form a result with all left-side removed and all right-side added, i.e. no 173*59c8e88eSDag-Erling Smørgrav * actual diff algorithm involved. */ 174*59c8e88eSDag-Erling Smørgrav int diff_algo_none(const struct diff_algo_config *algo_config, 175*59c8e88eSDag-Erling Smørgrav struct diff_state *state); 176*59c8e88eSDag-Erling Smørgrav 177*59c8e88eSDag-Erling Smørgrav /* Myers Diff tracing from the start all the way through to the end, requiring 178*59c8e88eSDag-Erling Smørgrav * quadratic amounts of memory. This can fail if the required space surpasses 179*59c8e88eSDag-Erling Smørgrav * algo_config->permitted_state_size. */ 180*59c8e88eSDag-Erling Smørgrav extern int diff_algo_myers(const struct diff_algo_config *algo_config, 181*59c8e88eSDag-Erling Smørgrav struct diff_state *state); 182*59c8e88eSDag-Erling Smørgrav 183*59c8e88eSDag-Erling Smørgrav /* Myers "Divide et Impera": tracing forwards from the start and backwards from 184*59c8e88eSDag-Erling Smørgrav * the end to find a midpoint that divides the problem into smaller chunks. 185*59c8e88eSDag-Erling Smørgrav * Requires only linear amounts of memory. */ 186*59c8e88eSDag-Erling Smørgrav extern int diff_algo_myers_divide( 187*59c8e88eSDag-Erling Smørgrav const struct diff_algo_config *algo_config, struct diff_state *state); 188*59c8e88eSDag-Erling Smørgrav 189*59c8e88eSDag-Erling Smørgrav /* Patience Diff algorithm, which divides a larger diff into smaller chunks. For 190*59c8e88eSDag-Erling Smørgrav * very specific scenarios, it may lead to a complete diff result by itself, but 191*59c8e88eSDag-Erling Smørgrav * needs a fallback algo to solve chunks that don't have common-unique atoms. */ 192*59c8e88eSDag-Erling Smørgrav extern int diff_algo_patience( 193*59c8e88eSDag-Erling Smørgrav const struct diff_algo_config *algo_config, struct diff_state *state); 194*59c8e88eSDag-Erling Smørgrav 195*59c8e88eSDag-Erling Smørgrav /* Diff algorithms to use, possibly nested. For example: 196*59c8e88eSDag-Erling Smørgrav * 197*59c8e88eSDag-Erling Smørgrav * struct diff_algo_config myers, patience, myers_divide; 198*59c8e88eSDag-Erling Smørgrav * 199*59c8e88eSDag-Erling Smørgrav * myers = (struct diff_algo_config){ 200*59c8e88eSDag-Erling Smørgrav * .impl = diff_algo_myers, 201*59c8e88eSDag-Erling Smørgrav * .permitted_state_size = 32 * 1024 * 1024, 202*59c8e88eSDag-Erling Smørgrav * // When too large, do diff_algo_patience: 203*59c8e88eSDag-Erling Smørgrav * .fallback_algo = &patience, 204*59c8e88eSDag-Erling Smørgrav * }; 205*59c8e88eSDag-Erling Smørgrav * 206*59c8e88eSDag-Erling Smørgrav * const struct diff_algo_config patience = (struct diff_algo_config){ 207*59c8e88eSDag-Erling Smørgrav * .impl = diff_algo_patience, 208*59c8e88eSDag-Erling Smørgrav * // After subdivision, do Patience again: 209*59c8e88eSDag-Erling Smørgrav * .inner_algo = &patience, 210*59c8e88eSDag-Erling Smørgrav * // If subdivision failed, do Myers Divide et Impera: 211*59c8e88eSDag-Erling Smørgrav * .fallback_algo = &myers_then_myers_divide, 212*59c8e88eSDag-Erling Smørgrav * }; 213*59c8e88eSDag-Erling Smørgrav * 214*59c8e88eSDag-Erling Smørgrav * const struct diff_algo_config myers_divide = (struct diff_algo_config){ 215*59c8e88eSDag-Erling Smørgrav * .impl = diff_algo_myers_divide, 216*59c8e88eSDag-Erling Smørgrav * // When division succeeded, start from the top: 217*59c8e88eSDag-Erling Smørgrav * .inner_algo = &myers_then_myers_divide, 218*59c8e88eSDag-Erling Smørgrav * // (fallback_algo = NULL implies diff_algo_none). 219*59c8e88eSDag-Erling Smørgrav * }; 220*59c8e88eSDag-Erling Smørgrav * struct diff_config config = { 221*59c8e88eSDag-Erling Smørgrav * .algo = &myers, 222*59c8e88eSDag-Erling Smørgrav * ... 223*59c8e88eSDag-Erling Smørgrav * }; 224*59c8e88eSDag-Erling Smørgrav * diff_main(&config, ...); 225*59c8e88eSDag-Erling Smørgrav */ 226*59c8e88eSDag-Erling Smørgrav struct diff_algo_config { 227*59c8e88eSDag-Erling Smørgrav diff_algo_impl_t impl; 228*59c8e88eSDag-Erling Smørgrav 229*59c8e88eSDag-Erling Smørgrav /* Fail this algo if it would use more than this amount of memory, and 230*59c8e88eSDag-Erling Smørgrav * instead use fallback_algo (diff_algo_myers). permitted_state_size == 231*59c8e88eSDag-Erling Smørgrav * 0 means no limitation. */ 232*59c8e88eSDag-Erling Smørgrav size_t permitted_state_size; 233*59c8e88eSDag-Erling Smørgrav 234*59c8e88eSDag-Erling Smørgrav /* For algorithms that divide into smaller chunks, use this algorithm to 235*59c8e88eSDag-Erling Smørgrav * solve the divided chunks. */ 236*59c8e88eSDag-Erling Smørgrav const struct diff_algo_config *inner_algo; 237*59c8e88eSDag-Erling Smørgrav 238*59c8e88eSDag-Erling Smørgrav /* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large 239*59c8e88eSDag-Erling Smørgrav * state, or diff_algo_patience can't find any common-unique atoms), 240*59c8e88eSDag-Erling Smørgrav * then use this algorithm instead. */ 241*59c8e88eSDag-Erling Smørgrav const struct diff_algo_config *fallback_algo; 242*59c8e88eSDag-Erling Smørgrav }; 243*59c8e88eSDag-Erling Smørgrav 244*59c8e88eSDag-Erling Smørgrav struct diff_config { 245*59c8e88eSDag-Erling Smørgrav diff_atomize_func_t atomize_func; 246*59c8e88eSDag-Erling Smørgrav void *atomize_func_data; 247*59c8e88eSDag-Erling Smørgrav 248*59c8e88eSDag-Erling Smørgrav const struct diff_algo_config *algo; 249*59c8e88eSDag-Erling Smørgrav 250*59c8e88eSDag-Erling Smørgrav /* How deep to step into subdivisions of a source file, a paranoia / 251*59c8e88eSDag-Erling Smørgrav * safety measure to guard against infinite loops through diff 252*59c8e88eSDag-Erling Smørgrav * algorithms. When the maximum recursion is reached, employ 253*59c8e88eSDag-Erling Smørgrav * diff_algo_none (i.e. remove all left atoms and add all right atoms). 254*59c8e88eSDag-Erling Smørgrav */ 255*59c8e88eSDag-Erling Smørgrav unsigned int max_recursion_depth; 256*59c8e88eSDag-Erling Smørgrav }; 257*59c8e88eSDag-Erling Smørgrav 258*59c8e88eSDag-Erling Smørgrav int diff_atomize_file(struct diff_data *d, const struct diff_config *config, 259*59c8e88eSDag-Erling Smørgrav FILE *f, const uint8_t *data, off_t len, int diff_flags); 260*59c8e88eSDag-Erling Smørgrav struct diff_result *diff_main(const struct diff_config *config, 261*59c8e88eSDag-Erling Smørgrav struct diff_data *left, 262*59c8e88eSDag-Erling Smørgrav struct diff_data *right); 263*59c8e88eSDag-Erling Smørgrav void diff_result_free(struct diff_result *result); 264*59c8e88eSDag-Erling Smørgrav int diff_result_contains_printable_chunks(struct diff_result *result); 265