159c8e88eSDag-Erling Smørgrav /* Generic infrastructure to implement various diff algorithms. */ 259c8e88eSDag-Erling Smørgrav /* 359c8e88eSDag-Erling Smørgrav * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de> 459c8e88eSDag-Erling Smørgrav * 559c8e88eSDag-Erling Smørgrav * Permission to use, copy, modify, and distribute this software for any 659c8e88eSDag-Erling Smørgrav * purpose with or without fee is hereby granted, provided that the above 759c8e88eSDag-Erling Smørgrav * copyright notice and this permission notice appear in all copies. 859c8e88eSDag-Erling Smørgrav * 959c8e88eSDag-Erling Smørgrav * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 1059c8e88eSDag-Erling Smørgrav * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 1159c8e88eSDag-Erling Smørgrav * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 1259c8e88eSDag-Erling Smørgrav * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1359c8e88eSDag-Erling Smørgrav * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 1459c8e88eSDag-Erling Smørgrav * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 1559c8e88eSDag-Erling Smørgrav * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 1659c8e88eSDag-Erling Smørgrav */ 1759c8e88eSDag-Erling Smørgrav 1859c8e88eSDag-Erling Smørgrav struct diff_range { 1959c8e88eSDag-Erling Smørgrav int start; 2059c8e88eSDag-Erling Smørgrav int end; 2159c8e88eSDag-Erling Smørgrav }; 2259c8e88eSDag-Erling Smørgrav 2359c8e88eSDag-Erling Smørgrav /* List of all possible return codes of a diff invocation. */ 2459c8e88eSDag-Erling Smørgrav #define DIFF_RC_USE_DIFF_ALGO_FALLBACK -1 2559c8e88eSDag-Erling Smørgrav #define DIFF_RC_OK 0 2659c8e88eSDag-Erling Smørgrav /* Any positive return values are errno values from sys/errno.h */ 2759c8e88eSDag-Erling Smørgrav 2859c8e88eSDag-Erling Smørgrav struct diff_atom { 2959c8e88eSDag-Erling Smørgrav struct diff_data *root; /* back pointer to root diff data */ 3059c8e88eSDag-Erling Smørgrav 3159c8e88eSDag-Erling Smørgrav off_t pos; /* set whether memory-mapped or not */ 3259c8e88eSDag-Erling Smørgrav const uint8_t *at; /* only set if memory-mapped */ 3359c8e88eSDag-Erling Smørgrav off_t len; 3459c8e88eSDag-Erling Smørgrav 3559c8e88eSDag-Erling Smørgrav /* This hash is just a very cheap speed up for finding *mismatching* 3659c8e88eSDag-Erling Smørgrav * atoms. When hashes match, we still need to compare entire atoms to 3759c8e88eSDag-Erling Smørgrav * find out whether they are indeed identical or not. 3859c8e88eSDag-Erling Smørgrav * Calculated over all atom bytes with diff_atom_hash_update(). */ 3959c8e88eSDag-Erling Smørgrav unsigned int hash; 4059c8e88eSDag-Erling Smørgrav }; 4159c8e88eSDag-Erling Smørgrav 4259c8e88eSDag-Erling Smørgrav /* Mix another atom_byte into the provided hash value and return the result. 4359c8e88eSDag-Erling Smørgrav * The hash value passed in for the first byte of the atom must be zero. */ 4459c8e88eSDag-Erling Smørgrav unsigned int 4559c8e88eSDag-Erling Smørgrav diff_atom_hash_update(unsigned int hash, unsigned char atom_byte); 4659c8e88eSDag-Erling Smørgrav 4759c8e88eSDag-Erling Smørgrav /* Compare two atoms for equality. Return 0 on success, or errno on failure. 4859c8e88eSDag-Erling Smørgrav * Set cmp to -1, 0, or 1, just like strcmp(). */ 4959c8e88eSDag-Erling Smørgrav int 5059c8e88eSDag-Erling Smørgrav diff_atom_cmp(int *cmp, 5159c8e88eSDag-Erling Smørgrav const struct diff_atom *left, 5259c8e88eSDag-Erling Smørgrav const struct diff_atom *right); 5359c8e88eSDag-Erling Smørgrav 5459c8e88eSDag-Erling Smørgrav 5559c8e88eSDag-Erling Smørgrav /* The atom's index in the entire file. For atoms divided by lines of text, this 5659c8e88eSDag-Erling Smørgrav * yields the line number (starting with 0). Also works for diff_data that 5759c8e88eSDag-Erling Smørgrav * reference only a subsection of a file, always reflecting the global position 5859c8e88eSDag-Erling Smørgrav * in the file (and not the relative position within the subsection). */ 5959c8e88eSDag-Erling Smørgrav #define diff_atom_root_idx(DIFF_DATA, ATOM) \ 6059c8e88eSDag-Erling Smørgrav ((ATOM) && ((ATOM) >= (DIFF_DATA)->root->atoms.head) \ 6159c8e88eSDag-Erling Smørgrav ? (unsigned int)((ATOM) - ((DIFF_DATA)->root->atoms.head)) \ 6259c8e88eSDag-Erling Smørgrav : (DIFF_DATA)->root->atoms.len) 6359c8e88eSDag-Erling Smørgrav 6459c8e88eSDag-Erling Smørgrav /* The atom's index within DIFF_DATA. For atoms divided by lines of text, this 6559c8e88eSDag-Erling Smørgrav * yields the line number (starting with 0). */ 6659c8e88eSDag-Erling Smørgrav #define diff_atom_idx(DIFF_DATA, ATOM) \ 6759c8e88eSDag-Erling Smørgrav ((ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) \ 6859c8e88eSDag-Erling Smørgrav ? (unsigned int)((ATOM) - ((DIFF_DATA)->atoms.head)) \ 6959c8e88eSDag-Erling Smørgrav : (DIFF_DATA)->atoms.len) 7059c8e88eSDag-Erling Smørgrav 7159c8e88eSDag-Erling Smørgrav #define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \ 7259c8e88eSDag-Erling Smørgrav for ((ATOM) = (FIRST_ATOM); \ 7359c8e88eSDag-Erling Smørgrav (ATOM) \ 7459c8e88eSDag-Erling Smørgrav && ((ATOM) >= (FIRST_ATOM)) \ 7559c8e88eSDag-Erling Smørgrav && ((ATOM) - (FIRST_ATOM) < (COUNT)); \ 7659c8e88eSDag-Erling Smørgrav (ATOM)++) 7759c8e88eSDag-Erling Smørgrav 7859c8e88eSDag-Erling Smørgrav #define diff_data_foreach_atom(ATOM, DIFF_DATA) \ 7959c8e88eSDag-Erling Smørgrav foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len) 8059c8e88eSDag-Erling Smørgrav 8159c8e88eSDag-Erling Smørgrav #define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \ 8259c8e88eSDag-Erling Smørgrav for ((ATOM) = (FROM); \ 8359c8e88eSDag-Erling Smørgrav (ATOM) \ 8459c8e88eSDag-Erling Smørgrav && ((ATOM) >= (DIFF_DATA)->atoms.head) \ 8559c8e88eSDag-Erling Smørgrav && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \ 8659c8e88eSDag-Erling Smørgrav (ATOM)++) 8759c8e88eSDag-Erling Smørgrav 8859c8e88eSDag-Erling Smørgrav #define diff_data_foreach_atom_backwards_from(FROM, ATOM, DIFF_DATA) \ 8959c8e88eSDag-Erling Smørgrav for ((ATOM) = (FROM); \ 9059c8e88eSDag-Erling Smørgrav (ATOM) \ 9159c8e88eSDag-Erling Smørgrav && ((ATOM) >= (DIFF_DATA)->atoms.head) \ 9259c8e88eSDag-Erling Smørgrav && ((ATOM) - (DIFF_DATA)->atoms.head >= 0); \ 9359c8e88eSDag-Erling Smørgrav (ATOM)--) 9459c8e88eSDag-Erling Smørgrav 9559c8e88eSDag-Erling Smørgrav /* For each file, there is a "root" struct diff_data referencing the entire 9659c8e88eSDag-Erling Smørgrav * file, which the atoms are parsed from. In recursion of diff algorithm, there 9759c8e88eSDag-Erling Smørgrav * may be "child" struct diff_data only referencing a subsection of the file, 9859c8e88eSDag-Erling Smørgrav * re-using the atoms parsing. For "root" structs, atoms_allocated will be 9959c8e88eSDag-Erling Smørgrav * nonzero, indicating that the array of atoms is owned by that struct. For 10059c8e88eSDag-Erling Smørgrav * "child" structs, atoms_allocated == 0, to indicate that the struct is 10159c8e88eSDag-Erling Smørgrav * referencing a subset of atoms. */ 10259c8e88eSDag-Erling Smørgrav struct diff_data { 10359c8e88eSDag-Erling Smørgrav FILE *f; /* if root diff_data and not memory-mapped */ 10459c8e88eSDag-Erling Smørgrav off_t pos; /* if not memory-mapped */ 10559c8e88eSDag-Erling Smørgrav const uint8_t *data; /* if memory-mapped */ 10659c8e88eSDag-Erling Smørgrav off_t len; 10759c8e88eSDag-Erling Smørgrav 10859c8e88eSDag-Erling Smørgrav int atomizer_flags; 10959c8e88eSDag-Erling Smørgrav ARRAYLIST(struct diff_atom) atoms; 11059c8e88eSDag-Erling Smørgrav struct diff_data *root; 11159c8e88eSDag-Erling Smørgrav struct diff_data *current; 11259c8e88eSDag-Erling Smørgrav void *algo_data; 11359c8e88eSDag-Erling Smørgrav 11459c8e88eSDag-Erling Smørgrav int diff_flags; 11559c8e88eSDag-Erling Smørgrav 11659c8e88eSDag-Erling Smørgrav int err; 11759c8e88eSDag-Erling Smørgrav }; 11859c8e88eSDag-Erling Smørgrav 11959c8e88eSDag-Erling Smørgrav /* Flags set by file atomizer. */ 12059c8e88eSDag-Erling Smørgrav #define DIFF_ATOMIZER_FOUND_BINARY_DATA 0x00000001 121*974ea6b2SDag-Erling Smørgrav #define DIFF_ATOMIZER_FILE_TRUNCATED 0x00000002 12259c8e88eSDag-Erling Smørgrav 12359c8e88eSDag-Erling Smørgrav /* Flags set by caller of diff_main(). */ 12459c8e88eSDag-Erling Smørgrav #define DIFF_FLAG_IGNORE_WHITESPACE 0x00000001 12559c8e88eSDag-Erling Smørgrav #define DIFF_FLAG_SHOW_PROTOTYPES 0x00000002 12659c8e88eSDag-Erling Smørgrav #define DIFF_FLAG_FORCE_TEXT_DATA 0x00000004 12759c8e88eSDag-Erling Smørgrav 12859c8e88eSDag-Erling Smørgrav void diff_data_free(struct diff_data *diff_data); 12959c8e88eSDag-Erling Smørgrav 13059c8e88eSDag-Erling Smørgrav struct diff_chunk; 13159c8e88eSDag-Erling Smørgrav typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t; 13259c8e88eSDag-Erling Smørgrav 13359c8e88eSDag-Erling Smørgrav struct diff_result { 13459c8e88eSDag-Erling Smørgrav int rc; 13559c8e88eSDag-Erling Smørgrav 13659c8e88eSDag-Erling Smørgrav /* 13759c8e88eSDag-Erling Smørgrav * Pointers to diff data passed in via diff_main. 13859c8e88eSDag-Erling Smørgrav * Do not free these diff_data before freeing the diff_result struct. 13959c8e88eSDag-Erling Smørgrav */ 14059c8e88eSDag-Erling Smørgrav struct diff_data *left; 14159c8e88eSDag-Erling Smørgrav struct diff_data *right; 14259c8e88eSDag-Erling Smørgrav 14359c8e88eSDag-Erling Smørgrav diff_chunk_arraylist_t chunks; 14459c8e88eSDag-Erling Smørgrav }; 14559c8e88eSDag-Erling Smørgrav 14659c8e88eSDag-Erling Smørgrav enum diff_chunk_type { 14759c8e88eSDag-Erling Smørgrav CHUNK_EMPTY, 14859c8e88eSDag-Erling Smørgrav CHUNK_PLUS, 14959c8e88eSDag-Erling Smørgrav CHUNK_MINUS, 15059c8e88eSDag-Erling Smørgrav CHUNK_SAME, 15159c8e88eSDag-Erling Smørgrav CHUNK_ERROR, 15259c8e88eSDag-Erling Smørgrav }; 15359c8e88eSDag-Erling Smørgrav 15459c8e88eSDag-Erling Smørgrav enum diff_chunk_type diff_chunk_type(const struct diff_chunk *c); 15559c8e88eSDag-Erling Smørgrav 15659c8e88eSDag-Erling Smørgrav struct diff_state; 15759c8e88eSDag-Erling Smørgrav 15859c8e88eSDag-Erling Smørgrav /* Signature of a utility function to divide a file into diff atoms. 15959c8e88eSDag-Erling Smørgrav * An example is diff_atomize_text_by_line() in diff_atomize_text.c. 16059c8e88eSDag-Erling Smørgrav * 16159c8e88eSDag-Erling Smørgrav * func_data: context pointer (free to be used by implementation). 16259c8e88eSDag-Erling Smørgrav * d: struct diff_data with d->data and d->len already set up, and 16359c8e88eSDag-Erling Smørgrav * d->atoms to be created and d->atomizer_flags to be set up. 16459c8e88eSDag-Erling Smørgrav */ 16559c8e88eSDag-Erling Smørgrav typedef int (*diff_atomize_func_t)(void *func_data, struct diff_data *d); 16659c8e88eSDag-Erling Smørgrav 16759c8e88eSDag-Erling Smørgrav extern int diff_atomize_text_by_line(void *func_data, struct diff_data *d); 16859c8e88eSDag-Erling Smørgrav 16959c8e88eSDag-Erling Smørgrav struct diff_algo_config; 17059c8e88eSDag-Erling Smørgrav typedef int (*diff_algo_impl_t)( 17159c8e88eSDag-Erling Smørgrav const struct diff_algo_config *algo_config, struct diff_state *state); 17259c8e88eSDag-Erling Smørgrav 17359c8e88eSDag-Erling Smørgrav /* Form a result with all left-side removed and all right-side added, i.e. no 17459c8e88eSDag-Erling Smørgrav * actual diff algorithm involved. */ 17559c8e88eSDag-Erling Smørgrav int diff_algo_none(const struct diff_algo_config *algo_config, 17659c8e88eSDag-Erling Smørgrav struct diff_state *state); 17759c8e88eSDag-Erling Smørgrav 17859c8e88eSDag-Erling Smørgrav /* Myers Diff tracing from the start all the way through to the end, requiring 17959c8e88eSDag-Erling Smørgrav * quadratic amounts of memory. This can fail if the required space surpasses 18059c8e88eSDag-Erling Smørgrav * algo_config->permitted_state_size. */ 18159c8e88eSDag-Erling Smørgrav extern int diff_algo_myers(const struct diff_algo_config *algo_config, 18259c8e88eSDag-Erling Smørgrav struct diff_state *state); 18359c8e88eSDag-Erling Smørgrav 18459c8e88eSDag-Erling Smørgrav /* Myers "Divide et Impera": tracing forwards from the start and backwards from 18559c8e88eSDag-Erling Smørgrav * the end to find a midpoint that divides the problem into smaller chunks. 18659c8e88eSDag-Erling Smørgrav * Requires only linear amounts of memory. */ 18759c8e88eSDag-Erling Smørgrav extern int diff_algo_myers_divide( 18859c8e88eSDag-Erling Smørgrav const struct diff_algo_config *algo_config, struct diff_state *state); 18959c8e88eSDag-Erling Smørgrav 19059c8e88eSDag-Erling Smørgrav /* Patience Diff algorithm, which divides a larger diff into smaller chunks. For 19159c8e88eSDag-Erling Smørgrav * very specific scenarios, it may lead to a complete diff result by itself, but 19259c8e88eSDag-Erling Smørgrav * needs a fallback algo to solve chunks that don't have common-unique atoms. */ 19359c8e88eSDag-Erling Smørgrav extern int diff_algo_patience( 19459c8e88eSDag-Erling Smørgrav const struct diff_algo_config *algo_config, struct diff_state *state); 19559c8e88eSDag-Erling Smørgrav 19659c8e88eSDag-Erling Smørgrav /* Diff algorithms to use, possibly nested. For example: 19759c8e88eSDag-Erling Smørgrav * 19859c8e88eSDag-Erling Smørgrav * struct diff_algo_config myers, patience, myers_divide; 19959c8e88eSDag-Erling Smørgrav * 20059c8e88eSDag-Erling Smørgrav * myers = (struct diff_algo_config){ 20159c8e88eSDag-Erling Smørgrav * .impl = diff_algo_myers, 20259c8e88eSDag-Erling Smørgrav * .permitted_state_size = 32 * 1024 * 1024, 20359c8e88eSDag-Erling Smørgrav * // When too large, do diff_algo_patience: 20459c8e88eSDag-Erling Smørgrav * .fallback_algo = &patience, 20559c8e88eSDag-Erling Smørgrav * }; 20659c8e88eSDag-Erling Smørgrav * 20759c8e88eSDag-Erling Smørgrav * const struct diff_algo_config patience = (struct diff_algo_config){ 20859c8e88eSDag-Erling Smørgrav * .impl = diff_algo_patience, 20959c8e88eSDag-Erling Smørgrav * // After subdivision, do Patience again: 21059c8e88eSDag-Erling Smørgrav * .inner_algo = &patience, 21159c8e88eSDag-Erling Smørgrav * // If subdivision failed, do Myers Divide et Impera: 21259c8e88eSDag-Erling Smørgrav * .fallback_algo = &myers_then_myers_divide, 21359c8e88eSDag-Erling Smørgrav * }; 21459c8e88eSDag-Erling Smørgrav * 21559c8e88eSDag-Erling Smørgrav * const struct diff_algo_config myers_divide = (struct diff_algo_config){ 21659c8e88eSDag-Erling Smørgrav * .impl = diff_algo_myers_divide, 21759c8e88eSDag-Erling Smørgrav * // When division succeeded, start from the top: 21859c8e88eSDag-Erling Smørgrav * .inner_algo = &myers_then_myers_divide, 21959c8e88eSDag-Erling Smørgrav * // (fallback_algo = NULL implies diff_algo_none). 22059c8e88eSDag-Erling Smørgrav * }; 22159c8e88eSDag-Erling Smørgrav * struct diff_config config = { 22259c8e88eSDag-Erling Smørgrav * .algo = &myers, 22359c8e88eSDag-Erling Smørgrav * ... 22459c8e88eSDag-Erling Smørgrav * }; 22559c8e88eSDag-Erling Smørgrav * diff_main(&config, ...); 22659c8e88eSDag-Erling Smørgrav */ 22759c8e88eSDag-Erling Smørgrav struct diff_algo_config { 22859c8e88eSDag-Erling Smørgrav diff_algo_impl_t impl; 22959c8e88eSDag-Erling Smørgrav 23059c8e88eSDag-Erling Smørgrav /* Fail this algo if it would use more than this amount of memory, and 23159c8e88eSDag-Erling Smørgrav * instead use fallback_algo (diff_algo_myers). permitted_state_size == 23259c8e88eSDag-Erling Smørgrav * 0 means no limitation. */ 23359c8e88eSDag-Erling Smørgrav size_t permitted_state_size; 23459c8e88eSDag-Erling Smørgrav 23559c8e88eSDag-Erling Smørgrav /* For algorithms that divide into smaller chunks, use this algorithm to 23659c8e88eSDag-Erling Smørgrav * solve the divided chunks. */ 23759c8e88eSDag-Erling Smørgrav const struct diff_algo_config *inner_algo; 23859c8e88eSDag-Erling Smørgrav 23959c8e88eSDag-Erling Smørgrav /* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large 24059c8e88eSDag-Erling Smørgrav * state, or diff_algo_patience can't find any common-unique atoms), 24159c8e88eSDag-Erling Smørgrav * then use this algorithm instead. */ 24259c8e88eSDag-Erling Smørgrav const struct diff_algo_config *fallback_algo; 24359c8e88eSDag-Erling Smørgrav }; 24459c8e88eSDag-Erling Smørgrav 24559c8e88eSDag-Erling Smørgrav struct diff_config { 24659c8e88eSDag-Erling Smørgrav diff_atomize_func_t atomize_func; 24759c8e88eSDag-Erling Smørgrav void *atomize_func_data; 24859c8e88eSDag-Erling Smørgrav 24959c8e88eSDag-Erling Smørgrav const struct diff_algo_config *algo; 25059c8e88eSDag-Erling Smørgrav 25159c8e88eSDag-Erling Smørgrav /* How deep to step into subdivisions of a source file, a paranoia / 25259c8e88eSDag-Erling Smørgrav * safety measure to guard against infinite loops through diff 25359c8e88eSDag-Erling Smørgrav * algorithms. When the maximum recursion is reached, employ 25459c8e88eSDag-Erling Smørgrav * diff_algo_none (i.e. remove all left atoms and add all right atoms). 25559c8e88eSDag-Erling Smørgrav */ 25659c8e88eSDag-Erling Smørgrav unsigned int max_recursion_depth; 25759c8e88eSDag-Erling Smørgrav }; 25859c8e88eSDag-Erling Smørgrav 25959c8e88eSDag-Erling Smørgrav int diff_atomize_file(struct diff_data *d, const struct diff_config *config, 26059c8e88eSDag-Erling Smørgrav FILE *f, const uint8_t *data, off_t len, int diff_flags); 26159c8e88eSDag-Erling Smørgrav struct diff_result *diff_main(const struct diff_config *config, 26259c8e88eSDag-Erling Smørgrav struct diff_data *left, 26359c8e88eSDag-Erling Smørgrav struct diff_data *right); 26459c8e88eSDag-Erling Smørgrav void diff_result_free(struct diff_result *result); 26559c8e88eSDag-Erling Smørgrav int diff_result_contains_printable_chunks(struct diff_result *result); 266