blake3.c - OpenGrok cross reference for /freebsd-src/sys/contrib/openzfs/module/icp/algs/blake3/blake3.c

Lines Matching +full:ouput +full:- +full:only
9  * or https://opensource.org/licenses/CDDL-1.0.
23  * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
24  * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor
25  * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
36  * - we define this pragma to make gcc happy
39 #pragma GCC diagnostic ignored "-Wframe-larger-than="
66 	memcpy(ctx->cv, key, BLAKE3_KEY_LEN);  in chunk_state_init()
67 	ctx->chunk_counter = 0;  in chunk_state_init()
68 	memset(ctx->buf, 0, BLAKE3_BLOCK_LEN);  in chunk_state_init()
69 	ctx->buf_len = 0;  in chunk_state_init()
70 	ctx->blocks_compressed = 0;  in chunk_state_init()
71 	ctx->flags = flags;  in chunk_state_init()
77 	memcpy(ctx->cv, key, BLAKE3_KEY_LEN);  in chunk_state_reset()
78 	ctx->chunk_counter = chunk_counter;  in chunk_state_reset()
79 	ctx->blocks_compressed = 0;  in chunk_state_reset()
80 	memset(ctx->buf, 0, BLAKE3_BLOCK_LEN);  in chunk_state_reset()
81 	ctx->buf_len = 0;  in chunk_state_reset()
86 	return (BLAKE3_BLOCK_LEN * (size_t)ctx->blocks_compressed) +  in chunk_state_len()
87 	    ((size_t)ctx->buf_len);  in chunk_state_len()
93 	size_t take = BLAKE3_BLOCK_LEN - ((size_t)ctx->buf_len);  in chunk_state_fill_buf()
97 	uint8_t *dest = ctx->buf + ((size_t)ctx->buf_len);  in chunk_state_fill_buf()
99 	ctx->buf_len += (uint8_t)take;  in chunk_state_fill_buf()
105 	if (ctx->blocks_compressed == 0) {  in chunk_state_maybe_start_flag()
127  * interface) are represented as words. This avoids unnecessary bytes<->words
137 	memcpy(cv_words, ctx->input_cv, 32);  in output_chaining_value()
138 	ops->compress_in_place(cv_words, ctx->block, ctx->block_len,  in output_chaining_value()
139 	    ctx->counter, ctx->flags);  in output_chaining_value()
150 		ops->compress_xof(ctx->input_cv, ctx->block, ctx->block_len,  in output_root_bytes()
151 		    output_block_counter, ctx->flags | ROOT, wide_buf);  in output_root_bytes()
152 		size_t available_bytes = 64 - offset_within_block;  in output_root_bytes()
161 		out_len -= memcpy_len;  in output_root_bytes()
170 	if (ctx->buf_len > 0) {  in chunk_state_update()
173 		input_len -= take;  in chunk_state_update()
175 			ops->compress_in_place(ctx->cv, ctx->buf,  in chunk_state_update()
176 			    BLAKE3_BLOCK_LEN, ctx->chunk_counter,  in chunk_state_update()
177 			    ctx->flags|chunk_state_maybe_start_flag(ctx));  in chunk_state_update()
178 			ctx->blocks_compressed += 1;  in chunk_state_update()
179 			ctx->buf_len = 0;  in chunk_state_update()
180 			memset(ctx->buf, 0, BLAKE3_BLOCK_LEN);  in chunk_state_update()
185 		ops->compress_in_place(ctx->cv, input, BLAKE3_BLOCK_LEN,  in chunk_state_update()
186 		    ctx->chunk_counter,  in chunk_state_update()
187 		    ctx->flags|chunk_state_maybe_start_flag(ctx));  in chunk_state_update()
188 		ctx->blocks_compressed += 1;  in chunk_state_update()
190 		input_len -= BLAKE3_BLOCK_LEN;  in chunk_state_update()
199 	    ctx->flags | chunk_state_maybe_start_flag(ctx) | CHUNK_END;  in chunk_state_output()
200 	return (make_output(ctx->cv, ctx->buf, ctx->buf_len, ctx->chunk_counter,  in chunk_state_output()
212  * should go in the left subtree. This is the largest power-of-2 number of
222 	size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;  in left_len()
239 	while (input_len - input_position >= BLAKE3_CHUNK_LEN) {  in compress_chunks_parallel()
245 	ops->hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN /  in compress_chunks_parallel()
259 		    input_len - input_position);  in compress_chunks_parallel()
283 	while (num_chaining_values - (2 * parents_array_len) >= 2) {  in compress_parents_parallel()
289 	ops->hash_many(parents_array, parents_array_len, 1, key, 0, B_FALSE,  in compress_parents_parallel()
314  * wouldn't be able to implement exendable ouput.) Note that this function is
315  * not used when the whole input is only 1 chunk long; that's a different
320  * multi-threading parallelism for that update().
328 	 * to 2 when it is 1. If this implementation adds multi-threading in  in blake3_compress_subtree_wide()
329 	 * the future, this gives us the option of multi-threading even the  in blake3_compress_subtree_wide()
330 	 * 2-chunk case, which can help performance on smaller platforms.  in blake3_compress_subtree_wide()
332 	if (input_len <= (size_t)(ops->degree * BLAKE3_CHUNK_LEN)) {  in blake3_compress_subtree_wide()
341 	 * only optimal as long as the SIMD degree is a power of 2. If we ever  in blake3_compress_subtree_wide()
346 	size_t right_input_len = input_len - left_input_len;  in blake3_compress_subtree_wide()
357 	size_t degree = ops->degree;  in blake3_compress_subtree_wide()
364 		 * 1-chunk-input case is a different codepath.)  in blake3_compress_subtree_wide()
371 	 * Recurse! If this implementation adds multi-threading support in the  in blake3_compress_subtree_wide()
433 	memcpy(ctx->key, key, BLAKE3_KEY_LEN);  in hasher_init_base()
434 	chunk_state_init(&ctx->chunk, key, flags);  in hasher_init_base()
435 	ctx->cv_stack_len = 0;  in hasher_init_base()
436 	ctx->ops = blake3_get_ops();  in hasher_init_base()
444  * any power-of-two number of chunks, as long as the smaller-above-larger
445  * stack order is maintained. Instead of the "count the trailing 0-bits"
447  * 1-bits" variant that doesn't require us to retain the subtree size of the
449  * remain in the stack is represented by a 1-bit in the total number of chunks
455 	while (ctx->cv_stack_len > post_merge_stack_len) {  in hasher_merge_cv_stack()
457 		    &ctx->cv_stack[(ctx->cv_stack_len - 2) * BLAKE3_OUT_LEN];  in hasher_merge_cv_stack()
459 		    parent_output(parent_node, ctx->key, ctx->chunk.flags);  in hasher_merge_cv_stack()
460 		output_chaining_value(ctx->ops, &output, parent_node);  in hasher_merge_cv_stack()
461 		ctx->cv_stack_len -= 1;  in hasher_merge_cv_stack()
476  * 1) This 64 KiB input might be the only call that ever gets made to update.
497  * hashing an input all-at-once.)
503 	memcpy(&ctx->cv_stack[ctx->cv_stack_len * BLAKE3_OUT_LEN], new_cv,  in hasher_push_cv()
505 	ctx->cv_stack_len += 1;  in hasher_push_cv()
541 	if (chunk_state_len(&ctx->chunk) > 0) {  in Blake3_Update2()
542 		size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&ctx->chunk);  in Blake3_Update2()
546 		chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, take);  in Blake3_Update2()
548 		input_len -= take;  in Blake3_Update2()
555 			output_t output = chunk_state_output(&ctx->chunk);  in Blake3_Update2()
557 			output_chaining_value(ctx->ops, &output, chunk_cv);  in Blake3_Update2()
558 			hasher_push_cv(ctx, chunk_cv, ctx->chunk.chunk_counter);  in Blake3_Update2()
559 			chunk_state_reset(&ctx->chunk, ctx->key,  in Blake3_Update2()
560 			    ctx->chunk.chunk_counter + 1);  in Blake3_Update2()
570 	 * (and maybe in the future, multi-threading) parallelism. Two  in Blake3_Update2()
572 	 * - The subtree has to be a power-of-2 number of chunks. Only  in Blake3_Update2()
575 	 * - The subtree must evenly divide the total number of chunks up  in Blake3_Update2()
577 	 *   subtree is only waiting for 1 more chunk, we can't hash a subtree  in Blake3_Update2()
585 		    ctx->chunk.chunk_counter * BLAKE3_CHUNK_LEN;  in Blake3_Update2()
591 		 * power-of-2 inputs of the same size, as is hopefully  in Blake3_Update2()
599 		 * and we might get to use 2-way SIMD parallelism. The problem  in Blake3_Update2()
604 		 * https://github.com/BLAKE3-team/BLAKE3/issues/69.  in Blake3_Update2()
606 		while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {  in Blake3_Update2()
617 			chunk_state_init(&chunk_state, ctx->key,  in Blake3_Update2()
618 			    ctx->chunk.flags);  in Blake3_Update2()
619 			chunk_state.chunk_counter = ctx->chunk.chunk_counter;  in Blake3_Update2()
620 			chunk_state_update(ctx->ops, &chunk_state, input_bytes,  in Blake3_Update2()
624 			output_chaining_value(ctx->ops, &output, cv);  in Blake3_Update2()
628 			 * This is the high-performance happy path, though  in Blake3_Update2()
633 			compress_subtree_to_parent_node(ctx->ops, input_bytes,  in Blake3_Update2()
634 			    subtree_len, ctx->key, ctx-> chunk.chunk_counter,  in Blake3_Update2()
635 			    ctx->chunk.flags, cv_pair);  in Blake3_Update2()
636 			hasher_push_cv(ctx, cv_pair, ctx->chunk.chunk_counter);  in Blake3_Update2()
638 			    ctx->chunk.chunk_counter + (subtree_chunks / 2));  in Blake3_Update2()
640 		ctx->chunk.chunk_counter += subtree_chunks;  in Blake3_Update2()
642 		input_len -= subtree_len;  in Blake3_Update2()
649 	 * remaining input means we know these merges are non-root. This merge  in Blake3_Update2()
655 		chunk_state_update(ctx->ops, &ctx->chunk, input_bytes,  in Blake3_Update2()
657 		hasher_merge_cv_stack(ctx, ctx->chunk.chunk_counter);  in Blake3_Update2()
673 		todo -= block;  in Blake3_Update()
697 	if (ctx->cv_stack_len == 0) {  in Blake3_FinalSeek()
698 		output_t output = chunk_state_output(&ctx->chunk);  in Blake3_FinalSeek()
699 		output_root_bytes(ctx->ops, &output, seek, out, out_len);  in Blake3_FinalSeek()
704 	 * do a roll-up merge between that chunk hash and every subtree in the  in Blake3_FinalSeek()
713 	if (chunk_state_len(&ctx->chunk) > 0) {  in Blake3_FinalSeek()
714 		cvs_remaining = ctx->cv_stack_len;  in Blake3_FinalSeek()
715 		output = chunk_state_output(&ctx->chunk);  in Blake3_FinalSeek()
718 		cvs_remaining = ctx->cv_stack_len - 2;  in Blake3_FinalSeek()
719 		output = parent_output(&ctx->cv_stack[cvs_remaining * 32],  in Blake3_FinalSeek()
720 		    ctx->key, ctx->chunk.flags);  in Blake3_FinalSeek()
723 		cvs_remaining -= 1;  in Blake3_FinalSeek()
725 		memcpy(parent_block, &ctx->cv_stack[cvs_remaining * 32], 32);  in Blake3_FinalSeek()
726 		output_chaining_value(ctx->ops, &output, &parent_block[32]);  in Blake3_FinalSeek()
727 		output = parent_output(parent_block, ctx->key,  in Blake3_FinalSeek()
728 		    ctx->chunk.flags);  in Blake3_FinalSeek()
730 	output_root_bytes(ctx->ops, &output, seek, out, out_len);  in Blake3_FinalSeek()