xref: /dflybsd-src/contrib/xz/src/liblzma/common/memcmplen.h (revision b5feb3da7c498482b19d14ac6f2b1901005f7d94)
115ab8c86SJohn Marino ///////////////////////////////////////////////////////////////////////////////
215ab8c86SJohn Marino //
315ab8c86SJohn Marino /// \file       memcmplen.h
415ab8c86SJohn Marino /// \brief      Optimized comparison of two buffers
515ab8c86SJohn Marino //
615ab8c86SJohn Marino //  Author:     Lasse Collin
715ab8c86SJohn Marino //
815ab8c86SJohn Marino //  This file has been put into the public domain.
915ab8c86SJohn Marino //  You can do whatever you want with this file.
1015ab8c86SJohn Marino //
1115ab8c86SJohn Marino ///////////////////////////////////////////////////////////////////////////////
1215ab8c86SJohn Marino 
1315ab8c86SJohn Marino #ifndef LZMA_MEMCMPLEN_H
1415ab8c86SJohn Marino #define LZMA_MEMCMPLEN_H
1515ab8c86SJohn Marino 
1615ab8c86SJohn Marino #include "common.h"
1715ab8c86SJohn Marino 
1815ab8c86SJohn Marino #ifdef HAVE_IMMINTRIN_H
1915ab8c86SJohn Marino #	include <immintrin.h>
2015ab8c86SJohn Marino #endif
2115ab8c86SJohn Marino 
2215ab8c86SJohn Marino 
2315ab8c86SJohn Marino /// Find out how many equal bytes the two buffers have.
2415ab8c86SJohn Marino ///
2515ab8c86SJohn Marino /// \param      buf1    First buffer
2615ab8c86SJohn Marino /// \param      buf2    Second buffer
2715ab8c86SJohn Marino /// \param      len     How many bytes have already been compared and will
2815ab8c86SJohn Marino ///                     be assumed to match
2915ab8c86SJohn Marino /// \param      limit   How many bytes to compare at most, including the
3015ab8c86SJohn Marino ///                     already-compared bytes. This must be significantly
3115ab8c86SJohn Marino ///                     smaller than UINT32_MAX to avoid integer overflows.
3215ab8c86SJohn Marino ///                     Up to LZMA_MEMCMPLEN_EXTRA bytes may be read past
3315ab8c86SJohn Marino ///                     the specified limit from both buf1 and buf2.
3415ab8c86SJohn Marino ///
3515ab8c86SJohn Marino /// \return     Number of equal bytes in the buffers is returned.
3615ab8c86SJohn Marino ///             This is always at least len and at most limit.
3715ab8c86SJohn Marino ///
3815ab8c86SJohn Marino /// \note       LZMA_MEMCMPLEN_EXTRA defines how many extra bytes may be read.
3915ab8c86SJohn Marino ///             It's rounded up to 2^n. This extra amount needs to be
4015ab8c86SJohn Marino ///             allocated in the buffers being used. It needs to be
4115ab8c86SJohn Marino ///             initialized too to keep Valgrind quiet.
uint32_t(__always_inline__)4215ab8c86SJohn Marino static inline uint32_t lzma_attribute((__always_inline__))
4315ab8c86SJohn Marino lzma_memcmplen(const uint8_t *buf1, const uint8_t *buf2,
4415ab8c86SJohn Marino 		uint32_t len, uint32_t limit)
4515ab8c86SJohn Marino {
4615ab8c86SJohn Marino 	assert(len <= limit);
4715ab8c86SJohn Marino 	assert(limit <= UINT32_MAX / 2);
4815ab8c86SJohn Marino 
4915ab8c86SJohn Marino #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
5015ab8c86SJohn Marino 		&& ((TUKLIB_GNUC_REQ(3, 4) && defined(__x86_64__)) \
5115ab8c86SJohn Marino 			|| (defined(__INTEL_COMPILER) && defined(__x86_64__)) \
5215ab8c86SJohn Marino 			|| (defined(__INTEL_COMPILER) && defined(_M_X64)) \
5315ab8c86SJohn Marino 			|| (defined(_MSC_VER) && defined(_M_X64)))
5415ab8c86SJohn Marino 	// NOTE: This will use 64-bit unaligned access which
5515ab8c86SJohn Marino 	// TUKLIB_FAST_UNALIGNED_ACCESS wasn't meant to permit, but
5615ab8c86SJohn Marino 	// it's convenient here at least as long as it's x86-64 only.
5715ab8c86SJohn Marino 	//
5815ab8c86SJohn Marino 	// I keep this x86-64 only for now since that's where I know this
5915ab8c86SJohn Marino 	// to be a good method. This may be fine on other 64-bit CPUs too.
6015ab8c86SJohn Marino 	// On big endian one should use xor instead of subtraction and switch
6115ab8c86SJohn Marino 	// to __builtin_clzll().
6215ab8c86SJohn Marino #define LZMA_MEMCMPLEN_EXTRA 8
6315ab8c86SJohn Marino 	while (len < limit) {
64*e151908bSDaniel Fojt 		const uint64_t x = read64ne(buf1 + len) - read64ne(buf2 + len);
6515ab8c86SJohn Marino 		if (x != 0) {
6615ab8c86SJohn Marino #	if defined(_M_X64) // MSVC or Intel C compiler on Windows
6715ab8c86SJohn Marino 			unsigned long tmp;
6815ab8c86SJohn Marino 			_BitScanForward64(&tmp, x);
6915ab8c86SJohn Marino 			len += (uint32_t)tmp >> 3;
7015ab8c86SJohn Marino #	else // GCC, clang, or Intel C compiler
7115ab8c86SJohn Marino 			len += (uint32_t)__builtin_ctzll(x) >> 3;
7215ab8c86SJohn Marino #	endif
7315ab8c86SJohn Marino 			return my_min(len, limit);
7415ab8c86SJohn Marino 		}
7515ab8c86SJohn Marino 
7615ab8c86SJohn Marino 		len += 8;
7715ab8c86SJohn Marino 	}
7815ab8c86SJohn Marino 
7915ab8c86SJohn Marino 	return limit;
8015ab8c86SJohn Marino 
8115ab8c86SJohn Marino #elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
8215ab8c86SJohn Marino 		&& defined(HAVE__MM_MOVEMASK_EPI8) \
8315ab8c86SJohn Marino 		&& ((defined(__GNUC__) && defined(__SSE2_MATH__)) \
8415ab8c86SJohn Marino 			|| (defined(__INTEL_COMPILER) && defined(__SSE2__)) \
8515ab8c86SJohn Marino 			|| (defined(_MSC_VER) && defined(_M_IX86_FP) \
8615ab8c86SJohn Marino 				&& _M_IX86_FP >= 2))
8715ab8c86SJohn Marino 	// NOTE: Like above, this will use 128-bit unaligned access which
8815ab8c86SJohn Marino 	// TUKLIB_FAST_UNALIGNED_ACCESS wasn't meant to permit.
8915ab8c86SJohn Marino 	//
9015ab8c86SJohn Marino 	// SSE2 version for 32-bit and 64-bit x86. On x86-64 the above
9115ab8c86SJohn Marino 	// version is sometimes significantly faster and sometimes
9215ab8c86SJohn Marino 	// slightly slower than this SSE2 version, so this SSE2
9315ab8c86SJohn Marino 	// version isn't used on x86-64.
9415ab8c86SJohn Marino #	define LZMA_MEMCMPLEN_EXTRA 16
9515ab8c86SJohn Marino 	while (len < limit) {
9615ab8c86SJohn Marino 		const uint32_t x = 0xFFFF ^ _mm_movemask_epi8(_mm_cmpeq_epi8(
9715ab8c86SJohn Marino 			_mm_loadu_si128((const __m128i *)(buf1 + len)),
9815ab8c86SJohn Marino 			_mm_loadu_si128((const __m128i *)(buf2 + len))));
9915ab8c86SJohn Marino 
10015ab8c86SJohn Marino 		if (x != 0) {
101*e151908bSDaniel Fojt 			len += ctz32(x);
10215ab8c86SJohn Marino 			return my_min(len, limit);
10315ab8c86SJohn Marino 		}
10415ab8c86SJohn Marino 
10515ab8c86SJohn Marino 		len += 16;
10615ab8c86SJohn Marino 	}
10715ab8c86SJohn Marino 
10815ab8c86SJohn Marino 	return limit;
10915ab8c86SJohn Marino 
11015ab8c86SJohn Marino #elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && !defined(WORDS_BIGENDIAN)
11115ab8c86SJohn Marino 	// Generic 32-bit little endian method
11215ab8c86SJohn Marino #	define LZMA_MEMCMPLEN_EXTRA 4
11315ab8c86SJohn Marino 	while (len < limit) {
114*e151908bSDaniel Fojt 		uint32_t x = read32ne(buf1 + len) - read32ne(buf2 + len);
11515ab8c86SJohn Marino 		if (x != 0) {
11615ab8c86SJohn Marino 			if ((x & 0xFFFF) == 0) {
11715ab8c86SJohn Marino 				len += 2;
11815ab8c86SJohn Marino 				x >>= 16;
11915ab8c86SJohn Marino 			}
12015ab8c86SJohn Marino 
12115ab8c86SJohn Marino 			if ((x & 0xFF) == 0)
12215ab8c86SJohn Marino 				++len;
12315ab8c86SJohn Marino 
12415ab8c86SJohn Marino 			return my_min(len, limit);
12515ab8c86SJohn Marino 		}
12615ab8c86SJohn Marino 
12715ab8c86SJohn Marino 		len += 4;
12815ab8c86SJohn Marino 	}
12915ab8c86SJohn Marino 
13015ab8c86SJohn Marino 	return limit;
13115ab8c86SJohn Marino 
13215ab8c86SJohn Marino #elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && defined(WORDS_BIGENDIAN)
13315ab8c86SJohn Marino 	// Generic 32-bit big endian method
13415ab8c86SJohn Marino #	define LZMA_MEMCMPLEN_EXTRA 4
13515ab8c86SJohn Marino 	while (len < limit) {
136*e151908bSDaniel Fojt 		uint32_t x = read32ne(buf1 + len) ^ read32ne(buf2 + len);
13715ab8c86SJohn Marino 		if (x != 0) {
13815ab8c86SJohn Marino 			if ((x & 0xFFFF0000) == 0) {
13915ab8c86SJohn Marino 				len += 2;
14015ab8c86SJohn Marino 				x <<= 16;
14115ab8c86SJohn Marino 			}
14215ab8c86SJohn Marino 
14315ab8c86SJohn Marino 			if ((x & 0xFF000000) == 0)
14415ab8c86SJohn Marino 				++len;
14515ab8c86SJohn Marino 
14615ab8c86SJohn Marino 			return my_min(len, limit);
14715ab8c86SJohn Marino 		}
14815ab8c86SJohn Marino 
14915ab8c86SJohn Marino 		len += 4;
15015ab8c86SJohn Marino 	}
15115ab8c86SJohn Marino 
15215ab8c86SJohn Marino 	return limit;
15315ab8c86SJohn Marino 
15415ab8c86SJohn Marino #else
15515ab8c86SJohn Marino 	// Simple portable version that doesn't use unaligned access.
15615ab8c86SJohn Marino #	define LZMA_MEMCMPLEN_EXTRA 0
15715ab8c86SJohn Marino 	while (len < limit && buf1[len] == buf2[len])
15815ab8c86SJohn Marino 		++len;
15915ab8c86SJohn Marino 
16015ab8c86SJohn Marino 	return len;
16115ab8c86SJohn Marino #endif
16215ab8c86SJohn Marino }
16315ab8c86SJohn Marino 
16415ab8c86SJohn Marino #endif
165