xref: /freebsd-src/contrib/xz/src/liblzma/check/crc32_arm64.h (revision 3b35e7ee8de9b0260149a2b77e87a2b9c7a36244)
1*3b35e7eeSXin LI // SPDX-License-Identifier: 0BSD
2*3b35e7eeSXin LI 
3*3b35e7eeSXin LI ///////////////////////////////////////////////////////////////////////////////
4*3b35e7eeSXin LI //
5*3b35e7eeSXin LI /// \file       crc32_arm64.h
6*3b35e7eeSXin LI /// \brief      CRC32 calculation with ARM64 optimization
7*3b35e7eeSXin LI //
8*3b35e7eeSXin LI //  Authors:    Chenxi Mao
9*3b35e7eeSXin LI //              Jia Tan
10*3b35e7eeSXin LI //              Hans Jansen
11*3b35e7eeSXin LI //
12*3b35e7eeSXin LI ///////////////////////////////////////////////////////////////////////////////
13*3b35e7eeSXin LI 
14*3b35e7eeSXin LI #ifndef LZMA_CRC32_ARM64_H
15*3b35e7eeSXin LI #define LZMA_CRC32_ARM64_H
16*3b35e7eeSXin LI 
17*3b35e7eeSXin LI // MSVC always has the CRC intrinsics available when building for ARM64
18*3b35e7eeSXin LI // there is no need to include any header files.
19*3b35e7eeSXin LI #ifndef _MSC_VER
20*3b35e7eeSXin LI #	include <arm_acle.h>
21*3b35e7eeSXin LI #endif
22*3b35e7eeSXin LI 
23*3b35e7eeSXin LI // If both versions are going to be built, we need runtime detection
24*3b35e7eeSXin LI // to check if the instructions are supported.
25*3b35e7eeSXin LI #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
26*3b35e7eeSXin LI #	if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
27*3b35e7eeSXin LI #		include <sys/auxv.h>
28*3b35e7eeSXin LI #	elif defined(_WIN32)
29*3b35e7eeSXin LI #		include <processthreadsapi.h>
30*3b35e7eeSXin LI #	elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME)
31*3b35e7eeSXin LI #		include <sys/sysctl.h>
32*3b35e7eeSXin LI #	endif
33*3b35e7eeSXin LI #endif
34*3b35e7eeSXin LI 
35*3b35e7eeSXin LI // Some EDG-based compilers support ARM64 and define __GNUC__
36*3b35e7eeSXin LI // (such as Nvidia's nvcc), but do not support function attributes.
37*3b35e7eeSXin LI //
38*3b35e7eeSXin LI // NOTE: Build systems check for this too, keep them in sync with this.
39*3b35e7eeSXin LI #if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
40*3b35e7eeSXin LI #	define crc_attr_target __attribute__((__target__("+crc")))
41*3b35e7eeSXin LI #else
42*3b35e7eeSXin LI #	define crc_attr_target
43*3b35e7eeSXin LI #endif
44*3b35e7eeSXin LI 
45*3b35e7eeSXin LI 
46*3b35e7eeSXin LI crc_attr_target
47*3b35e7eeSXin LI static uint32_t
crc32_arch_optimized(const uint8_t * buf,size_t size,uint32_t crc)48*3b35e7eeSXin LI crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
49*3b35e7eeSXin LI {
50*3b35e7eeSXin LI 	crc = ~crc;
51*3b35e7eeSXin LI 
52*3b35e7eeSXin LI 	// Align the input buffer because this was shown to be
53*3b35e7eeSXin LI 	// significantly faster than unaligned accesses.
54*3b35e7eeSXin LI 	const size_t align_amount = my_min(size, (0U - (uintptr_t)buf) & 7);
55*3b35e7eeSXin LI 
56*3b35e7eeSXin LI 	for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf)
57*3b35e7eeSXin LI 		crc = __crc32b(crc, *buf);
58*3b35e7eeSXin LI 
59*3b35e7eeSXin LI 	size -= align_amount;
60*3b35e7eeSXin LI 
61*3b35e7eeSXin LI 	// Process 8 bytes at a time. The end point is determined by
62*3b35e7eeSXin LI 	// ignoring the least significant three bits of size to ensure
63*3b35e7eeSXin LI 	// we do not process past the bounds of the buffer. This guarantees
64*3b35e7eeSXin LI 	// that limit is a multiple of 8 and is strictly less than size.
65*3b35e7eeSXin LI 	for (const uint8_t *limit = buf + (size & ~(size_t)7);
66*3b35e7eeSXin LI 			buf < limit; buf += 8)
67*3b35e7eeSXin LI 		crc = __crc32d(crc, aligned_read64le(buf));
68*3b35e7eeSXin LI 
69*3b35e7eeSXin LI 	// Process the remaining bytes that are not 8 byte aligned.
70*3b35e7eeSXin LI 	for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf)
71*3b35e7eeSXin LI 		crc = __crc32b(crc, *buf);
72*3b35e7eeSXin LI 
73*3b35e7eeSXin LI 	return ~crc;
74*3b35e7eeSXin LI }
75*3b35e7eeSXin LI 
76*3b35e7eeSXin LI 
77*3b35e7eeSXin LI #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
78*3b35e7eeSXin LI static inline bool
is_arch_extension_supported(void)79*3b35e7eeSXin LI is_arch_extension_supported(void)
80*3b35e7eeSXin LI {
81*3b35e7eeSXin LI #if defined(HAVE_GETAUXVAL)
82*3b35e7eeSXin LI 	return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0;
83*3b35e7eeSXin LI 
84*3b35e7eeSXin LI #elif defined(HAVE_ELF_AUX_INFO)
85*3b35e7eeSXin LI 	unsigned long feature_flags;
86*3b35e7eeSXin LI 
87*3b35e7eeSXin LI 	if (elf_aux_info(AT_HWCAP, &feature_flags, sizeof(feature_flags)) != 0)
88*3b35e7eeSXin LI 		return false;
89*3b35e7eeSXin LI 
90*3b35e7eeSXin LI 	return (feature_flags & HWCAP_CRC32) != 0;
91*3b35e7eeSXin LI 
92*3b35e7eeSXin LI #elif defined(_WIN32)
93*3b35e7eeSXin LI 	return IsProcessorFeaturePresent(
94*3b35e7eeSXin LI 			PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
95*3b35e7eeSXin LI 
96*3b35e7eeSXin LI #elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME)
97*3b35e7eeSXin LI 	int has_crc32 = 0;
98*3b35e7eeSXin LI 	size_t size = sizeof(has_crc32);
99*3b35e7eeSXin LI 
100*3b35e7eeSXin LI 	// The sysctlbyname() function requires a string identifier for the
101*3b35e7eeSXin LI 	// CPU feature it tests. The Apple documentation lists the string
102*3b35e7eeSXin LI 	// "hw.optional.armv8_crc32", which can be found here:
103*3b35e7eeSXin LI 	// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics#3915619
104*3b35e7eeSXin LI 	if (sysctlbyname("hw.optional.armv8_crc32", &has_crc32,
105*3b35e7eeSXin LI 			&size, NULL, 0) != 0)
106*3b35e7eeSXin LI 		return false;
107*3b35e7eeSXin LI 
108*3b35e7eeSXin LI 	return has_crc32;
109*3b35e7eeSXin LI 
110*3b35e7eeSXin LI #else
111*3b35e7eeSXin LI 	// If a runtime detection method cannot be found, then this must
112*3b35e7eeSXin LI 	// be a compile time error. The checks in crc_common.h should ensure
113*3b35e7eeSXin LI 	// a runtime detection method is always found if this function is
114*3b35e7eeSXin LI 	// built. It would be possible to just return false here, but this
115*3b35e7eeSXin LI 	// is inefficient for binary size and runtime since only the generic
116*3b35e7eeSXin LI 	// method could ever be used.
117*3b35e7eeSXin LI #	error Runtime detection method unavailable.
118*3b35e7eeSXin LI #endif
119*3b35e7eeSXin LI }
120*3b35e7eeSXin LI #endif
121*3b35e7eeSXin LI 
122*3b35e7eeSXin LI #endif // LZMA_CRC32_ARM64_H
123