1*3b35e7eeSXin LI // SPDX-License-Identifier: 0BSD
2*3b35e7eeSXin LI
3*3b35e7eeSXin LI ///////////////////////////////////////////////////////////////////////////////
4*3b35e7eeSXin LI //
5*3b35e7eeSXin LI /// \file crc32_arm64.h
6*3b35e7eeSXin LI /// \brief CRC32 calculation with ARM64 optimization
7*3b35e7eeSXin LI //
8*3b35e7eeSXin LI // Authors: Chenxi Mao
9*3b35e7eeSXin LI // Jia Tan
10*3b35e7eeSXin LI // Hans Jansen
11*3b35e7eeSXin LI //
12*3b35e7eeSXin LI ///////////////////////////////////////////////////////////////////////////////
13*3b35e7eeSXin LI
14*3b35e7eeSXin LI #ifndef LZMA_CRC32_ARM64_H
15*3b35e7eeSXin LI #define LZMA_CRC32_ARM64_H
16*3b35e7eeSXin LI
17*3b35e7eeSXin LI // MSVC always has the CRC intrinsics available when building for ARM64
18*3b35e7eeSXin LI // there is no need to include any header files.
19*3b35e7eeSXin LI #ifndef _MSC_VER
20*3b35e7eeSXin LI # include <arm_acle.h>
21*3b35e7eeSXin LI #endif
22*3b35e7eeSXin LI
23*3b35e7eeSXin LI // If both versions are going to be built, we need runtime detection
24*3b35e7eeSXin LI // to check if the instructions are supported.
25*3b35e7eeSXin LI #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
26*3b35e7eeSXin LI # if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
27*3b35e7eeSXin LI # include <sys/auxv.h>
28*3b35e7eeSXin LI # elif defined(_WIN32)
29*3b35e7eeSXin LI # include <processthreadsapi.h>
30*3b35e7eeSXin LI # elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME)
31*3b35e7eeSXin LI # include <sys/sysctl.h>
32*3b35e7eeSXin LI # endif
33*3b35e7eeSXin LI #endif
34*3b35e7eeSXin LI
35*3b35e7eeSXin LI // Some EDG-based compilers support ARM64 and define __GNUC__
36*3b35e7eeSXin LI // (such as Nvidia's nvcc), but do not support function attributes.
37*3b35e7eeSXin LI //
38*3b35e7eeSXin LI // NOTE: Build systems check for this too, keep them in sync with this.
39*3b35e7eeSXin LI #if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
40*3b35e7eeSXin LI # define crc_attr_target __attribute__((__target__("+crc")))
41*3b35e7eeSXin LI #else
42*3b35e7eeSXin LI # define crc_attr_target
43*3b35e7eeSXin LI #endif
44*3b35e7eeSXin LI
45*3b35e7eeSXin LI
46*3b35e7eeSXin LI crc_attr_target
47*3b35e7eeSXin LI static uint32_t
crc32_arch_optimized(const uint8_t * buf,size_t size,uint32_t crc)48*3b35e7eeSXin LI crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
49*3b35e7eeSXin LI {
50*3b35e7eeSXin LI crc = ~crc;
51*3b35e7eeSXin LI
52*3b35e7eeSXin LI // Align the input buffer because this was shown to be
53*3b35e7eeSXin LI // significantly faster than unaligned accesses.
54*3b35e7eeSXin LI const size_t align_amount = my_min(size, (0U - (uintptr_t)buf) & 7);
55*3b35e7eeSXin LI
56*3b35e7eeSXin LI for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf)
57*3b35e7eeSXin LI crc = __crc32b(crc, *buf);
58*3b35e7eeSXin LI
59*3b35e7eeSXin LI size -= align_amount;
60*3b35e7eeSXin LI
61*3b35e7eeSXin LI // Process 8 bytes at a time. The end point is determined by
62*3b35e7eeSXin LI // ignoring the least significant three bits of size to ensure
63*3b35e7eeSXin LI // we do not process past the bounds of the buffer. This guarantees
64*3b35e7eeSXin LI // that limit is a multiple of 8 and is strictly less than size.
65*3b35e7eeSXin LI for (const uint8_t *limit = buf + (size & ~(size_t)7);
66*3b35e7eeSXin LI buf < limit; buf += 8)
67*3b35e7eeSXin LI crc = __crc32d(crc, aligned_read64le(buf));
68*3b35e7eeSXin LI
69*3b35e7eeSXin LI // Process the remaining bytes that are not 8 byte aligned.
70*3b35e7eeSXin LI for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf)
71*3b35e7eeSXin LI crc = __crc32b(crc, *buf);
72*3b35e7eeSXin LI
73*3b35e7eeSXin LI return ~crc;
74*3b35e7eeSXin LI }
75*3b35e7eeSXin LI
76*3b35e7eeSXin LI
77*3b35e7eeSXin LI #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
78*3b35e7eeSXin LI static inline bool
is_arch_extension_supported(void)79*3b35e7eeSXin LI is_arch_extension_supported(void)
80*3b35e7eeSXin LI {
81*3b35e7eeSXin LI #if defined(HAVE_GETAUXVAL)
82*3b35e7eeSXin LI return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0;
83*3b35e7eeSXin LI
84*3b35e7eeSXin LI #elif defined(HAVE_ELF_AUX_INFO)
85*3b35e7eeSXin LI unsigned long feature_flags;
86*3b35e7eeSXin LI
87*3b35e7eeSXin LI if (elf_aux_info(AT_HWCAP, &feature_flags, sizeof(feature_flags)) != 0)
88*3b35e7eeSXin LI return false;
89*3b35e7eeSXin LI
90*3b35e7eeSXin LI return (feature_flags & HWCAP_CRC32) != 0;
91*3b35e7eeSXin LI
92*3b35e7eeSXin LI #elif defined(_WIN32)
93*3b35e7eeSXin LI return IsProcessorFeaturePresent(
94*3b35e7eeSXin LI PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
95*3b35e7eeSXin LI
96*3b35e7eeSXin LI #elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME)
97*3b35e7eeSXin LI int has_crc32 = 0;
98*3b35e7eeSXin LI size_t size = sizeof(has_crc32);
99*3b35e7eeSXin LI
100*3b35e7eeSXin LI // The sysctlbyname() function requires a string identifier for the
101*3b35e7eeSXin LI // CPU feature it tests. The Apple documentation lists the string
102*3b35e7eeSXin LI // "hw.optional.armv8_crc32", which can be found here:
103*3b35e7eeSXin LI // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics#3915619
104*3b35e7eeSXin LI if (sysctlbyname("hw.optional.armv8_crc32", &has_crc32,
105*3b35e7eeSXin LI &size, NULL, 0) != 0)
106*3b35e7eeSXin LI return false;
107*3b35e7eeSXin LI
108*3b35e7eeSXin LI return has_crc32;
109*3b35e7eeSXin LI
110*3b35e7eeSXin LI #else
111*3b35e7eeSXin LI // If a runtime detection method cannot be found, then this must
112*3b35e7eeSXin LI // be a compile time error. The checks in crc_common.h should ensure
113*3b35e7eeSXin LI // a runtime detection method is always found if this function is
114*3b35e7eeSXin LI // built. It would be possible to just return false here, but this
115*3b35e7eeSXin LI // is inefficient for binary size and runtime since only the generic
116*3b35e7eeSXin LI // method could ever be used.
117*3b35e7eeSXin LI # error Runtime detection method unavailable.
118*3b35e7eeSXin LI #endif
119*3b35e7eeSXin LI }
120*3b35e7eeSXin LI #endif
121*3b35e7eeSXin LI
122*3b35e7eeSXin LI #endif // LZMA_CRC32_ARM64_H
123