xref: /freebsd-src/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/bmi2intrin.h (revision 81ad626541db97eb356e2c1d4a20eb2a26a766ab)
1*81ad6265SDimitry Andric /*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
2*81ad6265SDimitry Andric  *
3*81ad6265SDimitry Andric  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*81ad6265SDimitry Andric  * See https://llvm.org/LICENSE.txt for license information.
5*81ad6265SDimitry Andric  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*81ad6265SDimitry Andric  *
7*81ad6265SDimitry Andric  *===-----------------------------------------------------------------------===
8*81ad6265SDimitry Andric  */
9*81ad6265SDimitry Andric 
10*81ad6265SDimitry Andric #if !defined X86GPRINTRIN_H_
11*81ad6265SDimitry Andric #error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
12*81ad6265SDimitry Andric #endif
13*81ad6265SDimitry Andric 
14*81ad6265SDimitry Andric #ifndef BMI2INTRIN_H_
15*81ad6265SDimitry Andric #define BMI2INTRIN_H_
16*81ad6265SDimitry Andric 
17*81ad6265SDimitry Andric extern __inline unsigned int
18*81ad6265SDimitry Andric     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bzhi_u32(unsigned int __X,unsigned int __Y)19*81ad6265SDimitry Andric     _bzhi_u32(unsigned int __X, unsigned int __Y) {
20*81ad6265SDimitry Andric   return ((__X << (32 - __Y)) >> (32 - __Y));
21*81ad6265SDimitry Andric }
22*81ad6265SDimitry Andric 
23*81ad6265SDimitry Andric extern __inline unsigned int
24*81ad6265SDimitry Andric     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mulx_u32(unsigned int __X,unsigned int __Y,unsigned int * __P)25*81ad6265SDimitry Andric     _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
26*81ad6265SDimitry Andric   unsigned long long __res = (unsigned long long)__X * __Y;
27*81ad6265SDimitry Andric   *__P = (unsigned int)(__res >> 32);
28*81ad6265SDimitry Andric   return (unsigned int)__res;
29*81ad6265SDimitry Andric }
30*81ad6265SDimitry Andric 
31*81ad6265SDimitry Andric #ifdef __PPC64__
32*81ad6265SDimitry Andric extern __inline unsigned long long
33*81ad6265SDimitry Andric     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bzhi_u64(unsigned long long __X,unsigned long long __Y)34*81ad6265SDimitry Andric     _bzhi_u64(unsigned long long __X, unsigned long long __Y) {
35*81ad6265SDimitry Andric   return ((__X << (64 - __Y)) >> (64 - __Y));
36*81ad6265SDimitry Andric }
37*81ad6265SDimitry Andric 
38*81ad6265SDimitry Andric /* __int128 requires base 64-bit.  */
39*81ad6265SDimitry Andric extern __inline unsigned long long
40*81ad6265SDimitry Andric     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mulx_u64(unsigned long long __X,unsigned long long __Y,unsigned long long * __P)41*81ad6265SDimitry Andric     _mulx_u64(unsigned long long __X, unsigned long long __Y,
42*81ad6265SDimitry Andric               unsigned long long *__P) {
43*81ad6265SDimitry Andric   unsigned __int128 __res = (unsigned __int128)__X * __Y;
44*81ad6265SDimitry Andric   *__P = (unsigned long long)(__res >> 64);
45*81ad6265SDimitry Andric   return (unsigned long long)__res;
46*81ad6265SDimitry Andric }
47*81ad6265SDimitry Andric 
48*81ad6265SDimitry Andric #ifdef _ARCH_PWR7
49*81ad6265SDimitry Andric /* popcount and bpermd require power7 minimum.  */
50*81ad6265SDimitry Andric extern __inline unsigned long long
51*81ad6265SDimitry Andric     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pdep_u64(unsigned long long __X,unsigned long long __M)52*81ad6265SDimitry Andric     _pdep_u64(unsigned long long __X, unsigned long long __M) {
53*81ad6265SDimitry Andric   unsigned long __result = 0x0UL;
54*81ad6265SDimitry Andric   const unsigned long __mask = 0x8000000000000000UL;
55*81ad6265SDimitry Andric   unsigned long __m = __M;
56*81ad6265SDimitry Andric   unsigned long __c, __t;
57*81ad6265SDimitry Andric   unsigned long __p;
58*81ad6265SDimitry Andric 
59*81ad6265SDimitry Andric   /* The pop-count of the mask gives the number of the bits from
60*81ad6265SDimitry Andric    source to process.  This is also needed to shift bits from the
61*81ad6265SDimitry Andric    source into the correct position for the result.  */
62*81ad6265SDimitry Andric   __p = 64 - __builtin_popcountl(__M);
63*81ad6265SDimitry Andric 
64*81ad6265SDimitry Andric   /* The loop is for the number of '1' bits in the mask and clearing
65*81ad6265SDimitry Andric    each mask bit as it is processed.  */
66*81ad6265SDimitry Andric   while (__m != 0) {
67*81ad6265SDimitry Andric     __c = __builtin_clzl(__m);
68*81ad6265SDimitry Andric     __t = __X << (__p - __c);
69*81ad6265SDimitry Andric     __m ^= (__mask >> __c);
70*81ad6265SDimitry Andric     __result |= (__t & (__mask >> __c));
71*81ad6265SDimitry Andric     __p++;
72*81ad6265SDimitry Andric   }
73*81ad6265SDimitry Andric   return __result;
74*81ad6265SDimitry Andric }
75*81ad6265SDimitry Andric 
76*81ad6265SDimitry Andric extern __inline unsigned long long
77*81ad6265SDimitry Andric     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pext_u64(unsigned long long __X,unsigned long long __M)78*81ad6265SDimitry Andric     _pext_u64(unsigned long long __X, unsigned long long __M) {
79*81ad6265SDimitry Andric   unsigned long __p = 0x4040404040404040UL; // initial bit permute control
80*81ad6265SDimitry Andric   const unsigned long __mask = 0x8000000000000000UL;
81*81ad6265SDimitry Andric   unsigned long __m = __M;
82*81ad6265SDimitry Andric   unsigned long __c;
83*81ad6265SDimitry Andric   unsigned long __result;
84*81ad6265SDimitry Andric 
85*81ad6265SDimitry Andric   /* if the mask is constant and selects 8 bits or less we can use
86*81ad6265SDimitry Andric    the Power8 Bit permute instruction.  */
87*81ad6265SDimitry Andric   if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
88*81ad6265SDimitry Andric     /* Also if the pext mask is constant, then the popcount is
89*81ad6265SDimitry Andric      constant, we can evaluate the following loop at compile
90*81ad6265SDimitry Andric      time and use a constant bit permute vector.  */
91*81ad6265SDimitry Andric     long __i;
92*81ad6265SDimitry Andric     for (__i = 0; __i < __builtin_popcountl(__M); __i++) {
93*81ad6265SDimitry Andric       __c = __builtin_clzl(__m);
94*81ad6265SDimitry Andric       __p = (__p << 8) | __c;
95*81ad6265SDimitry Andric       __m ^= (__mask >> __c);
96*81ad6265SDimitry Andric     }
97*81ad6265SDimitry Andric     __result = __builtin_bpermd(__p, __X);
98*81ad6265SDimitry Andric   } else {
99*81ad6265SDimitry Andric     __p = 64 - __builtin_popcountl(__M);
100*81ad6265SDimitry Andric     __result = 0;
101*81ad6265SDimitry Andric     /* We could a use a for loop here, but that combined with
102*81ad6265SDimitry Andric      -funroll-loops can expand to a lot of code.  The while
103*81ad6265SDimitry Andric      loop avoids unrolling and the compiler commons the xor
104*81ad6265SDimitry Andric      from clearing the mask bit with the (m != 0) test.  The
105*81ad6265SDimitry Andric      result is a more compact loop setup and body.  */
106*81ad6265SDimitry Andric     while (__m != 0) {
107*81ad6265SDimitry Andric       unsigned long __t;
108*81ad6265SDimitry Andric       __c = __builtin_clzl(__m);
109*81ad6265SDimitry Andric       __t = (__X & (__mask >> __c)) >> (__p - __c);
110*81ad6265SDimitry Andric       __m ^= (__mask >> __c);
111*81ad6265SDimitry Andric       __result |= (__t);
112*81ad6265SDimitry Andric       __p++;
113*81ad6265SDimitry Andric     }
114*81ad6265SDimitry Andric   }
115*81ad6265SDimitry Andric   return __result;
116*81ad6265SDimitry Andric }
117*81ad6265SDimitry Andric 
118*81ad6265SDimitry Andric /* these 32-bit implementations depend on 64-bit pdep/pext
119*81ad6265SDimitry Andric    which depend on _ARCH_PWR7.  */
120*81ad6265SDimitry Andric extern __inline unsigned int
121*81ad6265SDimitry Andric     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pdep_u32(unsigned int __X,unsigned int __Y)122*81ad6265SDimitry Andric     _pdep_u32(unsigned int __X, unsigned int __Y) {
123*81ad6265SDimitry Andric   return _pdep_u64(__X, __Y);
124*81ad6265SDimitry Andric }
125*81ad6265SDimitry Andric 
126*81ad6265SDimitry Andric extern __inline unsigned int
127*81ad6265SDimitry Andric     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pext_u32(unsigned int __X,unsigned int __Y)128*81ad6265SDimitry Andric     _pext_u32(unsigned int __X, unsigned int __Y) {
129*81ad6265SDimitry Andric   return _pext_u64(__X, __Y);
130*81ad6265SDimitry Andric }
131*81ad6265SDimitry Andric #endif /* _ARCH_PWR7  */
132*81ad6265SDimitry Andric #endif /* __PPC64__  */
133*81ad6265SDimitry Andric 
134*81ad6265SDimitry Andric #endif /* BMI2INTRIN_H_ */
135