1*81ad6265SDimitry Andric /*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------=== 2*81ad6265SDimitry Andric * 3*81ad6265SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*81ad6265SDimitry Andric * See https://llvm.org/LICENSE.txt for license information. 5*81ad6265SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*81ad6265SDimitry Andric * 7*81ad6265SDimitry Andric *===-----------------------------------------------------------------------=== 8*81ad6265SDimitry Andric */ 9*81ad6265SDimitry Andric 10*81ad6265SDimitry Andric #if !defined X86GPRINTRIN_H_ 11*81ad6265SDimitry Andric #error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead." 12*81ad6265SDimitry Andric #endif 13*81ad6265SDimitry Andric 14*81ad6265SDimitry Andric #ifndef BMI2INTRIN_H_ 15*81ad6265SDimitry Andric #define BMI2INTRIN_H_ 16*81ad6265SDimitry Andric 17*81ad6265SDimitry Andric extern __inline unsigned int 18*81ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _bzhi_u32(unsigned int __X,unsigned int __Y)19*81ad6265SDimitry Andric _bzhi_u32(unsigned int __X, unsigned int __Y) { 20*81ad6265SDimitry Andric return ((__X << (32 - __Y)) >> (32 - __Y)); 21*81ad6265SDimitry Andric } 22*81ad6265SDimitry Andric 23*81ad6265SDimitry Andric extern __inline unsigned int 24*81ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mulx_u32(unsigned int __X,unsigned int __Y,unsigned int * __P)25*81ad6265SDimitry Andric _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) { 26*81ad6265SDimitry Andric unsigned long long __res = (unsigned long long)__X * __Y; 27*81ad6265SDimitry Andric *__P = (unsigned int)(__res >> 32); 28*81ad6265SDimitry Andric return (unsigned int)__res; 29*81ad6265SDimitry Andric } 30*81ad6265SDimitry Andric 31*81ad6265SDimitry Andric #ifdef __PPC64__ 32*81ad6265SDimitry Andric extern __inline unsigned long long 33*81ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _bzhi_u64(unsigned long long __X,unsigned long long __Y)34*81ad6265SDimitry Andric _bzhi_u64(unsigned long long __X, unsigned long long __Y) { 35*81ad6265SDimitry Andric return ((__X << (64 - __Y)) >> (64 - __Y)); 36*81ad6265SDimitry Andric } 37*81ad6265SDimitry Andric 38*81ad6265SDimitry Andric /* __int128 requires base 64-bit. */ 39*81ad6265SDimitry Andric extern __inline unsigned long long 40*81ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mulx_u64(unsigned long long __X,unsigned long long __Y,unsigned long long * __P)41*81ad6265SDimitry Andric _mulx_u64(unsigned long long __X, unsigned long long __Y, 42*81ad6265SDimitry Andric unsigned long long *__P) { 43*81ad6265SDimitry Andric unsigned __int128 __res = (unsigned __int128)__X * __Y; 44*81ad6265SDimitry Andric *__P = (unsigned long long)(__res >> 64); 45*81ad6265SDimitry Andric return (unsigned long long)__res; 46*81ad6265SDimitry Andric } 47*81ad6265SDimitry Andric 48*81ad6265SDimitry Andric #ifdef _ARCH_PWR7 49*81ad6265SDimitry Andric /* popcount and bpermd require power7 minimum. */ 50*81ad6265SDimitry Andric extern __inline unsigned long long 51*81ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _pdep_u64(unsigned long long __X,unsigned long long __M)52*81ad6265SDimitry Andric _pdep_u64(unsigned long long __X, unsigned long long __M) { 53*81ad6265SDimitry Andric unsigned long __result = 0x0UL; 54*81ad6265SDimitry Andric const unsigned long __mask = 0x8000000000000000UL; 55*81ad6265SDimitry Andric unsigned long __m = __M; 56*81ad6265SDimitry Andric unsigned long __c, __t; 57*81ad6265SDimitry Andric unsigned long __p; 58*81ad6265SDimitry Andric 59*81ad6265SDimitry Andric /* The pop-count of the mask gives the number of the bits from 60*81ad6265SDimitry Andric source to process. This is also needed to shift bits from the 61*81ad6265SDimitry Andric source into the correct position for the result. */ 62*81ad6265SDimitry Andric __p = 64 - __builtin_popcountl(__M); 63*81ad6265SDimitry Andric 64*81ad6265SDimitry Andric /* The loop is for the number of '1' bits in the mask and clearing 65*81ad6265SDimitry Andric each mask bit as it is processed. */ 66*81ad6265SDimitry Andric while (__m != 0) { 67*81ad6265SDimitry Andric __c = __builtin_clzl(__m); 68*81ad6265SDimitry Andric __t = __X << (__p - __c); 69*81ad6265SDimitry Andric __m ^= (__mask >> __c); 70*81ad6265SDimitry Andric __result |= (__t & (__mask >> __c)); 71*81ad6265SDimitry Andric __p++; 72*81ad6265SDimitry Andric } 73*81ad6265SDimitry Andric return __result; 74*81ad6265SDimitry Andric } 75*81ad6265SDimitry Andric 76*81ad6265SDimitry Andric extern __inline unsigned long long 77*81ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _pext_u64(unsigned long long __X,unsigned long long __M)78*81ad6265SDimitry Andric _pext_u64(unsigned long long __X, unsigned long long __M) { 79*81ad6265SDimitry Andric unsigned long __p = 0x4040404040404040UL; // initial bit permute control 80*81ad6265SDimitry Andric const unsigned long __mask = 0x8000000000000000UL; 81*81ad6265SDimitry Andric unsigned long __m = __M; 82*81ad6265SDimitry Andric unsigned long __c; 83*81ad6265SDimitry Andric unsigned long __result; 84*81ad6265SDimitry Andric 85*81ad6265SDimitry Andric /* if the mask is constant and selects 8 bits or less we can use 86*81ad6265SDimitry Andric the Power8 Bit permute instruction. */ 87*81ad6265SDimitry Andric if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) { 88*81ad6265SDimitry Andric /* Also if the pext mask is constant, then the popcount is 89*81ad6265SDimitry Andric constant, we can evaluate the following loop at compile 90*81ad6265SDimitry Andric time and use a constant bit permute vector. */ 91*81ad6265SDimitry Andric long __i; 92*81ad6265SDimitry Andric for (__i = 0; __i < __builtin_popcountl(__M); __i++) { 93*81ad6265SDimitry Andric __c = __builtin_clzl(__m); 94*81ad6265SDimitry Andric __p = (__p << 8) | __c; 95*81ad6265SDimitry Andric __m ^= (__mask >> __c); 96*81ad6265SDimitry Andric } 97*81ad6265SDimitry Andric __result = __builtin_bpermd(__p, __X); 98*81ad6265SDimitry Andric } else { 99*81ad6265SDimitry Andric __p = 64 - __builtin_popcountl(__M); 100*81ad6265SDimitry Andric __result = 0; 101*81ad6265SDimitry Andric /* We could a use a for loop here, but that combined with 102*81ad6265SDimitry Andric -funroll-loops can expand to a lot of code. The while 103*81ad6265SDimitry Andric loop avoids unrolling and the compiler commons the xor 104*81ad6265SDimitry Andric from clearing the mask bit with the (m != 0) test. The 105*81ad6265SDimitry Andric result is a more compact loop setup and body. */ 106*81ad6265SDimitry Andric while (__m != 0) { 107*81ad6265SDimitry Andric unsigned long __t; 108*81ad6265SDimitry Andric __c = __builtin_clzl(__m); 109*81ad6265SDimitry Andric __t = (__X & (__mask >> __c)) >> (__p - __c); 110*81ad6265SDimitry Andric __m ^= (__mask >> __c); 111*81ad6265SDimitry Andric __result |= (__t); 112*81ad6265SDimitry Andric __p++; 113*81ad6265SDimitry Andric } 114*81ad6265SDimitry Andric } 115*81ad6265SDimitry Andric return __result; 116*81ad6265SDimitry Andric } 117*81ad6265SDimitry Andric 118*81ad6265SDimitry Andric /* these 32-bit implementations depend on 64-bit pdep/pext 119*81ad6265SDimitry Andric which depend on _ARCH_PWR7. */ 120*81ad6265SDimitry Andric extern __inline unsigned int 121*81ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _pdep_u32(unsigned int __X,unsigned int __Y)122*81ad6265SDimitry Andric _pdep_u32(unsigned int __X, unsigned int __Y) { 123*81ad6265SDimitry Andric return _pdep_u64(__X, __Y); 124*81ad6265SDimitry Andric } 125*81ad6265SDimitry Andric 126*81ad6265SDimitry Andric extern __inline unsigned int 127*81ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _pext_u32(unsigned int __X,unsigned int __Y)128*81ad6265SDimitry Andric _pext_u32(unsigned int __X, unsigned int __Y) { 129*81ad6265SDimitry Andric return _pext_u64(__X, __Y); 130*81ad6265SDimitry Andric } 131*81ad6265SDimitry Andric #endif /* _ARCH_PWR7 */ 132*81ad6265SDimitry Andric #endif /* __PPC64__ */ 133*81ad6265SDimitry Andric 134*81ad6265SDimitry Andric #endif /* BMI2INTRIN_H_ */ 135