1*f3087befSAndrew Turner /* 2*f3087befSAndrew Turner * Vector math abstractions. 3*f3087befSAndrew Turner * 4*f3087befSAndrew Turner * Copyright (c) 2019-2024, Arm Limited. 5*f3087befSAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*f3087befSAndrew Turner */ 7*f3087befSAndrew Turner 8*f3087befSAndrew Turner #ifndef _V_MATH_H 9*f3087befSAndrew Turner #define _V_MATH_H 10*f3087befSAndrew Turner 11*f3087befSAndrew Turner #if !__aarch64__ 12*f3087befSAndrew Turner # error "Cannot build without AArch64" 13*f3087befSAndrew Turner #endif 14*f3087befSAndrew Turner 15*f3087befSAndrew Turner #define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) 16*f3087befSAndrew Turner 17*f3087befSAndrew Turner #define V_NAME_F1(fun) _ZGVnN4v_##fun##f 18*f3087befSAndrew Turner #define V_NAME_D1(fun) _ZGVnN2v_##fun 19*f3087befSAndrew Turner #define V_NAME_F2(fun) _ZGVnN4vv_##fun##f 20*f3087befSAndrew Turner #define V_NAME_D2(fun) _ZGVnN2vv_##fun 21*f3087befSAndrew Turner #define V_NAME_F1_L1(fun) _ZGVnN4vl4_##fun##f 22*f3087befSAndrew Turner #define V_NAME_D1_L1(fun) _ZGVnN2vl8_##fun 23*f3087befSAndrew Turner 24*f3087befSAndrew Turner #if USE_GLIBC_ABI 25*f3087befSAndrew Turner 26*f3087befSAndrew Turner # define HALF_WIDTH_ALIAS_F1(fun) \ 27*f3087befSAndrew Turner float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x) \ 28*f3087befSAndrew Turner { \ 29*f3087befSAndrew Turner return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x))); \ 30*f3087befSAndrew Turner } 31*f3087befSAndrew Turner 32*f3087befSAndrew Turner # define HALF_WIDTH_ALIAS_F2(fun) \ 33*f3087befSAndrew Turner float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y) \ 34*f3087befSAndrew Turner { \ 35*f3087befSAndrew Turner return vget_low_f32 ( \ 36*f3087befSAndrew Turner _ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y))); \ 37*f3087befSAndrew Turner } 38*f3087befSAndrew Turner 39*f3087befSAndrew Turner #else 40*f3087befSAndrew Turner # define HALF_WIDTH_ALIAS_F1(fun) 41*f3087befSAndrew Turner # define HALF_WIDTH_ALIAS_F2(fun) 42*f3087befSAndrew Turner #endif 43*f3087befSAndrew Turner 44*f3087befSAndrew Turner #include <stdint.h> 45*f3087befSAndrew Turner #include "math_config.h" 46*f3087befSAndrew Turner #include <arm_neon.h> 47*f3087befSAndrew Turner 48*f3087befSAndrew Turner /* Shorthand helpers for declaring constants. */ 49*f3087befSAndrew Turner #define V2(X) \ 50*f3087befSAndrew Turner { \ 51*f3087befSAndrew Turner X, X \ 52*f3087befSAndrew Turner } 53*f3087befSAndrew Turner #define V4(X) \ 54*f3087befSAndrew Turner { \ 55*f3087befSAndrew Turner X, X, X, X \ 56*f3087befSAndrew Turner } 57*f3087befSAndrew Turner #define V8(X) \ 58*f3087befSAndrew Turner { \ 59*f3087befSAndrew Turner X, X, X, X, X, X, X, X \ 60*f3087befSAndrew Turner } 61*f3087befSAndrew Turner 62*f3087befSAndrew Turner static inline int 63*f3087befSAndrew Turner v_any_u16h (uint16x4_t x) 64*f3087befSAndrew Turner { 65*f3087befSAndrew Turner return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; 66*f3087befSAndrew Turner } 67*f3087befSAndrew Turner 68*f3087befSAndrew Turner static inline int 69*f3087befSAndrew Turner v_lanes32 (void) 70*f3087befSAndrew Turner { 71*f3087befSAndrew Turner return 4; 72*f3087befSAndrew Turner } 73*f3087befSAndrew Turner 74*f3087befSAndrew Turner static inline float32x4_t 75*f3087befSAndrew Turner v_f32 (float x) 76*f3087befSAndrew Turner { 77*f3087befSAndrew Turner return (float32x4_t) V4 (x); 78*f3087befSAndrew Turner } 79*f3087befSAndrew Turner static inline uint32x4_t 80*f3087befSAndrew Turner v_u32 (uint32_t x) 81*f3087befSAndrew Turner { 82*f3087befSAndrew Turner return (uint32x4_t) V4 (x); 83*f3087befSAndrew Turner } 84*f3087befSAndrew Turner static inline int32x4_t 85*f3087befSAndrew Turner v_s32 (int32_t x) 86*f3087befSAndrew Turner { 87*f3087befSAndrew Turner return (int32x4_t) V4 (x); 88*f3087befSAndrew Turner } 89*f3087befSAndrew Turner 90*f3087befSAndrew Turner /* true if any elements of a v_cond result is non-zero. */ 91*f3087befSAndrew Turner static inline int 92*f3087befSAndrew Turner v_any_u32 (uint32x4_t x) 93*f3087befSAndrew Turner { 94*f3087befSAndrew Turner /* assume elements in x are either 0 or -1u. */ 95*f3087befSAndrew Turner return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; 96*f3087befSAndrew Turner } 97*f3087befSAndrew Turner static inline int 98*f3087befSAndrew Turner v_any_u32h (uint32x2_t x) 99*f3087befSAndrew Turner { 100*f3087befSAndrew Turner return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; 101*f3087befSAndrew Turner } 102*f3087befSAndrew Turner static inline float32x4_t 103*f3087befSAndrew Turner v_lookup_f32 (const float *tab, uint32x4_t idx) 104*f3087befSAndrew Turner { 105*f3087befSAndrew Turner return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; 106*f3087befSAndrew Turner } 107*f3087befSAndrew Turner static inline uint32x4_t 108*f3087befSAndrew Turner v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) 109*f3087befSAndrew Turner { 110*f3087befSAndrew Turner return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; 111*f3087befSAndrew Turner } 112*f3087befSAndrew Turner static inline float32x4_t 113*f3087befSAndrew Turner v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) 114*f3087befSAndrew Turner { 115*f3087befSAndrew Turner return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], 116*f3087befSAndrew Turner p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] }; 117*f3087befSAndrew Turner } 118*f3087befSAndrew Turner static inline float32x4_t 119*f3087befSAndrew Turner v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, 120*f3087befSAndrew Turner float32x4_t y, uint32x4_t p) 121*f3087befSAndrew Turner { 122*f3087befSAndrew Turner return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0], 123*f3087befSAndrew Turner p[1] ? f (x1[1], x2[1]) : y[1], 124*f3087befSAndrew Turner p[2] ? f (x1[2], x2[2]) : y[2], 125*f3087befSAndrew Turner p[3] ? f (x1[3], x2[3]) : y[3] }; 126*f3087befSAndrew Turner } 127*f3087befSAndrew Turner static inline float32x4_t 128*f3087befSAndrew Turner v_zerofy_f32 (float32x4_t x, uint32x4_t mask) 129*f3087befSAndrew Turner { 130*f3087befSAndrew Turner return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask)); 131*f3087befSAndrew Turner } 132*f3087befSAndrew Turner 133*f3087befSAndrew Turner static inline int 134*f3087befSAndrew Turner v_lanes64 (void) 135*f3087befSAndrew Turner { 136*f3087befSAndrew Turner return 2; 137*f3087befSAndrew Turner } 138*f3087befSAndrew Turner static inline float64x2_t 139*f3087befSAndrew Turner v_f64 (double x) 140*f3087befSAndrew Turner { 141*f3087befSAndrew Turner return (float64x2_t) V2 (x); 142*f3087befSAndrew Turner } 143*f3087befSAndrew Turner static inline uint64x2_t 144*f3087befSAndrew Turner v_u64 (uint64_t x) 145*f3087befSAndrew Turner { 146*f3087befSAndrew Turner return (uint64x2_t) V2 (x); 147*f3087befSAndrew Turner } 148*f3087befSAndrew Turner static inline int64x2_t 149*f3087befSAndrew Turner v_s64 (int64_t x) 150*f3087befSAndrew Turner { 151*f3087befSAndrew Turner return (int64x2_t) V2 (x); 152*f3087befSAndrew Turner } 153*f3087befSAndrew Turner 154*f3087befSAndrew Turner /* true if any elements of a v_cond result is non-zero. */ 155*f3087befSAndrew Turner static inline int 156*f3087befSAndrew Turner v_any_u64 (uint64x2_t x) 157*f3087befSAndrew Turner { 158*f3087befSAndrew Turner /* assume elements in x are either 0 or -1u. */ 159*f3087befSAndrew Turner return vpaddd_u64 (x) != 0; 160*f3087befSAndrew Turner } 161*f3087befSAndrew Turner static inline float64x2_t 162*f3087befSAndrew Turner v_lookup_f64 (const double *tab, uint64x2_t idx) 163*f3087befSAndrew Turner { 164*f3087befSAndrew Turner return (float64x2_t){ tab[idx[0]], tab[idx[1]] }; 165*f3087befSAndrew Turner } 166*f3087befSAndrew Turner static inline uint64x2_t 167*f3087befSAndrew Turner v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) 168*f3087befSAndrew Turner { 169*f3087befSAndrew Turner return (uint64x2_t){ tab[idx[0]], tab[idx[1]] }; 170*f3087befSAndrew Turner } 171*f3087befSAndrew Turner static inline float64x2_t 172*f3087befSAndrew Turner v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) 173*f3087befSAndrew Turner { 174*f3087befSAndrew Turner double p1 = p[1]; 175*f3087befSAndrew Turner double x1 = x[1]; 176*f3087befSAndrew Turner if (likely (p[0])) 177*f3087befSAndrew Turner y[0] = f (x[0]); 178*f3087befSAndrew Turner if (likely (p1)) 179*f3087befSAndrew Turner y[1] = f (x1); 180*f3087befSAndrew Turner return y; 181*f3087befSAndrew Turner } 182*f3087befSAndrew Turner 183*f3087befSAndrew Turner static inline float64x2_t 184*f3087befSAndrew Turner v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2, 185*f3087befSAndrew Turner float64x2_t y, uint64x2_t p) 186*f3087befSAndrew Turner { 187*f3087befSAndrew Turner double p1 = p[1]; 188*f3087befSAndrew Turner double x1h = x1[1]; 189*f3087befSAndrew Turner double x2h = x2[1]; 190*f3087befSAndrew Turner if (likely (p[0])) 191*f3087befSAndrew Turner y[0] = f (x1[0], x2[0]); 192*f3087befSAndrew Turner if (likely (p1)) 193*f3087befSAndrew Turner y[1] = f (x1h, x2h); 194*f3087befSAndrew Turner return y; 195*f3087befSAndrew Turner } 196*f3087befSAndrew Turner static inline float64x2_t 197*f3087befSAndrew Turner v_zerofy_f64 (float64x2_t x, uint64x2_t mask) 198*f3087befSAndrew Turner { 199*f3087befSAndrew Turner return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask)); 200*f3087befSAndrew Turner } 201*f3087befSAndrew Turner 202*f3087befSAndrew Turner #endif 203