1*f3087befSAndrew Turner /* 2*f3087befSAndrew Turner * Single-precision vector hypot(x) function. 3*f3087befSAndrew Turner * 4*f3087befSAndrew Turner * Copyright (c) 2023-2024, Arm Limited. 5*f3087befSAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*f3087befSAndrew Turner */ 7*f3087befSAndrew Turner 8*f3087befSAndrew Turner #include "v_math.h" 9*f3087befSAndrew Turner #include "test_sig.h" 10*f3087befSAndrew Turner #include "test_defs.h" 11*f3087befSAndrew Turner 12*f3087befSAndrew Turner #if WANT_SIMD_EXCEPT 13*f3087befSAndrew Turner static const struct data 14*f3087befSAndrew Turner { 15*f3087befSAndrew Turner uint32x4_t tiny_bound, thres; 16*f3087befSAndrew Turner } data = { 17*f3087befSAndrew Turner .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */ 18*f3087befSAndrew Turner .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */ 19*f3087befSAndrew Turner }; 20*f3087befSAndrew Turner #else 21*f3087befSAndrew Turner static const struct data 22*f3087befSAndrew Turner { 23*f3087befSAndrew Turner uint32x4_t tiny_bound; 24*f3087befSAndrew Turner uint16x8_t thres; 25*f3087befSAndrew Turner } data = { 26*f3087befSAndrew Turner .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */ 27*f3087befSAndrew Turner .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */ 28*f3087befSAndrew Turner }; 29*f3087befSAndrew Turner #endif 30*f3087befSAndrew Turner 31*f3087befSAndrew Turner static float32x4_t VPCS_ATTR NOINLINE 32*f3087befSAndrew Turner special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum, 33*f3087befSAndrew Turner uint16x4_t special) 34*f3087befSAndrew Turner { 35*f3087befSAndrew Turner return v_call2_f32 (hypotf, x, y, vsqrtq_f32 (sqsum), vmovl_u16 (special)); 36*f3087befSAndrew Turner } 37*f3087befSAndrew Turner 38*f3087befSAndrew Turner /* Vector implementation of single-precision hypot. 39*f3087befSAndrew Turner Maximum error observed is 1.21 ULP: 40*f3087befSAndrew Turner _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13 41*f3087befSAndrew Turner want 0x1.6a41dp-13. */ 42*f3087befSAndrew Turner #if WANT_SIMD_EXCEPT 43*f3087befSAndrew Turner 44*f3087befSAndrew Turner float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) 45*f3087befSAndrew Turner { 46*f3087befSAndrew Turner const struct data *d = ptr_barrier (&data); 47*f3087befSAndrew Turner 48*f3087befSAndrew Turner float32x4_t ax = vabsq_f32 (x); 49*f3087befSAndrew Turner float32x4_t ay = vabsq_f32 (y); 50*f3087befSAndrew Turner 51*f3087befSAndrew Turner uint32x4_t ix = vreinterpretq_u32_f32 (ax); 52*f3087befSAndrew Turner uint32x4_t iy = vreinterpretq_u32_f32 (ay); 53*f3087befSAndrew Turner 54*f3087befSAndrew Turner /* Extreme values, NaNs, and infinities should be handled by the scalar 55*f3087befSAndrew Turner fallback for correct flag handling. */ 56*f3087befSAndrew Turner uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres); 57*f3087befSAndrew Turner uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres); 58*f3087befSAndrew Turner ax = v_zerofy_f32 (ax, specialx); 59*f3087befSAndrew Turner ay = v_zerofy_f32 (ay, specialy); 60*f3087befSAndrew Turner uint16x4_t special = vaddhn_u32 (specialx, specialy); 61*f3087befSAndrew Turner 62*f3087befSAndrew Turner float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay); 63*f3087befSAndrew Turner 64*f3087befSAndrew Turner if (unlikely (v_any_u16h (special))) 65*f3087befSAndrew Turner return special_case (x, y, sqsum, special); 66*f3087befSAndrew Turner 67*f3087befSAndrew Turner return vsqrtq_f32 (sqsum); 68*f3087befSAndrew Turner } 69*f3087befSAndrew Turner #else 70*f3087befSAndrew Turner 71*f3087befSAndrew Turner float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) 72*f3087befSAndrew Turner { 73*f3087befSAndrew Turner const struct data *d = ptr_barrier (&data); 74*f3087befSAndrew Turner 75*f3087befSAndrew Turner float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y); 76*f3087befSAndrew Turner 77*f3087befSAndrew Turner uint16x4_t special 78*f3087befSAndrew Turner = vcge_u16 (vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound), 79*f3087befSAndrew Turner vget_low_u16 (d->thres)); 80*f3087befSAndrew Turner 81*f3087befSAndrew Turner if (unlikely (v_any_u16h (special))) 82*f3087befSAndrew Turner return special_case (x, y, sqsum, special); 83*f3087befSAndrew Turner 84*f3087befSAndrew Turner return vsqrtq_f32 (sqsum); 85*f3087befSAndrew Turner } 86*f3087befSAndrew Turner #endif 87*f3087befSAndrew Turner 88*f3087befSAndrew Turner HALF_WIDTH_ALIAS_F2 (hypot) 89*f3087befSAndrew Turner 90*f3087befSAndrew Turner TEST_SIG (V, F, 2, hypot, -10.0, 10.0) 91*f3087befSAndrew Turner TEST_ULP (V_NAME_F2 (hypot), 1.21) 92*f3087befSAndrew Turner TEST_DISABLE_FENV_IF_NOT (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT) 93*f3087befSAndrew Turner TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000) 94*f3087befSAndrew Turner TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000) 95*f3087befSAndrew Turner TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000) 96*f3087befSAndrew Turner TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000) 97