1*f3087befSAndrew Turner /* 2*f3087befSAndrew Turner * Generic helpers for evaluating polynomials with various schemes. 3*f3087befSAndrew Turner * 4*f3087befSAndrew Turner * Copyright (c) 2023-2024, Arm Limited. 5*f3087befSAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*f3087befSAndrew Turner */ 7*f3087befSAndrew Turner 8*f3087befSAndrew Turner #ifndef VTYPE 9*f3087befSAndrew Turner # error Cannot use poly_generic without defining VTYPE 10*f3087befSAndrew Turner #endif 11*f3087befSAndrew Turner #ifndef VWRAP 12*f3087befSAndrew Turner # error Cannot use poly_generic without defining VWRAP 13*f3087befSAndrew Turner #endif 14*f3087befSAndrew Turner #ifndef FMA 15*f3087befSAndrew Turner # error Cannot use poly_generic without defining FMA 16*f3087befSAndrew Turner #endif 17*f3087befSAndrew Turner 18*f3087befSAndrew Turner static inline VTYPE VWRAP (pairwise_poly_3) (VTYPE x, VTYPE x2, 19*f3087befSAndrew Turner const VTYPE *poly) 20*f3087befSAndrew Turner { 21*f3087befSAndrew Turner /* At order 3, Estrin and Pairwise Horner are identical. */ 22*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 23*f3087befSAndrew Turner VTYPE p23 = FMA (poly[3], x, poly[2]); 24*f3087befSAndrew Turner return FMA (p23, x2, p01); 25*f3087befSAndrew Turner } 26*f3087befSAndrew Turner 27*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_4) (VTYPE x, VTYPE x2, VTYPE x4, 28*f3087befSAndrew Turner const VTYPE *poly) 29*f3087befSAndrew Turner { 30*f3087befSAndrew Turner VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); 31*f3087befSAndrew Turner return FMA (poly[4], x4, p03); 32*f3087befSAndrew Turner } 33*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_5) (VTYPE x, VTYPE x2, VTYPE x4, 34*f3087befSAndrew Turner const VTYPE *poly) 35*f3087befSAndrew Turner { 36*f3087befSAndrew Turner VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); 37*f3087befSAndrew Turner VTYPE p45 = FMA (poly[5], x, poly[4]); 38*f3087befSAndrew Turner return FMA (p45, x4, p03); 39*f3087befSAndrew Turner } 40*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_6) (VTYPE x, VTYPE x2, VTYPE x4, 41*f3087befSAndrew Turner const VTYPE *poly) 42*f3087befSAndrew Turner { 43*f3087befSAndrew Turner VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); 44*f3087befSAndrew Turner VTYPE p45 = FMA (poly[5], x, poly[4]); 45*f3087befSAndrew Turner VTYPE p46 = FMA (poly[6], x2, p45); 46*f3087befSAndrew Turner return FMA (p46, x4, p03); 47*f3087befSAndrew Turner } 48*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_7) (VTYPE x, VTYPE x2, VTYPE x4, 49*f3087befSAndrew Turner const VTYPE *poly) 50*f3087befSAndrew Turner { 51*f3087befSAndrew Turner VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); 52*f3087befSAndrew Turner VTYPE p47 = VWRAP (pairwise_poly_3) (x, x2, poly + 4); 53*f3087befSAndrew Turner return FMA (p47, x4, p03); 54*f3087befSAndrew Turner } 55*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_8) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, 56*f3087befSAndrew Turner const VTYPE *poly) 57*f3087befSAndrew Turner { 58*f3087befSAndrew Turner return FMA (poly[8], x8, VWRAP (estrin_7) (x, x2, x4, poly)); 59*f3087befSAndrew Turner } 60*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_9) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, 61*f3087befSAndrew Turner const VTYPE *poly) 62*f3087befSAndrew Turner { 63*f3087befSAndrew Turner VTYPE p89 = FMA (poly[9], x, poly[8]); 64*f3087befSAndrew Turner return FMA (p89, x8, VWRAP (estrin_7) (x, x2, x4, poly)); 65*f3087befSAndrew Turner } 66*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_10) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, 67*f3087befSAndrew Turner const VTYPE *poly) 68*f3087befSAndrew Turner { 69*f3087befSAndrew Turner VTYPE p89 = FMA (poly[9], x, poly[8]); 70*f3087befSAndrew Turner VTYPE p8_10 = FMA (poly[10], x2, p89); 71*f3087befSAndrew Turner return FMA (p8_10, x8, VWRAP (estrin_7) (x, x2, x4, poly)); 72*f3087befSAndrew Turner } 73*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_11) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, 74*f3087befSAndrew Turner const VTYPE *poly) 75*f3087befSAndrew Turner { 76*f3087befSAndrew Turner VTYPE p8_11 = VWRAP (pairwise_poly_3) (x, x2, poly + 8); 77*f3087befSAndrew Turner return FMA (p8_11, x8, VWRAP (estrin_7) (x, x2, x4, poly)); 78*f3087befSAndrew Turner } 79*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_12) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, 80*f3087befSAndrew Turner const VTYPE *poly) 81*f3087befSAndrew Turner { 82*f3087befSAndrew Turner return FMA (VWRAP (estrin_4) (x, x2, x4, poly + 8), x8, 83*f3087befSAndrew Turner VWRAP (estrin_7) (x, x2, x4, poly)); 84*f3087befSAndrew Turner } 85*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_13) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, 86*f3087befSAndrew Turner const VTYPE *poly) 87*f3087befSAndrew Turner { 88*f3087befSAndrew Turner return FMA (VWRAP (estrin_5) (x, x2, x4, poly + 8), x8, 89*f3087befSAndrew Turner VWRAP (estrin_7) (x, x2, x4, poly)); 90*f3087befSAndrew Turner } 91*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_14) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, 92*f3087befSAndrew Turner const VTYPE *poly) 93*f3087befSAndrew Turner { 94*f3087befSAndrew Turner return FMA (VWRAP (estrin_6) (x, x2, x4, poly + 8), x8, 95*f3087befSAndrew Turner VWRAP (estrin_7) (x, x2, x4, poly)); 96*f3087befSAndrew Turner } 97*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_15) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, 98*f3087befSAndrew Turner const VTYPE *poly) 99*f3087befSAndrew Turner { 100*f3087befSAndrew Turner return FMA (VWRAP (estrin_7) (x, x2, x4, poly + 8), x8, 101*f3087befSAndrew Turner VWRAP (estrin_7) (x, x2, x4, poly)); 102*f3087befSAndrew Turner } 103*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_16) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, 104*f3087befSAndrew Turner VTYPE x16, const VTYPE *poly) 105*f3087befSAndrew Turner { 106*f3087befSAndrew Turner return FMA (poly[16], x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); 107*f3087befSAndrew Turner } 108*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_17) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, 109*f3087befSAndrew Turner VTYPE x16, const VTYPE *poly) 110*f3087befSAndrew Turner { 111*f3087befSAndrew Turner VTYPE p16_17 = FMA (poly[17], x, poly[16]); 112*f3087befSAndrew Turner return FMA (p16_17, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); 113*f3087befSAndrew Turner } 114*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_18) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, 115*f3087befSAndrew Turner VTYPE x16, const VTYPE *poly) 116*f3087befSAndrew Turner { 117*f3087befSAndrew Turner VTYPE p16_17 = FMA (poly[17], x, poly[16]); 118*f3087befSAndrew Turner VTYPE p16_18 = FMA (poly[18], x2, p16_17); 119*f3087befSAndrew Turner return FMA (p16_18, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); 120*f3087befSAndrew Turner } 121*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_19) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, 122*f3087befSAndrew Turner VTYPE x16, const VTYPE *poly) 123*f3087befSAndrew Turner { 124*f3087befSAndrew Turner VTYPE p16_19 = VWRAP (pairwise_poly_3) (x, x2, poly + 16); 125*f3087befSAndrew Turner return FMA (p16_19, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); 126*f3087befSAndrew Turner } 127*f3087befSAndrew Turner 128*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_2) (VTYPE x, const VTYPE *poly) 129*f3087befSAndrew Turner { 130*f3087befSAndrew Turner VTYPE p = FMA (poly[2], x, poly[1]); 131*f3087befSAndrew Turner return FMA (x, p, poly[0]); 132*f3087befSAndrew Turner } 133*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_3) (VTYPE x, const VTYPE *poly) 134*f3087befSAndrew Turner { 135*f3087befSAndrew Turner VTYPE p = FMA (poly[3], x, poly[2]); 136*f3087befSAndrew Turner p = FMA (x, p, poly[1]); 137*f3087befSAndrew Turner p = FMA (x, p, poly[0]); 138*f3087befSAndrew Turner return p; 139*f3087befSAndrew Turner } 140*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_4) (VTYPE x, const VTYPE *poly) 141*f3087befSAndrew Turner { 142*f3087befSAndrew Turner VTYPE p = FMA (poly[4], x, poly[3]); 143*f3087befSAndrew Turner p = FMA (x, p, poly[2]); 144*f3087befSAndrew Turner p = FMA (x, p, poly[1]); 145*f3087befSAndrew Turner p = FMA (x, p, poly[0]); 146*f3087befSAndrew Turner return p; 147*f3087befSAndrew Turner } 148*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_5) (VTYPE x, const VTYPE *poly) 149*f3087befSAndrew Turner { 150*f3087befSAndrew Turner return FMA (x, VWRAP (horner_4) (x, poly + 1), poly[0]); 151*f3087befSAndrew Turner } 152*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_6) (VTYPE x, const VTYPE *poly) 153*f3087befSAndrew Turner { 154*f3087befSAndrew Turner return FMA (x, VWRAP (horner_5) (x, poly + 1), poly[0]); 155*f3087befSAndrew Turner } 156*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_7) (VTYPE x, const VTYPE *poly) 157*f3087befSAndrew Turner { 158*f3087befSAndrew Turner return FMA (x, VWRAP (horner_6) (x, poly + 1), poly[0]); 159*f3087befSAndrew Turner } 160*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_8) (VTYPE x, const VTYPE *poly) 161*f3087befSAndrew Turner { 162*f3087befSAndrew Turner return FMA (x, VWRAP (horner_7) (x, poly + 1), poly[0]); 163*f3087befSAndrew Turner } 164*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_9) (VTYPE x, const VTYPE *poly) 165*f3087befSAndrew Turner { 166*f3087befSAndrew Turner return FMA (x, VWRAP (horner_8) (x, poly + 1), poly[0]); 167*f3087befSAndrew Turner } 168*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_10) (VTYPE x, const VTYPE *poly) 169*f3087befSAndrew Turner { 170*f3087befSAndrew Turner return FMA (x, VWRAP (horner_9) (x, poly + 1), poly[0]); 171*f3087befSAndrew Turner } 172*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_11) (VTYPE x, const VTYPE *poly) 173*f3087befSAndrew Turner { 174*f3087befSAndrew Turner return FMA (x, VWRAP (horner_10) (x, poly + 1), poly[0]); 175*f3087befSAndrew Turner } 176*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_12) (VTYPE x, const VTYPE *poly) 177*f3087befSAndrew Turner { 178*f3087befSAndrew Turner return FMA (x, VWRAP (horner_11) (x, poly + 1), poly[0]); 179*f3087befSAndrew Turner } 180*f3087befSAndrew Turner 181*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_4) (VTYPE x, VTYPE x2, const VTYPE *poly) 182*f3087befSAndrew Turner { 183*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 184*f3087befSAndrew Turner VTYPE p23 = FMA (poly[3], x, poly[2]); 185*f3087befSAndrew Turner VTYPE p; 186*f3087befSAndrew Turner p = FMA (x2, poly[4], p23); 187*f3087befSAndrew Turner p = FMA (x2, p, p01); 188*f3087befSAndrew Turner return p; 189*f3087befSAndrew Turner } 190*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_5) (VTYPE x, VTYPE x2, const VTYPE *poly) 191*f3087befSAndrew Turner { 192*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 193*f3087befSAndrew Turner VTYPE p23 = FMA (poly[3], x, poly[2]); 194*f3087befSAndrew Turner VTYPE p45 = FMA (poly[5], x, poly[4]); 195*f3087befSAndrew Turner VTYPE p; 196*f3087befSAndrew Turner p = FMA (x2, p45, p23); 197*f3087befSAndrew Turner p = FMA (x2, p, p01); 198*f3087befSAndrew Turner return p; 199*f3087befSAndrew Turner } 200*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_6) (VTYPE x, VTYPE x2, const VTYPE *poly) 201*f3087befSAndrew Turner { 202*f3087befSAndrew Turner VTYPE p26 = VWRAP (pw_horner_4) (x, x2, poly + 2); 203*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 204*f3087befSAndrew Turner return FMA (x2, p26, p01); 205*f3087befSAndrew Turner } 206*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_7) (VTYPE x, VTYPE x2, const VTYPE *poly) 207*f3087befSAndrew Turner { 208*f3087befSAndrew Turner VTYPE p27 = VWRAP (pw_horner_5) (x, x2, poly + 2); 209*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 210*f3087befSAndrew Turner return FMA (x2, p27, p01); 211*f3087befSAndrew Turner } 212*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_8) (VTYPE x, VTYPE x2, const VTYPE *poly) 213*f3087befSAndrew Turner { 214*f3087befSAndrew Turner VTYPE p28 = VWRAP (pw_horner_6) (x, x2, poly + 2); 215*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 216*f3087befSAndrew Turner return FMA (x2, p28, p01); 217*f3087befSAndrew Turner } 218*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_9) (VTYPE x, VTYPE x2, const VTYPE *poly) 219*f3087befSAndrew Turner { 220*f3087befSAndrew Turner VTYPE p29 = VWRAP (pw_horner_7) (x, x2, poly + 2); 221*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 222*f3087befSAndrew Turner return FMA (x2, p29, p01); 223*f3087befSAndrew Turner } 224*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_10) (VTYPE x, VTYPE x2, const VTYPE *poly) 225*f3087befSAndrew Turner { 226*f3087befSAndrew Turner VTYPE p2_10 = VWRAP (pw_horner_8) (x, x2, poly + 2); 227*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 228*f3087befSAndrew Turner return FMA (x2, p2_10, p01); 229*f3087befSAndrew Turner } 230*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_11) (VTYPE x, VTYPE x2, const VTYPE *poly) 231*f3087befSAndrew Turner { 232*f3087befSAndrew Turner VTYPE p2_11 = VWRAP (pw_horner_9) (x, x2, poly + 2); 233*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 234*f3087befSAndrew Turner return FMA (x2, p2_11, p01); 235*f3087befSAndrew Turner } 236*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_12) (VTYPE x, VTYPE x2, const VTYPE *poly) 237*f3087befSAndrew Turner { 238*f3087befSAndrew Turner VTYPE p2_12 = VWRAP (pw_horner_10) (x, x2, poly + 2); 239*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 240*f3087befSAndrew Turner return FMA (x2, p2_12, p01); 241*f3087befSAndrew Turner } 242*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_13) (VTYPE x, VTYPE x2, const VTYPE *poly) 243*f3087befSAndrew Turner { 244*f3087befSAndrew Turner VTYPE p2_13 = VWRAP (pw_horner_11) (x, x2, poly + 2); 245*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 246*f3087befSAndrew Turner return FMA (x2, p2_13, p01); 247*f3087befSAndrew Turner } 248*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_14) (VTYPE x, VTYPE x2, const VTYPE *poly) 249*f3087befSAndrew Turner { 250*f3087befSAndrew Turner VTYPE p2_14 = VWRAP (pw_horner_12) (x, x2, poly + 2); 251*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 252*f3087befSAndrew Turner return FMA (x2, p2_14, p01); 253*f3087befSAndrew Turner } 254*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_15) (VTYPE x, VTYPE x2, const VTYPE *poly) 255*f3087befSAndrew Turner { 256*f3087befSAndrew Turner VTYPE p2_15 = VWRAP (pw_horner_13) (x, x2, poly + 2); 257*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 258*f3087befSAndrew Turner return FMA (x2, p2_15, p01); 259*f3087befSAndrew Turner } 260*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_16) (VTYPE x, VTYPE x2, const VTYPE *poly) 261*f3087befSAndrew Turner { 262*f3087befSAndrew Turner VTYPE p2_16 = VWRAP (pw_horner_14) (x, x2, poly + 2); 263*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 264*f3087befSAndrew Turner return FMA (x2, p2_16, p01); 265*f3087befSAndrew Turner } 266*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_17) (VTYPE x, VTYPE x2, const VTYPE *poly) 267*f3087befSAndrew Turner { 268*f3087befSAndrew Turner VTYPE p2_17 = VWRAP (pw_horner_15) (x, x2, poly + 2); 269*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 270*f3087befSAndrew Turner return FMA (x2, p2_17, p01); 271*f3087befSAndrew Turner } 272*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_18) (VTYPE x, VTYPE x2, const VTYPE *poly) 273*f3087befSAndrew Turner { 274*f3087befSAndrew Turner VTYPE p2_18 = VWRAP (pw_horner_16) (x, x2, poly + 2); 275*f3087befSAndrew Turner VTYPE p01 = FMA (poly[1], x, poly[0]); 276*f3087befSAndrew Turner return FMA (x2, p2_18, p01); 277*f3087befSAndrew Turner } 278