1*99a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
2*99a2dd95SBruce Richardson * Copyright(c) 2017 Cavium, Inc
3*99a2dd95SBruce Richardson */
4*99a2dd95SBruce Richardson /*
5*99a2dd95SBruce Richardson * Reciprocal divide
6*99a2dd95SBruce Richardson *
7*99a2dd95SBruce Richardson * Used with permission from original authors
8*99a2dd95SBruce Richardson * Hannes Frederic Sowa and Daniel Borkmann
9*99a2dd95SBruce Richardson *
10*99a2dd95SBruce Richardson * This algorithm is based on the paper "Division by Invariant
11*99a2dd95SBruce Richardson * Integers Using Multiplication" by Torbjörn Granlund and Peter
12*99a2dd95SBruce Richardson * L. Montgomery.
13*99a2dd95SBruce Richardson *
14*99a2dd95SBruce Richardson * The assembler implementation from Agner Fog, which this code is
15*99a2dd95SBruce Richardson * based on, can be found here:
16*99a2dd95SBruce Richardson * http://www.agner.org/optimize/asmlib.zip
17*99a2dd95SBruce Richardson *
18*99a2dd95SBruce Richardson * This optimization for A/B is helpful if the divisor B is mostly
19*99a2dd95SBruce Richardson * runtime invariant. The reciprocal of B is calculated in the
20*99a2dd95SBruce Richardson * slow-path with reciprocal_value(). The fast-path can then just use
21*99a2dd95SBruce Richardson * a much faster multiplication operation with a variable dividend A
22*99a2dd95SBruce Richardson * to calculate the division A/B.
23*99a2dd95SBruce Richardson */
24*99a2dd95SBruce Richardson
25*99a2dd95SBruce Richardson #ifndef _RTE_RECIPROCAL_H_
26*99a2dd95SBruce Richardson #define _RTE_RECIPROCAL_H_
27*99a2dd95SBruce Richardson
28*99a2dd95SBruce Richardson #include <stdint.h>
29*99a2dd95SBruce Richardson
30*99a2dd95SBruce Richardson #include <rte_common.h>
31*99a2dd95SBruce Richardson
32*99a2dd95SBruce Richardson #ifdef __cplusplus
33*99a2dd95SBruce Richardson extern "C" {
34*99a2dd95SBruce Richardson #endif
35*99a2dd95SBruce Richardson
36*99a2dd95SBruce Richardson struct rte_reciprocal {
37*99a2dd95SBruce Richardson uint32_t m;
38*99a2dd95SBruce Richardson uint8_t sh1, sh2;
39*99a2dd95SBruce Richardson };
40*99a2dd95SBruce Richardson
41*99a2dd95SBruce Richardson struct rte_reciprocal_u64 {
42*99a2dd95SBruce Richardson uint64_t m;
43*99a2dd95SBruce Richardson uint8_t sh1, sh2;
44*99a2dd95SBruce Richardson };
45*99a2dd95SBruce Richardson
rte_reciprocal_divide(uint32_t a,struct rte_reciprocal R)46*99a2dd95SBruce Richardson static inline uint32_t rte_reciprocal_divide(uint32_t a, struct rte_reciprocal R)
47*99a2dd95SBruce Richardson {
48*99a2dd95SBruce Richardson uint32_t t = (uint32_t)(((uint64_t)a * R.m) >> 32);
49*99a2dd95SBruce Richardson
50*99a2dd95SBruce Richardson return (t + ((a - t) >> R.sh1)) >> R.sh2;
51*99a2dd95SBruce Richardson }
52*99a2dd95SBruce Richardson
53*99a2dd95SBruce Richardson static __rte_always_inline uint64_t
mullhi_u64(uint64_t x,uint64_t y)54*99a2dd95SBruce Richardson mullhi_u64(uint64_t x, uint64_t y)
55*99a2dd95SBruce Richardson {
56*99a2dd95SBruce Richardson #ifdef __SIZEOF_INT128__
57*99a2dd95SBruce Richardson __uint128_t xl = x;
58*99a2dd95SBruce Richardson __uint128_t rl = xl * y;
59*99a2dd95SBruce Richardson
60*99a2dd95SBruce Richardson return (rl >> 64);
61*99a2dd95SBruce Richardson #else
62*99a2dd95SBruce Richardson uint64_t u0, u1, v0, v1, k, t;
63*99a2dd95SBruce Richardson uint64_t w1, w2;
64*99a2dd95SBruce Richardson uint64_t whi;
65*99a2dd95SBruce Richardson
66*99a2dd95SBruce Richardson u1 = x >> 32; u0 = x & 0xFFFFFFFF;
67*99a2dd95SBruce Richardson v1 = y >> 32; v0 = y & 0xFFFFFFFF;
68*99a2dd95SBruce Richardson
69*99a2dd95SBruce Richardson t = u0*v0;
70*99a2dd95SBruce Richardson k = t >> 32;
71*99a2dd95SBruce Richardson
72*99a2dd95SBruce Richardson t = u1*v0 + k;
73*99a2dd95SBruce Richardson w1 = t & 0xFFFFFFFF;
74*99a2dd95SBruce Richardson w2 = t >> 32;
75*99a2dd95SBruce Richardson
76*99a2dd95SBruce Richardson t = u0*v1 + w1;
77*99a2dd95SBruce Richardson k = t >> 32;
78*99a2dd95SBruce Richardson
79*99a2dd95SBruce Richardson whi = u1*v1 + w2 + k;
80*99a2dd95SBruce Richardson
81*99a2dd95SBruce Richardson return whi;
82*99a2dd95SBruce Richardson #endif
83*99a2dd95SBruce Richardson }
84*99a2dd95SBruce Richardson
85*99a2dd95SBruce Richardson static __rte_always_inline uint64_t
rte_reciprocal_divide_u64(uint64_t a,const struct rte_reciprocal_u64 * R)86*99a2dd95SBruce Richardson rte_reciprocal_divide_u64(uint64_t a, const struct rte_reciprocal_u64 *R)
87*99a2dd95SBruce Richardson {
88*99a2dd95SBruce Richardson uint64_t t = mullhi_u64(a, R->m);
89*99a2dd95SBruce Richardson
90*99a2dd95SBruce Richardson return (t + ((a - t) >> R->sh1)) >> R->sh2;
91*99a2dd95SBruce Richardson }
92*99a2dd95SBruce Richardson
93*99a2dd95SBruce Richardson struct rte_reciprocal rte_reciprocal_value(uint32_t d);
94*99a2dd95SBruce Richardson struct rte_reciprocal_u64 rte_reciprocal_value_u64(uint64_t d);
95*99a2dd95SBruce Richardson
96*99a2dd95SBruce Richardson #ifdef __cplusplus
97*99a2dd95SBruce Richardson }
98*99a2dd95SBruce Richardson #endif
99*99a2dd95SBruce Richardson
100*99a2dd95SBruce Richardson #endif /* _RTE_RECIPROCAL_H_ */
101