xref: /dpdk/lib/fib/dir24_8_avx512.c (revision 25a2a0dc3de31ca0a6fbc9371cf3dd85dfd74b07)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 #include <rte_vect.h>
6 #include <rte_fib.h>
7 
8 #include "dir24_8.h"
9 #include "dir24_8_avx512.h"
10 
11 static __rte_always_inline void
12 dir24_8_vec_lookup_x16(void *p, const uint32_t *ips,
13 	uint64_t *next_hops, int size, bool be_addr)
14 {
15 	struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p;
16 	__mmask16 msk_ext;
17 	__mmask16 exp_msk = 0x5555;
18 	__m512i ip_vec, idxes, res, bytes;
19 	const __m512i zero = _mm512_set1_epi32(0);
20 	const __m512i lsb = _mm512_set1_epi32(1);
21 	const __m512i lsbyte_msk = _mm512_set1_epi32(0xff);
22 	__m512i tmp1, tmp2, res_msk;
23 	__m256i tmp256;
24 	/* used to mask gather values if size is 1/2 (8/16 bit next hops) */
25 	if (size == sizeof(uint8_t))
26 		res_msk = _mm512_set1_epi32(UINT8_MAX);
27 	else if (size == sizeof(uint16_t))
28 		res_msk = _mm512_set1_epi32(UINT16_MAX);
29 
30 	ip_vec = _mm512_loadu_si512(ips);
31 	if (be_addr) {
32 		const __m512i bswap32 = _mm512_set_epi32(
33 			0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203,
34 			0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203,
35 			0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203,
36 			0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203
37 		);
38 		ip_vec = _mm512_shuffle_epi8(ip_vec, bswap32);
39 	}
40 
41 	/* mask 24 most significant bits */
42 	idxes = _mm512_srli_epi32(ip_vec, 8);
43 
44 	/**
45 	 * lookup in tbl24
46 	 * Put it inside branch to make compiler happy with -O0
47 	 */
48 	if (size == sizeof(uint8_t)) {
49 		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 1);
50 		res = _mm512_and_epi32(res, res_msk);
51 	} else if (size == sizeof(uint16_t)) {
52 		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 2);
53 		res = _mm512_and_epi32(res, res_msk);
54 	} else
55 		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 4);
56 
57 	/* get extended entries indexes */
58 	msk_ext = _mm512_test_epi32_mask(res, lsb);
59 
60 	if (msk_ext != 0) {
61 		idxes = _mm512_srli_epi32(res, 1);
62 		idxes = _mm512_slli_epi32(idxes, 8);
63 		bytes = _mm512_and_epi32(ip_vec, lsbyte_msk);
64 		idxes = _mm512_maskz_add_epi32(msk_ext, idxes, bytes);
65 		if (size == sizeof(uint8_t)) {
66 			idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
67 				idxes, (const int *)dp->tbl8, 1);
68 			idxes = _mm512_and_epi32(idxes, res_msk);
69 		} else if (size == sizeof(uint16_t)) {
70 			idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
71 				idxes, (const int *)dp->tbl8, 2);
72 			idxes = _mm512_and_epi32(idxes, res_msk);
73 		} else
74 			idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
75 				idxes, (const int *)dp->tbl8, 4);
76 
77 		res = _mm512_mask_blend_epi32(msk_ext, res, idxes);
78 	}
79 
80 	res = _mm512_srli_epi32(res, 1);
81 	tmp1 = _mm512_maskz_expand_epi32(exp_msk, res);
82 	tmp256 = _mm512_extracti32x8_epi32(res, 1);
83 	tmp2 = _mm512_maskz_expand_epi32(exp_msk,
84 		_mm512_castsi256_si512(tmp256));
85 	_mm512_storeu_si512(next_hops, tmp1);
86 	_mm512_storeu_si512(next_hops + 8, tmp2);
87 }
88 
89 static __rte_always_inline void
90 dir24_8_vec_lookup_x8_8b(void *p, const uint32_t *ips,
91 	uint64_t *next_hops, bool be_addr)
92 {
93 	struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p;
94 	const __m512i zero = _mm512_set1_epi32(0);
95 	const __m512i lsbyte_msk = _mm512_set1_epi64(0xff);
96 	const __m512i lsb = _mm512_set1_epi64(1);
97 	__m512i res, idxes, bytes;
98 	__m256i idxes_256, ip_vec;
99 	__mmask8 msk_ext;
100 
101 	ip_vec = _mm256_loadu_si256((const void *)ips);
102 	if (be_addr) {
103 		const __m256i bswap32 = _mm256_set_epi8(
104 			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3,
105 			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
106 		);
107 		ip_vec = _mm256_shuffle_epi8(ip_vec, bswap32);
108 	}
109 	/* mask 24 most significant bits */
110 	idxes_256 = _mm256_srli_epi32(ip_vec, 8);
111 
112 	/* lookup in tbl24 */
113 	res = _mm512_i32gather_epi64(idxes_256, (const void *)dp->tbl24, 8);
114 
115 	/* get extended entries indexes */
116 	msk_ext = _mm512_test_epi64_mask(res, lsb);
117 
118 	if (msk_ext != 0) {
119 		bytes = _mm512_cvtepi32_epi64(ip_vec);
120 		idxes = _mm512_srli_epi64(res, 1);
121 		idxes = _mm512_slli_epi64(idxes, 8);
122 		bytes = _mm512_and_epi64(bytes, lsbyte_msk);
123 		idxes = _mm512_maskz_add_epi64(msk_ext, idxes, bytes);
124 		idxes = _mm512_mask_i64gather_epi64(zero, msk_ext, idxes,
125 			(const void *)dp->tbl8, 8);
126 
127 		res = _mm512_mask_blend_epi64(msk_ext, res, idxes);
128 	}
129 
130 	res = _mm512_srli_epi64(res, 1);
131 	_mm512_storeu_si512(next_hops, res);
132 }
133 
134 #define DECLARE_VECTOR_FN(suffix, nh_type, be_addr) \
135 void \
136 rte_dir24_8_vec_lookup_bulk_##suffix(void *p, const uint32_t *ips, uint64_t *next_hops, \
137 	const unsigned int n) \
138 { \
139 	uint32_t i; \
140 	for (i = 0; i < (n / 16); i++) \
141 		dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16, sizeof(nh_type), \
142 			be_addr); \
143 	dir24_8_lookup_bulk_##suffix(p, ips + i * 16, next_hops + i * 16, n - i * 16); \
144 }
145 
146 DECLARE_VECTOR_FN(1b, uint8_t, false)
147 DECLARE_VECTOR_FN(1b_be, uint8_t, true)
148 DECLARE_VECTOR_FN(2b, uint16_t, false)
149 DECLARE_VECTOR_FN(2b_be, uint16_t, true)
150 DECLARE_VECTOR_FN(4b, uint32_t, false)
151 DECLARE_VECTOR_FN(4b_be, uint32_t, true)
152 
153 void
154 rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips,
155 	uint64_t *next_hops, const unsigned int n)
156 {
157 	uint32_t i;
158 	for (i = 0; i < (n / 8); i++)
159 		dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8, false);
160 	dir24_8_lookup_bulk_8b(p, ips + i * 8, next_hops + i * 8, n - i * 8);
161 }
162 
163 void
164 rte_dir24_8_vec_lookup_bulk_8b_be(void *p, const uint32_t *ips,
165 	uint64_t *next_hops, const unsigned int n)
166 {
167 	uint32_t i;
168 	for (i = 0; i < (n / 8); i++)
169 		dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8, true);
170 	dir24_8_lookup_bulk_8b_be(p, ips + i * 8, next_hops + i * 8, n - i * 8);
171 }
172