xref: /dpdk/lib/fib/dir24_8_avx512.c (revision e194f3cd5685d5b16c8561a715395a5f579c1bf3)
199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson  * Copyright(c) 2020 Intel Corporation
399a2dd95SBruce Richardson  */
499a2dd95SBruce Richardson 
599a2dd95SBruce Richardson #include <rte_vect.h>
699a2dd95SBruce Richardson #include <rte_fib.h>
799a2dd95SBruce Richardson 
899a2dd95SBruce Richardson #include "dir24_8.h"
999a2dd95SBruce Richardson #include "dir24_8_avx512.h"
1099a2dd95SBruce Richardson 
1199a2dd95SBruce Richardson static __rte_always_inline void
1299a2dd95SBruce Richardson dir24_8_vec_lookup_x16(void *p, const uint32_t *ips,
13*e194f3cdSVladimir Medvedkin 	uint64_t *next_hops, int size, bool be_addr)
1499a2dd95SBruce Richardson {
1599a2dd95SBruce Richardson 	struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p;
1699a2dd95SBruce Richardson 	__mmask16 msk_ext;
1799a2dd95SBruce Richardson 	__mmask16 exp_msk = 0x5555;
1899a2dd95SBruce Richardson 	__m512i ip_vec, idxes, res, bytes;
1999a2dd95SBruce Richardson 	const __m512i zero = _mm512_set1_epi32(0);
2099a2dd95SBruce Richardson 	const __m512i lsb = _mm512_set1_epi32(1);
2199a2dd95SBruce Richardson 	const __m512i lsbyte_msk = _mm512_set1_epi32(0xff);
2299a2dd95SBruce Richardson 	__m512i tmp1, tmp2, res_msk;
2399a2dd95SBruce Richardson 	__m256i tmp256;
2499a2dd95SBruce Richardson 	/* used to mask gather values if size is 1/2 (8/16 bit next hops) */
2599a2dd95SBruce Richardson 	if (size == sizeof(uint8_t))
2699a2dd95SBruce Richardson 		res_msk = _mm512_set1_epi32(UINT8_MAX);
2799a2dd95SBruce Richardson 	else if (size == sizeof(uint16_t))
2899a2dd95SBruce Richardson 		res_msk = _mm512_set1_epi32(UINT16_MAX);
2999a2dd95SBruce Richardson 
3099a2dd95SBruce Richardson 	ip_vec = _mm512_loadu_si512(ips);
31*e194f3cdSVladimir Medvedkin 	if (be_addr) {
32*e194f3cdSVladimir Medvedkin 		const __m512i bswap32 = _mm512_set_epi32(
33*e194f3cdSVladimir Medvedkin 			0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203,
34*e194f3cdSVladimir Medvedkin 			0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203,
35*e194f3cdSVladimir Medvedkin 			0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203,
36*e194f3cdSVladimir Medvedkin 			0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203
37*e194f3cdSVladimir Medvedkin 		);
38*e194f3cdSVladimir Medvedkin 		ip_vec = _mm512_shuffle_epi8(ip_vec, bswap32);
39*e194f3cdSVladimir Medvedkin 	}
40*e194f3cdSVladimir Medvedkin 
4199a2dd95SBruce Richardson 	/* mask 24 most significant bits */
4299a2dd95SBruce Richardson 	idxes = _mm512_srli_epi32(ip_vec, 8);
4399a2dd95SBruce Richardson 
4499a2dd95SBruce Richardson 	/**
4599a2dd95SBruce Richardson 	 * lookup in tbl24
4699a2dd95SBruce Richardson 	 * Put it inside branch to make compiler happy with -O0
4799a2dd95SBruce Richardson 	 */
4899a2dd95SBruce Richardson 	if (size == sizeof(uint8_t)) {
4999a2dd95SBruce Richardson 		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 1);
5099a2dd95SBruce Richardson 		res = _mm512_and_epi32(res, res_msk);
5199a2dd95SBruce Richardson 	} else if (size == sizeof(uint16_t)) {
5299a2dd95SBruce Richardson 		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 2);
5399a2dd95SBruce Richardson 		res = _mm512_and_epi32(res, res_msk);
5499a2dd95SBruce Richardson 	} else
5599a2dd95SBruce Richardson 		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 4);
5699a2dd95SBruce Richardson 
5799a2dd95SBruce Richardson 	/* get extended entries indexes */
5899a2dd95SBruce Richardson 	msk_ext = _mm512_test_epi32_mask(res, lsb);
5999a2dd95SBruce Richardson 
6099a2dd95SBruce Richardson 	if (msk_ext != 0) {
6199a2dd95SBruce Richardson 		idxes = _mm512_srli_epi32(res, 1);
6299a2dd95SBruce Richardson 		idxes = _mm512_slli_epi32(idxes, 8);
6399a2dd95SBruce Richardson 		bytes = _mm512_and_epi32(ip_vec, lsbyte_msk);
6499a2dd95SBruce Richardson 		idxes = _mm512_maskz_add_epi32(msk_ext, idxes, bytes);
6599a2dd95SBruce Richardson 		if (size == sizeof(uint8_t)) {
6699a2dd95SBruce Richardson 			idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
6799a2dd95SBruce Richardson 				idxes, (const int *)dp->tbl8, 1);
6899a2dd95SBruce Richardson 			idxes = _mm512_and_epi32(idxes, res_msk);
6999a2dd95SBruce Richardson 		} else if (size == sizeof(uint16_t)) {
7099a2dd95SBruce Richardson 			idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
7199a2dd95SBruce Richardson 				idxes, (const int *)dp->tbl8, 2);
7299a2dd95SBruce Richardson 			idxes = _mm512_and_epi32(idxes, res_msk);
7399a2dd95SBruce Richardson 		} else
7499a2dd95SBruce Richardson 			idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
7599a2dd95SBruce Richardson 				idxes, (const int *)dp->tbl8, 4);
7699a2dd95SBruce Richardson 
7799a2dd95SBruce Richardson 		res = _mm512_mask_blend_epi32(msk_ext, res, idxes);
7899a2dd95SBruce Richardson 	}
7999a2dd95SBruce Richardson 
8099a2dd95SBruce Richardson 	res = _mm512_srli_epi32(res, 1);
8199a2dd95SBruce Richardson 	tmp1 = _mm512_maskz_expand_epi32(exp_msk, res);
8299a2dd95SBruce Richardson 	tmp256 = _mm512_extracti32x8_epi32(res, 1);
8399a2dd95SBruce Richardson 	tmp2 = _mm512_maskz_expand_epi32(exp_msk,
8499a2dd95SBruce Richardson 		_mm512_castsi256_si512(tmp256));
8599a2dd95SBruce Richardson 	_mm512_storeu_si512(next_hops, tmp1);
8699a2dd95SBruce Richardson 	_mm512_storeu_si512(next_hops + 8, tmp2);
8799a2dd95SBruce Richardson }
8899a2dd95SBruce Richardson 
8999a2dd95SBruce Richardson static __rte_always_inline void
9099a2dd95SBruce Richardson dir24_8_vec_lookup_x8_8b(void *p, const uint32_t *ips,
91*e194f3cdSVladimir Medvedkin 	uint64_t *next_hops, bool be_addr)
9299a2dd95SBruce Richardson {
9399a2dd95SBruce Richardson 	struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p;
9499a2dd95SBruce Richardson 	const __m512i zero = _mm512_set1_epi32(0);
9599a2dd95SBruce Richardson 	const __m512i lsbyte_msk = _mm512_set1_epi64(0xff);
9699a2dd95SBruce Richardson 	const __m512i lsb = _mm512_set1_epi64(1);
9799a2dd95SBruce Richardson 	__m512i res, idxes, bytes;
9899a2dd95SBruce Richardson 	__m256i idxes_256, ip_vec;
9999a2dd95SBruce Richardson 	__mmask8 msk_ext;
10099a2dd95SBruce Richardson 
10199a2dd95SBruce Richardson 	ip_vec = _mm256_loadu_si256((const void *)ips);
102*e194f3cdSVladimir Medvedkin 	if (be_addr) {
103*e194f3cdSVladimir Medvedkin 		const __m256i bswap32 = _mm256_set_epi8(
104*e194f3cdSVladimir Medvedkin 			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3,
105*e194f3cdSVladimir Medvedkin 			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
106*e194f3cdSVladimir Medvedkin 		);
107*e194f3cdSVladimir Medvedkin 		ip_vec = _mm256_shuffle_epi8(ip_vec, bswap32);
108*e194f3cdSVladimir Medvedkin 	}
10999a2dd95SBruce Richardson 	/* mask 24 most significant bits */
11099a2dd95SBruce Richardson 	idxes_256 = _mm256_srli_epi32(ip_vec, 8);
11199a2dd95SBruce Richardson 
11299a2dd95SBruce Richardson 	/* lookup in tbl24 */
11399a2dd95SBruce Richardson 	res = _mm512_i32gather_epi64(idxes_256, (const void *)dp->tbl24, 8);
11499a2dd95SBruce Richardson 
11599a2dd95SBruce Richardson 	/* get extended entries indexes */
11699a2dd95SBruce Richardson 	msk_ext = _mm512_test_epi64_mask(res, lsb);
11799a2dd95SBruce Richardson 
11899a2dd95SBruce Richardson 	if (msk_ext != 0) {
11999a2dd95SBruce Richardson 		bytes = _mm512_cvtepi32_epi64(ip_vec);
12099a2dd95SBruce Richardson 		idxes = _mm512_srli_epi64(res, 1);
12199a2dd95SBruce Richardson 		idxes = _mm512_slli_epi64(idxes, 8);
12299a2dd95SBruce Richardson 		bytes = _mm512_and_epi64(bytes, lsbyte_msk);
12399a2dd95SBruce Richardson 		idxes = _mm512_maskz_add_epi64(msk_ext, idxes, bytes);
12499a2dd95SBruce Richardson 		idxes = _mm512_mask_i64gather_epi64(zero, msk_ext, idxes,
12599a2dd95SBruce Richardson 			(const void *)dp->tbl8, 8);
12699a2dd95SBruce Richardson 
12799a2dd95SBruce Richardson 		res = _mm512_mask_blend_epi64(msk_ext, res, idxes);
12899a2dd95SBruce Richardson 	}
12999a2dd95SBruce Richardson 
13099a2dd95SBruce Richardson 	res = _mm512_srli_epi64(res, 1);
13199a2dd95SBruce Richardson 	_mm512_storeu_si512(next_hops, res);
13299a2dd95SBruce Richardson }
13399a2dd95SBruce Richardson 
134*e194f3cdSVladimir Medvedkin #define DECLARE_VECTOR_FN(suffix, nh_type, be_addr) \
135*e194f3cdSVladimir Medvedkin void \
136*e194f3cdSVladimir Medvedkin rte_dir24_8_vec_lookup_bulk_##suffix(void *p, const uint32_t *ips, uint64_t *next_hops, \
137*e194f3cdSVladimir Medvedkin 	const unsigned int n) \
138*e194f3cdSVladimir Medvedkin { \
139*e194f3cdSVladimir Medvedkin 	uint32_t i; \
140*e194f3cdSVladimir Medvedkin 	for (i = 0; i < (n / 16); i++) \
141*e194f3cdSVladimir Medvedkin 		dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16, sizeof(nh_type), \
142*e194f3cdSVladimir Medvedkin 			be_addr); \
143*e194f3cdSVladimir Medvedkin 	dir24_8_lookup_bulk_##suffix(p, ips + i * 16, next_hops + i * 16, n - i * 16); \
14499a2dd95SBruce Richardson }
14599a2dd95SBruce Richardson 
146*e194f3cdSVladimir Medvedkin DECLARE_VECTOR_FN(1b, uint8_t, false)
147*e194f3cdSVladimir Medvedkin DECLARE_VECTOR_FN(1b_be, uint8_t, true)
148*e194f3cdSVladimir Medvedkin DECLARE_VECTOR_FN(2b, uint16_t, false)
149*e194f3cdSVladimir Medvedkin DECLARE_VECTOR_FN(2b_be, uint16_t, true)
150*e194f3cdSVladimir Medvedkin DECLARE_VECTOR_FN(4b, uint32_t, false)
151*e194f3cdSVladimir Medvedkin DECLARE_VECTOR_FN(4b_be, uint32_t, true)
15299a2dd95SBruce Richardson 
15399a2dd95SBruce Richardson void
15499a2dd95SBruce Richardson rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips,
15599a2dd95SBruce Richardson 	uint64_t *next_hops, const unsigned int n)
15699a2dd95SBruce Richardson {
15799a2dd95SBruce Richardson 	uint32_t i;
15899a2dd95SBruce Richardson 	for (i = 0; i < (n / 8); i++)
159*e194f3cdSVladimir Medvedkin 		dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8, false);
16099a2dd95SBruce Richardson 	dir24_8_lookup_bulk_8b(p, ips + i * 8, next_hops + i * 8, n - i * 8);
16199a2dd95SBruce Richardson }
162*e194f3cdSVladimir Medvedkin 
163*e194f3cdSVladimir Medvedkin void
164*e194f3cdSVladimir Medvedkin rte_dir24_8_vec_lookup_bulk_8b_be(void *p, const uint32_t *ips,
165*e194f3cdSVladimir Medvedkin 	uint64_t *next_hops, const unsigned int n)
166*e194f3cdSVladimir Medvedkin {
167*e194f3cdSVladimir Medvedkin 	uint32_t i;
168*e194f3cdSVladimir Medvedkin 	for (i = 0; i < (n / 8); i++)
169*e194f3cdSVladimir Medvedkin 		dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8, true);
170*e194f3cdSVladimir Medvedkin 	dir24_8_lookup_bulk_8b_be(p, ips + i * 8, next_hops + i * 8, n - i * 8);
171*e194f3cdSVladimir Medvedkin }
172