xref: /dpdk/lib/fib/trie_avx512.c (revision 6cb10a9bdb6d2d0253e4d022f230371d703d8ac2)
199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson  * Copyright(c) 2020 Intel Corporation
399a2dd95SBruce Richardson  */
499a2dd95SBruce Richardson 
599a2dd95SBruce Richardson #include <rte_vect.h>
699a2dd95SBruce Richardson #include <rte_fib6.h>
799a2dd95SBruce Richardson 
899a2dd95SBruce Richardson #include "trie.h"
999a2dd95SBruce Richardson #include "trie_avx512.h"
1099a2dd95SBruce Richardson 
1199a2dd95SBruce Richardson static __rte_always_inline void
12*6cb10a9bSRobin Jarry transpose_x16(const struct rte_ipv6_addr *ips,
1399a2dd95SBruce Richardson 	__m512i *first, __m512i *second, __m512i *third, __m512i *fourth)
1499a2dd95SBruce Richardson {
1599a2dd95SBruce Richardson 	__m512i tmp1, tmp2, tmp3, tmp4;
1699a2dd95SBruce Richardson 	__m512i tmp5, tmp6, tmp7, tmp8;
1799a2dd95SBruce Richardson 	const __rte_x86_zmm_t perm_idxes = {
1899a2dd95SBruce Richardson 		.u32 = { 0, 4, 8, 12, 2, 6, 10, 14,
1999a2dd95SBruce Richardson 			1, 5, 9, 13, 3, 7, 11, 15
2099a2dd95SBruce Richardson 		},
2199a2dd95SBruce Richardson 	};
2299a2dd95SBruce Richardson 
2399a2dd95SBruce Richardson 	/* load all ip addresses */
24*6cb10a9bSRobin Jarry 	tmp1 = _mm512_loadu_si512(&ips[0]);
25*6cb10a9bSRobin Jarry 	tmp2 = _mm512_loadu_si512(&ips[4]);
26*6cb10a9bSRobin Jarry 	tmp3 = _mm512_loadu_si512(&ips[8]);
27*6cb10a9bSRobin Jarry 	tmp4 = _mm512_loadu_si512(&ips[12]);
2899a2dd95SBruce Richardson 
2999a2dd95SBruce Richardson 	/* transpose 4 byte chunks of 16 ips */
3099a2dd95SBruce Richardson 	tmp5 = _mm512_unpacklo_epi32(tmp1, tmp2);
3199a2dd95SBruce Richardson 	tmp7 = _mm512_unpackhi_epi32(tmp1, tmp2);
3299a2dd95SBruce Richardson 	tmp6 = _mm512_unpacklo_epi32(tmp3, tmp4);
3399a2dd95SBruce Richardson 	tmp8 = _mm512_unpackhi_epi32(tmp3, tmp4);
3499a2dd95SBruce Richardson 
3599a2dd95SBruce Richardson 	tmp1 = _mm512_unpacklo_epi32(tmp5, tmp6);
3699a2dd95SBruce Richardson 	tmp3 = _mm512_unpackhi_epi32(tmp5, tmp6);
3799a2dd95SBruce Richardson 	tmp2 = _mm512_unpacklo_epi32(tmp7, tmp8);
3899a2dd95SBruce Richardson 	tmp4 = _mm512_unpackhi_epi32(tmp7, tmp8);
3999a2dd95SBruce Richardson 
4099a2dd95SBruce Richardson 	/* first 4-byte chunks of ips[] */
4199a2dd95SBruce Richardson 	*first = _mm512_permutexvar_epi32(perm_idxes.z, tmp1);
4299a2dd95SBruce Richardson 	/* second 4-byte chunks of ips[] */
4399a2dd95SBruce Richardson 	*second = _mm512_permutexvar_epi32(perm_idxes.z, tmp3);
4499a2dd95SBruce Richardson 	/* third 4-byte chunks of ips[] */
4599a2dd95SBruce Richardson 	*third = _mm512_permutexvar_epi32(perm_idxes.z, tmp2);
4699a2dd95SBruce Richardson 	/* fourth 4-byte chunks of ips[] */
4799a2dd95SBruce Richardson 	*fourth = _mm512_permutexvar_epi32(perm_idxes.z, tmp4);
4899a2dd95SBruce Richardson }
4999a2dd95SBruce Richardson 
5099a2dd95SBruce Richardson static __rte_always_inline void
51*6cb10a9bSRobin Jarry transpose_x8(const struct rte_ipv6_addr *ips,
5299a2dd95SBruce Richardson 	__m512i *first, __m512i *second)
5399a2dd95SBruce Richardson {
5499a2dd95SBruce Richardson 	__m512i tmp1, tmp2, tmp3, tmp4;
5599a2dd95SBruce Richardson 	const __rte_x86_zmm_t perm_idxes = {
5699a2dd95SBruce Richardson 		.u64 = { 0, 2, 4, 6, 1, 3, 5, 7
5799a2dd95SBruce Richardson 		},
5899a2dd95SBruce Richardson 	};
5999a2dd95SBruce Richardson 
60*6cb10a9bSRobin Jarry 	tmp1 = _mm512_loadu_si512(&ips[0]);
61*6cb10a9bSRobin Jarry 	tmp2 = _mm512_loadu_si512(&ips[4]);
6299a2dd95SBruce Richardson 
6399a2dd95SBruce Richardson 	tmp3 = _mm512_unpacklo_epi64(tmp1, tmp2);
6499a2dd95SBruce Richardson 	*first = _mm512_permutexvar_epi64(perm_idxes.z, tmp3);
6599a2dd95SBruce Richardson 	tmp4 = _mm512_unpackhi_epi64(tmp1, tmp2);
6699a2dd95SBruce Richardson 	*second = _mm512_permutexvar_epi64(perm_idxes.z, tmp4);
6799a2dd95SBruce Richardson }
6899a2dd95SBruce Richardson 
6999a2dd95SBruce Richardson static __rte_always_inline void
70*6cb10a9bSRobin Jarry trie_vec_lookup_x16x2(void *p, const struct rte_ipv6_addr *ips,
7199a2dd95SBruce Richardson 	uint64_t *next_hops, int size)
7299a2dd95SBruce Richardson {
7399a2dd95SBruce Richardson 	struct rte_trie_tbl *dp = (struct rte_trie_tbl *)p;
7499a2dd95SBruce Richardson 	const __m512i zero = _mm512_set1_epi32(0);
7599a2dd95SBruce Richardson 	const __m512i lsb = _mm512_set1_epi32(1);
7699a2dd95SBruce Richardson 	const __m512i two_lsb = _mm512_set1_epi32(3);
7799a2dd95SBruce Richardson 	/* IPv6 four byte chunks */
7899a2dd95SBruce Richardson 	__m512i first_1, second_1, third_1, fourth_1;
7999a2dd95SBruce Richardson 	__m512i first_2, second_2, third_2, fourth_2;
8099a2dd95SBruce Richardson 	__m512i idxes_1, res_1;
8199a2dd95SBruce Richardson 	__m512i idxes_2, res_2;
8299a2dd95SBruce Richardson 	__m512i shuf_idxes;
8399a2dd95SBruce Richardson 	__m512i tmp_1, tmp2_1, bytes_1, byte_chunk_1;
8499a2dd95SBruce Richardson 	__m512i tmp_2, tmp2_2, bytes_2, byte_chunk_2;
8599a2dd95SBruce Richardson 	__m512i base_idxes;
8699a2dd95SBruce Richardson 	/* used to mask gather values if size is 2 (16 bit next hops) */
8799a2dd95SBruce Richardson 	const __m512i res_msk = _mm512_set1_epi32(UINT16_MAX);
8899a2dd95SBruce Richardson 	const __rte_x86_zmm_t bswap = {
8999a2dd95SBruce Richardson 		.u8 = { 2, 1, 0, 255, 6, 5, 4, 255,
9099a2dd95SBruce Richardson 			10, 9, 8, 255, 14, 13, 12, 255,
9199a2dd95SBruce Richardson 			2, 1, 0, 255, 6, 5, 4, 255,
9299a2dd95SBruce Richardson 			10, 9, 8, 255, 14, 13, 12, 255,
9399a2dd95SBruce Richardson 			2, 1, 0, 255, 6, 5, 4, 255,
9499a2dd95SBruce Richardson 			10, 9, 8, 255, 14, 13, 12, 255,
9599a2dd95SBruce Richardson 			2, 1, 0, 255, 6, 5, 4, 255,
9699a2dd95SBruce Richardson 			10, 9, 8, 255, 14, 13, 12, 255
9799a2dd95SBruce Richardson 			},
9899a2dd95SBruce Richardson 	};
9999a2dd95SBruce Richardson 	const __mmask64 k = 0x1111111111111111;
10099a2dd95SBruce Richardson 	int i = 3;
10199a2dd95SBruce Richardson 	__mmask16 msk_ext_1, new_msk_1;
10299a2dd95SBruce Richardson 	__mmask16 msk_ext_2, new_msk_2;
10399a2dd95SBruce Richardson 	__mmask16 exp_msk = 0x5555;
10499a2dd95SBruce Richardson 
10599a2dd95SBruce Richardson 	transpose_x16(ips, &first_1, &second_1, &third_1, &fourth_1);
10699a2dd95SBruce Richardson 	transpose_x16(ips + 16, &first_2, &second_2, &third_2, &fourth_2);
10799a2dd95SBruce Richardson 
10899a2dd95SBruce Richardson 	/* get_tbl24_idx() for every 4 byte chunk */
10999a2dd95SBruce Richardson 	idxes_1 = _mm512_shuffle_epi8(first_1, bswap.z);
11099a2dd95SBruce Richardson 	idxes_2 = _mm512_shuffle_epi8(first_2, bswap.z);
11199a2dd95SBruce Richardson 
11299a2dd95SBruce Richardson 	/**
11399a2dd95SBruce Richardson 	 * lookup in tbl24
11499a2dd95SBruce Richardson 	 * Put it inside branch to make compiller happy with -O0
11599a2dd95SBruce Richardson 	 */
11699a2dd95SBruce Richardson 	if (size == sizeof(uint16_t)) {
11799a2dd95SBruce Richardson 		res_1 = _mm512_i32gather_epi32(idxes_1,
11899a2dd95SBruce Richardson 				(const int *)dp->tbl24, 2);
11999a2dd95SBruce Richardson 		res_2 = _mm512_i32gather_epi32(idxes_2,
12099a2dd95SBruce Richardson 				(const int *)dp->tbl24, 2);
12199a2dd95SBruce Richardson 		res_1 = _mm512_and_epi32(res_1, res_msk);
12299a2dd95SBruce Richardson 		res_2 = _mm512_and_epi32(res_2, res_msk);
12399a2dd95SBruce Richardson 	} else {
12499a2dd95SBruce Richardson 		res_1 = _mm512_i32gather_epi32(idxes_1,
12599a2dd95SBruce Richardson 				(const int *)dp->tbl24, 4);
12699a2dd95SBruce Richardson 		res_2 = _mm512_i32gather_epi32(idxes_2,
12799a2dd95SBruce Richardson 				(const int *)dp->tbl24, 4);
12899a2dd95SBruce Richardson 	}
12999a2dd95SBruce Richardson 
13099a2dd95SBruce Richardson 	/* get extended entries indexes */
13199a2dd95SBruce Richardson 	msk_ext_1 = _mm512_test_epi32_mask(res_1, lsb);
13299a2dd95SBruce Richardson 	msk_ext_2 = _mm512_test_epi32_mask(res_2, lsb);
13399a2dd95SBruce Richardson 
13499a2dd95SBruce Richardson 	tmp_1 = _mm512_srli_epi32(res_1, 1);
13599a2dd95SBruce Richardson 	tmp_2 = _mm512_srli_epi32(res_2, 1);
13699a2dd95SBruce Richardson 
13799a2dd95SBruce Richardson 	/* idxes to retrieve bytes */
13899a2dd95SBruce Richardson 	shuf_idxes = _mm512_setr_epi32(3, 7, 11, 15,
13999a2dd95SBruce Richardson 				19, 23, 27, 31,
14099a2dd95SBruce Richardson 				35, 39, 43, 47,
14199a2dd95SBruce Richardson 				51, 55, 59, 63);
14299a2dd95SBruce Richardson 
14399a2dd95SBruce Richardson 	base_idxes = _mm512_setr_epi32(0, 4, 8, 12,
14499a2dd95SBruce Richardson 				16, 20, 24, 28,
14599a2dd95SBruce Richardson 				32, 36, 40, 44,
14699a2dd95SBruce Richardson 				48, 52, 56, 60);
14799a2dd95SBruce Richardson 
14899a2dd95SBruce Richardson 	/* traverse down the trie */
14999a2dd95SBruce Richardson 	while (msk_ext_1 || msk_ext_2) {
15099a2dd95SBruce Richardson 		idxes_1 = _mm512_maskz_slli_epi32(msk_ext_1, tmp_1, 8);
15199a2dd95SBruce Richardson 		idxes_2 = _mm512_maskz_slli_epi32(msk_ext_2, tmp_2, 8);
15299a2dd95SBruce Richardson 		byte_chunk_1 = (i < 8) ?
15399a2dd95SBruce Richardson 			((i >= 4) ? second_1 : first_1) :
15499a2dd95SBruce Richardson 			((i >= 12) ? fourth_1 : third_1);
15599a2dd95SBruce Richardson 		byte_chunk_2 = (i < 8) ?
15699a2dd95SBruce Richardson 			((i >= 4) ? second_2 : first_2) :
15799a2dd95SBruce Richardson 			((i >= 12) ? fourth_2 : third_2);
15899a2dd95SBruce Richardson 		bytes_1 = _mm512_maskz_shuffle_epi8(k, byte_chunk_1,
15999a2dd95SBruce Richardson 				shuf_idxes);
16099a2dd95SBruce Richardson 		bytes_2 = _mm512_maskz_shuffle_epi8(k, byte_chunk_2,
16199a2dd95SBruce Richardson 				shuf_idxes);
16299a2dd95SBruce Richardson 		idxes_1 = _mm512_maskz_add_epi32(msk_ext_1, idxes_1, bytes_1);
16399a2dd95SBruce Richardson 		idxes_2 = _mm512_maskz_add_epi32(msk_ext_2, idxes_2, bytes_2);
16499a2dd95SBruce Richardson 		if (size == sizeof(uint16_t)) {
16599a2dd95SBruce Richardson 			tmp_1 = _mm512_mask_i32gather_epi32(zero, msk_ext_1,
16699a2dd95SBruce Richardson 				idxes_1, (const int *)dp->tbl8, 2);
16799a2dd95SBruce Richardson 			tmp_2 = _mm512_mask_i32gather_epi32(zero, msk_ext_2,
16899a2dd95SBruce Richardson 				idxes_2, (const int *)dp->tbl8, 2);
16999a2dd95SBruce Richardson 			tmp_1 = _mm512_and_epi32(tmp_1, res_msk);
17099a2dd95SBruce Richardson 			tmp_2 = _mm512_and_epi32(tmp_2, res_msk);
17199a2dd95SBruce Richardson 		} else {
17299a2dd95SBruce Richardson 			tmp_1 = _mm512_mask_i32gather_epi32(zero, msk_ext_1,
17399a2dd95SBruce Richardson 				idxes_1, (const int *)dp->tbl8, 4);
17499a2dd95SBruce Richardson 			tmp_2 = _mm512_mask_i32gather_epi32(zero, msk_ext_2,
17599a2dd95SBruce Richardson 				idxes_2, (const int *)dp->tbl8, 4);
17699a2dd95SBruce Richardson 		}
17799a2dd95SBruce Richardson 		new_msk_1 = _mm512_test_epi32_mask(tmp_1, lsb);
17899a2dd95SBruce Richardson 		new_msk_2 = _mm512_test_epi32_mask(tmp_2, lsb);
17999a2dd95SBruce Richardson 		res_1 = _mm512_mask_blend_epi32(msk_ext_1 ^ new_msk_1, res_1,
18099a2dd95SBruce Richardson 				tmp_1);
18199a2dd95SBruce Richardson 		res_2 = _mm512_mask_blend_epi32(msk_ext_2 ^ new_msk_2, res_2,
18299a2dd95SBruce Richardson 				tmp_2);
18399a2dd95SBruce Richardson 		tmp_1 = _mm512_srli_epi32(tmp_1, 1);
18499a2dd95SBruce Richardson 		tmp_2 = _mm512_srli_epi32(tmp_2, 1);
18599a2dd95SBruce Richardson 		msk_ext_1 = new_msk_1;
18699a2dd95SBruce Richardson 		msk_ext_2 = new_msk_2;
18799a2dd95SBruce Richardson 
18899a2dd95SBruce Richardson 		shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, lsb);
18999a2dd95SBruce Richardson 		shuf_idxes = _mm512_and_epi32(shuf_idxes, two_lsb);
19099a2dd95SBruce Richardson 		shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, base_idxes);
19199a2dd95SBruce Richardson 		i++;
19299a2dd95SBruce Richardson 	}
19399a2dd95SBruce Richardson 
19499a2dd95SBruce Richardson 	/* get rid of 1 LSB, now we have HN in every epi32 */
19599a2dd95SBruce Richardson 	res_1 = _mm512_srli_epi32(res_1, 1);
19699a2dd95SBruce Richardson 	res_2 = _mm512_srli_epi32(res_2, 1);
19799a2dd95SBruce Richardson 	/* extract first half of NH's each in epi64 chunk */
19899a2dd95SBruce Richardson 	tmp_1 = _mm512_maskz_expand_epi32(exp_msk, res_1);
19999a2dd95SBruce Richardson 	tmp_2 = _mm512_maskz_expand_epi32(exp_msk, res_2);
20099a2dd95SBruce Richardson 	/* extract second half of NH's */
20199a2dd95SBruce Richardson 	__m256i tmp256_1, tmp256_2;
20299a2dd95SBruce Richardson 	tmp256_1 = _mm512_extracti32x8_epi32(res_1, 1);
20399a2dd95SBruce Richardson 	tmp256_2 = _mm512_extracti32x8_epi32(res_2, 1);
20499a2dd95SBruce Richardson 	tmp2_1 = _mm512_maskz_expand_epi32(exp_msk,
20599a2dd95SBruce Richardson 		_mm512_castsi256_si512(tmp256_1));
20699a2dd95SBruce Richardson 	tmp2_2 = _mm512_maskz_expand_epi32(exp_msk,
20799a2dd95SBruce Richardson 		_mm512_castsi256_si512(tmp256_2));
20899a2dd95SBruce Richardson 	/* return NH's from two sets of registers */
20999a2dd95SBruce Richardson 	_mm512_storeu_si512(next_hops, tmp_1);
21099a2dd95SBruce Richardson 	_mm512_storeu_si512(next_hops + 8, tmp2_1);
21199a2dd95SBruce Richardson 	_mm512_storeu_si512(next_hops + 16, tmp_2);
21299a2dd95SBruce Richardson 	_mm512_storeu_si512(next_hops + 24, tmp2_2);
21399a2dd95SBruce Richardson }
21499a2dd95SBruce Richardson 
21599a2dd95SBruce Richardson static void
216*6cb10a9bSRobin Jarry trie_vec_lookup_x8x2_8b(void *p, const struct rte_ipv6_addr *ips,
21799a2dd95SBruce Richardson 	uint64_t *next_hops)
21899a2dd95SBruce Richardson {
21999a2dd95SBruce Richardson 	struct rte_trie_tbl *dp = (struct rte_trie_tbl *)p;
22099a2dd95SBruce Richardson 	const __m512i zero = _mm512_set1_epi32(0);
22199a2dd95SBruce Richardson 	const __m512i lsb = _mm512_set1_epi32(1);
22299a2dd95SBruce Richardson 	const __m512i three_lsb = _mm512_set1_epi32(7);
22399a2dd95SBruce Richardson 	/* IPv6 eight byte chunks */
22499a2dd95SBruce Richardson 	__m512i first_1, second_1;
22599a2dd95SBruce Richardson 	__m512i first_2, second_2;
22699a2dd95SBruce Richardson 	__m512i idxes_1, res_1;
22799a2dd95SBruce Richardson 	__m512i idxes_2, res_2;
22899a2dd95SBruce Richardson 	__m512i shuf_idxes, base_idxes;
22999a2dd95SBruce Richardson 	__m512i tmp_1, bytes_1, byte_chunk_1;
23099a2dd95SBruce Richardson 	__m512i tmp_2, bytes_2, byte_chunk_2;
23199a2dd95SBruce Richardson 	const __rte_x86_zmm_t bswap = {
23299a2dd95SBruce Richardson 		.u8 = { 2, 1, 0, 255, 255, 255, 255, 255,
23399a2dd95SBruce Richardson 			10, 9, 8, 255, 255, 255, 255, 255,
23499a2dd95SBruce Richardson 			2, 1, 0, 255, 255, 255, 255, 255,
23599a2dd95SBruce Richardson 			10, 9, 8, 255, 255, 255, 255, 255,
23699a2dd95SBruce Richardson 			2, 1, 0, 255, 255, 255, 255, 255,
23799a2dd95SBruce Richardson 			10, 9, 8, 255, 255, 255, 255, 255,
23899a2dd95SBruce Richardson 			2, 1, 0, 255, 255, 255, 255, 255,
23999a2dd95SBruce Richardson 			10, 9, 8, 255, 255, 255, 255, 255
24099a2dd95SBruce Richardson 			},
24199a2dd95SBruce Richardson 	};
24299a2dd95SBruce Richardson 	const __mmask64 k = 0x101010101010101;
24399a2dd95SBruce Richardson 	int i = 3;
24499a2dd95SBruce Richardson 	__mmask8 msk_ext_1, new_msk_1;
24599a2dd95SBruce Richardson 	__mmask8 msk_ext_2, new_msk_2;
24699a2dd95SBruce Richardson 
24799a2dd95SBruce Richardson 	transpose_x8(ips, &first_1, &second_1);
24899a2dd95SBruce Richardson 	transpose_x8(ips + 8, &first_2, &second_2);
24999a2dd95SBruce Richardson 
25099a2dd95SBruce Richardson 	/* get_tbl24_idx() for every 4 byte chunk */
25199a2dd95SBruce Richardson 	idxes_1 = _mm512_shuffle_epi8(first_1, bswap.z);
25299a2dd95SBruce Richardson 	idxes_2 = _mm512_shuffle_epi8(first_2, bswap.z);
25399a2dd95SBruce Richardson 
25499a2dd95SBruce Richardson 	/* lookup in tbl24 */
25599a2dd95SBruce Richardson 	res_1 = _mm512_i64gather_epi64(idxes_1, (const void *)dp->tbl24, 8);
25699a2dd95SBruce Richardson 	res_2 = _mm512_i64gather_epi64(idxes_2, (const void *)dp->tbl24, 8);
25799a2dd95SBruce Richardson 	/* get extended entries indexes */
25899a2dd95SBruce Richardson 	msk_ext_1 = _mm512_test_epi64_mask(res_1, lsb);
25999a2dd95SBruce Richardson 	msk_ext_2 = _mm512_test_epi64_mask(res_2, lsb);
26099a2dd95SBruce Richardson 
26199a2dd95SBruce Richardson 	tmp_1 = _mm512_srli_epi64(res_1, 1);
26299a2dd95SBruce Richardson 	tmp_2 = _mm512_srli_epi64(res_2, 1);
26399a2dd95SBruce Richardson 
26499a2dd95SBruce Richardson 	/* idxes to retrieve bytes */
26599a2dd95SBruce Richardson 	shuf_idxes = _mm512_setr_epi64(3, 11, 19, 27, 35, 43, 51, 59);
26699a2dd95SBruce Richardson 
26799a2dd95SBruce Richardson 	base_idxes = _mm512_setr_epi64(0, 8, 16, 24, 32, 40, 48, 56);
26899a2dd95SBruce Richardson 
26999a2dd95SBruce Richardson 	/* traverse down the trie */
27099a2dd95SBruce Richardson 	while (msk_ext_1 || msk_ext_2) {
27199a2dd95SBruce Richardson 		idxes_1 = _mm512_maskz_slli_epi64(msk_ext_1, tmp_1, 8);
27299a2dd95SBruce Richardson 		idxes_2 = _mm512_maskz_slli_epi64(msk_ext_2, tmp_2, 8);
27399a2dd95SBruce Richardson 		byte_chunk_1 = (i < 8) ? first_1 : second_1;
27499a2dd95SBruce Richardson 		byte_chunk_2 = (i < 8) ? first_2 : second_2;
27599a2dd95SBruce Richardson 		bytes_1 = _mm512_maskz_shuffle_epi8(k, byte_chunk_1,
27699a2dd95SBruce Richardson 				shuf_idxes);
27799a2dd95SBruce Richardson 		bytes_2 = _mm512_maskz_shuffle_epi8(k, byte_chunk_2,
27899a2dd95SBruce Richardson 				shuf_idxes);
27999a2dd95SBruce Richardson 		idxes_1 = _mm512_maskz_add_epi64(msk_ext_1, idxes_1, bytes_1);
28099a2dd95SBruce Richardson 		idxes_2 = _mm512_maskz_add_epi64(msk_ext_2, idxes_2, bytes_2);
28199a2dd95SBruce Richardson 		tmp_1 = _mm512_mask_i64gather_epi64(zero, msk_ext_1,
28299a2dd95SBruce Richardson 				idxes_1, (const void *)dp->tbl8, 8);
28399a2dd95SBruce Richardson 		tmp_2 = _mm512_mask_i64gather_epi64(zero, msk_ext_2,
28499a2dd95SBruce Richardson 				idxes_2, (const void *)dp->tbl8, 8);
28599a2dd95SBruce Richardson 		new_msk_1 = _mm512_test_epi64_mask(tmp_1, lsb);
28699a2dd95SBruce Richardson 		new_msk_2 = _mm512_test_epi64_mask(tmp_2, lsb);
28799a2dd95SBruce Richardson 		res_1 = _mm512_mask_blend_epi64(msk_ext_1 ^ new_msk_1, res_1,
28899a2dd95SBruce Richardson 				tmp_1);
28999a2dd95SBruce Richardson 		res_2 = _mm512_mask_blend_epi64(msk_ext_2 ^ new_msk_2, res_2,
29099a2dd95SBruce Richardson 				tmp_2);
29199a2dd95SBruce Richardson 		tmp_1 = _mm512_srli_epi64(tmp_1, 1);
29299a2dd95SBruce Richardson 		tmp_2 = _mm512_srli_epi64(tmp_2, 1);
29399a2dd95SBruce Richardson 		msk_ext_1 = new_msk_1;
29499a2dd95SBruce Richardson 		msk_ext_2 = new_msk_2;
29599a2dd95SBruce Richardson 
29699a2dd95SBruce Richardson 		shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, lsb);
29799a2dd95SBruce Richardson 		shuf_idxes = _mm512_and_epi64(shuf_idxes, three_lsb);
29899a2dd95SBruce Richardson 		shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, base_idxes);
29999a2dd95SBruce Richardson 		i++;
30099a2dd95SBruce Richardson 	}
30199a2dd95SBruce Richardson 
30299a2dd95SBruce Richardson 	res_1 = _mm512_srli_epi64(res_1, 1);
30399a2dd95SBruce Richardson 	res_2 = _mm512_srli_epi64(res_2, 1);
30499a2dd95SBruce Richardson 	_mm512_storeu_si512(next_hops, res_1);
30599a2dd95SBruce Richardson 	_mm512_storeu_si512(next_hops + 8, res_2);
30699a2dd95SBruce Richardson }
30799a2dd95SBruce Richardson 
30899a2dd95SBruce Richardson void
309*6cb10a9bSRobin Jarry rte_trie_vec_lookup_bulk_2b(void *p, const struct rte_ipv6_addr *ips,
31099a2dd95SBruce Richardson 	uint64_t *next_hops, const unsigned int n)
31199a2dd95SBruce Richardson {
31299a2dd95SBruce Richardson 	uint32_t i;
31399a2dd95SBruce Richardson 	for (i = 0; i < (n / 32); i++) {
314*6cb10a9bSRobin Jarry 		trie_vec_lookup_x16x2(p, &ips[i * 32],
31599a2dd95SBruce Richardson 				next_hops + i * 32, sizeof(uint16_t));
31699a2dd95SBruce Richardson 	}
317*6cb10a9bSRobin Jarry 	rte_trie_lookup_bulk_2b(p, &ips[i * 32],
31899a2dd95SBruce Richardson 			next_hops + i * 32, n - i * 32);
31999a2dd95SBruce Richardson }
32099a2dd95SBruce Richardson 
32199a2dd95SBruce Richardson void
322*6cb10a9bSRobin Jarry rte_trie_vec_lookup_bulk_4b(void *p, const struct rte_ipv6_addr *ips,
32399a2dd95SBruce Richardson 	uint64_t *next_hops, const unsigned int n)
32499a2dd95SBruce Richardson {
32599a2dd95SBruce Richardson 	uint32_t i;
32699a2dd95SBruce Richardson 	for (i = 0; i < (n / 32); i++) {
327*6cb10a9bSRobin Jarry 		trie_vec_lookup_x16x2(p, &ips[i * 32],
32899a2dd95SBruce Richardson 				next_hops + i * 32, sizeof(uint32_t));
32999a2dd95SBruce Richardson 	}
330*6cb10a9bSRobin Jarry 	rte_trie_lookup_bulk_4b(p, &ips[i * 32],
33199a2dd95SBruce Richardson 			next_hops + i * 32, n - i * 32);
33299a2dd95SBruce Richardson }
33399a2dd95SBruce Richardson 
33499a2dd95SBruce Richardson void
335*6cb10a9bSRobin Jarry rte_trie_vec_lookup_bulk_8b(void *p, const struct rte_ipv6_addr *ips,
33699a2dd95SBruce Richardson 	uint64_t *next_hops, const unsigned int n)
33799a2dd95SBruce Richardson {
33899a2dd95SBruce Richardson 	uint32_t i;
33999a2dd95SBruce Richardson 	for (i = 0; i < (n / 16); i++) {
340*6cb10a9bSRobin Jarry 		trie_vec_lookup_x8x2_8b(p, &ips[i * 16],
34199a2dd95SBruce Richardson 				next_hops + i * 16);
34299a2dd95SBruce Richardson 	}
343*6cb10a9bSRobin Jarry 	rte_trie_lookup_bulk_8b(p, &ips[i * 16],
34499a2dd95SBruce Richardson 			next_hops + i * 16, n - i * 16);
34599a2dd95SBruce Richardson }
346