xref: /dpdk/lib/acl/acl_run_avx512x16.h (revision 7be78d027918dbc846e502780faf94d5acdf5f75)
199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
299a2dd95SBruce Richardson  * Copyright(c) 2020 Intel Corporation
399a2dd95SBruce Richardson  */
499a2dd95SBruce Richardson 
599a2dd95SBruce Richardson /*
699a2dd95SBruce Richardson  * Defines required by "acl_run_avx512_common.h".
799a2dd95SBruce Richardson  * Note that all of them has to be undefined by the end
899a2dd95SBruce Richardson  * of this file, as "acl_run_avx512_common.h" can be included several
999a2dd95SBruce Richardson  * times from different *.h files for the same *.c.
1099a2dd95SBruce Richardson  */
1199a2dd95SBruce Richardson 
1299a2dd95SBruce Richardson /*
13*7be78d02SJosh Soref  * This implementation uses 512-bit registers(zmm) and intrinsics.
1499a2dd95SBruce Richardson  * So our main SIMD type is 512-bit width and each such variable can
1599a2dd95SBruce Richardson  * process sizeof(__m512i) / sizeof(uint32_t) == 16 entries in parallel.
1699a2dd95SBruce Richardson  */
1799a2dd95SBruce Richardson #define _T_simd		__m512i
1899a2dd95SBruce Richardson #define _T_mask		__mmask16
1999a2dd95SBruce Richardson 
2099a2dd95SBruce Richardson /* Naming convention for static const variables. */
2199a2dd95SBruce Richardson #define _SC_(x)		zmm_##x
2299a2dd95SBruce Richardson #define _SV_(x)		(zmm_##x.z)
2399a2dd95SBruce Richardson 
2499a2dd95SBruce Richardson /* Naming convention for internal functions. */
2599a2dd95SBruce Richardson #define _F_(x)		x##_avx512x16
2699a2dd95SBruce Richardson 
2799a2dd95SBruce Richardson /*
28*7be78d02SJosh Soref  * Same intrinsics have different syntaxes (depending on the bit-width),
2999a2dd95SBruce Richardson  * so to overcome that few macros need to be defined.
3099a2dd95SBruce Richardson  */
3199a2dd95SBruce Richardson 
32*7be78d02SJosh Soref /* Naming convention for generic epi(packed integers) type intrinsics. */
3399a2dd95SBruce Richardson #define _M_I_(x)	_mm512_##x
3499a2dd95SBruce Richardson 
35*7be78d02SJosh Soref /* Naming convention for si(whole simd integer) type intrinsics. */
3699a2dd95SBruce Richardson #define _M_SI_(x)	_mm512_##x##_si512
3799a2dd95SBruce Richardson 
38*7be78d02SJosh Soref /* Naming convention for masked gather type intrinsics. */
3999a2dd95SBruce Richardson #define _M_MGI_(x)	_mm512_##x
4099a2dd95SBruce Richardson 
41*7be78d02SJosh Soref /* Naming convention for gather type intrinsics. */
4299a2dd95SBruce Richardson #define _M_GI_(name, idx, base, scale)	_mm512_##name(idx, base, scale)
4399a2dd95SBruce Richardson 
4499a2dd95SBruce Richardson /* num/mask of transitions per SIMD regs */
4599a2dd95SBruce Richardson #define _SIMD_MASK_BIT_	(sizeof(_T_simd) / sizeof(uint32_t))
4699a2dd95SBruce Richardson #define _SIMD_MASK_MAX_	RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
4799a2dd95SBruce Richardson 
4899a2dd95SBruce Richardson #define _SIMD_FLOW_NUM_	(2 * _SIMD_MASK_BIT_)
4999a2dd95SBruce Richardson #define _SIMD_FLOW_MSK_	(_SIMD_FLOW_NUM_ - 1)
5099a2dd95SBruce Richardson 
5199a2dd95SBruce Richardson /* num/mask of pointers per SIMD regs */
5299a2dd95SBruce Richardson #define _SIMD_PTR_NUM_	(sizeof(_T_simd) / sizeof(uintptr_t))
5399a2dd95SBruce Richardson #define _SIMD_PTR_MSK_	RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
5499a2dd95SBruce Richardson 
5599a2dd95SBruce Richardson static const __rte_x86_zmm_t _SC_(match_mask) = {
5699a2dd95SBruce Richardson 	.u32 = {
5799a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
5899a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
5999a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
6099a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
6199a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
6299a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
6399a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
6499a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
6599a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
6699a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
6799a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
6899a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
6999a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
7099a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
7199a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
7299a2dd95SBruce Richardson 		RTE_ACL_NODE_MATCH,
7399a2dd95SBruce Richardson 	},
7499a2dd95SBruce Richardson };
7599a2dd95SBruce Richardson 
7699a2dd95SBruce Richardson static const __rte_x86_zmm_t _SC_(index_mask) = {
7799a2dd95SBruce Richardson 	.u32 = {
7899a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
7999a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
8099a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
8199a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
8299a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
8399a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
8499a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
8599a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
8699a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
8799a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
8899a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
8999a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
9099a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
9199a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
9299a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
9399a2dd95SBruce Richardson 		RTE_ACL_NODE_INDEX,
9499a2dd95SBruce Richardson 	},
9599a2dd95SBruce Richardson };
9699a2dd95SBruce Richardson 
9799a2dd95SBruce Richardson static const __rte_x86_zmm_t _SC_(trlo_idle) = {
9899a2dd95SBruce Richardson 	.u32 = {
9999a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
10099a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
10199a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
10299a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
10399a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
10499a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
10599a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
10699a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
10799a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
10899a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
10999a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
11099a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
11199a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
11299a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
11399a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
11499a2dd95SBruce Richardson 		RTE_ACL_IDLE_NODE,
11599a2dd95SBruce Richardson 	},
11699a2dd95SBruce Richardson };
11799a2dd95SBruce Richardson 
11899a2dd95SBruce Richardson static const __rte_x86_zmm_t _SC_(trhi_idle) = {
11999a2dd95SBruce Richardson 	.u32 = {
12099a2dd95SBruce Richardson 		0, 0, 0, 0,
12199a2dd95SBruce Richardson 		0, 0, 0, 0,
12299a2dd95SBruce Richardson 		0, 0, 0, 0,
12399a2dd95SBruce Richardson 		0, 0, 0, 0,
12499a2dd95SBruce Richardson 	},
12599a2dd95SBruce Richardson };
12699a2dd95SBruce Richardson 
12799a2dd95SBruce Richardson static const __rte_x86_zmm_t _SC_(shuffle_input) = {
12899a2dd95SBruce Richardson 	.u32 = {
12999a2dd95SBruce Richardson 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
13099a2dd95SBruce Richardson 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
13199a2dd95SBruce Richardson 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
13299a2dd95SBruce Richardson 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
13399a2dd95SBruce Richardson 	},
13499a2dd95SBruce Richardson };
13599a2dd95SBruce Richardson 
13699a2dd95SBruce Richardson static const __rte_x86_zmm_t _SC_(four_32) = {
13799a2dd95SBruce Richardson 	.u32 = {
13899a2dd95SBruce Richardson 		4, 4, 4, 4,
13999a2dd95SBruce Richardson 		4, 4, 4, 4,
14099a2dd95SBruce Richardson 		4, 4, 4, 4,
14199a2dd95SBruce Richardson 		4, 4, 4, 4,
14299a2dd95SBruce Richardson 	},
14399a2dd95SBruce Richardson };
14499a2dd95SBruce Richardson 
14599a2dd95SBruce Richardson static const __rte_x86_zmm_t _SC_(idx_add) = {
14699a2dd95SBruce Richardson 	.u32 = {
14799a2dd95SBruce Richardson 		0, 1, 2, 3,
14899a2dd95SBruce Richardson 		4, 5, 6, 7,
14999a2dd95SBruce Richardson 		8, 9, 10, 11,
15099a2dd95SBruce Richardson 		12, 13, 14, 15,
15199a2dd95SBruce Richardson 	},
15299a2dd95SBruce Richardson };
15399a2dd95SBruce Richardson 
15499a2dd95SBruce Richardson static const __rte_x86_zmm_t _SC_(range_base) = {
15599a2dd95SBruce Richardson 	.u32 = {
15699a2dd95SBruce Richardson 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
15799a2dd95SBruce Richardson 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
15899a2dd95SBruce Richardson 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
15999a2dd95SBruce Richardson 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
16099a2dd95SBruce Richardson 	},
16199a2dd95SBruce Richardson };
16299a2dd95SBruce Richardson 
16399a2dd95SBruce Richardson static const __rte_x86_zmm_t _SC_(pminp) = {
16499a2dd95SBruce Richardson 	.u32 = {
16599a2dd95SBruce Richardson 		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
16699a2dd95SBruce Richardson 		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
16799a2dd95SBruce Richardson 	},
16899a2dd95SBruce Richardson };
16999a2dd95SBruce Richardson 
17099a2dd95SBruce Richardson static const _T_mask _SC_(pmidx_msk) = 0x5555;
17199a2dd95SBruce Richardson 
17299a2dd95SBruce Richardson static const __rte_x86_zmm_t _SC_(pmidx[2]) = {
17399a2dd95SBruce Richardson 	[0] = {
17499a2dd95SBruce Richardson 		.u32 = {
17599a2dd95SBruce Richardson 			0, 0, 1, 0, 2, 0, 3, 0,
17699a2dd95SBruce Richardson 			4, 0, 5, 0, 6, 0, 7, 0,
17799a2dd95SBruce Richardson 		},
17899a2dd95SBruce Richardson 	},
17999a2dd95SBruce Richardson 	[1] = {
18099a2dd95SBruce Richardson 		.u32 = {
18199a2dd95SBruce Richardson 			8, 0, 9, 0, 10, 0, 11, 0,
18299a2dd95SBruce Richardson 			12, 0, 13, 0, 14, 0, 15, 0,
18399a2dd95SBruce Richardson 		},
18499a2dd95SBruce Richardson 	},
18599a2dd95SBruce Richardson };
18699a2dd95SBruce Richardson 
18799a2dd95SBruce Richardson /*
18899a2dd95SBruce Richardson  * unfortunately current AVX512 ISA doesn't provide ability for
18999a2dd95SBruce Richardson  * gather load on a byte quantity. So we have to mimic it in SW,
19099a2dd95SBruce Richardson  * by doing 8x1B scalar loads.
19199a2dd95SBruce Richardson  */
19299a2dd95SBruce Richardson static inline __m256i
_m512_mask_gather_epi8x8(__m512i pdata,__mmask8 mask)19399a2dd95SBruce Richardson _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
19499a2dd95SBruce Richardson {
19599a2dd95SBruce Richardson 	rte_ymm_t v;
19699a2dd95SBruce Richardson 	__rte_x86_zmm_t p;
19799a2dd95SBruce Richardson 
19899a2dd95SBruce Richardson 	static const uint32_t zero;
19999a2dd95SBruce Richardson 
20099a2dd95SBruce Richardson 	p.z = _mm512_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_,
20199a2dd95SBruce Richardson 		(uintptr_t)&zero);
20299a2dd95SBruce Richardson 
20399a2dd95SBruce Richardson 	v.u32[0] = *(uint8_t *)p.u64[0];
20499a2dd95SBruce Richardson 	v.u32[1] = *(uint8_t *)p.u64[1];
20599a2dd95SBruce Richardson 	v.u32[2] = *(uint8_t *)p.u64[2];
20699a2dd95SBruce Richardson 	v.u32[3] = *(uint8_t *)p.u64[3];
20799a2dd95SBruce Richardson 	v.u32[4] = *(uint8_t *)p.u64[4];
20899a2dd95SBruce Richardson 	v.u32[5] = *(uint8_t *)p.u64[5];
20999a2dd95SBruce Richardson 	v.u32[6] = *(uint8_t *)p.u64[6];
21099a2dd95SBruce Richardson 	v.u32[7] = *(uint8_t *)p.u64[7];
21199a2dd95SBruce Richardson 
21299a2dd95SBruce Richardson 	return v.y;
21399a2dd95SBruce Richardson }
21499a2dd95SBruce Richardson 
21599a2dd95SBruce Richardson /*
21699a2dd95SBruce Richardson  * Gather 4/1 input bytes for up to 16 (2*8) locations in parallel.
21799a2dd95SBruce Richardson  */
21899a2dd95SBruce Richardson static __rte_always_inline __m512i
_F_(gather_bytes)21999a2dd95SBruce Richardson _F_(gather_bytes)(__m512i zero, const __m512i p[2], const uint32_t m[2],
22099a2dd95SBruce Richardson 	uint32_t bnum)
22199a2dd95SBruce Richardson {
22299a2dd95SBruce Richardson 	__m256i inp[2];
22399a2dd95SBruce Richardson 
22499a2dd95SBruce Richardson 	if (bnum == sizeof(uint8_t)) {
22599a2dd95SBruce Richardson 		inp[0] = _m512_mask_gather_epi8x8(p[0], m[0]);
22699a2dd95SBruce Richardson 		inp[1] = _m512_mask_gather_epi8x8(p[1], m[1]);
22799a2dd95SBruce Richardson 	} else {
22899a2dd95SBruce Richardson 		inp[0] = _mm512_mask_i64gather_epi32(
22999a2dd95SBruce Richardson 				_mm512_castsi512_si256(zero),
23099a2dd95SBruce Richardson 				m[0], p[0], NULL, sizeof(uint8_t));
23199a2dd95SBruce Richardson 		inp[1] = _mm512_mask_i64gather_epi32(
23299a2dd95SBruce Richardson 				_mm512_castsi512_si256(zero),
23399a2dd95SBruce Richardson 				m[1], p[1], NULL, sizeof(uint8_t));
23499a2dd95SBruce Richardson 	}
23599a2dd95SBruce Richardson 
23699a2dd95SBruce Richardson 	/* squeeze input into one 512-bit register */
23799a2dd95SBruce Richardson 	return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]),
23899a2dd95SBruce Richardson 			_SV_(pminp), _mm512_castsi256_si512(inp[1]));
23999a2dd95SBruce Richardson }
24099a2dd95SBruce Richardson 
24199a2dd95SBruce Richardson /*
242*7be78d02SJosh Soref  * Resolve matches for multiple categories (GT 8, use 512b instructions/regs)
24399a2dd95SBruce Richardson  */
24499a2dd95SBruce Richardson static inline void
resolve_mcgt8_avx512x1(uint32_t result[],const struct rte_acl_match_results pr[],const uint32_t match[],uint32_t nb_pkt,uint32_t nb_cat,uint32_t nb_trie)24599a2dd95SBruce Richardson resolve_mcgt8_avx512x1(uint32_t result[],
24699a2dd95SBruce Richardson 	const struct rte_acl_match_results pr[], const uint32_t match[],
24799a2dd95SBruce Richardson 	uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)
24899a2dd95SBruce Richardson {
24999a2dd95SBruce Richardson 	const int32_t *pri;
25099a2dd95SBruce Richardson 	const uint32_t *pm, *res;
25199a2dd95SBruce Richardson 	uint32_t i, k, mi;
25299a2dd95SBruce Richardson 	__mmask16 cm, sm;
25399a2dd95SBruce Richardson 	__m512i cp, cr, np, nr;
25499a2dd95SBruce Richardson 
25599a2dd95SBruce Richardson 	res = pr->results;
25699a2dd95SBruce Richardson 	pri = pr->priority;
25799a2dd95SBruce Richardson 
25899a2dd95SBruce Richardson 	cm = (1 << nb_cat) - 1;
25999a2dd95SBruce Richardson 
26099a2dd95SBruce Richardson 	for (k = 0; k != nb_pkt; k++, result += nb_cat) {
26199a2dd95SBruce Richardson 
262b3b36f0fSKonstantin Ananyev 		mi = match[k] << ACL_MATCH_LOG;
26399a2dd95SBruce Richardson 
26499a2dd95SBruce Richardson 		cr = _mm512_maskz_loadu_epi32(cm, res + mi);
26599a2dd95SBruce Richardson 		cp = _mm512_maskz_loadu_epi32(cm, pri + mi);
26699a2dd95SBruce Richardson 
26799a2dd95SBruce Richardson 		for (i = 1, pm = match + nb_pkt; i != nb_trie;
26899a2dd95SBruce Richardson 				i++, pm += nb_pkt) {
26999a2dd95SBruce Richardson 
270b3b36f0fSKonstantin Ananyev 			mi = pm[k] << ACL_MATCH_LOG;
27199a2dd95SBruce Richardson 
27299a2dd95SBruce Richardson 			nr = _mm512_maskz_loadu_epi32(cm, res + mi);
27399a2dd95SBruce Richardson 			np = _mm512_maskz_loadu_epi32(cm, pri + mi);
27499a2dd95SBruce Richardson 
27599a2dd95SBruce Richardson 			sm = _mm512_cmpgt_epi32_mask(cp, np);
27699a2dd95SBruce Richardson 			cr = _mm512_mask_mov_epi32(nr, sm, cr);
27799a2dd95SBruce Richardson 			cp = _mm512_mask_mov_epi32(np, sm, cp);
27899a2dd95SBruce Richardson 		}
27999a2dd95SBruce Richardson 
28099a2dd95SBruce Richardson 		_mm512_mask_storeu_epi32(result, cm, cr);
28199a2dd95SBruce Richardson 	}
28299a2dd95SBruce Richardson }
28399a2dd95SBruce Richardson 
28499a2dd95SBruce Richardson #include "acl_run_avx512_common.h"
28599a2dd95SBruce Richardson 
28699a2dd95SBruce Richardson /*
28799a2dd95SBruce Richardson  * Perform search for up to (2 * 16) flows in parallel.
28899a2dd95SBruce Richardson  * Use two sets of metadata, each serves 16 flows max.
28999a2dd95SBruce Richardson  */
29099a2dd95SBruce Richardson static inline int
search_avx512x16x2(const struct rte_acl_ctx * ctx,const uint8_t ** data,uint32_t * results,uint32_t total_packets,uint32_t categories)29199a2dd95SBruce Richardson search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
29299a2dd95SBruce Richardson 	uint32_t *results, uint32_t total_packets, uint32_t categories)
29399a2dd95SBruce Richardson {
29499a2dd95SBruce Richardson 	uint32_t i, *pm;
29599a2dd95SBruce Richardson 	const struct rte_acl_match_results *pr;
29699a2dd95SBruce Richardson 	struct acl_flow_avx512 flow;
29799a2dd95SBruce Richardson 	uint32_t match[ctx->num_tries * total_packets];
29899a2dd95SBruce Richardson 
29999a2dd95SBruce Richardson 	for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
30099a2dd95SBruce Richardson 
30199a2dd95SBruce Richardson 		/* setup for next trie */
30299a2dd95SBruce Richardson 		acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
30399a2dd95SBruce Richardson 
30499a2dd95SBruce Richardson 		/* process the trie */
30599a2dd95SBruce Richardson 		_F_(search_trie)(&flow);
30699a2dd95SBruce Richardson 	}
30799a2dd95SBruce Richardson 
30899a2dd95SBruce Richardson 	/* resolve matches */
30999a2dd95SBruce Richardson 	pr = (const struct rte_acl_match_results *)
31099a2dd95SBruce Richardson 		(ctx->trans_table + ctx->match_index);
31199a2dd95SBruce Richardson 
31299a2dd95SBruce Richardson 	if (categories == 1)
31399a2dd95SBruce Richardson 		_F_(resolve_single_cat)(results, pr, match, total_packets,
31499a2dd95SBruce Richardson 			ctx->num_tries);
31599a2dd95SBruce Richardson 	else if (categories <= RTE_ACL_MAX_CATEGORIES / 2)
31699a2dd95SBruce Richardson 		resolve_mcle8_avx512x1(results, pr, match, total_packets,
31799a2dd95SBruce Richardson 			categories, ctx->num_tries);
31899a2dd95SBruce Richardson 	else
31999a2dd95SBruce Richardson 		resolve_mcgt8_avx512x1(results, pr, match, total_packets,
32099a2dd95SBruce Richardson 			categories, ctx->num_tries);
32199a2dd95SBruce Richardson 
32299a2dd95SBruce Richardson 	return 0;
32399a2dd95SBruce Richardson }
32499a2dd95SBruce Richardson 
32599a2dd95SBruce Richardson #undef _SIMD_PTR_MSK_
32699a2dd95SBruce Richardson #undef _SIMD_PTR_NUM_
32799a2dd95SBruce Richardson #undef _SIMD_FLOW_MSK_
32899a2dd95SBruce Richardson #undef _SIMD_FLOW_NUM_
32999a2dd95SBruce Richardson #undef _SIMD_MASK_MAX_
33099a2dd95SBruce Richardson #undef _SIMD_MASK_BIT_
33199a2dd95SBruce Richardson #undef _M_GI_
33299a2dd95SBruce Richardson #undef _M_MGI_
33399a2dd95SBruce Richardson #undef _M_SI_
33499a2dd95SBruce Richardson #undef _M_I_
33599a2dd95SBruce Richardson #undef _F_
33699a2dd95SBruce Richardson #undef _SV_
33799a2dd95SBruce Richardson #undef _SC_
33899a2dd95SBruce Richardson #undef _T_mask
33999a2dd95SBruce Richardson #undef _T_simd
340