xref: /dpdk/lib/distributor/rte_distributor_match_sse.c (revision 30a1de105a5f40d77b344a891c4a68f79e815c43)
1*99a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
2*99a2dd95SBruce Richardson  * Copyright(c) 2017 Intel Corporation
3*99a2dd95SBruce Richardson  */
4*99a2dd95SBruce Richardson 
5*99a2dd95SBruce Richardson #include <rte_mbuf.h>
6*99a2dd95SBruce Richardson #include "distributor_private.h"
7*99a2dd95SBruce Richardson #include "smmintrin.h"
8*99a2dd95SBruce Richardson 
9*99a2dd95SBruce Richardson 
10*99a2dd95SBruce Richardson void
find_match_vec(struct rte_distributor * d,uint16_t * data_ptr,uint16_t * output_ptr)11*99a2dd95SBruce Richardson find_match_vec(struct rte_distributor *d,
12*99a2dd95SBruce Richardson 			uint16_t *data_ptr,
13*99a2dd95SBruce Richardson 			uint16_t *output_ptr)
14*99a2dd95SBruce Richardson {
15*99a2dd95SBruce Richardson 	/* Setup */
16*99a2dd95SBruce Richardson 	__m128i incoming_fids;
17*99a2dd95SBruce Richardson 	__m128i inflight_fids;
18*99a2dd95SBruce Richardson 	__m128i preflight_fids;
19*99a2dd95SBruce Richardson 	__m128i wkr;
20*99a2dd95SBruce Richardson 	__m128i mask1;
21*99a2dd95SBruce Richardson 	__m128i mask2;
22*99a2dd95SBruce Richardson 	__m128i output;
23*99a2dd95SBruce Richardson 	struct rte_distributor_backlog *bl;
24*99a2dd95SBruce Richardson 	uint16_t i;
25*99a2dd95SBruce Richardson 
26*99a2dd95SBruce Richardson 	/*
27*99a2dd95SBruce Richardson 	 * Function overview:
28*99a2dd95SBruce Richardson 	 * 2. Loop through all worker ID's
29*99a2dd95SBruce Richardson 	 *  2a. Load the current inflights for that worker into an xmm reg
30*99a2dd95SBruce Richardson 	 *  2b. Load the current backlog for that worker into an xmm reg
31*99a2dd95SBruce Richardson 	 *  2c. use cmpestrm to intersect flow_ids with backlog and inflights
32*99a2dd95SBruce Richardson 	 *  2d. Add any matches to the output
33*99a2dd95SBruce Richardson 	 * 3. Write the output xmm (matching worker ids).
34*99a2dd95SBruce Richardson 	 */
35*99a2dd95SBruce Richardson 
36*99a2dd95SBruce Richardson 
37*99a2dd95SBruce Richardson 	output = _mm_set1_epi16(0);
38*99a2dd95SBruce Richardson 	incoming_fids = _mm_load_si128((__m128i *)data_ptr);
39*99a2dd95SBruce Richardson 
40*99a2dd95SBruce Richardson 	for (i = 0; i < d->num_workers; i++) {
41*99a2dd95SBruce Richardson 		bl = &d->backlog[i];
42*99a2dd95SBruce Richardson 
43*99a2dd95SBruce Richardson 		inflight_fids =
44*99a2dd95SBruce Richardson 			_mm_load_si128((__m128i *)&(d->in_flight_tags[i]));
45*99a2dd95SBruce Richardson 		preflight_fids =
46*99a2dd95SBruce Richardson 			_mm_load_si128((__m128i *)(bl->tags));
47*99a2dd95SBruce Richardson 
48*99a2dd95SBruce Richardson 		/*
49*99a2dd95SBruce Richardson 		 * Any incoming_fid that exists anywhere in inflight_fids will
50*99a2dd95SBruce Richardson 		 * have 0xffff in same position of the mask as the incoming fid
51*99a2dd95SBruce Richardson 		 * Example (shortened to bytes for brevity):
52*99a2dd95SBruce Richardson 		 * incoming_fids   0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08
53*99a2dd95SBruce Richardson 		 * inflight_fids   0x03 0x05 0x07 0x00 0x00 0x00 0x00 0x00
54*99a2dd95SBruce Richardson 		 * mask            0x00 0x00 0xff 0x00 0xff 0x00 0xff 0x00
55*99a2dd95SBruce Richardson 		 */
56*99a2dd95SBruce Richardson 
57*99a2dd95SBruce Richardson 		mask1 = _mm_cmpestrm(inflight_fids, 8, incoming_fids, 8,
58*99a2dd95SBruce Richardson 			_SIDD_UWORD_OPS |
59*99a2dd95SBruce Richardson 			_SIDD_CMP_EQUAL_ANY |
60*99a2dd95SBruce Richardson 			_SIDD_UNIT_MASK);
61*99a2dd95SBruce Richardson 		mask2 = _mm_cmpestrm(preflight_fids, 8, incoming_fids, 8,
62*99a2dd95SBruce Richardson 			_SIDD_UWORD_OPS |
63*99a2dd95SBruce Richardson 			_SIDD_CMP_EQUAL_ANY |
64*99a2dd95SBruce Richardson 			_SIDD_UNIT_MASK);
65*99a2dd95SBruce Richardson 
66*99a2dd95SBruce Richardson 		mask1 = _mm_or_si128(mask1, mask2);
67*99a2dd95SBruce Richardson 		/*
68*99a2dd95SBruce Richardson 		 * Now mask contains 0xffff where there's a match.
69*99a2dd95SBruce Richardson 		 * Next we need to store the worker_id in the relevant position
70*99a2dd95SBruce Richardson 		 * in the output.
71*99a2dd95SBruce Richardson 		 */
72*99a2dd95SBruce Richardson 
73*99a2dd95SBruce Richardson 		wkr = _mm_set1_epi16(i+1);
74*99a2dd95SBruce Richardson 		mask1 = _mm_and_si128(mask1, wkr);
75*99a2dd95SBruce Richardson 		output = _mm_or_si128(mask1, output);
76*99a2dd95SBruce Richardson 	}
77*99a2dd95SBruce Richardson 
78*99a2dd95SBruce Richardson 	/*
79*99a2dd95SBruce Richardson 	 * At this stage, the output 128-bit contains 8 16-bit values, with
80*99a2dd95SBruce Richardson 	 * each non-zero value containing the worker ID on which the
81*99a2dd95SBruce Richardson 	 * corresponding flow is pinned to.
82*99a2dd95SBruce Richardson 	 */
83*99a2dd95SBruce Richardson 	_mm_store_si128((__m128i *)output_ptr, output);
84*99a2dd95SBruce Richardson }
85