1*99a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
2*99a2dd95SBruce Richardson * Copyright(c) 2017 Intel Corporation
3*99a2dd95SBruce Richardson */
4*99a2dd95SBruce Richardson
5*99a2dd95SBruce Richardson #include <rte_mbuf.h>
6*99a2dd95SBruce Richardson #include "distributor_private.h"
7*99a2dd95SBruce Richardson #include "smmintrin.h"
8*99a2dd95SBruce Richardson
9*99a2dd95SBruce Richardson
10*99a2dd95SBruce Richardson void
find_match_vec(struct rte_distributor * d,uint16_t * data_ptr,uint16_t * output_ptr)11*99a2dd95SBruce Richardson find_match_vec(struct rte_distributor *d,
12*99a2dd95SBruce Richardson uint16_t *data_ptr,
13*99a2dd95SBruce Richardson uint16_t *output_ptr)
14*99a2dd95SBruce Richardson {
15*99a2dd95SBruce Richardson /* Setup */
16*99a2dd95SBruce Richardson __m128i incoming_fids;
17*99a2dd95SBruce Richardson __m128i inflight_fids;
18*99a2dd95SBruce Richardson __m128i preflight_fids;
19*99a2dd95SBruce Richardson __m128i wkr;
20*99a2dd95SBruce Richardson __m128i mask1;
21*99a2dd95SBruce Richardson __m128i mask2;
22*99a2dd95SBruce Richardson __m128i output;
23*99a2dd95SBruce Richardson struct rte_distributor_backlog *bl;
24*99a2dd95SBruce Richardson uint16_t i;
25*99a2dd95SBruce Richardson
26*99a2dd95SBruce Richardson /*
27*99a2dd95SBruce Richardson * Function overview:
28*99a2dd95SBruce Richardson * 2. Loop through all worker ID's
29*99a2dd95SBruce Richardson * 2a. Load the current inflights for that worker into an xmm reg
30*99a2dd95SBruce Richardson * 2b. Load the current backlog for that worker into an xmm reg
31*99a2dd95SBruce Richardson * 2c. use cmpestrm to intersect flow_ids with backlog and inflights
32*99a2dd95SBruce Richardson * 2d. Add any matches to the output
33*99a2dd95SBruce Richardson * 3. Write the output xmm (matching worker ids).
34*99a2dd95SBruce Richardson */
35*99a2dd95SBruce Richardson
36*99a2dd95SBruce Richardson
37*99a2dd95SBruce Richardson output = _mm_set1_epi16(0);
38*99a2dd95SBruce Richardson incoming_fids = _mm_load_si128((__m128i *)data_ptr);
39*99a2dd95SBruce Richardson
40*99a2dd95SBruce Richardson for (i = 0; i < d->num_workers; i++) {
41*99a2dd95SBruce Richardson bl = &d->backlog[i];
42*99a2dd95SBruce Richardson
43*99a2dd95SBruce Richardson inflight_fids =
44*99a2dd95SBruce Richardson _mm_load_si128((__m128i *)&(d->in_flight_tags[i]));
45*99a2dd95SBruce Richardson preflight_fids =
46*99a2dd95SBruce Richardson _mm_load_si128((__m128i *)(bl->tags));
47*99a2dd95SBruce Richardson
48*99a2dd95SBruce Richardson /*
49*99a2dd95SBruce Richardson * Any incoming_fid that exists anywhere in inflight_fids will
50*99a2dd95SBruce Richardson * have 0xffff in same position of the mask as the incoming fid
51*99a2dd95SBruce Richardson * Example (shortened to bytes for brevity):
52*99a2dd95SBruce Richardson * incoming_fids 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08
53*99a2dd95SBruce Richardson * inflight_fids 0x03 0x05 0x07 0x00 0x00 0x00 0x00 0x00
54*99a2dd95SBruce Richardson * mask 0x00 0x00 0xff 0x00 0xff 0x00 0xff 0x00
55*99a2dd95SBruce Richardson */
56*99a2dd95SBruce Richardson
57*99a2dd95SBruce Richardson mask1 = _mm_cmpestrm(inflight_fids, 8, incoming_fids, 8,
58*99a2dd95SBruce Richardson _SIDD_UWORD_OPS |
59*99a2dd95SBruce Richardson _SIDD_CMP_EQUAL_ANY |
60*99a2dd95SBruce Richardson _SIDD_UNIT_MASK);
61*99a2dd95SBruce Richardson mask2 = _mm_cmpestrm(preflight_fids, 8, incoming_fids, 8,
62*99a2dd95SBruce Richardson _SIDD_UWORD_OPS |
63*99a2dd95SBruce Richardson _SIDD_CMP_EQUAL_ANY |
64*99a2dd95SBruce Richardson _SIDD_UNIT_MASK);
65*99a2dd95SBruce Richardson
66*99a2dd95SBruce Richardson mask1 = _mm_or_si128(mask1, mask2);
67*99a2dd95SBruce Richardson /*
68*99a2dd95SBruce Richardson * Now mask contains 0xffff where there's a match.
69*99a2dd95SBruce Richardson * Next we need to store the worker_id in the relevant position
70*99a2dd95SBruce Richardson * in the output.
71*99a2dd95SBruce Richardson */
72*99a2dd95SBruce Richardson
73*99a2dd95SBruce Richardson wkr = _mm_set1_epi16(i+1);
74*99a2dd95SBruce Richardson mask1 = _mm_and_si128(mask1, wkr);
75*99a2dd95SBruce Richardson output = _mm_or_si128(mask1, output);
76*99a2dd95SBruce Richardson }
77*99a2dd95SBruce Richardson
78*99a2dd95SBruce Richardson /*
79*99a2dd95SBruce Richardson * At this stage, the output 128-bit contains 8 16-bit values, with
80*99a2dd95SBruce Richardson * each non-zero value containing the worker ID on which the
81*99a2dd95SBruce Richardson * corresponding flow is pinned to.
82*99a2dd95SBruce Richardson */
83*99a2dd95SBruce Richardson _mm_store_si128((__m128i *)output_ptr, output);
84*99a2dd95SBruce Richardson }
85