xref: /dpdk/lib/distributor/distributor_private.h (revision c6552d9a8deffa448de2d5e2e726f50508c1efd2)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2017 Intel Corporation
3  */
4 
5 #ifndef _DIST_PRIV_H_
6 #define _DIST_PRIV_H_
7 
8 #include <stdalign.h>
9 
10 /**
11  * @file
12  * RTE distributor
13  *
14  * The distributor is a component which is designed to pass packets
15  * one-at-a-time to workers, with dynamic load balancing.
16  */
17 
18 #define NO_FLAGS 0
19 #define RTE_DISTRIB_PREFIX "DT_"
20 
21 /*
22  * We will use the bottom four bits of pointer for flags, shifting out
23  * the top four bits to make room (since a 64-bit pointer actually only uses
24  * 48 bits). An arithmetic-right-shift will then appropriately restore the
25  * original pointer value with proper sign extension into the top bits.
26  */
27 #define RTE_DISTRIB_FLAG_BITS 4
28 #define RTE_DISTRIB_FLAGS_MASK (0x0F)
29 #define RTE_DISTRIB_NO_BUF 0       /**< empty flags: no buffer requested */
30 #define RTE_DISTRIB_GET_BUF (1)    /**< worker requests a buffer, returns old */
31 #define RTE_DISTRIB_RETURN_BUF (2) /**< worker returns a buffer, no request */
32 #define RTE_DISTRIB_VALID_BUF (4)  /**< set if bufptr contains ptr */
33 
34 #define RTE_DISTRIB_BACKLOG_SIZE 8
35 #define RTE_DISTRIB_BACKLOG_MASK (RTE_DISTRIB_BACKLOG_SIZE - 1)
36 
37 #define RTE_DISTRIB_MAX_RETURNS 128
38 #define RTE_DISTRIB_RETURNS_MASK (RTE_DISTRIB_MAX_RETURNS - 1)
39 
40 /**
41  * Maximum number of workers allowed.
42  * Be aware of increasing the limit, because it is limited by how we track
43  * in-flight tags. See in_flight_bitmask and rte_distributor_process
44  */
45 #define RTE_DISTRIB_MAX_WORKERS 64
46 
47 #define RTE_DISTRIBUTOR_NAMESIZE 32 /**< Length of name for instance */
48 
49 /**
50  * Buffer structure used to pass the pointer data between cores. This is cache
51  * line aligned, but to improve performance and prevent adjacent cache-line
52  * prefetches of buffers for other workers, e.g. when worker 1's buffer is on
53  * the next cache line to worker 0, we pad this out to three cache lines.
54  * Only 64-bits of the memory is actually used though.
55  */
56 union __rte_cache_aligned rte_distributor_buffer_single {
57 	volatile RTE_ATOMIC(int64_t) bufptr64;
58 	char pad[RTE_CACHE_LINE_SIZE*3];
59 };
60 
61 /*
62  * Transfer up to 8 mbufs at a time to/from workers, and
63  * flow matching algorithm optimized for 8 flow IDs at a time
64  */
65 #define RTE_DIST_BURST_SIZE 8
66 
67 struct __rte_cache_aligned rte_distributor_backlog {
68 	unsigned int start;
69 	unsigned int count;
70 	alignas(RTE_CACHE_LINE_SIZE) int64_t pkts[RTE_DIST_BURST_SIZE];
71 	uint16_t *tags; /* will point to second cacheline of inflights */
72 };
73 
74 
75 struct rte_distributor_returned_pkts {
76 	unsigned int start;
77 	unsigned int count;
78 	struct rte_mbuf *mbufs[RTE_DISTRIB_MAX_RETURNS];
79 };
80 
81 struct rte_distributor_single {
82 	TAILQ_ENTRY(rte_distributor_single) next;    /**< Next in list. */
83 
84 	char name[RTE_DISTRIBUTOR_NAMESIZE];  /**< Name of the ring. */
85 	unsigned int num_workers;             /**< Number of workers polling */
86 
87 	uint32_t in_flight_tags[RTE_DISTRIB_MAX_WORKERS];
88 		/**< Tracks the tag being processed per core */
89 	uint64_t in_flight_bitmask;
90 		/**< on/off bits for in-flight tags.
91 		 * Note that if RTE_DISTRIB_MAX_WORKERS is larger than 64 then
92 		 * the bitmask has to expand.
93 		 */
94 
95 	struct rte_distributor_backlog backlog[RTE_DISTRIB_MAX_WORKERS];
96 
97 	union rte_distributor_buffer_single bufs[RTE_DISTRIB_MAX_WORKERS];
98 
99 	struct rte_distributor_returned_pkts returns;
100 };
101 
102 /* All different signature compare functions */
103 enum rte_distributor_match_function {
104 	RTE_DIST_MATCH_SCALAR = 0,
105 	RTE_DIST_MATCH_VECTOR,
106 	RTE_DIST_NUM_MATCH_FNS
107 };
108 
109 /**
110  * Buffer structure used to pass the pointer data between cores. This is cache
111  * line aligned, but to improve performance and prevent adjacent cache-line
112  * prefetches of buffers for other workers, e.g. when worker 1's buffer is on
113  * the next cache line to worker 0, we pad this out to two cache lines.
114  * We can pass up to 8 mbufs at a time in one cacheline.
115  * There is a separate cacheline for returns in the burst API.
116  */
117 struct rte_distributor_buffer {
118 	volatile alignas(RTE_CACHE_LINE_SIZE) RTE_ATOMIC(int64_t) bufptr64[RTE_DIST_BURST_SIZE];
119 		/* <= outgoing to worker */
120 
121 	alignas(RTE_CACHE_LINE_SIZE) int64_t pad1;    /* <= one cache line  */
122 
123 	volatile alignas(RTE_CACHE_LINE_SIZE) RTE_ATOMIC(int64_t) retptr64[RTE_DIST_BURST_SIZE];
124 		/* <= incoming from worker */
125 
126 	alignas(RTE_CACHE_LINE_SIZE) int64_t pad2;    /* <= one cache line  */
127 
128 	alignas(RTE_CACHE_LINE_SIZE) int count;       /* <= number of current mbufs */
129 };
130 
131 struct rte_distributor {
132 	TAILQ_ENTRY(rte_distributor) next;    /**< Next in list. */
133 
134 	char name[RTE_DISTRIBUTOR_NAMESIZE];  /**< Name of the ring. */
135 	unsigned int num_workers;             /**< Number of workers polling */
136 	unsigned int alg_type;                /**< Number of alg types */
137 
138 	/**>
139 	 * First cache line in the this array are the tags inflight
140 	 * on the worker core. Second cache line are the backlog
141 	 * that are going to go to the worker core.
142 	 */
143 	alignas(RTE_CACHE_LINE_SIZE) uint16_t
144 		in_flight_tags[RTE_DISTRIB_MAX_WORKERS][RTE_DIST_BURST_SIZE*2];
145 
146 	alignas(RTE_CACHE_LINE_SIZE) struct rte_distributor_backlog
147 		backlog[RTE_DISTRIB_MAX_WORKERS];
148 
149 	struct rte_distributor_buffer bufs[RTE_DISTRIB_MAX_WORKERS];
150 
151 	struct rte_distributor_returned_pkts returns;
152 
153 	enum rte_distributor_match_function dist_match_fn;
154 
155 	struct rte_distributor_single *d_single;
156 
157 	uint8_t active[RTE_DISTRIB_MAX_WORKERS];
158 	uint8_t activesum;
159 };
160 
161 void
162 find_match_scalar(struct rte_distributor *d,
163 			uint16_t *data_ptr,
164 			uint16_t *output_ptr);
165 
166 void
167 find_match_vec(struct rte_distributor *d,
168 			uint16_t *data_ptr,
169 			uint16_t *output_ptr);
170 
171 #endif /* _DIST_PRIV_H_ */
172