1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2017 Intel Corporation 3 */ 4 5 #ifndef _DIST_PRIV_H_ 6 #define _DIST_PRIV_H_ 7 8 #include <stdalign.h> 9 10 /** 11 * @file 12 * RTE distributor 13 * 14 * The distributor is a component which is designed to pass packets 15 * one-at-a-time to workers, with dynamic load balancing. 16 */ 17 18 #define NO_FLAGS 0 19 #define RTE_DISTRIB_PREFIX "DT_" 20 21 /* 22 * We will use the bottom four bits of pointer for flags, shifting out 23 * the top four bits to make room (since a 64-bit pointer actually only uses 24 * 48 bits). An arithmetic-right-shift will then appropriately restore the 25 * original pointer value with proper sign extension into the top bits. 26 */ 27 #define RTE_DISTRIB_FLAG_BITS 4 28 #define RTE_DISTRIB_FLAGS_MASK (0x0F) 29 #define RTE_DISTRIB_NO_BUF 0 /**< empty flags: no buffer requested */ 30 #define RTE_DISTRIB_GET_BUF (1) /**< worker requests a buffer, returns old */ 31 #define RTE_DISTRIB_RETURN_BUF (2) /**< worker returns a buffer, no request */ 32 #define RTE_DISTRIB_VALID_BUF (4) /**< set if bufptr contains ptr */ 33 34 #define RTE_DISTRIB_BACKLOG_SIZE 8 35 #define RTE_DISTRIB_BACKLOG_MASK (RTE_DISTRIB_BACKLOG_SIZE - 1) 36 37 #define RTE_DISTRIB_MAX_RETURNS 128 38 #define RTE_DISTRIB_RETURNS_MASK (RTE_DISTRIB_MAX_RETURNS - 1) 39 40 /** 41 * Maximum number of workers allowed. 42 * Be aware of increasing the limit, because it is limited by how we track 43 * in-flight tags. See in_flight_bitmask and rte_distributor_process 44 */ 45 #define RTE_DISTRIB_MAX_WORKERS 64 46 47 #define RTE_DISTRIBUTOR_NAMESIZE 32 /**< Length of name for instance */ 48 49 /** 50 * Buffer structure used to pass the pointer data between cores. This is cache 51 * line aligned, but to improve performance and prevent adjacent cache-line 52 * prefetches of buffers for other workers, e.g. when worker 1's buffer is on 53 * the next cache line to worker 0, we pad this out to three cache lines. 54 * Only 64-bits of the memory is actually used though. 55 */ 56 union __rte_cache_aligned rte_distributor_buffer_single { 57 volatile RTE_ATOMIC(int64_t) bufptr64; 58 char pad[RTE_CACHE_LINE_SIZE*3]; 59 }; 60 61 /* 62 * Transfer up to 8 mbufs at a time to/from workers, and 63 * flow matching algorithm optimized for 8 flow IDs at a time 64 */ 65 #define RTE_DIST_BURST_SIZE 8 66 67 struct __rte_cache_aligned rte_distributor_backlog { 68 unsigned int start; 69 unsigned int count; 70 alignas(RTE_CACHE_LINE_SIZE) int64_t pkts[RTE_DIST_BURST_SIZE]; 71 uint16_t *tags; /* will point to second cacheline of inflights */ 72 }; 73 74 75 struct rte_distributor_returned_pkts { 76 unsigned int start; 77 unsigned int count; 78 struct rte_mbuf *mbufs[RTE_DISTRIB_MAX_RETURNS]; 79 }; 80 81 struct rte_distributor_single { 82 TAILQ_ENTRY(rte_distributor_single) next; /**< Next in list. */ 83 84 char name[RTE_DISTRIBUTOR_NAMESIZE]; /**< Name of the ring. */ 85 unsigned int num_workers; /**< Number of workers polling */ 86 87 uint32_t in_flight_tags[RTE_DISTRIB_MAX_WORKERS]; 88 /**< Tracks the tag being processed per core */ 89 uint64_t in_flight_bitmask; 90 /**< on/off bits for in-flight tags. 91 * Note that if RTE_DISTRIB_MAX_WORKERS is larger than 64 then 92 * the bitmask has to expand. 93 */ 94 95 struct rte_distributor_backlog backlog[RTE_DISTRIB_MAX_WORKERS]; 96 97 union rte_distributor_buffer_single bufs[RTE_DISTRIB_MAX_WORKERS]; 98 99 struct rte_distributor_returned_pkts returns; 100 }; 101 102 /* All different signature compare functions */ 103 enum rte_distributor_match_function { 104 RTE_DIST_MATCH_SCALAR = 0, 105 RTE_DIST_MATCH_VECTOR, 106 RTE_DIST_NUM_MATCH_FNS 107 }; 108 109 /** 110 * Buffer structure used to pass the pointer data between cores. This is cache 111 * line aligned, but to improve performance and prevent adjacent cache-line 112 * prefetches of buffers for other workers, e.g. when worker 1's buffer is on 113 * the next cache line to worker 0, we pad this out to two cache lines. 114 * We can pass up to 8 mbufs at a time in one cacheline. 115 * There is a separate cacheline for returns in the burst API. 116 */ 117 struct rte_distributor_buffer { 118 volatile alignas(RTE_CACHE_LINE_SIZE) RTE_ATOMIC(int64_t) bufptr64[RTE_DIST_BURST_SIZE]; 119 /* <= outgoing to worker */ 120 121 alignas(RTE_CACHE_LINE_SIZE) int64_t pad1; /* <= one cache line */ 122 123 volatile alignas(RTE_CACHE_LINE_SIZE) RTE_ATOMIC(int64_t) retptr64[RTE_DIST_BURST_SIZE]; 124 /* <= incoming from worker */ 125 126 alignas(RTE_CACHE_LINE_SIZE) int64_t pad2; /* <= one cache line */ 127 128 alignas(RTE_CACHE_LINE_SIZE) int count; /* <= number of current mbufs */ 129 }; 130 131 struct rte_distributor { 132 TAILQ_ENTRY(rte_distributor) next; /**< Next in list. */ 133 134 char name[RTE_DISTRIBUTOR_NAMESIZE]; /**< Name of the ring. */ 135 unsigned int num_workers; /**< Number of workers polling */ 136 unsigned int alg_type; /**< Number of alg types */ 137 138 /**> 139 * First cache line in the this array are the tags inflight 140 * on the worker core. Second cache line are the backlog 141 * that are going to go to the worker core. 142 */ 143 alignas(RTE_CACHE_LINE_SIZE) uint16_t 144 in_flight_tags[RTE_DISTRIB_MAX_WORKERS][RTE_DIST_BURST_SIZE*2]; 145 146 alignas(RTE_CACHE_LINE_SIZE) struct rte_distributor_backlog 147 backlog[RTE_DISTRIB_MAX_WORKERS]; 148 149 struct rte_distributor_buffer bufs[RTE_DISTRIB_MAX_WORKERS]; 150 151 struct rte_distributor_returned_pkts returns; 152 153 enum rte_distributor_match_function dist_match_fn; 154 155 struct rte_distributor_single *d_single; 156 157 uint8_t active[RTE_DISTRIB_MAX_WORKERS]; 158 uint8_t activesum; 159 }; 160 161 void 162 find_match_scalar(struct rte_distributor *d, 163 uint16_t *data_ptr, 164 uint16_t *output_ptr); 165 166 void 167 find_match_vec(struct rte_distributor *d, 168 uint16_t *data_ptr, 169 uint16_t *output_ptr); 170 171 #endif /* _DIST_PRIV_H_ */ 172