1f0d2114fSYongseok Koh /*- 2f0d2114fSYongseok Koh * BSD LICENSE 3f0d2114fSYongseok Koh * 4f0d2114fSYongseok Koh * Copyright 2017 6WIND S.A. 5f0d2114fSYongseok Koh * Copyright 2017 Mellanox. 6f0d2114fSYongseok Koh * 7f0d2114fSYongseok Koh * Redistribution and use in source and binary forms, with or without 8f0d2114fSYongseok Koh * modification, are permitted provided that the following conditions 9f0d2114fSYongseok Koh * are met: 10f0d2114fSYongseok Koh * 11f0d2114fSYongseok Koh * * Redistributions of source code must retain the above copyright 12f0d2114fSYongseok Koh * notice, this list of conditions and the following disclaimer. 13f0d2114fSYongseok Koh * * Redistributions in binary form must reproduce the above copyright 14f0d2114fSYongseok Koh * notice, this list of conditions and the following disclaimer in 15f0d2114fSYongseok Koh * the documentation and/or other materials provided with the 16f0d2114fSYongseok Koh * distribution. 17f0d2114fSYongseok Koh * * Neither the name of 6WIND S.A. nor the names of its 18f0d2114fSYongseok Koh * contributors may be used to endorse or promote products derived 19f0d2114fSYongseok Koh * from this software without specific prior written permission. 20f0d2114fSYongseok Koh * 21f0d2114fSYongseok Koh * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22f0d2114fSYongseok Koh * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23f0d2114fSYongseok Koh * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24f0d2114fSYongseok Koh * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25f0d2114fSYongseok Koh * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26f0d2114fSYongseok Koh * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27f0d2114fSYongseok Koh * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28f0d2114fSYongseok Koh * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29f0d2114fSYongseok Koh * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30f0d2114fSYongseok Koh * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31f0d2114fSYongseok Koh * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32f0d2114fSYongseok Koh */ 33f0d2114fSYongseok Koh 34f0d2114fSYongseok Koh #include <assert.h> 35f0d2114fSYongseok Koh #include <stdint.h> 36f0d2114fSYongseok Koh #include <string.h> 37f0d2114fSYongseok Koh #include <stdlib.h> 38f0d2114fSYongseok Koh #include <smmintrin.h> 39f0d2114fSYongseok Koh 40f0d2114fSYongseok Koh /* Verbs header. */ 41f0d2114fSYongseok Koh /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ 42f0d2114fSYongseok Koh #ifdef PEDANTIC 43f0d2114fSYongseok Koh #pragma GCC diagnostic ignored "-Wpedantic" 44f0d2114fSYongseok Koh #endif 45f0d2114fSYongseok Koh #include <infiniband/verbs.h> 46f0d2114fSYongseok Koh #include <infiniband/mlx5dv.h> 47f0d2114fSYongseok Koh #ifdef PEDANTIC 48f0d2114fSYongseok Koh #pragma GCC diagnostic error "-Wpedantic" 49f0d2114fSYongseok Koh #endif 50f0d2114fSYongseok Koh 51f0d2114fSYongseok Koh #include <rte_mbuf.h> 52f0d2114fSYongseok Koh #include <rte_mempool.h> 53f0d2114fSYongseok Koh #include <rte_prefetch.h> 54f0d2114fSYongseok Koh 55f0d2114fSYongseok Koh #include "mlx5.h" 56f0d2114fSYongseok Koh #include "mlx5_utils.h" 57f0d2114fSYongseok Koh #include "mlx5_rxtx.h" 58*5bfc9fc1SYongseok Koh #include "mlx5_rxtx_vec.h" 59f0d2114fSYongseok Koh #include "mlx5_autoconf.h" 60f0d2114fSYongseok Koh #include "mlx5_defs.h" 61f0d2114fSYongseok Koh #include "mlx5_prm.h" 62f0d2114fSYongseok Koh 63f0d2114fSYongseok Koh #ifndef __INTEL_COMPILER 64f0d2114fSYongseok Koh #pragma GCC diagnostic ignored "-Wcast-qual" 65f0d2114fSYongseok Koh #endif 66f0d2114fSYongseok Koh 67f0d2114fSYongseok Koh /** 68f0d2114fSYongseok Koh * Fill in buffer descriptors in a multi-packet send descriptor. 69f0d2114fSYongseok Koh * 70f0d2114fSYongseok Koh * @param txq 71f0d2114fSYongseok Koh * Pointer to TX queue structure. 72f0d2114fSYongseok Koh * @param dseg 73f0d2114fSYongseok Koh * Pointer to buffer descriptor to be writen. 74f0d2114fSYongseok Koh * @param pkts 75f0d2114fSYongseok Koh * Pointer to array of packets to be sent. 76f0d2114fSYongseok Koh * @param n 77f0d2114fSYongseok Koh * Number of packets to be filled. 78f0d2114fSYongseok Koh */ 79f0d2114fSYongseok Koh static inline void 80f0d2114fSYongseok Koh txq_wr_dseg_v(struct mlx5_txq_data *txq, __m128i *dseg, 81f0d2114fSYongseok Koh struct rte_mbuf **pkts, unsigned int n) 82f0d2114fSYongseok Koh { 83f0d2114fSYongseok Koh unsigned int pos; 84f0d2114fSYongseok Koh uintptr_t addr; 85f0d2114fSYongseok Koh const __m128i shuf_mask_dseg = 86f0d2114fSYongseok Koh _mm_set_epi8(8, 9, 10, 11, /* addr, bswap64 */ 87f0d2114fSYongseok Koh 12, 13, 14, 15, 88f0d2114fSYongseok Koh 7, 6, 5, 4, /* lkey */ 89f0d2114fSYongseok Koh 0, 1, 2, 3 /* length, bswap32 */); 90f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 91f0d2114fSYongseok Koh uint32_t tx_byte = 0; 92f0d2114fSYongseok Koh #endif 93f0d2114fSYongseok Koh 94f0d2114fSYongseok Koh for (pos = 0; pos < n; ++pos, ++dseg) { 95f0d2114fSYongseok Koh __m128i desc; 96f0d2114fSYongseok Koh struct rte_mbuf *pkt = pkts[pos]; 97f0d2114fSYongseok Koh 98f0d2114fSYongseok Koh addr = rte_pktmbuf_mtod(pkt, uintptr_t); 99f0d2114fSYongseok Koh desc = _mm_set_epi32(addr >> 32, 100f0d2114fSYongseok Koh addr, 101f0d2114fSYongseok Koh mlx5_tx_mb2mr(txq, pkt), 102f0d2114fSYongseok Koh DATA_LEN(pkt)); 103f0d2114fSYongseok Koh desc = _mm_shuffle_epi8(desc, shuf_mask_dseg); 104f0d2114fSYongseok Koh _mm_store_si128(dseg, desc); 105f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 106f0d2114fSYongseok Koh tx_byte += DATA_LEN(pkt); 107f0d2114fSYongseok Koh #endif 108f0d2114fSYongseok Koh } 109f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 110f0d2114fSYongseok Koh txq->stats.obytes += tx_byte; 111f0d2114fSYongseok Koh #endif 112f0d2114fSYongseok Koh } 113f0d2114fSYongseok Koh 114f0d2114fSYongseok Koh /** 115f0d2114fSYongseok Koh * Count the number of continuous single segment packets. 116f0d2114fSYongseok Koh * 117f0d2114fSYongseok Koh * @param pkts 118f0d2114fSYongseok Koh * Pointer to array of packets. 119f0d2114fSYongseok Koh * @param pkts_n 120f0d2114fSYongseok Koh * Number of packets. 121f0d2114fSYongseok Koh * 122f0d2114fSYongseok Koh * @return 123f0d2114fSYongseok Koh * Number of continuous single segment packets. 124f0d2114fSYongseok Koh */ 125f0d2114fSYongseok Koh static inline unsigned int 126f0d2114fSYongseok Koh txq_check_multiseg(struct rte_mbuf **pkts, uint16_t pkts_n) 127f0d2114fSYongseok Koh { 128f0d2114fSYongseok Koh unsigned int pos; 129f0d2114fSYongseok Koh 130f0d2114fSYongseok Koh if (!pkts_n) 131f0d2114fSYongseok Koh return 0; 132f0d2114fSYongseok Koh /* Count the number of continuous single segment packets. */ 133f0d2114fSYongseok Koh for (pos = 0; pos < pkts_n; ++pos) 134f0d2114fSYongseok Koh if (NB_SEGS(pkts[pos]) > 1) 135f0d2114fSYongseok Koh break; 136f0d2114fSYongseok Koh return pos; 137f0d2114fSYongseok Koh } 138f0d2114fSYongseok Koh 139f0d2114fSYongseok Koh /** 140f0d2114fSYongseok Koh * Count the number of packets having same ol_flags and calculate cs_flags. 141f0d2114fSYongseok Koh * 142f0d2114fSYongseok Koh * @param txq 143f0d2114fSYongseok Koh * Pointer to TX queue structure. 144f0d2114fSYongseok Koh * @param pkts 145f0d2114fSYongseok Koh * Pointer to array of packets. 146f0d2114fSYongseok Koh * @param pkts_n 147f0d2114fSYongseok Koh * Number of packets. 148f0d2114fSYongseok Koh * @param cs_flags 149f0d2114fSYongseok Koh * Pointer of flags to be returned. 150f0d2114fSYongseok Koh * 151f0d2114fSYongseok Koh * @return 152f0d2114fSYongseok Koh * Number of packets having same ol_flags. 153f0d2114fSYongseok Koh */ 154f0d2114fSYongseok Koh static inline unsigned int 155f0d2114fSYongseok Koh txq_calc_offload(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, 156f0d2114fSYongseok Koh uint16_t pkts_n, uint8_t *cs_flags) 157f0d2114fSYongseok Koh { 158f0d2114fSYongseok Koh unsigned int pos; 159f0d2114fSYongseok Koh const uint64_t ol_mask = 160f0d2114fSYongseok Koh PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | 161f0d2114fSYongseok Koh PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE | 162f0d2114fSYongseok Koh PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM; 163f0d2114fSYongseok Koh 164f0d2114fSYongseok Koh if (!pkts_n) 165f0d2114fSYongseok Koh return 0; 166f0d2114fSYongseok Koh /* Count the number of packets having same ol_flags. */ 167f0d2114fSYongseok Koh for (pos = 1; pos < pkts_n; ++pos) 168f0d2114fSYongseok Koh if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask) 169f0d2114fSYongseok Koh break; 170f0d2114fSYongseok Koh /* Should open another MPW session for the rest. */ 171f0d2114fSYongseok Koh if (pkts[0]->ol_flags & 172f0d2114fSYongseok Koh (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 173f0d2114fSYongseok Koh const uint64_t is_tunneled = 174f0d2114fSYongseok Koh pkts[0]->ol_flags & 175f0d2114fSYongseok Koh (PKT_TX_TUNNEL_GRE | 176f0d2114fSYongseok Koh PKT_TX_TUNNEL_VXLAN); 177f0d2114fSYongseok Koh 178f0d2114fSYongseok Koh if (is_tunneled && txq->tunnel_en) { 179f0d2114fSYongseok Koh *cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | 180f0d2114fSYongseok Koh MLX5_ETH_WQE_L4_INNER_CSUM; 181f0d2114fSYongseok Koh if (pkts[0]->ol_flags & PKT_TX_OUTER_IP_CKSUM) 182f0d2114fSYongseok Koh *cs_flags |= MLX5_ETH_WQE_L3_CSUM; 183f0d2114fSYongseok Koh } else { 184f0d2114fSYongseok Koh *cs_flags = MLX5_ETH_WQE_L3_CSUM | 185f0d2114fSYongseok Koh MLX5_ETH_WQE_L4_CSUM; 186f0d2114fSYongseok Koh } 187f0d2114fSYongseok Koh } 188f0d2114fSYongseok Koh return pos; 189f0d2114fSYongseok Koh } 190f0d2114fSYongseok Koh 191f0d2114fSYongseok Koh /** 192f0d2114fSYongseok Koh * Send multi-segmented packets until it encounters a single segment packet in 193f0d2114fSYongseok Koh * the pkts list. 194f0d2114fSYongseok Koh * 195f0d2114fSYongseok Koh * @param txq 196f0d2114fSYongseok Koh * Pointer to TX queue structure. 197f0d2114fSYongseok Koh * @param pkts 198f0d2114fSYongseok Koh * Pointer to array of packets to be sent. 199f0d2114fSYongseok Koh * @param pkts_n 200f0d2114fSYongseok Koh * Number of packets to be sent. 201f0d2114fSYongseok Koh * 202f0d2114fSYongseok Koh * @return 203f0d2114fSYongseok Koh * Number of packets successfully transmitted (<= pkts_n). 204f0d2114fSYongseok Koh */ 205f0d2114fSYongseok Koh static uint16_t 206f0d2114fSYongseok Koh txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, 207f0d2114fSYongseok Koh uint16_t pkts_n) 208f0d2114fSYongseok Koh { 209f0d2114fSYongseok Koh uint16_t elts_head = txq->elts_head; 210f0d2114fSYongseok Koh const uint16_t elts_n = 1 << txq->elts_n; 211f0d2114fSYongseok Koh const uint16_t elts_m = elts_n - 1; 212f0d2114fSYongseok Koh const uint16_t wq_n = 1 << txq->wqe_n; 213f0d2114fSYongseok Koh const uint16_t wq_mask = wq_n - 1; 214f0d2114fSYongseok Koh const unsigned int nb_dword_per_wqebb = 215f0d2114fSYongseok Koh MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; 216f0d2114fSYongseok Koh const unsigned int nb_dword_in_hdr = 217f0d2114fSYongseok Koh sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; 218f0d2114fSYongseok Koh unsigned int n; 219f0d2114fSYongseok Koh volatile struct mlx5_wqe *wqe = NULL; 220f0d2114fSYongseok Koh 221f0d2114fSYongseok Koh assert(elts_n > pkts_n); 222f0d2114fSYongseok Koh mlx5_tx_complete(txq); 223f0d2114fSYongseok Koh if (unlikely(!pkts_n)) 224f0d2114fSYongseok Koh return 0; 225f0d2114fSYongseok Koh for (n = 0; n < pkts_n; ++n) { 226f0d2114fSYongseok Koh struct rte_mbuf *buf = pkts[n]; 227f0d2114fSYongseok Koh unsigned int segs_n = buf->nb_segs; 228f0d2114fSYongseok Koh unsigned int ds = nb_dword_in_hdr; 229f0d2114fSYongseok Koh unsigned int len = PKT_LEN(buf); 230f0d2114fSYongseok Koh uint16_t wqe_ci = txq->wqe_ci; 231f0d2114fSYongseok Koh const __m128i shuf_mask_ctrl = 232f0d2114fSYongseok Koh _mm_set_epi8(15, 14, 13, 12, 233f0d2114fSYongseok Koh 8, 9, 10, 11, /* bswap32 */ 234f0d2114fSYongseok Koh 4, 5, 6, 7, /* bswap32 */ 235f0d2114fSYongseok Koh 0, 1, 2, 3 /* bswap32 */); 236f0d2114fSYongseok Koh uint8_t cs_flags = 0; 237f0d2114fSYongseok Koh uint16_t max_elts; 238f0d2114fSYongseok Koh uint16_t max_wqe; 239f0d2114fSYongseok Koh __m128i *t_wqe, *dseg; 240f0d2114fSYongseok Koh __m128i ctrl; 241f0d2114fSYongseok Koh 242f0d2114fSYongseok Koh assert(segs_n); 243f0d2114fSYongseok Koh max_elts = elts_n - (elts_head - txq->elts_tail); 244f0d2114fSYongseok Koh max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi); 245f0d2114fSYongseok Koh /* 246f0d2114fSYongseok Koh * A MPW session consumes 2 WQEs at most to 247f0d2114fSYongseok Koh * include MLX5_MPW_DSEG_MAX pointers. 248f0d2114fSYongseok Koh */ 249f0d2114fSYongseok Koh if (segs_n == 1 || 250f0d2114fSYongseok Koh max_elts < segs_n || max_wqe < 2) 251f0d2114fSYongseok Koh break; 252f0d2114fSYongseok Koh if (segs_n > MLX5_MPW_DSEG_MAX) { 253f0d2114fSYongseok Koh txq->stats.oerrors++; 254f0d2114fSYongseok Koh break; 255f0d2114fSYongseok Koh } 256f0d2114fSYongseok Koh wqe = &((volatile struct mlx5_wqe64 *) 257f0d2114fSYongseok Koh txq->wqes)[wqe_ci & wq_mask].hdr; 258f0d2114fSYongseok Koh if (buf->ol_flags & 259f0d2114fSYongseok Koh (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { 260f0d2114fSYongseok Koh const uint64_t is_tunneled = buf->ol_flags & 261f0d2114fSYongseok Koh (PKT_TX_TUNNEL_GRE | 262f0d2114fSYongseok Koh PKT_TX_TUNNEL_VXLAN); 263f0d2114fSYongseok Koh 264f0d2114fSYongseok Koh if (is_tunneled && txq->tunnel_en) { 265f0d2114fSYongseok Koh cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | 266f0d2114fSYongseok Koh MLX5_ETH_WQE_L4_INNER_CSUM; 267f0d2114fSYongseok Koh if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) 268f0d2114fSYongseok Koh cs_flags |= MLX5_ETH_WQE_L3_CSUM; 269f0d2114fSYongseok Koh } else { 270f0d2114fSYongseok Koh cs_flags = MLX5_ETH_WQE_L3_CSUM | 271f0d2114fSYongseok Koh MLX5_ETH_WQE_L4_CSUM; 272f0d2114fSYongseok Koh } 273f0d2114fSYongseok Koh } 274f0d2114fSYongseok Koh /* Title WQEBB pointer. */ 275f0d2114fSYongseok Koh t_wqe = (__m128i *)wqe; 276f0d2114fSYongseok Koh dseg = (__m128i *)(wqe + 1); 277f0d2114fSYongseok Koh do { 278f0d2114fSYongseok Koh if (!(ds++ % nb_dword_per_wqebb)) { 279f0d2114fSYongseok Koh dseg = (__m128i *) 280f0d2114fSYongseok Koh &((volatile struct mlx5_wqe64 *) 281f0d2114fSYongseok Koh txq->wqes)[++wqe_ci & wq_mask]; 282f0d2114fSYongseok Koh } 283f0d2114fSYongseok Koh txq_wr_dseg_v(txq, dseg++, &buf, 1); 284f0d2114fSYongseok Koh (*txq->elts)[elts_head++ & elts_m] = buf; 285f0d2114fSYongseok Koh buf = buf->next; 286f0d2114fSYongseok Koh } while (--segs_n); 287f0d2114fSYongseok Koh ++wqe_ci; 288f0d2114fSYongseok Koh /* Fill CTRL in the header. */ 289f0d2114fSYongseok Koh ctrl = _mm_set_epi32(0, 0, txq->qp_num_8s | ds, 290f0d2114fSYongseok Koh MLX5_OPC_MOD_MPW << 24 | 291f0d2114fSYongseok Koh txq->wqe_ci << 8 | MLX5_OPCODE_TSO); 292f0d2114fSYongseok Koh ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl); 293f0d2114fSYongseok Koh _mm_store_si128(t_wqe, ctrl); 294f0d2114fSYongseok Koh /* Fill ESEG in the header. */ 295f0d2114fSYongseok Koh _mm_store_si128(t_wqe + 1, 296f0d2114fSYongseok Koh _mm_set_epi16(0, 0, 0, 0, 297f0d2114fSYongseok Koh rte_cpu_to_be_16(len), cs_flags, 298f0d2114fSYongseok Koh 0, 0)); 299f0d2114fSYongseok Koh txq->wqe_ci = wqe_ci; 300f0d2114fSYongseok Koh } 301f0d2114fSYongseok Koh if (!n) 302f0d2114fSYongseok Koh return 0; 303f0d2114fSYongseok Koh txq->elts_comp += (uint16_t)(elts_head - txq->elts_head); 304f0d2114fSYongseok Koh txq->elts_head = elts_head; 305f0d2114fSYongseok Koh if (txq->elts_comp >= MLX5_TX_COMP_THRESH) { 306f0d2114fSYongseok Koh wqe->ctrl[2] = rte_cpu_to_be_32(8); 307f0d2114fSYongseok Koh wqe->ctrl[3] = txq->elts_head; 308f0d2114fSYongseok Koh txq->elts_comp = 0; 309f0d2114fSYongseok Koh ++txq->cq_pi; 310f0d2114fSYongseok Koh } 311f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 312f0d2114fSYongseok Koh txq->stats.opackets += n; 313f0d2114fSYongseok Koh #endif 314f0d2114fSYongseok Koh mlx5_tx_dbrec(txq, wqe); 315f0d2114fSYongseok Koh return n; 316f0d2114fSYongseok Koh } 317f0d2114fSYongseok Koh 318f0d2114fSYongseok Koh /** 319f0d2114fSYongseok Koh * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet, 320f0d2114fSYongseok Koh * it returns to make it processed by txq_scatter_v(). All the packets in 321f0d2114fSYongseok Koh * the pkts list should be single segment packets having same offload flags. 322f0d2114fSYongseok Koh * This must be checked by txq_check_multiseg() and txq_calc_offload(). 323f0d2114fSYongseok Koh * 324f0d2114fSYongseok Koh * @param txq 325f0d2114fSYongseok Koh * Pointer to TX queue structure. 326f0d2114fSYongseok Koh * @param pkts 327f0d2114fSYongseok Koh * Pointer to array of packets to be sent. 328f0d2114fSYongseok Koh * @param pkts_n 329f0d2114fSYongseok Koh * Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST). 330f0d2114fSYongseok Koh * @param cs_flags 331f0d2114fSYongseok Koh * Checksum offload flags to be written in the descriptor. 332f0d2114fSYongseok Koh * 333f0d2114fSYongseok Koh * @return 334f0d2114fSYongseok Koh * Number of packets successfully transmitted (<= pkts_n). 335f0d2114fSYongseok Koh */ 336f0d2114fSYongseok Koh static inline uint16_t 337f0d2114fSYongseok Koh txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, 338f0d2114fSYongseok Koh uint8_t cs_flags) 339f0d2114fSYongseok Koh { 340f0d2114fSYongseok Koh struct rte_mbuf **elts; 341f0d2114fSYongseok Koh uint16_t elts_head = txq->elts_head; 342f0d2114fSYongseok Koh const uint16_t elts_n = 1 << txq->elts_n; 343f0d2114fSYongseok Koh const uint16_t elts_m = elts_n - 1; 344f0d2114fSYongseok Koh const unsigned int nb_dword_per_wqebb = 345f0d2114fSYongseok Koh MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; 346f0d2114fSYongseok Koh const unsigned int nb_dword_in_hdr = 347f0d2114fSYongseok Koh sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; 348f0d2114fSYongseok Koh unsigned int n = 0; 349f0d2114fSYongseok Koh unsigned int pos; 350f0d2114fSYongseok Koh uint16_t max_elts; 351f0d2114fSYongseok Koh uint16_t max_wqe; 352f0d2114fSYongseok Koh uint32_t comp_req = 0; 353f0d2114fSYongseok Koh const uint16_t wq_n = 1 << txq->wqe_n; 354f0d2114fSYongseok Koh const uint16_t wq_mask = wq_n - 1; 355f0d2114fSYongseok Koh uint16_t wq_idx = txq->wqe_ci & wq_mask; 356f0d2114fSYongseok Koh volatile struct mlx5_wqe64 *wq = 357f0d2114fSYongseok Koh &((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx]; 358f0d2114fSYongseok Koh volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq; 359f0d2114fSYongseok Koh const __m128i shuf_mask_ctrl = 360f0d2114fSYongseok Koh _mm_set_epi8(15, 14, 13, 12, 361f0d2114fSYongseok Koh 8, 9, 10, 11, /* bswap32 */ 362f0d2114fSYongseok Koh 4, 5, 6, 7, /* bswap32 */ 363f0d2114fSYongseok Koh 0, 1, 2, 3 /* bswap32 */); 364f0d2114fSYongseok Koh __m128i *t_wqe, *dseg; 365f0d2114fSYongseok Koh __m128i ctrl; 366f0d2114fSYongseok Koh 367f0d2114fSYongseok Koh /* Make sure all packets can fit into a single WQE. */ 368f0d2114fSYongseok Koh assert(elts_n > pkts_n); 369f0d2114fSYongseok Koh mlx5_tx_complete(txq); 370f0d2114fSYongseok Koh max_elts = (elts_n - (elts_head - txq->elts_tail)); 371f0d2114fSYongseok Koh max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); 372f0d2114fSYongseok Koh pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts); 373f0d2114fSYongseok Koh assert(pkts_n <= MLX5_DSEG_MAX - nb_dword_in_hdr); 374f0d2114fSYongseok Koh if (unlikely(!pkts_n)) 375f0d2114fSYongseok Koh return 0; 376f0d2114fSYongseok Koh elts = &(*txq->elts)[elts_head & elts_m]; 377f0d2114fSYongseok Koh /* Loop for available tailroom first. */ 378f0d2114fSYongseok Koh n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n); 379f0d2114fSYongseok Koh for (pos = 0; pos < (n & -2); pos += 2) 380f0d2114fSYongseok Koh _mm_storeu_si128((__m128i *)&elts[pos], 381f0d2114fSYongseok Koh _mm_loadu_si128((__m128i *)&pkts[pos])); 382f0d2114fSYongseok Koh if (n & 1) 383f0d2114fSYongseok Koh elts[pos] = pkts[pos]; 384f0d2114fSYongseok Koh /* Check if it crosses the end of the queue. */ 385f0d2114fSYongseok Koh if (unlikely(n < pkts_n)) { 386f0d2114fSYongseok Koh elts = &(*txq->elts)[0]; 387f0d2114fSYongseok Koh for (pos = 0; pos < pkts_n - n; ++pos) 388f0d2114fSYongseok Koh elts[pos] = pkts[n + pos]; 389f0d2114fSYongseok Koh } 390f0d2114fSYongseok Koh txq->elts_head += pkts_n; 391f0d2114fSYongseok Koh /* Save title WQEBB pointer. */ 392f0d2114fSYongseok Koh t_wqe = (__m128i *)wqe; 393f0d2114fSYongseok Koh dseg = (__m128i *)(wqe + 1); 394f0d2114fSYongseok Koh /* Calculate the number of entries to the end. */ 395f0d2114fSYongseok Koh n = RTE_MIN( 396f0d2114fSYongseok Koh (wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr, 397f0d2114fSYongseok Koh pkts_n); 398f0d2114fSYongseok Koh /* Fill DSEGs. */ 399f0d2114fSYongseok Koh txq_wr_dseg_v(txq, dseg, pkts, n); 400f0d2114fSYongseok Koh /* Check if it crosses the end of the queue. */ 401f0d2114fSYongseok Koh if (n < pkts_n) { 402f0d2114fSYongseok Koh dseg = (__m128i *)txq->wqes; 403f0d2114fSYongseok Koh txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n); 404f0d2114fSYongseok Koh } 405f0d2114fSYongseok Koh if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) { 406f0d2114fSYongseok Koh txq->elts_comp += pkts_n; 407f0d2114fSYongseok Koh } else { 408f0d2114fSYongseok Koh /* Request a completion. */ 409f0d2114fSYongseok Koh txq->elts_comp = 0; 410f0d2114fSYongseok Koh ++txq->cq_pi; 411f0d2114fSYongseok Koh comp_req = 8; 412f0d2114fSYongseok Koh } 413f0d2114fSYongseok Koh /* Fill CTRL in the header. */ 414f0d2114fSYongseok Koh ctrl = _mm_set_epi32(txq->elts_head, comp_req, 415f0d2114fSYongseok Koh txq->qp_num_8s | (pkts_n + 2), 416f0d2114fSYongseok Koh MLX5_OPC_MOD_ENHANCED_MPSW << 24 | 417f0d2114fSYongseok Koh txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW); 418f0d2114fSYongseok Koh ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl); 419f0d2114fSYongseok Koh _mm_store_si128(t_wqe, ctrl); 420f0d2114fSYongseok Koh /* Fill ESEG in the header. */ 421f0d2114fSYongseok Koh _mm_store_si128(t_wqe + 1, 422f0d2114fSYongseok Koh _mm_set_epi8(0, 0, 0, 0, 423f0d2114fSYongseok Koh 0, 0, 0, 0, 424f0d2114fSYongseok Koh 0, 0, 0, cs_flags, 425f0d2114fSYongseok Koh 0, 0, 0, 0)); 426f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 427f0d2114fSYongseok Koh txq->stats.opackets += pkts_n; 428f0d2114fSYongseok Koh #endif 429f0d2114fSYongseok Koh txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) / 430f0d2114fSYongseok Koh nb_dword_per_wqebb; 431f0d2114fSYongseok Koh /* Ring QP doorbell. */ 432f0d2114fSYongseok Koh mlx5_tx_dbrec(txq, wqe); 433f0d2114fSYongseok Koh return pkts_n; 434f0d2114fSYongseok Koh } 435f0d2114fSYongseok Koh 436f0d2114fSYongseok Koh /** 437f0d2114fSYongseok Koh * DPDK callback for vectorized TX. 438f0d2114fSYongseok Koh * 439f0d2114fSYongseok Koh * @param dpdk_txq 440f0d2114fSYongseok Koh * Generic pointer to TX queue structure. 441f0d2114fSYongseok Koh * @param[in] pkts 442f0d2114fSYongseok Koh * Packets to transmit. 443f0d2114fSYongseok Koh * @param pkts_n 444f0d2114fSYongseok Koh * Number of packets in array. 445f0d2114fSYongseok Koh * 446f0d2114fSYongseok Koh * @return 447f0d2114fSYongseok Koh * Number of packets successfully transmitted (<= pkts_n). 448f0d2114fSYongseok Koh */ 449f0d2114fSYongseok Koh uint16_t 450f0d2114fSYongseok Koh mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, 451f0d2114fSYongseok Koh uint16_t pkts_n) 452f0d2114fSYongseok Koh { 453f0d2114fSYongseok Koh struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; 454f0d2114fSYongseok Koh uint16_t nb_tx = 0; 455f0d2114fSYongseok Koh 456f0d2114fSYongseok Koh while (pkts_n > nb_tx) { 457f0d2114fSYongseok Koh uint16_t n; 458f0d2114fSYongseok Koh uint16_t ret; 459f0d2114fSYongseok Koh 460f0d2114fSYongseok Koh n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST); 461f0d2114fSYongseok Koh ret = txq_burst_v(txq, &pkts[nb_tx], n, 0); 462f0d2114fSYongseok Koh nb_tx += ret; 463f0d2114fSYongseok Koh if (!ret) 464f0d2114fSYongseok Koh break; 465f0d2114fSYongseok Koh } 466f0d2114fSYongseok Koh return nb_tx; 467f0d2114fSYongseok Koh } 468f0d2114fSYongseok Koh 469f0d2114fSYongseok Koh /** 470f0d2114fSYongseok Koh * DPDK callback for vectorized TX with multi-seg packets and offload. 471f0d2114fSYongseok Koh * 472f0d2114fSYongseok Koh * @param dpdk_txq 473f0d2114fSYongseok Koh * Generic pointer to TX queue structure. 474f0d2114fSYongseok Koh * @param[in] pkts 475f0d2114fSYongseok Koh * Packets to transmit. 476f0d2114fSYongseok Koh * @param pkts_n 477f0d2114fSYongseok Koh * Number of packets in array. 478f0d2114fSYongseok Koh * 479f0d2114fSYongseok Koh * @return 480f0d2114fSYongseok Koh * Number of packets successfully transmitted (<= pkts_n). 481f0d2114fSYongseok Koh */ 482f0d2114fSYongseok Koh uint16_t 483f0d2114fSYongseok Koh mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) 484f0d2114fSYongseok Koh { 485f0d2114fSYongseok Koh struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; 486f0d2114fSYongseok Koh uint16_t nb_tx = 0; 487f0d2114fSYongseok Koh 488f0d2114fSYongseok Koh while (pkts_n > nb_tx) { 489f0d2114fSYongseok Koh uint8_t cs_flags = 0; 490f0d2114fSYongseok Koh uint16_t n; 491f0d2114fSYongseok Koh uint16_t ret; 492f0d2114fSYongseok Koh 493f0d2114fSYongseok Koh /* Transmit multi-seg packets in the head of pkts list. */ 494f0d2114fSYongseok Koh if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) && 495f0d2114fSYongseok Koh NB_SEGS(pkts[nb_tx]) > 1) 496f0d2114fSYongseok Koh nb_tx += txq_scatter_v(txq, 497f0d2114fSYongseok Koh &pkts[nb_tx], 498f0d2114fSYongseok Koh pkts_n - nb_tx); 499f0d2114fSYongseok Koh n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST); 500f0d2114fSYongseok Koh if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS)) 501f0d2114fSYongseok Koh n = txq_check_multiseg(&pkts[nb_tx], n); 502f0d2114fSYongseok Koh if (!(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS)) 503f0d2114fSYongseok Koh n = txq_calc_offload(txq, &pkts[nb_tx], n, &cs_flags); 504f0d2114fSYongseok Koh ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags); 505f0d2114fSYongseok Koh nb_tx += ret; 506f0d2114fSYongseok Koh if (!ret) 507f0d2114fSYongseok Koh break; 508f0d2114fSYongseok Koh } 509f0d2114fSYongseok Koh return nb_tx; 510f0d2114fSYongseok Koh } 511f0d2114fSYongseok Koh 512f0d2114fSYongseok Koh /** 513f0d2114fSYongseok Koh * Store free buffers to RX SW ring. 514f0d2114fSYongseok Koh * 515f0d2114fSYongseok Koh * @param rxq 516f0d2114fSYongseok Koh * Pointer to RX queue structure. 517f0d2114fSYongseok Koh * @param pkts 518f0d2114fSYongseok Koh * Pointer to array of packets to be stored. 519f0d2114fSYongseok Koh * @param pkts_n 520f0d2114fSYongseok Koh * Number of packets to be stored. 521f0d2114fSYongseok Koh */ 522f0d2114fSYongseok Koh static inline void 523f0d2114fSYongseok Koh rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) 524f0d2114fSYongseok Koh { 525f0d2114fSYongseok Koh const uint16_t q_mask = (1 << rxq->elts_n) - 1; 526f0d2114fSYongseok Koh struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask]; 527f0d2114fSYongseok Koh unsigned int pos; 528f0d2114fSYongseok Koh uint16_t p = n & -2; 529f0d2114fSYongseok Koh 530f0d2114fSYongseok Koh for (pos = 0; pos < p; pos += 2) { 531f0d2114fSYongseok Koh __m128i mbp; 532f0d2114fSYongseok Koh 533f0d2114fSYongseok Koh mbp = _mm_loadu_si128((__m128i *)&elts[pos]); 534f0d2114fSYongseok Koh _mm_storeu_si128((__m128i *)&pkts[pos], mbp); 535f0d2114fSYongseok Koh } 536f0d2114fSYongseok Koh if (n & 1) 537f0d2114fSYongseok Koh pkts[pos] = elts[pos]; 538f0d2114fSYongseok Koh } 539f0d2114fSYongseok Koh 540f0d2114fSYongseok Koh /** 541f0d2114fSYongseok Koh * Replenish buffers for RX in bulk. 542f0d2114fSYongseok Koh * 543f0d2114fSYongseok Koh * @param rxq 544f0d2114fSYongseok Koh * Pointer to RX queue structure. 545f0d2114fSYongseok Koh * @param n 546f0d2114fSYongseok Koh * Number of buffers to be replenished. 547f0d2114fSYongseok Koh */ 548f0d2114fSYongseok Koh static inline void 549f0d2114fSYongseok Koh rxq_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n) 550f0d2114fSYongseok Koh { 551f0d2114fSYongseok Koh const uint16_t q_n = 1 << rxq->elts_n; 552f0d2114fSYongseok Koh const uint16_t q_mask = q_n - 1; 553f0d2114fSYongseok Koh const uint16_t elts_idx = rxq->rq_ci & q_mask; 554f0d2114fSYongseok Koh struct rte_mbuf **elts = &(*rxq->elts)[elts_idx]; 555f0d2114fSYongseok Koh volatile struct mlx5_wqe_data_seg *wq = &(*rxq->wqes)[elts_idx]; 556f0d2114fSYongseok Koh unsigned int i; 557f0d2114fSYongseok Koh 558f0d2114fSYongseok Koh assert(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH); 559f0d2114fSYongseok Koh assert(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi))); 560f0d2114fSYongseok Koh assert(MLX5_VPMD_RXQ_RPLNSH_THRESH > MLX5_VPMD_DESCS_PER_LOOP); 561f0d2114fSYongseok Koh /* Not to cross queue end. */ 562f0d2114fSYongseok Koh n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx); 563f0d2114fSYongseok Koh if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) { 564f0d2114fSYongseok Koh rxq->stats.rx_nombuf += n; 565f0d2114fSYongseok Koh return; 566f0d2114fSYongseok Koh } 567f0d2114fSYongseok Koh for (i = 0; i < n; ++i) 568f0d2114fSYongseok Koh wq[i].addr = rte_cpu_to_be_64((uintptr_t)elts[i]->buf_addr + 569f0d2114fSYongseok Koh RTE_PKTMBUF_HEADROOM); 570f0d2114fSYongseok Koh rxq->rq_ci += n; 571f0d2114fSYongseok Koh rte_io_wmb(); 572f0d2114fSYongseok Koh *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); 573f0d2114fSYongseok Koh } 574f0d2114fSYongseok Koh 575f0d2114fSYongseok Koh /** 576f0d2114fSYongseok Koh * Decompress a compressed completion and fill in mbufs in RX SW ring with data 577f0d2114fSYongseok Koh * extracted from the title completion descriptor. 578f0d2114fSYongseok Koh * 579f0d2114fSYongseok Koh * @param rxq 580f0d2114fSYongseok Koh * Pointer to RX queue structure. 581f0d2114fSYongseok Koh * @param cq 582f0d2114fSYongseok Koh * Pointer to completion array having a compressed completion at first. 583f0d2114fSYongseok Koh * @param elts 584f0d2114fSYongseok Koh * Pointer to SW ring to be filled. The first mbuf has to be pre-built from 585f0d2114fSYongseok Koh * the title completion descriptor to be copied to the rest of mbufs. 586f0d2114fSYongseok Koh */ 587f0d2114fSYongseok Koh static inline void 588f0d2114fSYongseok Koh rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, 589f0d2114fSYongseok Koh volatile struct mlx5_cqe *cq, 590f0d2114fSYongseok Koh struct rte_mbuf **elts) 591f0d2114fSYongseok Koh { 592f0d2114fSYongseok Koh volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1); 593f0d2114fSYongseok Koh struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */ 594f0d2114fSYongseok Koh unsigned int pos; 595f0d2114fSYongseok Koh unsigned int i; 596f0d2114fSYongseok Koh unsigned int inv = 0; 597f0d2114fSYongseok Koh /* Mask to shuffle from extracted mini CQE to mbuf. */ 598f0d2114fSYongseok Koh const __m128i shuf_mask1 = 599f0d2114fSYongseok Koh _mm_set_epi8(0, 1, 2, 3, /* rss, bswap32 */ 600f0d2114fSYongseok Koh -1, -1, /* skip vlan_tci */ 601f0d2114fSYongseok Koh 6, 7, /* data_len, bswap16 */ 602f0d2114fSYongseok Koh -1, -1, 6, 7, /* pkt_len, bswap16 */ 603f0d2114fSYongseok Koh -1, -1, -1, -1 /* skip packet_type */); 604f0d2114fSYongseok Koh const __m128i shuf_mask2 = 605f0d2114fSYongseok Koh _mm_set_epi8(8, 9, 10, 11, /* rss, bswap32 */ 606f0d2114fSYongseok Koh -1, -1, /* skip vlan_tci */ 607f0d2114fSYongseok Koh 14, 15, /* data_len, bswap16 */ 608f0d2114fSYongseok Koh -1, -1, 14, 15, /* pkt_len, bswap16 */ 609f0d2114fSYongseok Koh -1, -1, -1, -1 /* skip packet_type */); 610f0d2114fSYongseok Koh /* Restore the compressed count. Must be 16 bits. */ 611f0d2114fSYongseok Koh const uint16_t mcqe_n = t_pkt->data_len + 612f0d2114fSYongseok Koh (rxq->crc_present * ETHER_CRC_LEN); 613f0d2114fSYongseok Koh const __m128i rearm = 614f0d2114fSYongseok Koh _mm_loadu_si128((__m128i *)&t_pkt->rearm_data); 615f0d2114fSYongseok Koh const __m128i rxdf = 616f0d2114fSYongseok Koh _mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1); 617f0d2114fSYongseok Koh const __m128i crc_adj = 618f0d2114fSYongseok Koh _mm_set_epi16(0, 0, 0, 619f0d2114fSYongseok Koh rxq->crc_present * ETHER_CRC_LEN, 620f0d2114fSYongseok Koh 0, 621f0d2114fSYongseok Koh rxq->crc_present * ETHER_CRC_LEN, 622f0d2114fSYongseok Koh 0, 0); 623f0d2114fSYongseok Koh const uint32_t flow_tag = t_pkt->hash.fdir.hi; 624f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 625f0d2114fSYongseok Koh const __m128i zero = _mm_setzero_si128(); 626f0d2114fSYongseok Koh const __m128i ones = _mm_cmpeq_epi32(zero, zero); 627f0d2114fSYongseok Koh uint32_t rcvd_byte = 0; 628f0d2114fSYongseok Koh /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ 629f0d2114fSYongseok Koh const __m128i len_shuf_mask = 630f0d2114fSYongseok Koh _mm_set_epi8(-1, -1, -1, -1, 631f0d2114fSYongseok Koh -1, -1, -1, -1, 632f0d2114fSYongseok Koh 14, 15, 6, 7, 633f0d2114fSYongseok Koh 10, 11, 2, 3); 634f0d2114fSYongseok Koh #endif 635f0d2114fSYongseok Koh 636f0d2114fSYongseok Koh /* 637f0d2114fSYongseok Koh * Not to overflow elts array. Decompress next time after mbuf 638f0d2114fSYongseok Koh * replenishment. 639f0d2114fSYongseok Koh */ 640f0d2114fSYongseok Koh if (unlikely(mcqe_n + MLX5_VPMD_DESCS_PER_LOOP > 641f0d2114fSYongseok Koh (uint16_t)(rxq->rq_ci - rxq->cq_ci))) 642f0d2114fSYongseok Koh return; 643f0d2114fSYongseok Koh /* 644f0d2114fSYongseok Koh * A. load mCQEs into a 128bit register. 645f0d2114fSYongseok Koh * B. store rearm data to mbuf. 646f0d2114fSYongseok Koh * C. combine data from mCQEs with rx_descriptor_fields1. 647f0d2114fSYongseok Koh * D. store rx_descriptor_fields1. 648f0d2114fSYongseok Koh * E. store flow tag (rte_flow mark). 649f0d2114fSYongseok Koh */ 650f0d2114fSYongseok Koh for (pos = 0; pos < mcqe_n; ) { 651f0d2114fSYongseok Koh __m128i mcqe1, mcqe2; 652f0d2114fSYongseok Koh __m128i rxdf1, rxdf2; 653f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 654f0d2114fSYongseok Koh __m128i byte_cnt, invalid_mask; 655f0d2114fSYongseok Koh #endif 656f0d2114fSYongseok Koh 657f0d2114fSYongseok Koh if (!(pos & 0x7) && pos + 8 < mcqe_n) 658f0d2114fSYongseok Koh rte_prefetch0((void *)(cq + pos + 8)); 659f0d2114fSYongseok Koh /* A.1 load mCQEs into a 128bit register. */ 660f0d2114fSYongseok Koh mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]); 661f0d2114fSYongseok Koh mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]); 662f0d2114fSYongseok Koh /* B.1 store rearm data to mbuf. */ 663f0d2114fSYongseok Koh _mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm); 664f0d2114fSYongseok Koh _mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm); 665f0d2114fSYongseok Koh /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ 666f0d2114fSYongseok Koh rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1); 667f0d2114fSYongseok Koh rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2); 668f0d2114fSYongseok Koh rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); 669f0d2114fSYongseok Koh rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); 670f0d2114fSYongseok Koh rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); 671f0d2114fSYongseok Koh rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); 672f0d2114fSYongseok Koh /* D.1 store rx_descriptor_fields1. */ 673f0d2114fSYongseok Koh _mm_storeu_si128((__m128i *) 674f0d2114fSYongseok Koh &elts[pos]->rx_descriptor_fields1, 675f0d2114fSYongseok Koh rxdf1); 676f0d2114fSYongseok Koh _mm_storeu_si128((__m128i *) 677f0d2114fSYongseok Koh &elts[pos + 1]->rx_descriptor_fields1, 678f0d2114fSYongseok Koh rxdf2); 679f0d2114fSYongseok Koh /* B.1 store rearm data to mbuf. */ 680f0d2114fSYongseok Koh _mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm); 681f0d2114fSYongseok Koh _mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm); 682f0d2114fSYongseok Koh /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ 683f0d2114fSYongseok Koh rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1); 684f0d2114fSYongseok Koh rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2); 685f0d2114fSYongseok Koh rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); 686f0d2114fSYongseok Koh rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); 687f0d2114fSYongseok Koh rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); 688f0d2114fSYongseok Koh rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); 689f0d2114fSYongseok Koh /* D.1 store rx_descriptor_fields1. */ 690f0d2114fSYongseok Koh _mm_storeu_si128((__m128i *) 691f0d2114fSYongseok Koh &elts[pos + 2]->rx_descriptor_fields1, 692f0d2114fSYongseok Koh rxdf1); 693f0d2114fSYongseok Koh _mm_storeu_si128((__m128i *) 694f0d2114fSYongseok Koh &elts[pos + 3]->rx_descriptor_fields1, 695f0d2114fSYongseok Koh rxdf2); 696f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 697f0d2114fSYongseok Koh invalid_mask = _mm_set_epi64x(0, 698f0d2114fSYongseok Koh (mcqe_n - pos) * 699f0d2114fSYongseok Koh sizeof(uint16_t) * 8); 700f0d2114fSYongseok Koh invalid_mask = _mm_sll_epi64(ones, invalid_mask); 701f0d2114fSYongseok Koh mcqe1 = _mm_srli_si128(mcqe1, 4); 702f0d2114fSYongseok Koh byte_cnt = _mm_blend_epi16(mcqe1, mcqe2, 0xcc); 703f0d2114fSYongseok Koh byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask); 704f0d2114fSYongseok Koh byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); 705f0d2114fSYongseok Koh byte_cnt = _mm_hadd_epi16(byte_cnt, zero); 706f0d2114fSYongseok Koh rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); 707f0d2114fSYongseok Koh #endif 708f0d2114fSYongseok Koh if (rxq->mark) { 709f0d2114fSYongseok Koh /* E.1 store flow tag (rte_flow mark). */ 710f0d2114fSYongseok Koh elts[pos]->hash.fdir.hi = flow_tag; 711f0d2114fSYongseok Koh elts[pos + 1]->hash.fdir.hi = flow_tag; 712f0d2114fSYongseok Koh elts[pos + 2]->hash.fdir.hi = flow_tag; 713f0d2114fSYongseok Koh elts[pos + 3]->hash.fdir.hi = flow_tag; 714f0d2114fSYongseok Koh } 715f0d2114fSYongseok Koh pos += MLX5_VPMD_DESCS_PER_LOOP; 716f0d2114fSYongseok Koh /* Move to next CQE and invalidate consumed CQEs. */ 717f0d2114fSYongseok Koh if (!(pos & 0x7) && pos < mcqe_n) { 718f0d2114fSYongseok Koh mcq = (void *)(cq + pos); 719f0d2114fSYongseok Koh for (i = 0; i < 8; ++i) 720f0d2114fSYongseok Koh cq[inv++].op_own = MLX5_CQE_INVALIDATE; 721f0d2114fSYongseok Koh } 722f0d2114fSYongseok Koh } 723f0d2114fSYongseok Koh /* Invalidate the rest of CQEs. */ 724f0d2114fSYongseok Koh for (; inv < mcqe_n; ++inv) 725f0d2114fSYongseok Koh cq[inv].op_own = MLX5_CQE_INVALIDATE; 726f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 727f0d2114fSYongseok Koh rxq->stats.ipackets += mcqe_n; 728f0d2114fSYongseok Koh rxq->stats.ibytes += rcvd_byte; 729f0d2114fSYongseok Koh #endif 730f0d2114fSYongseok Koh rxq->cq_ci += mcqe_n; 731f0d2114fSYongseok Koh } 732f0d2114fSYongseok Koh 733f0d2114fSYongseok Koh /** 734f0d2114fSYongseok Koh * Calculate packet type and offload flag for mbuf and store it. 735f0d2114fSYongseok Koh * 736f0d2114fSYongseok Koh * @param rxq 737f0d2114fSYongseok Koh * Pointer to RX queue structure. 738f0d2114fSYongseok Koh * @param cqes[4] 739f0d2114fSYongseok Koh * Array of four 16bytes completions extracted from the original completion 740f0d2114fSYongseok Koh * descriptor. 741f0d2114fSYongseok Koh * @param op_err 742f0d2114fSYongseok Koh * Opcode vector having responder error status. Each field is 4B. 743f0d2114fSYongseok Koh * @param pkts 744f0d2114fSYongseok Koh * Pointer to array of packets to be filled. 745f0d2114fSYongseok Koh */ 746f0d2114fSYongseok Koh static inline void 747f0d2114fSYongseok Koh rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4], 748f0d2114fSYongseok Koh __m128i op_err, struct rte_mbuf **pkts) 749f0d2114fSYongseok Koh { 750f0d2114fSYongseok Koh __m128i pinfo0, pinfo1; 751f0d2114fSYongseok Koh __m128i pinfo, ptype; 752f0d2114fSYongseok Koh __m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH); 753f0d2114fSYongseok Koh __m128i cv_flags; 754f0d2114fSYongseok Koh const __m128i zero = _mm_setzero_si128(); 755f0d2114fSYongseok Koh const __m128i ptype_mask = 756f0d2114fSYongseok Koh _mm_set_epi32(0xfd06, 0xfd06, 0xfd06, 0xfd06); 757f0d2114fSYongseok Koh const __m128i ptype_ol_mask = 758f0d2114fSYongseok Koh _mm_set_epi32(0x106, 0x106, 0x106, 0x106); 759f0d2114fSYongseok Koh const __m128i pinfo_mask = 760f0d2114fSYongseok Koh _mm_set_epi32(0x3, 0x3, 0x3, 0x3); 761f0d2114fSYongseok Koh const __m128i cv_flag_sel = 762f0d2114fSYongseok Koh _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 763f0d2114fSYongseok Koh (uint8_t)((PKT_RX_IP_CKSUM_GOOD | 764f0d2114fSYongseok Koh PKT_RX_L4_CKSUM_GOOD) >> 1), 765f0d2114fSYongseok Koh 0, 766f0d2114fSYongseok Koh (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1), 767f0d2114fSYongseok Koh 0, 768f0d2114fSYongseok Koh (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1), 769f0d2114fSYongseok Koh (uint8_t)(PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED), 770f0d2114fSYongseok Koh 0); 771f0d2114fSYongseok Koh const __m128i cv_mask = 772f0d2114fSYongseok Koh _mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | 773f0d2114fSYongseok Koh PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, 774f0d2114fSYongseok Koh PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | 775f0d2114fSYongseok Koh PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, 776f0d2114fSYongseok Koh PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | 777f0d2114fSYongseok Koh PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, 778f0d2114fSYongseok Koh PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | 779f0d2114fSYongseok Koh PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED); 780f0d2114fSYongseok Koh const __m128i mbuf_init = 781f0d2114fSYongseok Koh _mm_loadl_epi64((__m128i *)&rxq->mbuf_initializer); 782f0d2114fSYongseok Koh __m128i rearm0, rearm1, rearm2, rearm3; 783f0d2114fSYongseok Koh 784f0d2114fSYongseok Koh /* Extract pkt_info field. */ 785f0d2114fSYongseok Koh pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]); 786f0d2114fSYongseok Koh pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]); 787f0d2114fSYongseok Koh pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1); 788f0d2114fSYongseok Koh /* Extract hdr_type_etc field. */ 789f0d2114fSYongseok Koh pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]); 790f0d2114fSYongseok Koh pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]); 791f0d2114fSYongseok Koh ptype = _mm_unpacklo_epi64(pinfo0, pinfo1); 792f0d2114fSYongseok Koh if (rxq->mark) { 793f0d2114fSYongseok Koh const __m128i pinfo_ft_mask = 794f0d2114fSYongseok Koh _mm_set_epi32(0xffffff00, 0xffffff00, 795f0d2114fSYongseok Koh 0xffffff00, 0xffffff00); 796f0d2114fSYongseok Koh const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR); 797f0d2114fSYongseok Koh const __m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID); 798f0d2114fSYongseok Koh __m128i flow_tag, invalid_mask; 799f0d2114fSYongseok Koh 800f0d2114fSYongseok Koh flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask); 801f0d2114fSYongseok Koh /* Check if flow tag is non-zero then set PKT_RX_FDIR. */ 802f0d2114fSYongseok Koh invalid_mask = _mm_cmpeq_epi32(flow_tag, zero); 803f0d2114fSYongseok Koh ol_flags = _mm_or_si128(ol_flags, 804f0d2114fSYongseok Koh _mm_andnot_si128(invalid_mask, 805f0d2114fSYongseok Koh fdir_flags)); 806f0d2114fSYongseok Koh /* Mask out invalid entries. */ 807f0d2114fSYongseok Koh flow_tag = _mm_andnot_si128(invalid_mask, flow_tag); 808f0d2114fSYongseok Koh /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */ 809f0d2114fSYongseok Koh ol_flags = _mm_or_si128(ol_flags, 810f0d2114fSYongseok Koh _mm_andnot_si128( 811f0d2114fSYongseok Koh _mm_cmpeq_epi32(flow_tag, 812f0d2114fSYongseok Koh pinfo_ft_mask), 813f0d2114fSYongseok Koh fdir_id_flags)); 814f0d2114fSYongseok Koh } 815f0d2114fSYongseok Koh /* 816f0d2114fSYongseok Koh * Merge the two fields to generate the following: 817f0d2114fSYongseok Koh * bit[1] = l3_ok 818f0d2114fSYongseok Koh * bit[2] = l4_ok 819f0d2114fSYongseok Koh * bit[8] = cv 820f0d2114fSYongseok Koh * bit[11:10] = l3_hdr_type 821f0d2114fSYongseok Koh * bit[14:12] = l4_hdr_type 822f0d2114fSYongseok Koh * bit[15] = ip_frag 823f0d2114fSYongseok Koh * bit[16] = tunneled 824f0d2114fSYongseok Koh * bit[17] = outer_l3_type 825f0d2114fSYongseok Koh */ 826f0d2114fSYongseok Koh ptype = _mm_and_si128(ptype, ptype_mask); 827f0d2114fSYongseok Koh pinfo = _mm_and_si128(pinfo, pinfo_mask); 828f0d2114fSYongseok Koh pinfo = _mm_slli_epi32(pinfo, 16); 829f0d2114fSYongseok Koh /* Make pinfo has merged fields for ol_flags calculation. */ 830f0d2114fSYongseok Koh pinfo = _mm_or_si128(ptype, pinfo); 831f0d2114fSYongseok Koh ptype = _mm_srli_epi32(pinfo, 10); 832f0d2114fSYongseok Koh ptype = _mm_packs_epi32(ptype, zero); 833f0d2114fSYongseok Koh /* Errored packets will have RTE_PTYPE_ALL_MASK. */ 834f0d2114fSYongseok Koh op_err = _mm_srli_epi16(op_err, 8); 835f0d2114fSYongseok Koh ptype = _mm_or_si128(ptype, op_err); 836f0d2114fSYongseok Koh pkts[0]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 0)]; 837f0d2114fSYongseok Koh pkts[1]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 2)]; 838f0d2114fSYongseok Koh pkts[2]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 4)]; 839f0d2114fSYongseok Koh pkts[3]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 6)]; 840f0d2114fSYongseok Koh /* Fill flags for checksum and VLAN. */ 841f0d2114fSYongseok Koh pinfo = _mm_and_si128(pinfo, ptype_ol_mask); 842f0d2114fSYongseok Koh pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo); 843f0d2114fSYongseok Koh /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */ 844f0d2114fSYongseok Koh cv_flags = _mm_slli_epi32(pinfo, 9); 845f0d2114fSYongseok Koh cv_flags = _mm_or_si128(pinfo, cv_flags); 846f0d2114fSYongseok Koh /* Move back flags to start from byte[0]. */ 847f0d2114fSYongseok Koh cv_flags = _mm_srli_epi32(cv_flags, 8); 848f0d2114fSYongseok Koh /* Mask out garbage bits. */ 849f0d2114fSYongseok Koh cv_flags = _mm_and_si128(cv_flags, cv_mask); 850f0d2114fSYongseok Koh /* Merge to ol_flags. */ 851f0d2114fSYongseok Koh ol_flags = _mm_or_si128(ol_flags, cv_flags); 852f0d2114fSYongseok Koh /* Merge mbuf_init and ol_flags. */ 853f0d2114fSYongseok Koh rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30); 854f0d2114fSYongseok Koh rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30); 855f0d2114fSYongseok Koh rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30); 856f0d2114fSYongseok Koh rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30); 857f0d2114fSYongseok Koh /* Write 8B rearm_data and 8B ol_flags. */ 858f0d2114fSYongseok Koh _mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0); 859f0d2114fSYongseok Koh _mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1); 860f0d2114fSYongseok Koh _mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2); 861f0d2114fSYongseok Koh _mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3); 862f0d2114fSYongseok Koh } 863f0d2114fSYongseok Koh 864f0d2114fSYongseok Koh /** 865f0d2114fSYongseok Koh * Skip error packets. 866f0d2114fSYongseok Koh * 867f0d2114fSYongseok Koh * @param rxq 868f0d2114fSYongseok Koh * Pointer to RX queue structure. 869f0d2114fSYongseok Koh * @param[out] pkts 870f0d2114fSYongseok Koh * Array to store received packets. 871f0d2114fSYongseok Koh * @param pkts_n 872f0d2114fSYongseok Koh * Maximum number of packets in array. 873f0d2114fSYongseok Koh * 874f0d2114fSYongseok Koh * @return 875f0d2114fSYongseok Koh * Number of packets successfully received (<= pkts_n). 876f0d2114fSYongseok Koh */ 877f0d2114fSYongseok Koh static uint16_t 878f0d2114fSYongseok Koh rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, 879f0d2114fSYongseok Koh uint16_t pkts_n) 880f0d2114fSYongseok Koh { 881f0d2114fSYongseok Koh uint16_t n = 0; 882f0d2114fSYongseok Koh unsigned int i; 883f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 884f0d2114fSYongseok Koh uint32_t err_bytes = 0; 885f0d2114fSYongseok Koh #endif 886f0d2114fSYongseok Koh 887f0d2114fSYongseok Koh for (i = 0; i < pkts_n; ++i) { 888f0d2114fSYongseok Koh struct rte_mbuf *pkt = pkts[i]; 889f0d2114fSYongseok Koh 890f0d2114fSYongseok Koh if (pkt->packet_type == RTE_PTYPE_ALL_MASK) { 891f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 892f0d2114fSYongseok Koh err_bytes += PKT_LEN(pkt); 893f0d2114fSYongseok Koh #endif 894f0d2114fSYongseok Koh rte_pktmbuf_free_seg(pkt); 895f0d2114fSYongseok Koh } else { 896f0d2114fSYongseok Koh pkts[n++] = pkt; 897f0d2114fSYongseok Koh } 898f0d2114fSYongseok Koh } 899f0d2114fSYongseok Koh rxq->stats.idropped += (pkts_n - n); 900f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 901f0d2114fSYongseok Koh /* Correct counters of errored completions. */ 902f0d2114fSYongseok Koh rxq->stats.ipackets -= (pkts_n - n); 903f0d2114fSYongseok Koh rxq->stats.ibytes -= err_bytes; 904f0d2114fSYongseok Koh #endif 905f0d2114fSYongseok Koh rxq->pending_err = 0; 906f0d2114fSYongseok Koh return n; 907f0d2114fSYongseok Koh } 908f0d2114fSYongseok Koh 909f0d2114fSYongseok Koh /** 910f0d2114fSYongseok Koh * Receive burst of packets. An errored completion also consumes a mbuf, but the 911f0d2114fSYongseok Koh * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed 912f0d2114fSYongseok Koh * before returning to application. 913f0d2114fSYongseok Koh * 914f0d2114fSYongseok Koh * @param rxq 915f0d2114fSYongseok Koh * Pointer to RX queue structure. 916f0d2114fSYongseok Koh * @param[out] pkts 917f0d2114fSYongseok Koh * Array to store received packets. 918f0d2114fSYongseok Koh * @param pkts_n 919f0d2114fSYongseok Koh * Maximum number of packets in array. 920f0d2114fSYongseok Koh * 921f0d2114fSYongseok Koh * @return 922f0d2114fSYongseok Koh * Number of packets received including errors (<= pkts_n). 923f0d2114fSYongseok Koh */ 924f0d2114fSYongseok Koh static inline uint16_t 925f0d2114fSYongseok Koh rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 926f0d2114fSYongseok Koh { 927f0d2114fSYongseok Koh const uint16_t q_n = 1 << rxq->cqe_n; 928f0d2114fSYongseok Koh const uint16_t q_mask = q_n - 1; 929f0d2114fSYongseok Koh volatile struct mlx5_cqe *cq; 930f0d2114fSYongseok Koh struct rte_mbuf **elts; 931f0d2114fSYongseok Koh unsigned int pos; 932f0d2114fSYongseok Koh uint64_t n; 933f0d2114fSYongseok Koh uint16_t repl_n; 934f0d2114fSYongseok Koh uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; 935f0d2114fSYongseok Koh uint16_t nocmp_n = 0; 936f0d2114fSYongseok Koh uint16_t rcvd_pkt = 0; 937f0d2114fSYongseok Koh unsigned int cq_idx = rxq->cq_ci & q_mask; 938f0d2114fSYongseok Koh unsigned int elts_idx; 939f0d2114fSYongseok Koh unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1)); 940f0d2114fSYongseok Koh const __m128i owner_check = 941f0d2114fSYongseok Koh _mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL); 942f0d2114fSYongseok Koh const __m128i opcode_check = 943f0d2114fSYongseok Koh _mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL); 944f0d2114fSYongseok Koh const __m128i format_check = 945f0d2114fSYongseok Koh _mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL); 946f0d2114fSYongseok Koh const __m128i resp_err_check = 947f0d2114fSYongseok Koh _mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL); 948f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 949f0d2114fSYongseok Koh uint32_t rcvd_byte = 0; 950f0d2114fSYongseok Koh /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ 951f0d2114fSYongseok Koh const __m128i len_shuf_mask = 952f0d2114fSYongseok Koh _mm_set_epi8(-1, -1, -1, -1, 953f0d2114fSYongseok Koh -1, -1, -1, -1, 954f0d2114fSYongseok Koh 12, 13, 8, 9, 955f0d2114fSYongseok Koh 4, 5, 0, 1); 956f0d2114fSYongseok Koh #endif 957f0d2114fSYongseok Koh /* Mask to shuffle from extracted CQE to mbuf. */ 958f0d2114fSYongseok Koh const __m128i shuf_mask = 959f0d2114fSYongseok Koh _mm_set_epi8(-1, 3, 2, 1, /* fdir.hi */ 960f0d2114fSYongseok Koh 12, 13, 14, 15, /* rss, bswap32 */ 961f0d2114fSYongseok Koh 10, 11, /* vlan_tci, bswap16 */ 962f0d2114fSYongseok Koh 4, 5, /* data_len, bswap16 */ 963f0d2114fSYongseok Koh -1, -1, /* zero out 2nd half of pkt_len */ 964f0d2114fSYongseok Koh 4, 5 /* pkt_len, bswap16 */); 965f0d2114fSYongseok Koh /* Mask to blend from the last Qword to the first DQword. */ 966f0d2114fSYongseok Koh const __m128i blend_mask = 967f0d2114fSYongseok Koh _mm_set_epi8(-1, -1, -1, -1, 968f0d2114fSYongseok Koh -1, -1, -1, -1, 969f0d2114fSYongseok Koh 0, 0, 0, 0, 970f0d2114fSYongseok Koh 0, 0, 0, -1); 971f0d2114fSYongseok Koh const __m128i zero = _mm_setzero_si128(); 972f0d2114fSYongseok Koh const __m128i ones = _mm_cmpeq_epi32(zero, zero); 973f0d2114fSYongseok Koh const __m128i crc_adj = 974f0d2114fSYongseok Koh _mm_set_epi16(0, 0, 0, 0, 0, 975f0d2114fSYongseok Koh rxq->crc_present * ETHER_CRC_LEN, 976f0d2114fSYongseok Koh 0, 977f0d2114fSYongseok Koh rxq->crc_present * ETHER_CRC_LEN); 978f0d2114fSYongseok Koh const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0); 979f0d2114fSYongseok Koh 980f0d2114fSYongseok Koh assert(rxq->sges_n == 0); 981f0d2114fSYongseok Koh assert(rxq->cqe_n == rxq->elts_n); 982f0d2114fSYongseok Koh cq = &(*rxq->cqes)[cq_idx]; 983f0d2114fSYongseok Koh rte_prefetch0(cq); 984f0d2114fSYongseok Koh rte_prefetch0(cq + 1); 985f0d2114fSYongseok Koh rte_prefetch0(cq + 2); 986f0d2114fSYongseok Koh rte_prefetch0(cq + 3); 987f0d2114fSYongseok Koh pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); 988f0d2114fSYongseok Koh /* 989f0d2114fSYongseok Koh * Order of indexes: 990f0d2114fSYongseok Koh * rq_ci >= cq_ci >= rq_pi 991f0d2114fSYongseok Koh * Definition of indexes: 992f0d2114fSYongseok Koh * rq_ci - cq_ci := # of buffers owned by HW (posted). 993f0d2114fSYongseok Koh * cq_ci - rq_pi := # of buffers not returned to app (decompressed). 994f0d2114fSYongseok Koh * N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished). 995f0d2114fSYongseok Koh */ 996f0d2114fSYongseok Koh repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); 997f0d2114fSYongseok Koh if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH) 998f0d2114fSYongseok Koh rxq_replenish_bulk_mbuf(rxq, repl_n); 999f0d2114fSYongseok Koh /* See if there're unreturned mbufs from compressed CQE. */ 1000f0d2114fSYongseok Koh rcvd_pkt = rxq->cq_ci - rxq->rq_pi; 1001f0d2114fSYongseok Koh if (rcvd_pkt > 0) { 1002f0d2114fSYongseok Koh rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); 1003f0d2114fSYongseok Koh rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); 1004f0d2114fSYongseok Koh rxq->rq_pi += rcvd_pkt; 1005f0d2114fSYongseok Koh pkts += rcvd_pkt; 1006f0d2114fSYongseok Koh } 1007f0d2114fSYongseok Koh elts_idx = rxq->rq_pi & q_mask; 1008f0d2114fSYongseok Koh elts = &(*rxq->elts)[elts_idx]; 1009f0d2114fSYongseok Koh pkts_n = RTE_MIN(pkts_n - rcvd_pkt, 1010f0d2114fSYongseok Koh (uint16_t)(rxq->rq_ci - rxq->cq_ci)); 1011f0d2114fSYongseok Koh /* Not to overflow pkts/elts array. */ 1012f0d2114fSYongseok Koh pkts_n = RTE_ALIGN_FLOOR(pkts_n, MLX5_VPMD_DESCS_PER_LOOP); 1013f0d2114fSYongseok Koh /* Not to cross queue end. */ 1014f0d2114fSYongseok Koh pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); 1015f0d2114fSYongseok Koh if (!pkts_n) 1016f0d2114fSYongseok Koh return rcvd_pkt; 1017f0d2114fSYongseok Koh /* At this point, there shouldn't be any remained packets. */ 1018f0d2114fSYongseok Koh assert(rxq->rq_pi == rxq->cq_ci); 1019f0d2114fSYongseok Koh /* 1020f0d2114fSYongseok Koh * A. load first Qword (8bytes) in one loop. 1021f0d2114fSYongseok Koh * B. copy 4 mbuf pointers from elts ring to returing pkts. 1022f0d2114fSYongseok Koh * C. load remained CQE data and extract necessary fields. 1023f0d2114fSYongseok Koh * Final 16bytes cqes[] extracted from original 64bytes CQE has the 1024f0d2114fSYongseok Koh * following structure: 1025f0d2114fSYongseok Koh * struct { 1026f0d2114fSYongseok Koh * uint8_t pkt_info; 1027f0d2114fSYongseok Koh * uint8_t flow_tag[3]; 1028f0d2114fSYongseok Koh * uint16_t byte_cnt; 1029f0d2114fSYongseok Koh * uint8_t rsvd4; 1030f0d2114fSYongseok Koh * uint8_t op_own; 1031f0d2114fSYongseok Koh * uint16_t hdr_type_etc; 1032f0d2114fSYongseok Koh * uint16_t vlan_info; 1033f0d2114fSYongseok Koh * uint32_t rx_has_res; 1034f0d2114fSYongseok Koh * } c; 1035f0d2114fSYongseok Koh * D. fill in mbuf. 1036f0d2114fSYongseok Koh * E. get valid CQEs. 1037f0d2114fSYongseok Koh * F. find compressed CQE. 1038f0d2114fSYongseok Koh */ 1039f0d2114fSYongseok Koh for (pos = 0; 1040f0d2114fSYongseok Koh pos < pkts_n; 1041f0d2114fSYongseok Koh pos += MLX5_VPMD_DESCS_PER_LOOP) { 1042f0d2114fSYongseok Koh __m128i cqes[MLX5_VPMD_DESCS_PER_LOOP]; 1043f0d2114fSYongseok Koh __m128i cqe_tmp1, cqe_tmp2; 1044f0d2114fSYongseok Koh __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; 1045f0d2114fSYongseok Koh __m128i op_own, op_own_tmp1, op_own_tmp2; 1046f0d2114fSYongseok Koh __m128i opcode, owner_mask, invalid_mask; 1047f0d2114fSYongseok Koh __m128i comp_mask; 1048f0d2114fSYongseok Koh __m128i mask; 1049f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 1050f0d2114fSYongseok Koh __m128i byte_cnt; 1051f0d2114fSYongseok Koh #endif 1052f0d2114fSYongseok Koh __m128i mbp1, mbp2; 1053f0d2114fSYongseok Koh __m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0); 1054f0d2114fSYongseok Koh unsigned int p1, p2, p3; 1055f0d2114fSYongseok Koh 1056f0d2114fSYongseok Koh /* Prefetch next 4 CQEs. */ 1057f0d2114fSYongseok Koh if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) { 1058f0d2114fSYongseok Koh rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]); 1059f0d2114fSYongseok Koh rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]); 1060f0d2114fSYongseok Koh rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]); 1061f0d2114fSYongseok Koh rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]); 1062f0d2114fSYongseok Koh } 1063f0d2114fSYongseok Koh /* A.0 do not cross the end of CQ. */ 1064f0d2114fSYongseok Koh mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8); 1065f0d2114fSYongseok Koh mask = _mm_sll_epi64(ones, mask); 1066f0d2114fSYongseok Koh p = _mm_andnot_si128(mask, p); 1067f0d2114fSYongseok Koh /* A.1 load cqes. */ 1068f0d2114fSYongseok Koh p3 = _mm_extract_epi16(p, 3); 1069f0d2114fSYongseok Koh cqes[3] = _mm_loadl_epi64((__m128i *) 1070f0d2114fSYongseok Koh &cq[pos + p3].sop_drop_qpn); 1071f0d2114fSYongseok Koh rte_compiler_barrier(); 1072f0d2114fSYongseok Koh p2 = _mm_extract_epi16(p, 2); 1073f0d2114fSYongseok Koh cqes[2] = _mm_loadl_epi64((__m128i *) 1074f0d2114fSYongseok Koh &cq[pos + p2].sop_drop_qpn); 1075f0d2114fSYongseok Koh rte_compiler_barrier(); 1076f0d2114fSYongseok Koh /* B.1 load mbuf pointers. */ 1077f0d2114fSYongseok Koh mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]); 1078f0d2114fSYongseok Koh mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]); 1079f0d2114fSYongseok Koh /* A.1 load a block having op_own. */ 1080f0d2114fSYongseok Koh p1 = _mm_extract_epi16(p, 1); 1081f0d2114fSYongseok Koh cqes[1] = _mm_loadl_epi64((__m128i *) 1082f0d2114fSYongseok Koh &cq[pos + p1].sop_drop_qpn); 1083f0d2114fSYongseok Koh rte_compiler_barrier(); 1084f0d2114fSYongseok Koh cqes[0] = _mm_loadl_epi64((__m128i *) 1085f0d2114fSYongseok Koh &cq[pos].sop_drop_qpn); 1086f0d2114fSYongseok Koh /* B.2 copy mbuf pointers. */ 1087f0d2114fSYongseok Koh _mm_storeu_si128((__m128i *)&pkts[pos], mbp1); 1088f0d2114fSYongseok Koh _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2); 1089f0d2114fSYongseok Koh rte_compiler_barrier(); 1090f0d2114fSYongseok Koh /* C.1 load remained CQE data and extract necessary fields. */ 1091f0d2114fSYongseok Koh cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]); 1092f0d2114fSYongseok Koh cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]); 1093f0d2114fSYongseok Koh cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask); 1094f0d2114fSYongseok Koh cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask); 1095f0d2114fSYongseok Koh cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].rsvd1[3]); 1096f0d2114fSYongseok Koh cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].rsvd1[3]); 1097f0d2114fSYongseok Koh cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30); 1098f0d2114fSYongseok Koh cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30); 1099f0d2114fSYongseok Koh cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd2[10]); 1100f0d2114fSYongseok Koh cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd2[10]); 1101f0d2114fSYongseok Koh cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04); 1102f0d2114fSYongseok Koh cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04); 1103f0d2114fSYongseok Koh /* C.2 generate final structure for mbuf with swapping bytes. */ 1104f0d2114fSYongseok Koh pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask); 1105f0d2114fSYongseok Koh pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask); 1106f0d2114fSYongseok Koh /* C.3 adjust CRC length. */ 1107f0d2114fSYongseok Koh pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj); 1108f0d2114fSYongseok Koh pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj); 1109f0d2114fSYongseok Koh /* C.4 adjust flow mark. */ 1110f0d2114fSYongseok Koh pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj); 1111f0d2114fSYongseok Koh pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj); 1112f0d2114fSYongseok Koh /* D.1 fill in mbuf - rx_descriptor_fields1. */ 1113f0d2114fSYongseok Koh _mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3); 1114f0d2114fSYongseok Koh _mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2); 1115f0d2114fSYongseok Koh /* E.1 extract op_own field. */ 1116f0d2114fSYongseok Koh op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]); 1117f0d2114fSYongseok Koh /* C.1 load remained CQE data and extract necessary fields. */ 1118f0d2114fSYongseok Koh cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]); 1119f0d2114fSYongseok Koh cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]); 1120f0d2114fSYongseok Koh cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask); 1121f0d2114fSYongseok Koh cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask); 1122f0d2114fSYongseok Koh cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].rsvd1[3]); 1123f0d2114fSYongseok Koh cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].rsvd1[3]); 1124f0d2114fSYongseok Koh cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30); 1125f0d2114fSYongseok Koh cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30); 1126f0d2114fSYongseok Koh cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd2[10]); 1127f0d2114fSYongseok Koh cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd2[10]); 1128f0d2114fSYongseok Koh cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04); 1129f0d2114fSYongseok Koh cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04); 1130f0d2114fSYongseok Koh /* C.2 generate final structure for mbuf with swapping bytes. */ 1131f0d2114fSYongseok Koh pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask); 1132f0d2114fSYongseok Koh pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask); 1133f0d2114fSYongseok Koh /* C.3 adjust CRC length. */ 1134f0d2114fSYongseok Koh pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj); 1135f0d2114fSYongseok Koh pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj); 1136f0d2114fSYongseok Koh /* C.4 adjust flow mark. */ 1137f0d2114fSYongseok Koh pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj); 1138f0d2114fSYongseok Koh pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj); 1139f0d2114fSYongseok Koh /* E.1 extract op_own byte. */ 1140f0d2114fSYongseok Koh op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]); 1141f0d2114fSYongseok Koh op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2); 1142f0d2114fSYongseok Koh /* D.1 fill in mbuf - rx_descriptor_fields1. */ 1143f0d2114fSYongseok Koh _mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1); 1144f0d2114fSYongseok Koh _mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0); 1145f0d2114fSYongseok Koh /* E.2 flip owner bit to mark CQEs from last round. */ 1146f0d2114fSYongseok Koh owner_mask = _mm_and_si128(op_own, owner_check); 1147f0d2114fSYongseok Koh if (ownership) 1148f0d2114fSYongseok Koh owner_mask = _mm_xor_si128(owner_mask, owner_check); 1149f0d2114fSYongseok Koh owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check); 1150f0d2114fSYongseok Koh owner_mask = _mm_packs_epi32(owner_mask, zero); 1151f0d2114fSYongseok Koh /* E.3 get mask for invalidated CQEs. */ 1152f0d2114fSYongseok Koh opcode = _mm_and_si128(op_own, opcode_check); 1153f0d2114fSYongseok Koh invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode); 1154f0d2114fSYongseok Koh invalid_mask = _mm_packs_epi32(invalid_mask, zero); 1155f0d2114fSYongseok Koh /* E.4 mask out beyond boundary. */ 1156f0d2114fSYongseok Koh invalid_mask = _mm_or_si128(invalid_mask, mask); 1157f0d2114fSYongseok Koh /* E.5 merge invalid_mask with invalid owner. */ 1158f0d2114fSYongseok Koh invalid_mask = _mm_or_si128(invalid_mask, owner_mask); 1159f0d2114fSYongseok Koh /* F.1 find compressed CQE format. */ 1160f0d2114fSYongseok Koh comp_mask = _mm_and_si128(op_own, format_check); 1161f0d2114fSYongseok Koh comp_mask = _mm_cmpeq_epi32(comp_mask, format_check); 1162f0d2114fSYongseok Koh comp_mask = _mm_packs_epi32(comp_mask, zero); 1163f0d2114fSYongseok Koh /* F.2 mask out invalid entries. */ 1164f0d2114fSYongseok Koh comp_mask = _mm_andnot_si128(invalid_mask, comp_mask); 1165f0d2114fSYongseok Koh comp_idx = _mm_cvtsi128_si64(comp_mask); 1166f0d2114fSYongseok Koh /* F.3 get the first compressed CQE. */ 1167f0d2114fSYongseok Koh comp_idx = comp_idx ? 1168f0d2114fSYongseok Koh __builtin_ctzll(comp_idx) / 1169f0d2114fSYongseok Koh (sizeof(uint16_t) * 8) : 1170f0d2114fSYongseok Koh MLX5_VPMD_DESCS_PER_LOOP; 1171f0d2114fSYongseok Koh /* E.6 mask out entries after the compressed CQE. */ 1172f0d2114fSYongseok Koh mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8); 1173f0d2114fSYongseok Koh mask = _mm_sll_epi64(ones, mask); 1174f0d2114fSYongseok Koh invalid_mask = _mm_or_si128(invalid_mask, mask); 1175f0d2114fSYongseok Koh /* E.7 count non-compressed valid CQEs. */ 1176f0d2114fSYongseok Koh n = _mm_cvtsi128_si64(invalid_mask); 1177f0d2114fSYongseok Koh n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) : 1178f0d2114fSYongseok Koh MLX5_VPMD_DESCS_PER_LOOP; 1179f0d2114fSYongseok Koh nocmp_n += n; 1180f0d2114fSYongseok Koh /* D.2 get the final invalid mask. */ 1181f0d2114fSYongseok Koh mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8); 1182f0d2114fSYongseok Koh mask = _mm_sll_epi64(ones, mask); 1183f0d2114fSYongseok Koh invalid_mask = _mm_or_si128(invalid_mask, mask); 1184f0d2114fSYongseok Koh /* D.3 check error in opcode. */ 1185f0d2114fSYongseok Koh opcode = _mm_cmpeq_epi32(resp_err_check, opcode); 1186f0d2114fSYongseok Koh opcode = _mm_packs_epi32(opcode, zero); 1187f0d2114fSYongseok Koh opcode = _mm_andnot_si128(invalid_mask, opcode); 1188f0d2114fSYongseok Koh /* D.4 mark if any error is set */ 1189f0d2114fSYongseok Koh rxq->pending_err |= !!_mm_cvtsi128_si64(opcode); 1190f0d2114fSYongseok Koh /* D.5 fill in mbuf - rearm_data and packet_type. */ 1191f0d2114fSYongseok Koh rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]); 1192f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 1193f0d2114fSYongseok Koh /* Add up received bytes count. */ 1194f0d2114fSYongseok Koh byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask); 1195f0d2114fSYongseok Koh byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); 1196f0d2114fSYongseok Koh byte_cnt = _mm_hadd_epi16(byte_cnt, zero); 1197f0d2114fSYongseok Koh rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); 1198f0d2114fSYongseok Koh #endif 1199f0d2114fSYongseok Koh /* 1200f0d2114fSYongseok Koh * Break the loop unless more valid CQE is expected, or if 1201f0d2114fSYongseok Koh * there's a compressed CQE. 1202f0d2114fSYongseok Koh */ 1203f0d2114fSYongseok Koh if (n != MLX5_VPMD_DESCS_PER_LOOP) 1204f0d2114fSYongseok Koh break; 1205f0d2114fSYongseok Koh } 1206f0d2114fSYongseok Koh /* If no new CQE seen, return without updating cq_db. */ 1207f0d2114fSYongseok Koh if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) 1208f0d2114fSYongseok Koh return rcvd_pkt; 1209f0d2114fSYongseok Koh /* Update the consumer indexes for non-compressed CQEs. */ 1210f0d2114fSYongseok Koh assert(nocmp_n <= pkts_n); 1211f0d2114fSYongseok Koh rxq->cq_ci += nocmp_n; 1212f0d2114fSYongseok Koh rxq->rq_pi += nocmp_n; 1213f0d2114fSYongseok Koh rcvd_pkt += nocmp_n; 1214f0d2114fSYongseok Koh #ifdef MLX5_PMD_SOFT_COUNTERS 1215f0d2114fSYongseok Koh rxq->stats.ipackets += nocmp_n; 1216f0d2114fSYongseok Koh rxq->stats.ibytes += rcvd_byte; 1217f0d2114fSYongseok Koh #endif 1218f0d2114fSYongseok Koh /* Decompress the last CQE if compressed. */ 1219f0d2114fSYongseok Koh if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { 1220f0d2114fSYongseok Koh assert(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); 1221f0d2114fSYongseok Koh rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]); 1222f0d2114fSYongseok Koh /* Return more packets if needed. */ 1223f0d2114fSYongseok Koh if (nocmp_n < pkts_n) { 1224f0d2114fSYongseok Koh uint16_t n = rxq->cq_ci - rxq->rq_pi; 1225f0d2114fSYongseok Koh 1226f0d2114fSYongseok Koh n = RTE_MIN(n, pkts_n - nocmp_n); 1227f0d2114fSYongseok Koh rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); 1228f0d2114fSYongseok Koh rxq->rq_pi += n; 1229f0d2114fSYongseok Koh rcvd_pkt += n; 1230f0d2114fSYongseok Koh } 1231f0d2114fSYongseok Koh } 1232f0d2114fSYongseok Koh rte_compiler_barrier(); 1233f0d2114fSYongseok Koh *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); 1234f0d2114fSYongseok Koh return rcvd_pkt; 1235f0d2114fSYongseok Koh } 1236f0d2114fSYongseok Koh 1237f0d2114fSYongseok Koh /** 1238f0d2114fSYongseok Koh * DPDK callback for vectorized RX. 1239f0d2114fSYongseok Koh * 1240f0d2114fSYongseok Koh * @param dpdk_rxq 1241f0d2114fSYongseok Koh * Generic pointer to RX queue structure. 1242f0d2114fSYongseok Koh * @param[out] pkts 1243f0d2114fSYongseok Koh * Array to store received packets. 1244f0d2114fSYongseok Koh * @param pkts_n 1245f0d2114fSYongseok Koh * Maximum number of packets in array. 1246f0d2114fSYongseok Koh * 1247f0d2114fSYongseok Koh * @return 1248f0d2114fSYongseok Koh * Number of packets successfully received (<= pkts_n). 1249f0d2114fSYongseok Koh */ 1250f0d2114fSYongseok Koh uint16_t 1251f0d2114fSYongseok Koh mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) 1252f0d2114fSYongseok Koh { 1253f0d2114fSYongseok Koh struct mlx5_rxq_data *rxq = dpdk_rxq; 1254f0d2114fSYongseok Koh uint16_t nb_rx; 1255f0d2114fSYongseok Koh 1256f0d2114fSYongseok Koh nb_rx = rxq_burst_v(rxq, pkts, pkts_n); 1257f0d2114fSYongseok Koh if (unlikely(rxq->pending_err)) 1258f0d2114fSYongseok Koh nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx); 1259f0d2114fSYongseok Koh return nb_rx; 1260f0d2114fSYongseok Koh } 1261f0d2114fSYongseok Koh 1262f0d2114fSYongseok Koh /** 1263f0d2114fSYongseok Koh * Check Tx queue flags are set for raw vectorized Tx. 1264f0d2114fSYongseok Koh * 1265f0d2114fSYongseok Koh * @param priv 1266f0d2114fSYongseok Koh * Pointer to private structure. 1267f0d2114fSYongseok Koh * 1268f0d2114fSYongseok Koh * @return 1269f0d2114fSYongseok Koh * 1 if supported, negative errno value if not. 1270f0d2114fSYongseok Koh */ 1271f0d2114fSYongseok Koh int __attribute__((cold)) 1272f0d2114fSYongseok Koh priv_check_raw_vec_tx_support(struct priv *priv) 1273f0d2114fSYongseok Koh { 1274f0d2114fSYongseok Koh uint16_t i; 1275f0d2114fSYongseok Koh 1276f0d2114fSYongseok Koh /* All the configured queues should support. */ 1277f0d2114fSYongseok Koh for (i = 0; i < priv->txqs_n; ++i) { 1278f0d2114fSYongseok Koh struct mlx5_txq_data *txq = (*priv->txqs)[i]; 1279f0d2114fSYongseok Koh 1280f0d2114fSYongseok Koh if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) || 1281f0d2114fSYongseok Koh !(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS)) 1282f0d2114fSYongseok Koh break; 1283f0d2114fSYongseok Koh } 1284f0d2114fSYongseok Koh if (i != priv->txqs_n) 1285f0d2114fSYongseok Koh return -ENOTSUP; 1286f0d2114fSYongseok Koh return 1; 1287f0d2114fSYongseok Koh } 1288f0d2114fSYongseok Koh 1289f0d2114fSYongseok Koh /** 1290f0d2114fSYongseok Koh * Check a device can support vectorized TX. 1291f0d2114fSYongseok Koh * 1292f0d2114fSYongseok Koh * @param priv 1293f0d2114fSYongseok Koh * Pointer to private structure. 1294f0d2114fSYongseok Koh * 1295f0d2114fSYongseok Koh * @return 1296f0d2114fSYongseok Koh * 1 if supported, negative errno value if not. 1297f0d2114fSYongseok Koh */ 1298f0d2114fSYongseok Koh int __attribute__((cold)) 1299f0d2114fSYongseok Koh priv_check_vec_tx_support(struct priv *priv) 1300f0d2114fSYongseok Koh { 1301f0d2114fSYongseok Koh if (!priv->tx_vec_en || 1302f0d2114fSYongseok Koh priv->txqs_n > MLX5_VPMD_MIN_TXQS || 1303f0d2114fSYongseok Koh priv->mps != MLX5_MPW_ENHANCED || 1304f0d2114fSYongseok Koh priv->tso) 1305f0d2114fSYongseok Koh return -ENOTSUP; 1306f0d2114fSYongseok Koh return 1; 1307f0d2114fSYongseok Koh } 1308f0d2114fSYongseok Koh 1309f0d2114fSYongseok Koh /** 1310f0d2114fSYongseok Koh * Check a RX queue can support vectorized RX. 1311f0d2114fSYongseok Koh * 1312f0d2114fSYongseok Koh * @param rxq 1313f0d2114fSYongseok Koh * Pointer to RX queue. 1314f0d2114fSYongseok Koh * 1315f0d2114fSYongseok Koh * @return 1316f0d2114fSYongseok Koh * 1 if supported, negative errno value if not. 1317f0d2114fSYongseok Koh */ 1318f0d2114fSYongseok Koh int __attribute__((cold)) 1319f0d2114fSYongseok Koh rxq_check_vec_support(struct mlx5_rxq_data *rxq) 1320f0d2114fSYongseok Koh { 1321f0d2114fSYongseok Koh struct mlx5_rxq_ctrl *ctrl = 1322f0d2114fSYongseok Koh container_of(rxq, struct mlx5_rxq_ctrl, rxq); 1323f0d2114fSYongseok Koh 1324f0d2114fSYongseok Koh if (!ctrl->priv->rx_vec_en || rxq->sges_n != 0) 1325f0d2114fSYongseok Koh return -ENOTSUP; 1326f0d2114fSYongseok Koh return 1; 1327f0d2114fSYongseok Koh } 1328f0d2114fSYongseok Koh 1329f0d2114fSYongseok Koh /** 1330f0d2114fSYongseok Koh * Check a device can support vectorized RX. 1331f0d2114fSYongseok Koh * 1332f0d2114fSYongseok Koh * @param priv 1333f0d2114fSYongseok Koh * Pointer to private structure. 1334f0d2114fSYongseok Koh * 1335f0d2114fSYongseok Koh * @return 1336f0d2114fSYongseok Koh * 1 if supported, negative errno value if not. 1337f0d2114fSYongseok Koh */ 1338f0d2114fSYongseok Koh int __attribute__((cold)) 1339f0d2114fSYongseok Koh priv_check_vec_rx_support(struct priv *priv) 1340f0d2114fSYongseok Koh { 1341f0d2114fSYongseok Koh uint16_t i; 1342f0d2114fSYongseok Koh 1343f0d2114fSYongseok Koh if (!priv->rx_vec_en) 1344f0d2114fSYongseok Koh return -ENOTSUP; 1345f0d2114fSYongseok Koh /* All the configured queues should support. */ 1346f0d2114fSYongseok Koh for (i = 0; i < priv->rxqs_n; ++i) { 1347f0d2114fSYongseok Koh struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; 1348f0d2114fSYongseok Koh 1349f0d2114fSYongseok Koh if (!rxq) 1350f0d2114fSYongseok Koh continue; 1351f0d2114fSYongseok Koh if (rxq_check_vec_support(rxq) < 0) 1352f0d2114fSYongseok Koh break; 1353f0d2114fSYongseok Koh } 1354f0d2114fSYongseok Koh if (i != priv->rxqs_n) 1355f0d2114fSYongseok Koh return -ENOTSUP; 1356f0d2114fSYongseok Koh return 1; 1357f0d2114fSYongseok Koh } 1358