1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2019 Intel Corporation 3 */ 4 5 #ifndef _ICE_RXTX_COMMON_AVX_H_ 6 #define _ICE_RXTX_COMMON_AVX_H_ 7 8 #include "ice_rxtx.h" 9 10 #ifdef __AVX2__ 11 static __rte_always_inline void 12 ice_rxq_rearm_common(struct ice_rx_queue *rxq, __rte_unused bool avx512) 13 { 14 int i; 15 uint16_t rx_id; 16 volatile union ice_rx_flex_desc *rxdp; 17 struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start]; 18 19 rxdp = rxq->rx_ring + rxq->rxrearm_start; 20 21 /* Pull 'n' more MBUFs into the software ring */ 22 if (rte_mempool_get_bulk(rxq->mp, 23 (void *)rxep, 24 ICE_RXQ_REARM_THRESH) < 0) { 25 if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >= 26 rxq->nb_rx_desc) { 27 __m128i dma_addr0; 28 29 dma_addr0 = _mm_setzero_si128(); 30 for (i = 0; i < ICE_DESCS_PER_LOOP; i++) { 31 rxep[i].mbuf = &rxq->fake_mbuf; 32 _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i].read), 33 dma_addr0); 34 } 35 } 36 rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += 37 ICE_RXQ_REARM_THRESH; 38 return; 39 } 40 41 #ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC 42 struct rte_mbuf *mb0, *mb1; 43 __m128i dma_addr0, dma_addr1; 44 __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, 45 RTE_PKTMBUF_HEADROOM); 46 /* Initialize the mbufs in vector, process 2 mbufs in one loop */ 47 for (i = 0; i < ICE_RXQ_REARM_THRESH; i += 2, rxep += 2) { 48 __m128i vaddr0, vaddr1; 49 50 mb0 = rxep[0].mbuf; 51 mb1 = rxep[1].mbuf; 52 53 #if RTE_IOVA_IN_MBUF 54 /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ 55 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != 56 offsetof(struct rte_mbuf, buf_addr) + 8); 57 #endif 58 vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); 59 vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); 60 61 #if RTE_IOVA_IN_MBUF 62 /* convert pa to dma_addr hdr/data */ 63 dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); 64 dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); 65 #else 66 /* convert va to dma_addr hdr/data */ 67 dma_addr0 = _mm_unpacklo_epi64(vaddr0, vaddr0); 68 dma_addr1 = _mm_unpacklo_epi64(vaddr1, vaddr1); 69 #endif 70 71 /* add headroom to pa values */ 72 dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); 73 dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); 74 75 /* flush desc with pa dma_addr */ 76 _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr0); 77 _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp++->read), dma_addr1); 78 } 79 #else 80 #ifdef __AVX512VL__ 81 if (avx512) { 82 struct rte_mbuf *mb0, *mb1, *mb2, *mb3; 83 struct rte_mbuf *mb4, *mb5, *mb6, *mb7; 84 __m512i dma_addr0_3, dma_addr4_7; 85 __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); 86 /* Initialize the mbufs in vector, process 8 mbufs in one loop */ 87 for (i = 0; i < ICE_RXQ_REARM_THRESH; 88 i += 8, rxep += 8, rxdp += 8) { 89 __m128i vaddr0, vaddr1, vaddr2, vaddr3; 90 __m128i vaddr4, vaddr5, vaddr6, vaddr7; 91 __m256i vaddr0_1, vaddr2_3; 92 __m256i vaddr4_5, vaddr6_7; 93 __m512i vaddr0_3, vaddr4_7; 94 95 mb0 = rxep[0].mbuf; 96 mb1 = rxep[1].mbuf; 97 mb2 = rxep[2].mbuf; 98 mb3 = rxep[3].mbuf; 99 mb4 = rxep[4].mbuf; 100 mb5 = rxep[5].mbuf; 101 mb6 = rxep[6].mbuf; 102 mb7 = rxep[7].mbuf; 103 104 #if RTE_IOVA_IN_MBUF 105 /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ 106 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != 107 offsetof(struct rte_mbuf, buf_addr) + 8); 108 #endif 109 vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); 110 vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); 111 vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); 112 vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); 113 vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr); 114 vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr); 115 vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr); 116 vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr); 117 118 /** 119 * merge 0 & 1, by casting 0 to 256-bit and inserting 1 120 * into the high lanes. Similarly for 2 & 3, and so on. 121 */ 122 vaddr0_1 = 123 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0), 124 vaddr1, 1); 125 vaddr2_3 = 126 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2), 127 vaddr3, 1); 128 vaddr4_5 = 129 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4), 130 vaddr5, 1); 131 vaddr6_7 = 132 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6), 133 vaddr7, 1); 134 vaddr0_3 = 135 _mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1), 136 vaddr2_3, 1); 137 vaddr4_7 = 138 _mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5), 139 vaddr6_7, 1); 140 141 #if RTE_IOVA_IN_MBUF 142 /* convert pa to dma_addr hdr/data */ 143 dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3); 144 dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7); 145 #else 146 /* convert va to dma_addr hdr/data */ 147 dma_addr0_3 = _mm512_unpacklo_epi64(vaddr0_3, vaddr0_3); 148 dma_addr4_7 = _mm512_unpacklo_epi64(vaddr4_7, vaddr4_7); 149 #endif 150 151 /* add headroom to pa values */ 152 dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room); 153 dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room); 154 155 /* flush desc with pa dma_addr */ 156 _mm512_store_si512(RTE_CAST_PTR(__m512i *, &rxdp->read), dma_addr0_3); 157 _mm512_store_si512(RTE_CAST_PTR(__m512i *, &(rxdp + 4)->read), dma_addr4_7); 158 } 159 } else 160 #endif /* __AVX512VL__ */ 161 { 162 struct rte_mbuf *mb0, *mb1, *mb2, *mb3; 163 __m256i dma_addr0_1, dma_addr2_3; 164 __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM); 165 /* Initialize the mbufs in vector, process 4 mbufs in one loop */ 166 for (i = 0; i < ICE_RXQ_REARM_THRESH; 167 i += 4, rxep += 4, rxdp += 4) { 168 __m128i vaddr0, vaddr1, vaddr2, vaddr3; 169 __m256i vaddr0_1, vaddr2_3; 170 171 mb0 = rxep[0].mbuf; 172 mb1 = rxep[1].mbuf; 173 mb2 = rxep[2].mbuf; 174 mb3 = rxep[3].mbuf; 175 176 #if RTE_IOVA_IN_MBUF 177 /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ 178 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != 179 offsetof(struct rte_mbuf, buf_addr) + 8); 180 #endif 181 vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); 182 vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); 183 vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); 184 vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); 185 186 /** 187 * merge 0 & 1, by casting 0 to 256-bit and inserting 1 188 * into the high lanes. Similarly for 2 & 3 189 */ 190 vaddr0_1 = 191 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0), 192 vaddr1, 1); 193 vaddr2_3 = 194 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2), 195 vaddr3, 1); 196 197 #if RTE_IOVA_IN_MBUF 198 /* convert pa to dma_addr hdr/data */ 199 dma_addr0_1 = _mm256_unpackhi_epi64(vaddr0_1, vaddr0_1); 200 dma_addr2_3 = _mm256_unpackhi_epi64(vaddr2_3, vaddr2_3); 201 #else 202 /* convert va to dma_addr hdr/data */ 203 dma_addr0_1 = _mm256_unpacklo_epi64(vaddr0_1, vaddr0_1); 204 dma_addr2_3 = _mm256_unpacklo_epi64(vaddr2_3, vaddr2_3); 205 #endif 206 207 /* add headroom to pa values */ 208 dma_addr0_1 = _mm256_add_epi64(dma_addr0_1, hdr_room); 209 dma_addr2_3 = _mm256_add_epi64(dma_addr2_3, hdr_room); 210 211 /* flush desc with pa dma_addr */ 212 _mm256_store_si256(RTE_CAST_PTR(__m256i *, &rxdp->read), dma_addr0_1); 213 _mm256_store_si256(RTE_CAST_PTR(__m256i *, &(rxdp + 2)->read), dma_addr2_3); 214 } 215 } 216 217 #endif 218 219 rxq->rxrearm_start += ICE_RXQ_REARM_THRESH; 220 if (rxq->rxrearm_start >= rxq->nb_rx_desc) 221 rxq->rxrearm_start = 0; 222 223 rxq->rxrearm_nb -= ICE_RXQ_REARM_THRESH; 224 225 rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? 226 (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); 227 228 /* Update the tail pointer on the NIC */ 229 ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); 230 } 231 #endif /* __AVX2__ */ 232 233 #endif /* _ICE_RXTX_COMMON_AVX_H_ */ 234