1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2014-2021 Netronome Systems, Inc. 3 * All rights reserved. 4 * 5 * Small portions derived from code Copyright(c) 2010-2015 Intel Corporation. 6 */ 7 8 /* 9 * vim:shiftwidth=8:noexpandtab 10 * 11 * @file dpdk/pmd/nfp_rxtx.c 12 * 13 * Netronome vNIC DPDK Poll-Mode Driver: Rx/Tx functions 14 */ 15 16 #include <ethdev_driver.h> 17 #include <ethdev_pci.h> 18 19 #include "nfp_common.h" 20 #include "nfp_rxtx.h" 21 #include "nfp_logs.h" 22 #include "nfp_ctrl.h" 23 24 /* Prototypes */ 25 static int nfp_net_rx_fill_freelist(struct nfp_net_rxq *rxq); 26 static inline void nfp_net_mbuf_alloc_failed(struct nfp_net_rxq *rxq); 27 static inline void nfp_net_set_hash(struct nfp_net_rxq *rxq, 28 struct nfp_net_rx_desc *rxd, 29 struct rte_mbuf *mbuf); 30 static inline void nfp_net_rx_cksum(struct nfp_net_rxq *rxq, 31 struct nfp_net_rx_desc *rxd, 32 struct rte_mbuf *mb); 33 static void nfp_net_rx_queue_release_mbufs(struct nfp_net_rxq *rxq); 34 static int nfp_net_tx_free_bufs(struct nfp_net_txq *txq); 35 static void nfp_net_tx_queue_release_mbufs(struct nfp_net_txq *txq); 36 static inline uint32_t nfp_free_tx_desc(struct nfp_net_txq *txq); 37 static inline uint32_t nfp_net_txq_full(struct nfp_net_txq *txq); 38 static inline void nfp_net_tx_tso(struct nfp_net_txq *txq, 39 struct nfp_net_tx_desc *txd, 40 struct rte_mbuf *mb); 41 static inline void nfp_net_tx_cksum(struct nfp_net_txq *txq, 42 struct nfp_net_tx_desc *txd, 43 struct rte_mbuf *mb); 44 45 static int 46 nfp_net_rx_fill_freelist(struct nfp_net_rxq *rxq) 47 { 48 struct nfp_net_rx_buff *rxe = rxq->rxbufs; 49 uint64_t dma_addr; 50 unsigned int i; 51 52 PMD_RX_LOG(DEBUG, "Fill Rx Freelist for %u descriptors", 53 rxq->rx_count); 54 55 for (i = 0; i < rxq->rx_count; i++) { 56 struct nfp_net_rx_desc *rxd; 57 struct rte_mbuf *mbuf = rte_pktmbuf_alloc(rxq->mem_pool); 58 59 if (mbuf == NULL) { 60 PMD_DRV_LOG(ERR, "RX mbuf alloc failed queue_id=%u", 61 (unsigned int)rxq->qidx); 62 return -ENOMEM; 63 } 64 65 dma_addr = rte_cpu_to_le_64(RTE_MBUF_DMA_ADDR_DEFAULT(mbuf)); 66 67 rxd = &rxq->rxds[i]; 68 rxd->fld.dd = 0; 69 rxd->fld.dma_addr_hi = (dma_addr >> 32) & 0xff; 70 rxd->fld.dma_addr_lo = dma_addr & 0xffffffff; 71 rxe[i].mbuf = mbuf; 72 PMD_RX_LOG(DEBUG, "[%d]: %" PRIx64, i, dma_addr); 73 } 74 75 /* Make sure all writes are flushed before telling the hardware */ 76 rte_wmb(); 77 78 /* Not advertising the whole ring as the firmware gets confused if so */ 79 PMD_RX_LOG(DEBUG, "Increment FL write pointer in %u", 80 rxq->rx_count - 1); 81 82 nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, rxq->rx_count - 1); 83 84 return 0; 85 } 86 87 int 88 nfp_net_rx_freelist_setup(struct rte_eth_dev *dev) 89 { 90 int i; 91 92 for (i = 0; i < dev->data->nb_rx_queues; i++) { 93 if (nfp_net_rx_fill_freelist(dev->data->rx_queues[i]) < 0) 94 return -1; 95 } 96 return 0; 97 } 98 99 uint32_t 100 nfp_net_rx_queue_count(void *rx_queue) 101 { 102 struct nfp_net_rxq *rxq; 103 struct nfp_net_rx_desc *rxds; 104 uint32_t idx; 105 uint32_t count; 106 107 rxq = rx_queue; 108 109 idx = rxq->rd_p; 110 111 count = 0; 112 113 /* 114 * Other PMDs are just checking the DD bit in intervals of 4 115 * descriptors and counting all four if the first has the DD 116 * bit on. Of course, this is not accurate but can be good for 117 * performance. But ideally that should be done in descriptors 118 * chunks belonging to the same cache line 119 */ 120 121 while (count < rxq->rx_count) { 122 rxds = &rxq->rxds[idx]; 123 if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0) 124 break; 125 126 count++; 127 idx++; 128 129 /* Wrapping? */ 130 if ((idx) == rxq->rx_count) 131 idx = 0; 132 } 133 134 return count; 135 } 136 137 static inline void 138 nfp_net_mbuf_alloc_failed(struct nfp_net_rxq *rxq) 139 { 140 rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++; 141 } 142 143 /* 144 * nfp_net_set_hash - Set mbuf hash data 145 * 146 * The RSS hash and hash-type are pre-pended to the packet data. 147 * Extract and decode it and set the mbuf fields. 148 */ 149 static inline void 150 nfp_net_set_hash(struct nfp_net_rxq *rxq, struct nfp_net_rx_desc *rxd, 151 struct rte_mbuf *mbuf) 152 { 153 struct nfp_net_hw *hw = rxq->hw; 154 uint8_t *meta_offset; 155 uint32_t meta_info; 156 uint32_t hash = 0; 157 uint32_t hash_type = 0; 158 159 if (!(hw->ctrl & NFP_NET_CFG_CTRL_RSS)) 160 return; 161 162 /* this is true for new firmwares */ 163 if (likely(((hw->cap & NFP_NET_CFG_CTRL_RSS2) || 164 (NFD_CFG_MAJOR_VERSION_of(hw->ver) == 4)) && 165 NFP_DESC_META_LEN(rxd))) { 166 /* 167 * new metadata api: 168 * <---- 32 bit -----> 169 * m field type word 170 * e data field #2 171 * t data field #1 172 * a data field #0 173 * ==================== 174 * packet data 175 * 176 * Field type word contains up to 8 4bit field types 177 * A 4bit field type refers to a data field word 178 * A data field word can have several 4bit field types 179 */ 180 meta_offset = rte_pktmbuf_mtod(mbuf, uint8_t *); 181 meta_offset -= NFP_DESC_META_LEN(rxd); 182 meta_info = rte_be_to_cpu_32(*(uint32_t *)meta_offset); 183 meta_offset += 4; 184 /* NFP PMD just supports metadata for hashing */ 185 switch (meta_info & NFP_NET_META_FIELD_MASK) { 186 case NFP_NET_META_HASH: 187 /* next field type is about the hash type */ 188 meta_info >>= NFP_NET_META_FIELD_SIZE; 189 /* hash value is in the data field */ 190 hash = rte_be_to_cpu_32(*(uint32_t *)meta_offset); 191 hash_type = meta_info & NFP_NET_META_FIELD_MASK; 192 break; 193 default: 194 /* Unsupported metadata can be a performance issue */ 195 return; 196 } 197 } else { 198 if (!(rxd->rxd.flags & PCIE_DESC_RX_RSS)) 199 return; 200 201 hash = rte_be_to_cpu_32(*(uint32_t *)NFP_HASH_OFFSET); 202 hash_type = rte_be_to_cpu_32(*(uint32_t *)NFP_HASH_TYPE_OFFSET); 203 } 204 205 mbuf->hash.rss = hash; 206 mbuf->ol_flags |= PKT_RX_RSS_HASH; 207 208 switch (hash_type) { 209 case NFP_NET_RSS_IPV4: 210 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV4; 211 break; 212 case NFP_NET_RSS_IPV6: 213 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6; 214 break; 215 case NFP_NET_RSS_IPV6_EX: 216 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT; 217 break; 218 case NFP_NET_RSS_IPV4_TCP: 219 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT; 220 break; 221 case NFP_NET_RSS_IPV6_TCP: 222 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT; 223 break; 224 case NFP_NET_RSS_IPV4_UDP: 225 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT; 226 break; 227 case NFP_NET_RSS_IPV6_UDP: 228 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT; 229 break; 230 default: 231 mbuf->packet_type |= RTE_PTYPE_INNER_L4_MASK; 232 } 233 } 234 235 /* nfp_net_rx_cksum - set mbuf checksum flags based on RX descriptor flags */ 236 static inline void 237 nfp_net_rx_cksum(struct nfp_net_rxq *rxq, struct nfp_net_rx_desc *rxd, 238 struct rte_mbuf *mb) 239 { 240 struct nfp_net_hw *hw = rxq->hw; 241 242 if (!(hw->ctrl & NFP_NET_CFG_CTRL_RXCSUM)) 243 return; 244 245 /* If IPv4 and IP checksum error, fail */ 246 if (unlikely((rxd->rxd.flags & PCIE_DESC_RX_IP4_CSUM) && 247 !(rxd->rxd.flags & PCIE_DESC_RX_IP4_CSUM_OK))) 248 mb->ol_flags |= PKT_RX_IP_CKSUM_BAD; 249 else 250 mb->ol_flags |= PKT_RX_IP_CKSUM_GOOD; 251 252 /* If neither UDP nor TCP return */ 253 if (!(rxd->rxd.flags & PCIE_DESC_RX_TCP_CSUM) && 254 !(rxd->rxd.flags & PCIE_DESC_RX_UDP_CSUM)) 255 return; 256 257 if (likely(rxd->rxd.flags & PCIE_DESC_RX_L4_CSUM_OK)) 258 mb->ol_flags |= PKT_RX_L4_CKSUM_GOOD; 259 else 260 mb->ol_flags |= PKT_RX_L4_CKSUM_BAD; 261 } 262 263 /* 264 * RX path design: 265 * 266 * There are some decisions to take: 267 * 1) How to check DD RX descriptors bit 268 * 2) How and when to allocate new mbufs 269 * 270 * Current implementation checks just one single DD bit each loop. As each 271 * descriptor is 8 bytes, it is likely a good idea to check descriptors in 272 * a single cache line instead. Tests with this change have not shown any 273 * performance improvement but it requires further investigation. For example, 274 * depending on which descriptor is next, the number of descriptors could be 275 * less than 8 for just checking those in the same cache line. This implies 276 * extra work which could be counterproductive by itself. Indeed, last firmware 277 * changes are just doing this: writing several descriptors with the DD bit 278 * for saving PCIe bandwidth and DMA operations from the NFP. 279 * 280 * Mbuf allocation is done when a new packet is received. Then the descriptor 281 * is automatically linked with the new mbuf and the old one is given to the 282 * user. The main drawback with this design is mbuf allocation is heavier than 283 * using bulk allocations allowed by DPDK with rte_mempool_get_bulk. From the 284 * cache point of view it does not seem allocating the mbuf early on as we are 285 * doing now have any benefit at all. Again, tests with this change have not 286 * shown any improvement. Also, rte_mempool_get_bulk returns all or nothing 287 * so looking at the implications of this type of allocation should be studied 288 * deeply 289 */ 290 291 uint16_t 292 nfp_net_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) 293 { 294 struct nfp_net_rxq *rxq; 295 struct nfp_net_rx_desc *rxds; 296 struct nfp_net_rx_buff *rxb; 297 struct nfp_net_hw *hw; 298 struct rte_mbuf *mb; 299 struct rte_mbuf *new_mb; 300 uint16_t nb_hold; 301 uint64_t dma_addr; 302 int avail; 303 304 rxq = rx_queue; 305 if (unlikely(rxq == NULL)) { 306 /* 307 * DPDK just checks the queue is lower than max queues 308 * enabled. But the queue needs to be configured 309 */ 310 RTE_LOG_DP(ERR, PMD, "RX Bad queue\n"); 311 return -EINVAL; 312 } 313 314 hw = rxq->hw; 315 avail = 0; 316 nb_hold = 0; 317 318 while (avail < nb_pkts) { 319 rxb = &rxq->rxbufs[rxq->rd_p]; 320 if (unlikely(rxb == NULL)) { 321 RTE_LOG_DP(ERR, PMD, "rxb does not exist!\n"); 322 break; 323 } 324 325 rxds = &rxq->rxds[rxq->rd_p]; 326 if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0) 327 break; 328 329 /* 330 * Memory barrier to ensure that we won't do other 331 * reads before the DD bit. 332 */ 333 rte_rmb(); 334 335 /* 336 * We got a packet. Let's alloc a new mbuf for refilling the 337 * free descriptor ring as soon as possible 338 */ 339 new_mb = rte_pktmbuf_alloc(rxq->mem_pool); 340 if (unlikely(new_mb == NULL)) { 341 RTE_LOG_DP(DEBUG, PMD, 342 "RX mbuf alloc failed port_id=%u queue_id=%u\n", 343 rxq->port_id, (unsigned int)rxq->qidx); 344 nfp_net_mbuf_alloc_failed(rxq); 345 break; 346 } 347 348 nb_hold++; 349 350 /* 351 * Grab the mbuf and refill the descriptor with the 352 * previously allocated mbuf 353 */ 354 mb = rxb->mbuf; 355 rxb->mbuf = new_mb; 356 357 PMD_RX_LOG(DEBUG, "Packet len: %u, mbuf_size: %u", 358 rxds->rxd.data_len, rxq->mbuf_size); 359 360 /* Size of this segment */ 361 mb->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds); 362 /* Size of the whole packet. We just support 1 segment */ 363 mb->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds); 364 365 if (unlikely((mb->data_len + hw->rx_offset) > 366 rxq->mbuf_size)) { 367 /* 368 * This should not happen and the user has the 369 * responsibility of avoiding it. But we have 370 * to give some info about the error 371 */ 372 RTE_LOG_DP(ERR, PMD, 373 "mbuf overflow likely due to the RX offset.\n" 374 "\t\tYour mbuf size should have extra space for" 375 " RX offset=%u bytes.\n" 376 "\t\tCurrently you just have %u bytes available" 377 " but the received packet is %u bytes long", 378 hw->rx_offset, 379 rxq->mbuf_size - hw->rx_offset, 380 mb->data_len); 381 return -EINVAL; 382 } 383 384 /* Filling the received mbuf with packet info */ 385 if (hw->rx_offset) 386 mb->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset; 387 else 388 mb->data_off = RTE_PKTMBUF_HEADROOM + 389 NFP_DESC_META_LEN(rxds); 390 391 /* No scatter mode supported */ 392 mb->nb_segs = 1; 393 mb->next = NULL; 394 395 mb->port = rxq->port_id; 396 397 /* Checking the RSS flag */ 398 nfp_net_set_hash(rxq, rxds, mb); 399 400 /* Checking the checksum flag */ 401 nfp_net_rx_cksum(rxq, rxds, mb); 402 403 if ((rxds->rxd.flags & PCIE_DESC_RX_VLAN) && 404 (hw->ctrl & NFP_NET_CFG_CTRL_RXVLAN)) { 405 mb->vlan_tci = rte_cpu_to_le_32(rxds->rxd.vlan); 406 mb->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; 407 } 408 409 /* Adding the mbuf to the mbuf array passed by the app */ 410 rx_pkts[avail++] = mb; 411 412 /* Now resetting and updating the descriptor */ 413 rxds->vals[0] = 0; 414 rxds->vals[1] = 0; 415 dma_addr = rte_cpu_to_le_64(RTE_MBUF_DMA_ADDR_DEFAULT(new_mb)); 416 rxds->fld.dd = 0; 417 rxds->fld.dma_addr_hi = (dma_addr >> 32) & 0xff; 418 rxds->fld.dma_addr_lo = dma_addr & 0xffffffff; 419 420 rxq->rd_p++; 421 if (unlikely(rxq->rd_p == rxq->rx_count)) /* wrapping?*/ 422 rxq->rd_p = 0; 423 } 424 425 if (nb_hold == 0) 426 return nb_hold; 427 428 PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received", 429 rxq->port_id, (unsigned int)rxq->qidx, nb_hold); 430 431 nb_hold += rxq->nb_rx_hold; 432 433 /* 434 * FL descriptors needs to be written before incrementing the 435 * FL queue WR pointer 436 */ 437 rte_wmb(); 438 if (nb_hold > rxq->rx_free_thresh) { 439 PMD_RX_LOG(DEBUG, "port=%u queue=%u nb_hold=%u avail=%u", 440 rxq->port_id, (unsigned int)rxq->qidx, 441 (unsigned int)nb_hold, (unsigned int)avail); 442 nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold); 443 nb_hold = 0; 444 } 445 rxq->nb_rx_hold = nb_hold; 446 447 return avail; 448 } 449 450 static void 451 nfp_net_rx_queue_release_mbufs(struct nfp_net_rxq *rxq) 452 { 453 unsigned int i; 454 455 if (rxq->rxbufs == NULL) 456 return; 457 458 for (i = 0; i < rxq->rx_count; i++) { 459 if (rxq->rxbufs[i].mbuf) { 460 rte_pktmbuf_free_seg(rxq->rxbufs[i].mbuf); 461 rxq->rxbufs[i].mbuf = NULL; 462 } 463 } 464 } 465 466 void 467 nfp_net_rx_queue_release(struct rte_eth_dev *dev, uint16_t queue_idx) 468 { 469 struct nfp_net_rxq *rxq = dev->data->rx_queues[queue_idx]; 470 471 if (rxq) { 472 nfp_net_rx_queue_release_mbufs(rxq); 473 rte_free(rxq->rxbufs); 474 rte_free(rxq); 475 } 476 } 477 478 void 479 nfp_net_reset_rx_queue(struct nfp_net_rxq *rxq) 480 { 481 nfp_net_rx_queue_release_mbufs(rxq); 482 rxq->rd_p = 0; 483 rxq->nb_rx_hold = 0; 484 } 485 486 int 487 nfp_net_rx_queue_setup(struct rte_eth_dev *dev, 488 uint16_t queue_idx, uint16_t nb_desc, 489 unsigned int socket_id, 490 const struct rte_eth_rxconf *rx_conf, 491 struct rte_mempool *mp) 492 { 493 const struct rte_memzone *tz; 494 struct nfp_net_rxq *rxq; 495 struct nfp_net_hw *hw; 496 uint32_t rx_desc_sz; 497 498 hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private); 499 500 PMD_INIT_FUNC_TRACE(); 501 502 /* Validating number of descriptors */ 503 rx_desc_sz = nb_desc * sizeof(struct nfp_net_rx_desc); 504 if (rx_desc_sz % NFP_ALIGN_RING_DESC != 0 || 505 nb_desc > NFP_NET_MAX_RX_DESC || 506 nb_desc < NFP_NET_MIN_RX_DESC) { 507 PMD_DRV_LOG(ERR, "Wrong nb_desc value"); 508 return -EINVAL; 509 } 510 511 /* 512 * Free memory prior to re-allocation if needed. This is the case after 513 * calling nfp_net_stop 514 */ 515 if (dev->data->rx_queues[queue_idx]) { 516 nfp_net_rx_queue_release(dev, queue_idx); 517 dev->data->rx_queues[queue_idx] = NULL; 518 } 519 520 /* Allocating rx queue data structure */ 521 rxq = rte_zmalloc_socket("ethdev RX queue", sizeof(struct nfp_net_rxq), 522 RTE_CACHE_LINE_SIZE, socket_id); 523 if (rxq == NULL) 524 return -ENOMEM; 525 526 dev->data->rx_queues[queue_idx] = rxq; 527 528 /* Hw queues mapping based on firmware configuration */ 529 rxq->qidx = queue_idx; 530 rxq->fl_qcidx = queue_idx * hw->stride_rx; 531 rxq->rx_qcidx = rxq->fl_qcidx + (hw->stride_rx - 1); 532 rxq->qcp_fl = hw->rx_bar + NFP_QCP_QUEUE_OFF(rxq->fl_qcidx); 533 rxq->qcp_rx = hw->rx_bar + NFP_QCP_QUEUE_OFF(rxq->rx_qcidx); 534 535 /* 536 * Tracking mbuf size for detecting a potential mbuf overflow due to 537 * RX offset 538 */ 539 rxq->mem_pool = mp; 540 rxq->mbuf_size = rxq->mem_pool->elt_size; 541 rxq->mbuf_size -= (sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM); 542 hw->flbufsz = rxq->mbuf_size; 543 544 rxq->rx_count = nb_desc; 545 rxq->port_id = dev->data->port_id; 546 rxq->rx_free_thresh = rx_conf->rx_free_thresh; 547 rxq->drop_en = rx_conf->rx_drop_en; 548 549 /* 550 * Allocate RX ring hardware descriptors. A memzone large enough to 551 * handle the maximum ring size is allocated in order to allow for 552 * resizing in later calls to the queue setup function. 553 */ 554 tz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx, 555 sizeof(struct nfp_net_rx_desc) * 556 NFP_NET_MAX_RX_DESC, NFP_MEMZONE_ALIGN, 557 socket_id); 558 559 if (tz == NULL) { 560 PMD_DRV_LOG(ERR, "Error allocating rx dma"); 561 nfp_net_rx_queue_release(dev, queue_idx); 562 dev->data->rx_queues[queue_idx] = NULL; 563 return -ENOMEM; 564 } 565 566 /* Saving physical and virtual addresses for the RX ring */ 567 rxq->dma = (uint64_t)tz->iova; 568 rxq->rxds = (struct nfp_net_rx_desc *)tz->addr; 569 570 /* mbuf pointers array for referencing mbufs linked to RX descriptors */ 571 rxq->rxbufs = rte_zmalloc_socket("rxq->rxbufs", 572 sizeof(*rxq->rxbufs) * nb_desc, 573 RTE_CACHE_LINE_SIZE, socket_id); 574 if (rxq->rxbufs == NULL) { 575 nfp_net_rx_queue_release(dev, queue_idx); 576 dev->data->rx_queues[queue_idx] = NULL; 577 return -ENOMEM; 578 } 579 580 PMD_RX_LOG(DEBUG, "rxbufs=%p hw_ring=%p dma_addr=0x%" PRIx64, 581 rxq->rxbufs, rxq->rxds, (unsigned long)rxq->dma); 582 583 nfp_net_reset_rx_queue(rxq); 584 585 rxq->hw = hw; 586 587 /* 588 * Telling the HW about the physical address of the RX ring and number 589 * of descriptors in log2 format 590 */ 591 nn_cfg_writeq(hw, NFP_NET_CFG_RXR_ADDR(queue_idx), rxq->dma); 592 nn_cfg_writeb(hw, NFP_NET_CFG_RXR_SZ(queue_idx), rte_log2_u32(nb_desc)); 593 594 return 0; 595 } 596 597 /* 598 * nfp_net_tx_free_bufs - Check for descriptors with a complete 599 * status 600 * @txq: TX queue to work with 601 * Returns number of descriptors freed 602 */ 603 static int 604 nfp_net_tx_free_bufs(struct nfp_net_txq *txq) 605 { 606 uint32_t qcp_rd_p; 607 int todo; 608 609 PMD_TX_LOG(DEBUG, "queue %u. Check for descriptor with a complete" 610 " status", txq->qidx); 611 612 /* Work out how many packets have been sent */ 613 qcp_rd_p = nfp_qcp_read(txq->qcp_q, NFP_QCP_READ_PTR); 614 615 if (qcp_rd_p == txq->rd_p) { 616 PMD_TX_LOG(DEBUG, "queue %u: It seems harrier is not sending " 617 "packets (%u, %u)", txq->qidx, 618 qcp_rd_p, txq->rd_p); 619 return 0; 620 } 621 622 if (qcp_rd_p > txq->rd_p) 623 todo = qcp_rd_p - txq->rd_p; 624 else 625 todo = qcp_rd_p + txq->tx_count - txq->rd_p; 626 627 PMD_TX_LOG(DEBUG, "qcp_rd_p %u, txq->rd_p: %u, qcp->rd_p: %u", 628 qcp_rd_p, txq->rd_p, txq->rd_p); 629 630 if (todo == 0) 631 return todo; 632 633 txq->rd_p += todo; 634 if (unlikely(txq->rd_p >= txq->tx_count)) 635 txq->rd_p -= txq->tx_count; 636 637 return todo; 638 } 639 640 static void 641 nfp_net_tx_queue_release_mbufs(struct nfp_net_txq *txq) 642 { 643 unsigned int i; 644 645 if (txq->txbufs == NULL) 646 return; 647 648 for (i = 0; i < txq->tx_count; i++) { 649 if (txq->txbufs[i].mbuf) { 650 rte_pktmbuf_free_seg(txq->txbufs[i].mbuf); 651 txq->txbufs[i].mbuf = NULL; 652 } 653 } 654 } 655 656 void 657 nfp_net_tx_queue_release(struct rte_eth_dev *dev, uint16_t queue_idx) 658 { 659 struct nfp_net_txq *txq = dev->data->tx_queues[queue_idx]; 660 661 if (txq) { 662 nfp_net_tx_queue_release_mbufs(txq); 663 rte_free(txq->txbufs); 664 rte_free(txq); 665 } 666 } 667 668 void 669 nfp_net_reset_tx_queue(struct nfp_net_txq *txq) 670 { 671 nfp_net_tx_queue_release_mbufs(txq); 672 txq->wr_p = 0; 673 txq->rd_p = 0; 674 } 675 676 int 677 nfp_net_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx, 678 uint16_t nb_desc, unsigned int socket_id, 679 const struct rte_eth_txconf *tx_conf) 680 { 681 const struct rte_memzone *tz; 682 struct nfp_net_txq *txq; 683 uint16_t tx_free_thresh; 684 struct nfp_net_hw *hw; 685 uint32_t tx_desc_sz; 686 687 hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private); 688 689 PMD_INIT_FUNC_TRACE(); 690 691 /* Validating number of descriptors */ 692 tx_desc_sz = nb_desc * sizeof(struct nfp_net_tx_desc); 693 if (tx_desc_sz % NFP_ALIGN_RING_DESC != 0 || 694 nb_desc > NFP_NET_MAX_TX_DESC || 695 nb_desc < NFP_NET_MIN_TX_DESC) { 696 PMD_DRV_LOG(ERR, "Wrong nb_desc value"); 697 return -EINVAL; 698 } 699 700 tx_free_thresh = (uint16_t)((tx_conf->tx_free_thresh) ? 701 tx_conf->tx_free_thresh : 702 DEFAULT_TX_FREE_THRESH); 703 704 if (tx_free_thresh > (nb_desc)) { 705 PMD_DRV_LOG(ERR, 706 "tx_free_thresh must be less than the number of TX " 707 "descriptors. (tx_free_thresh=%u port=%d " 708 "queue=%d)", (unsigned int)tx_free_thresh, 709 dev->data->port_id, (int)queue_idx); 710 return -(EINVAL); 711 } 712 713 /* 714 * Free memory prior to re-allocation if needed. This is the case after 715 * calling nfp_net_stop 716 */ 717 if (dev->data->tx_queues[queue_idx]) { 718 PMD_TX_LOG(DEBUG, "Freeing memory prior to re-allocation %d", 719 queue_idx); 720 nfp_net_tx_queue_release(dev, queue_idx); 721 dev->data->tx_queues[queue_idx] = NULL; 722 } 723 724 /* Allocating tx queue data structure */ 725 txq = rte_zmalloc_socket("ethdev TX queue", sizeof(struct nfp_net_txq), 726 RTE_CACHE_LINE_SIZE, socket_id); 727 if (txq == NULL) { 728 PMD_DRV_LOG(ERR, "Error allocating tx dma"); 729 return -ENOMEM; 730 } 731 732 dev->data->tx_queues[queue_idx] = txq; 733 734 /* 735 * Allocate TX ring hardware descriptors. A memzone large enough to 736 * handle the maximum ring size is allocated in order to allow for 737 * resizing in later calls to the queue setup function. 738 */ 739 tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx, 740 sizeof(struct nfp_net_tx_desc) * 741 NFP_NET_MAX_TX_DESC, NFP_MEMZONE_ALIGN, 742 socket_id); 743 if (tz == NULL) { 744 PMD_DRV_LOG(ERR, "Error allocating tx dma"); 745 nfp_net_tx_queue_release(dev, queue_idx); 746 dev->data->tx_queues[queue_idx] = NULL; 747 return -ENOMEM; 748 } 749 750 txq->tx_count = nb_desc; 751 txq->tx_free_thresh = tx_free_thresh; 752 txq->tx_pthresh = tx_conf->tx_thresh.pthresh; 753 txq->tx_hthresh = tx_conf->tx_thresh.hthresh; 754 txq->tx_wthresh = tx_conf->tx_thresh.wthresh; 755 756 /* queue mapping based on firmware configuration */ 757 txq->qidx = queue_idx; 758 txq->tx_qcidx = queue_idx * hw->stride_tx; 759 txq->qcp_q = hw->tx_bar + NFP_QCP_QUEUE_OFF(txq->tx_qcidx); 760 761 txq->port_id = dev->data->port_id; 762 763 /* Saving physical and virtual addresses for the TX ring */ 764 txq->dma = (uint64_t)tz->iova; 765 txq->txds = (struct nfp_net_tx_desc *)tz->addr; 766 767 /* mbuf pointers array for referencing mbufs linked to TX descriptors */ 768 txq->txbufs = rte_zmalloc_socket("txq->txbufs", 769 sizeof(*txq->txbufs) * nb_desc, 770 RTE_CACHE_LINE_SIZE, socket_id); 771 if (txq->txbufs == NULL) { 772 nfp_net_tx_queue_release(dev, queue_idx); 773 dev->data->tx_queues[queue_idx] = NULL; 774 return -ENOMEM; 775 } 776 PMD_TX_LOG(DEBUG, "txbufs=%p hw_ring=%p dma_addr=0x%" PRIx64, 777 txq->txbufs, txq->txds, (unsigned long)txq->dma); 778 779 nfp_net_reset_tx_queue(txq); 780 781 txq->hw = hw; 782 783 /* 784 * Telling the HW about the physical address of the TX ring and number 785 * of descriptors in log2 format 786 */ 787 nn_cfg_writeq(hw, NFP_NET_CFG_TXR_ADDR(queue_idx), txq->dma); 788 nn_cfg_writeb(hw, NFP_NET_CFG_TXR_SZ(queue_idx), rte_log2_u32(nb_desc)); 789 790 return 0; 791 } 792 793 /* Leaving always free descriptors for avoiding wrapping confusion */ 794 static inline 795 uint32_t nfp_free_tx_desc(struct nfp_net_txq *txq) 796 { 797 if (txq->wr_p >= txq->rd_p) 798 return txq->tx_count - (txq->wr_p - txq->rd_p) - 8; 799 else 800 return txq->rd_p - txq->wr_p - 8; 801 } 802 803 /* 804 * nfp_net_txq_full - Check if the TX queue free descriptors 805 * is below tx_free_threshold 806 * 807 * @txq: TX queue to check 808 * 809 * This function uses the host copy* of read/write pointers 810 */ 811 static inline 812 uint32_t nfp_net_txq_full(struct nfp_net_txq *txq) 813 { 814 return (nfp_free_tx_desc(txq) < txq->tx_free_thresh); 815 } 816 817 /* nfp_net_tx_tso - Set TX descriptor for TSO */ 818 static inline void 819 nfp_net_tx_tso(struct nfp_net_txq *txq, struct nfp_net_tx_desc *txd, 820 struct rte_mbuf *mb) 821 { 822 uint64_t ol_flags; 823 struct nfp_net_hw *hw = txq->hw; 824 825 if (!(hw->cap & NFP_NET_CFG_CTRL_LSO_ANY)) 826 goto clean_txd; 827 828 ol_flags = mb->ol_flags; 829 830 if (!(ol_flags & PKT_TX_TCP_SEG)) 831 goto clean_txd; 832 833 txd->l3_offset = mb->l2_len; 834 txd->l4_offset = mb->l2_len + mb->l3_len; 835 txd->lso_hdrlen = mb->l2_len + mb->l3_len + mb->l4_len; 836 txd->mss = rte_cpu_to_le_16(mb->tso_segsz); 837 txd->flags = PCIE_DESC_TX_LSO; 838 return; 839 840 clean_txd: 841 txd->flags = 0; 842 txd->l3_offset = 0; 843 txd->l4_offset = 0; 844 txd->lso_hdrlen = 0; 845 txd->mss = 0; 846 } 847 848 /* nfp_net_tx_cksum - Set TX CSUM offload flags in TX descriptor */ 849 static inline void 850 nfp_net_tx_cksum(struct nfp_net_txq *txq, struct nfp_net_tx_desc *txd, 851 struct rte_mbuf *mb) 852 { 853 uint64_t ol_flags; 854 struct nfp_net_hw *hw = txq->hw; 855 856 if (!(hw->cap & NFP_NET_CFG_CTRL_TXCSUM)) 857 return; 858 859 ol_flags = mb->ol_flags; 860 861 /* IPv6 does not need checksum */ 862 if (ol_flags & PKT_TX_IP_CKSUM) 863 txd->flags |= PCIE_DESC_TX_IP4_CSUM; 864 865 switch (ol_flags & PKT_TX_L4_MASK) { 866 case PKT_TX_UDP_CKSUM: 867 txd->flags |= PCIE_DESC_TX_UDP_CSUM; 868 break; 869 case PKT_TX_TCP_CKSUM: 870 txd->flags |= PCIE_DESC_TX_TCP_CSUM; 871 break; 872 } 873 874 if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_L4_MASK)) 875 txd->flags |= PCIE_DESC_TX_CSUM; 876 } 877 878 uint16_t 879 nfp_net_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) 880 { 881 struct nfp_net_txq *txq; 882 struct nfp_net_hw *hw; 883 struct nfp_net_tx_desc *txds, txd; 884 struct rte_mbuf *pkt; 885 uint64_t dma_addr; 886 int pkt_size, dma_size; 887 uint16_t free_descs, issued_descs; 888 struct rte_mbuf **lmbuf; 889 int i; 890 891 txq = tx_queue; 892 hw = txq->hw; 893 txds = &txq->txds[txq->wr_p]; 894 895 PMD_TX_LOG(DEBUG, "working for queue %u at pos %d and %u packets", 896 txq->qidx, txq->wr_p, nb_pkts); 897 898 if ((nfp_free_tx_desc(txq) < nb_pkts) || (nfp_net_txq_full(txq))) 899 nfp_net_tx_free_bufs(txq); 900 901 free_descs = (uint16_t)nfp_free_tx_desc(txq); 902 if (unlikely(free_descs == 0)) 903 return 0; 904 905 pkt = *tx_pkts; 906 907 i = 0; 908 issued_descs = 0; 909 PMD_TX_LOG(DEBUG, "queue: %u. Sending %u packets", 910 txq->qidx, nb_pkts); 911 /* Sending packets */ 912 while ((i < nb_pkts) && free_descs) { 913 /* Grabbing the mbuf linked to the current descriptor */ 914 lmbuf = &txq->txbufs[txq->wr_p].mbuf; 915 /* Warming the cache for releasing the mbuf later on */ 916 RTE_MBUF_PREFETCH_TO_FREE(*lmbuf); 917 918 pkt = *(tx_pkts + i); 919 920 if (unlikely(pkt->nb_segs > 1 && 921 !(hw->cap & NFP_NET_CFG_CTRL_GATHER))) { 922 PMD_INIT_LOG(INFO, "NFP_NET_CFG_CTRL_GATHER not set"); 923 rte_panic("Multisegment packet unsupported\n"); 924 } 925 926 /* Checking if we have enough descriptors */ 927 if (unlikely(pkt->nb_segs > free_descs)) 928 goto xmit_end; 929 930 /* 931 * Checksum and VLAN flags just in the first descriptor for a 932 * multisegment packet, but TSO info needs to be in all of them. 933 */ 934 txd.data_len = pkt->pkt_len; 935 nfp_net_tx_tso(txq, &txd, pkt); 936 nfp_net_tx_cksum(txq, &txd, pkt); 937 938 if ((pkt->ol_flags & PKT_TX_VLAN_PKT) && 939 (hw->cap & NFP_NET_CFG_CTRL_TXVLAN)) { 940 txd.flags |= PCIE_DESC_TX_VLAN; 941 txd.vlan = pkt->vlan_tci; 942 } 943 944 /* 945 * mbuf data_len is the data in one segment and pkt_len data 946 * in the whole packet. When the packet is just one segment, 947 * then data_len = pkt_len 948 */ 949 pkt_size = pkt->pkt_len; 950 951 while (pkt) { 952 /* Copying TSO, VLAN and cksum info */ 953 *txds = txd; 954 955 /* Releasing mbuf used by this descriptor previously*/ 956 if (*lmbuf) 957 rte_pktmbuf_free_seg(*lmbuf); 958 959 /* 960 * Linking mbuf with descriptor for being released 961 * next time descriptor is used 962 */ 963 *lmbuf = pkt; 964 965 dma_size = pkt->data_len; 966 dma_addr = rte_mbuf_data_iova(pkt); 967 PMD_TX_LOG(DEBUG, "Working with mbuf at dma address:" 968 "%" PRIx64 "", dma_addr); 969 970 /* Filling descriptors fields */ 971 txds->dma_len = dma_size; 972 txds->data_len = txd.data_len; 973 txds->dma_addr_hi = (dma_addr >> 32) & 0xff; 974 txds->dma_addr_lo = (dma_addr & 0xffffffff); 975 ASSERT(free_descs > 0); 976 free_descs--; 977 978 txq->wr_p++; 979 if (unlikely(txq->wr_p == txq->tx_count)) /* wrapping?*/ 980 txq->wr_p = 0; 981 982 pkt_size -= dma_size; 983 984 /* 985 * Making the EOP, packets with just one segment 986 * the priority 987 */ 988 if (likely(!pkt_size)) 989 txds->offset_eop = PCIE_DESC_TX_EOP; 990 else 991 txds->offset_eop = 0; 992 993 pkt = pkt->next; 994 /* Referencing next free TX descriptor */ 995 txds = &txq->txds[txq->wr_p]; 996 lmbuf = &txq->txbufs[txq->wr_p].mbuf; 997 issued_descs++; 998 } 999 i++; 1000 } 1001 1002 xmit_end: 1003 /* Increment write pointers. Force memory write before we let HW know */ 1004 rte_wmb(); 1005 nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, issued_descs); 1006 1007 return i; 1008 } 1009