15971ce5eSJoyce Kong /* SPDX-License-Identifier: BSD-3-Clause
25971ce5eSJoyce Kong * Copyright(c) 2020 Arm Corporation
35971ce5eSJoyce Kong */
45971ce5eSJoyce Kong
55971ce5eSJoyce Kong #include <stdlib.h>
65971ce5eSJoyce Kong #include <stdint.h>
75971ce5eSJoyce Kong #include <stdio.h>
85971ce5eSJoyce Kong #include <string.h>
95971ce5eSJoyce Kong #include <errno.h>
105971ce5eSJoyce Kong
115971ce5eSJoyce Kong #include <rte_net.h>
125971ce5eSJoyce Kong #include <rte_vect.h>
135971ce5eSJoyce Kong
145971ce5eSJoyce Kong #include "virtio_ethdev.h"
15b5ba7ee4SMaxime Coquelin #include "virtio.h"
165971ce5eSJoyce Kong #include "virtio_rxtx_packed.h"
175971ce5eSJoyce Kong #include "virtqueue.h"
185971ce5eSJoyce Kong
195971ce5eSJoyce Kong static inline int
virtqueue_enqueue_batch_packed_vec(struct virtnet_tx * txvq,struct rte_mbuf ** tx_pkts)2053088746SJoyce Kong virtqueue_enqueue_batch_packed_vec(struct virtnet_tx *txvq,
2153088746SJoyce Kong struct rte_mbuf **tx_pkts)
2253088746SJoyce Kong {
233169550fSMaxime Coquelin struct virtqueue *vq = virtnet_txq_to_vq(txvq);
2453088746SJoyce Kong uint16_t head_size = vq->hw->vtnet_hdr_size;
2553088746SJoyce Kong uint16_t idx = vq->vq_avail_idx;
2653088746SJoyce Kong struct virtio_net_hdr *hdr;
2753088746SJoyce Kong struct vq_desc_extra *dxp;
2853088746SJoyce Kong struct vring_packed_desc *p_desc;
2953088746SJoyce Kong uint16_t i;
3053088746SJoyce Kong
3153088746SJoyce Kong if (idx & PACKED_BATCH_MASK)
3253088746SJoyce Kong return -1;
3353088746SJoyce Kong
3453088746SJoyce Kong if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
3553088746SJoyce Kong return -1;
3653088746SJoyce Kong
3753088746SJoyce Kong /* Map four refcnt and nb_segs from mbufs to one NEON register. */
3853088746SJoyce Kong uint8x16_t ref_seg_msk = {
3953088746SJoyce Kong 2, 3, 4, 5,
4053088746SJoyce Kong 10, 11, 12, 13,
4153088746SJoyce Kong 18, 19, 20, 21,
4253088746SJoyce Kong 26, 27, 28, 29
4353088746SJoyce Kong };
4453088746SJoyce Kong
4553088746SJoyce Kong /* Map four data_off from mbufs to one NEON register. */
4653088746SJoyce Kong uint8x8_t data_msk = {
4753088746SJoyce Kong 0, 1,
4853088746SJoyce Kong 8, 9,
4953088746SJoyce Kong 16, 17,
5053088746SJoyce Kong 24, 25
5153088746SJoyce Kong };
5253088746SJoyce Kong
5353088746SJoyce Kong uint16x8_t net_hdr_msk = {
5453088746SJoyce Kong 0xFFFF, 0xFFFF,
5553088746SJoyce Kong 0, 0, 0, 0
5653088746SJoyce Kong };
5753088746SJoyce Kong
5853088746SJoyce Kong uint16x4_t pkts[PACKED_BATCH_SIZE];
5953088746SJoyce Kong uint8x16x2_t mbuf;
6053088746SJoyce Kong /* Load four mbufs rearm data. */
6153088746SJoyce Kong RTE_BUILD_BUG_ON(REFCNT_BITS_OFFSET >= 64);
6253088746SJoyce Kong pkts[0] = vld1_u16((uint16_t *)&tx_pkts[0]->rearm_data);
6353088746SJoyce Kong pkts[1] = vld1_u16((uint16_t *)&tx_pkts[1]->rearm_data);
6453088746SJoyce Kong pkts[2] = vld1_u16((uint16_t *)&tx_pkts[2]->rearm_data);
6553088746SJoyce Kong pkts[3] = vld1_u16((uint16_t *)&tx_pkts[3]->rearm_data);
6653088746SJoyce Kong
6753088746SJoyce Kong mbuf.val[0] = vreinterpretq_u8_u16(vcombine_u16(pkts[0], pkts[1]));
6853088746SJoyce Kong mbuf.val[1] = vreinterpretq_u8_u16(vcombine_u16(pkts[2], pkts[3]));
6953088746SJoyce Kong
7053088746SJoyce Kong /* refcnt = 1 and nb_segs = 1 */
7153088746SJoyce Kong uint32x4_t def_ref_seg = vdupq_n_u32(0x10001);
7253088746SJoyce Kong /* Check refcnt and nb_segs. */
7353088746SJoyce Kong uint32x4_t ref_seg = vreinterpretq_u32_u8(vqtbl2q_u8(mbuf, ref_seg_msk));
741790c29eSJoyce Kong uint64x2_t cmp1 = vreinterpretq_u64_u32(~vceqq_u32(ref_seg, def_ref_seg));
751790c29eSJoyce Kong if (unlikely(vgetq_lane_u64(cmp1, 0) || vgetq_lane_u64(cmp1, 1)))
7653088746SJoyce Kong return -1;
7753088746SJoyce Kong
7853088746SJoyce Kong /* Check headroom is enough. */
7953088746SJoyce Kong uint16x4_t head_rooms = vdup_n_u16(head_size);
8053088746SJoyce Kong RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_off) !=
8153088746SJoyce Kong offsetof(struct rte_mbuf, rearm_data));
8253088746SJoyce Kong uint16x4_t data_offset = vreinterpret_u16_u8(vqtbl2_u8(mbuf, data_msk));
8353088746SJoyce Kong uint64x1_t cmp2 = vreinterpret_u64_u16(vclt_u16(data_offset, head_rooms));
8453088746SJoyce Kong if (unlikely(vget_lane_u64(cmp2, 0)))
8553088746SJoyce Kong return -1;
8653088746SJoyce Kong
8753088746SJoyce Kong virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
8853088746SJoyce Kong dxp = &vq->vq_descx[idx + i];
8953088746SJoyce Kong dxp->ndescs = 1;
9053088746SJoyce Kong dxp->cookie = tx_pkts[i];
9153088746SJoyce Kong }
9253088746SJoyce Kong
9353088746SJoyce Kong virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
9453088746SJoyce Kong tx_pkts[i]->data_off -= head_size;
9553088746SJoyce Kong tx_pkts[i]->data_len += head_size;
9653088746SJoyce Kong }
9753088746SJoyce Kong
9853088746SJoyce Kong uint64x2x2_t desc[PACKED_BATCH_SIZE / 2];
9953088746SJoyce Kong uint64x2_t base_addr0 = {
100*ba55c94aSMaxime Coquelin VIRTIO_MBUF_ADDR(tx_pkts[0], vq) + tx_pkts[0]->data_off,
101*ba55c94aSMaxime Coquelin VIRTIO_MBUF_ADDR(tx_pkts[1], vq) + tx_pkts[1]->data_off
10253088746SJoyce Kong };
10353088746SJoyce Kong uint64x2_t base_addr1 = {
104*ba55c94aSMaxime Coquelin VIRTIO_MBUF_ADDR(tx_pkts[2], vq) + tx_pkts[2]->data_off,
105*ba55c94aSMaxime Coquelin VIRTIO_MBUF_ADDR(tx_pkts[3], vq) + tx_pkts[3]->data_off
10653088746SJoyce Kong };
10753088746SJoyce Kong
10853088746SJoyce Kong desc[0].val[0] = base_addr0;
10953088746SJoyce Kong desc[1].val[0] = base_addr1;
11053088746SJoyce Kong
11153088746SJoyce Kong uint64_t flags = (uint64_t)vq->vq_packed.cached_flags << FLAGS_LEN_BITS_OFFSET;
11253088746SJoyce Kong uint64x2_t tx_desc0 = {
11353088746SJoyce Kong flags | (uint64_t)idx << ID_BITS_OFFSET | tx_pkts[0]->data_len,
11453088746SJoyce Kong flags | (uint64_t)(idx + 1) << ID_BITS_OFFSET | tx_pkts[1]->data_len
11553088746SJoyce Kong };
11653088746SJoyce Kong
11753088746SJoyce Kong uint64x2_t tx_desc1 = {
11853088746SJoyce Kong flags | (uint64_t)(idx + 2) << ID_BITS_OFFSET | tx_pkts[2]->data_len,
11953088746SJoyce Kong flags | (uint64_t)(idx + 3) << ID_BITS_OFFSET | tx_pkts[3]->data_len
12053088746SJoyce Kong };
12153088746SJoyce Kong
12253088746SJoyce Kong desc[0].val[1] = tx_desc0;
12353088746SJoyce Kong desc[1].val[1] = tx_desc1;
12453088746SJoyce Kong
12553088746SJoyce Kong if (!vq->hw->has_tx_offload) {
12653088746SJoyce Kong virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
12753088746SJoyce Kong hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
12853088746SJoyce Kong struct virtio_net_hdr *, -head_size);
12953088746SJoyce Kong /* Clear net hdr. */
13053088746SJoyce Kong uint16x8_t v_hdr = vld1q_u16((void *)hdr);
13153088746SJoyce Kong vst1q_u16((void *)hdr, vandq_u16(v_hdr, net_hdr_msk));
13253088746SJoyce Kong }
13353088746SJoyce Kong } else {
13453088746SJoyce Kong virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
13553088746SJoyce Kong hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
13653088746SJoyce Kong struct virtio_net_hdr *, -head_size);
13785a4fa2fSDavid Marchand virtqueue_xmit_offload(hdr, tx_pkts[i]);
13853088746SJoyce Kong }
13953088746SJoyce Kong }
14053088746SJoyce Kong
14153088746SJoyce Kong /* Enqueue packet buffers. */
14253088746SJoyce Kong p_desc = &vq->vq_packed.ring.desc[idx];
14353088746SJoyce Kong vst2q_u64((uint64_t *)p_desc, desc[0]);
14453088746SJoyce Kong vst2q_u64((uint64_t *)(p_desc + 2), desc[1]);
14553088746SJoyce Kong
14653088746SJoyce Kong virtio_update_batch_stats(&txvq->stats, tx_pkts[0]->pkt_len,
14753088746SJoyce Kong tx_pkts[1]->pkt_len, tx_pkts[2]->pkt_len,
14853088746SJoyce Kong tx_pkts[3]->pkt_len);
14953088746SJoyce Kong
15053088746SJoyce Kong vq->vq_avail_idx += PACKED_BATCH_SIZE;
15153088746SJoyce Kong vq->vq_free_cnt -= PACKED_BATCH_SIZE;
15253088746SJoyce Kong
15353088746SJoyce Kong if (vq->vq_avail_idx >= vq->vq_nentries) {
15453088746SJoyce Kong vq->vq_avail_idx -= vq->vq_nentries;
15553088746SJoyce Kong vq->vq_packed.cached_flags ^=
15653088746SJoyce Kong VRING_PACKED_DESC_F_AVAIL_USED;
15753088746SJoyce Kong }
15853088746SJoyce Kong
15953088746SJoyce Kong return 0;
16053088746SJoyce Kong }
16153088746SJoyce Kong
16253088746SJoyce Kong static inline int
virtqueue_dequeue_batch_packed_vec(struct virtnet_rx * rxvq,struct rte_mbuf ** rx_pkts)1635971ce5eSJoyce Kong virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
1645971ce5eSJoyce Kong struct rte_mbuf **rx_pkts)
1655971ce5eSJoyce Kong {
1663169550fSMaxime Coquelin struct virtqueue *vq = virtnet_rxq_to_vq(rxvq);
1675971ce5eSJoyce Kong struct virtio_hw *hw = vq->hw;
1685971ce5eSJoyce Kong uint16_t head_size = hw->vtnet_hdr_size;
1695971ce5eSJoyce Kong uint16_t id = vq->vq_used_cons_idx;
1705971ce5eSJoyce Kong struct vring_packed_desc *p_desc;
1715971ce5eSJoyce Kong uint16_t i;
1725971ce5eSJoyce Kong
1735971ce5eSJoyce Kong if (id & PACKED_BATCH_MASK)
1745971ce5eSJoyce Kong return -1;
1755971ce5eSJoyce Kong
1765971ce5eSJoyce Kong if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
1775971ce5eSJoyce Kong return -1;
1785971ce5eSJoyce Kong
1795971ce5eSJoyce Kong /* Map packed descriptor to mbuf fields. */
1805971ce5eSJoyce Kong uint8x16_t shuf_msk1 = {
1815971ce5eSJoyce Kong 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
1825971ce5eSJoyce Kong 0, 1, /* octet 1~0, low 16 bits pkt_len */
1835971ce5eSJoyce Kong 0xFF, 0xFF, /* skip high 16 bits of pkt_len, zero out */
1845971ce5eSJoyce Kong 0, 1, /* octet 1~0, 16 bits data_len */
1855971ce5eSJoyce Kong 0xFF, 0xFF, /* vlan tci set as unknown */
1865971ce5eSJoyce Kong 0xFF, 0xFF, 0xFF, 0xFF
1875971ce5eSJoyce Kong };
1885971ce5eSJoyce Kong
1895971ce5eSJoyce Kong uint8x16_t shuf_msk2 = {
1905971ce5eSJoyce Kong 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
1915971ce5eSJoyce Kong 8, 9, /* octet 9~8, low 16 bits pkt_len */
1925971ce5eSJoyce Kong 0xFF, 0xFF, /* skip high 16 bits of pkt_len, zero out */
1935971ce5eSJoyce Kong 8, 9, /* octet 9~8, 16 bits data_len */
1945971ce5eSJoyce Kong 0xFF, 0xFF, /* vlan tci set as unknown */
1955971ce5eSJoyce Kong 0xFF, 0xFF, 0xFF, 0xFF
1965971ce5eSJoyce Kong };
1975971ce5eSJoyce Kong
1985971ce5eSJoyce Kong /* Subtract the header length. */
1995971ce5eSJoyce Kong uint16x8_t len_adjust = {
2005971ce5eSJoyce Kong 0, 0, /* ignore pkt_type field */
2015971ce5eSJoyce Kong head_size, /* sub head_size on pkt_len */
2025971ce5eSJoyce Kong 0, /* ignore high 16 bits of pkt_len */
2035971ce5eSJoyce Kong head_size, /* sub head_size on data_len */
2045971ce5eSJoyce Kong 0, 0, 0 /* ignore non-length fields */
2055971ce5eSJoyce Kong };
2065971ce5eSJoyce Kong
2075971ce5eSJoyce Kong uint64x2_t desc[PACKED_BATCH_SIZE / 2];
2085971ce5eSJoyce Kong uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];
2095971ce5eSJoyce Kong uint64x2_t pkt_mb[PACKED_BATCH_SIZE];
2105971ce5eSJoyce Kong
2115971ce5eSJoyce Kong p_desc = &vq->vq_packed.ring.desc[id];
2125971ce5eSJoyce Kong /* Load high 64 bits of packed descriptor 0,1. */
2135971ce5eSJoyce Kong desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];
2145971ce5eSJoyce Kong /* Load high 64 bits of packed descriptor 2,3. */
2155971ce5eSJoyce Kong desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];
2165971ce5eSJoyce Kong
2175971ce5eSJoyce Kong /* Only care avail/used bits. */
2185971ce5eSJoyce Kong uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);
2195971ce5eSJoyce Kong /* Extract high 32 bits of packed descriptor (id, flags). */
2205971ce5eSJoyce Kong uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),
2215971ce5eSJoyce Kong vreinterpretq_u32_u64(desc[1]));
2225971ce5eSJoyce Kong uint32x4_t v_flag = vandq_u32(v_desc, v_mask);
2235971ce5eSJoyce Kong
2245971ce5eSJoyce Kong uint32x4_t v_used_flag = vdupq_n_u32(0);
2255971ce5eSJoyce Kong if (vq->vq_packed.used_wrap_counter)
2265971ce5eSJoyce Kong v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);
2275971ce5eSJoyce Kong
2281790c29eSJoyce Kong uint64x2_t desc_stats = vreinterpretq_u64_u32(~vceqq_u32(v_flag, v_used_flag));
2295971ce5eSJoyce Kong
2305971ce5eSJoyce Kong /* Check all descs are used. */
2311790c29eSJoyce Kong if (unlikely(vgetq_lane_u64(desc_stats, 0) || vgetq_lane_u64(desc_stats, 1)))
2325971ce5eSJoyce Kong return -1;
2335971ce5eSJoyce Kong
2345971ce5eSJoyce Kong /* Load 2 mbuf pointers per time. */
2355971ce5eSJoyce Kong mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);
2365971ce5eSJoyce Kong vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);
2375971ce5eSJoyce Kong
2385971ce5eSJoyce Kong mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);
2395971ce5eSJoyce Kong vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);
2405971ce5eSJoyce Kong
2415971ce5eSJoyce Kong /**
2425971ce5eSJoyce Kong * Update data length and packet length for descriptor.
2435971ce5eSJoyce Kong * structure of pkt_mb:
2445971ce5eSJoyce Kong * --------------------------------------------------------------------
2455971ce5eSJoyce Kong * |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
2465971ce5eSJoyce Kong * --------------------------------------------------------------------
2475971ce5eSJoyce Kong */
2485971ce5eSJoyce Kong pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
2495971ce5eSJoyce Kong vreinterpretq_u8_u64(desc[0]), shuf_msk1));
2505971ce5eSJoyce Kong pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
2515971ce5eSJoyce Kong vreinterpretq_u8_u64(desc[0]), shuf_msk2));
2525971ce5eSJoyce Kong pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
2535971ce5eSJoyce Kong vreinterpretq_u8_u64(desc[1]), shuf_msk1));
2545971ce5eSJoyce Kong pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
2555971ce5eSJoyce Kong vreinterpretq_u8_u64(desc[1]), shuf_msk2));
2565971ce5eSJoyce Kong
2575971ce5eSJoyce Kong pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
2585971ce5eSJoyce Kong vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
2595971ce5eSJoyce Kong pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
2605971ce5eSJoyce Kong vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
2615971ce5eSJoyce Kong pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
2625971ce5eSJoyce Kong vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
2635971ce5eSJoyce Kong pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
2645971ce5eSJoyce Kong vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
2655971ce5eSJoyce Kong
2665971ce5eSJoyce Kong vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);
2675971ce5eSJoyce Kong vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);
2685971ce5eSJoyce Kong vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);
2695971ce5eSJoyce Kong vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);
2705971ce5eSJoyce Kong
2715971ce5eSJoyce Kong if (hw->has_rx_offload) {
2725971ce5eSJoyce Kong virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2735971ce5eSJoyce Kong char *addr = (char *)rx_pkts[i]->buf_addr +
2745971ce5eSJoyce Kong RTE_PKTMBUF_HEADROOM - head_size;
2755971ce5eSJoyce Kong virtio_vec_rx_offload(rx_pkts[i],
2765971ce5eSJoyce Kong (struct virtio_net_hdr *)addr);
2775971ce5eSJoyce Kong }
2785971ce5eSJoyce Kong }
2795971ce5eSJoyce Kong
2805971ce5eSJoyce Kong virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
2815971ce5eSJoyce Kong rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
2825971ce5eSJoyce Kong rx_pkts[3]->pkt_len);
2835971ce5eSJoyce Kong
2845971ce5eSJoyce Kong vq->vq_free_cnt += PACKED_BATCH_SIZE;
2855971ce5eSJoyce Kong
2865971ce5eSJoyce Kong vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
2875971ce5eSJoyce Kong if (vq->vq_used_cons_idx >= vq->vq_nentries) {
2885971ce5eSJoyce Kong vq->vq_used_cons_idx -= vq->vq_nentries;
2895971ce5eSJoyce Kong vq->vq_packed.used_wrap_counter ^= 1;
2905971ce5eSJoyce Kong }
2915971ce5eSJoyce Kong
2925971ce5eSJoyce Kong return 0;
2935971ce5eSJoyce Kong }
294