1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
3 */
4
5 #include <stdint.h>
6 #include <stdbool.h>
7 #include <stdlib.h>
8 #include <linux/virtio_net.h>
9
10 #include <rte_mbuf.h>
11 #include <rte_memcpy.h>
12 #include <rte_vhost.h>
13
14 #include "main.h"
15
16 /*
17 * A very simple vhost-user net driver implementation, without
18 * any extra features being enabled, such as TSO and mrg-Rx.
19 */
20
21 void
vs_vhost_net_setup(struct vhost_dev * dev)22 vs_vhost_net_setup(struct vhost_dev *dev)
23 {
24 uint16_t i;
25 int vid = dev->vid;
26 struct vhost_queue *queue;
27 int ret;
28
29 RTE_LOG(INFO, VHOST_CONFIG,
30 "setting builtin vhost-user net driver\n");
31
32 rte_vhost_get_negotiated_features(vid, &dev->features);
33 if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
34 dev->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
35 else
36 dev->hdr_len = sizeof(struct virtio_net_hdr);
37
38 ret = rte_vhost_get_mem_table(vid, &dev->mem);
39 if (ret < 0) {
40 RTE_LOG(ERR, VHOST_CONFIG, "Failed to get "
41 "VM memory layout for device(%d)\n", vid);
42 return;
43 }
44
45 dev->nr_vrings = rte_vhost_get_vring_num(vid);
46 for (i = 0; i < dev->nr_vrings; i++) {
47 queue = &dev->queues[i];
48
49 queue->last_used_idx = 0;
50 queue->last_avail_idx = 0;
51 rte_vhost_get_vhost_vring(vid, i, &queue->vr);
52 }
53 }
54
55 void
vs_vhost_net_remove(struct vhost_dev * dev)56 vs_vhost_net_remove(struct vhost_dev *dev)
57 {
58 free(dev->mem);
59 }
60
61 static __rte_always_inline int
enqueue_pkt(struct vhost_dev * dev,struct rte_vhost_vring * vr,struct rte_mbuf * m,uint16_t desc_idx)62 enqueue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
63 struct rte_mbuf *m, uint16_t desc_idx)
64 {
65 uint32_t desc_avail, desc_offset;
66 uint64_t desc_chunck_len;
67 uint32_t mbuf_avail, mbuf_offset;
68 uint32_t cpy_len;
69 struct vring_desc *desc;
70 uint64_t desc_addr, desc_gaddr;
71 struct virtio_net_hdr virtio_hdr = {0, 0, 0, 0, 0, 0};
72 /* A counter to avoid desc dead loop chain */
73 uint16_t nr_desc = 1;
74
75 desc = &vr->desc[desc_idx];
76 desc_chunck_len = desc->len;
77 desc_gaddr = desc->addr;
78 desc_addr = rte_vhost_va_from_guest_pa(
79 dev->mem, desc_gaddr, &desc_chunck_len);
80 /*
81 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
82 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
83 * otherwise stores offset on the stack instead of in a register.
84 */
85 if (unlikely(desc->len < dev->hdr_len) || !desc_addr)
86 return -1;
87
88 rte_prefetch0((void *)(uintptr_t)desc_addr);
89
90 /* write virtio-net header */
91 if (likely(desc_chunck_len >= dev->hdr_len)) {
92 *(struct virtio_net_hdr *)(uintptr_t)desc_addr = virtio_hdr;
93 desc_offset = dev->hdr_len;
94 } else {
95 uint64_t len;
96 uint64_t remain = dev->hdr_len;
97 uint64_t src = (uint64_t)(uintptr_t)&virtio_hdr, dst;
98 uint64_t guest_addr = desc_gaddr;
99
100 while (remain) {
101 len = remain;
102 dst = rte_vhost_va_from_guest_pa(dev->mem,
103 guest_addr, &len);
104 if (unlikely(!dst || !len))
105 return -1;
106
107 rte_memcpy((void *)(uintptr_t)dst,
108 (void *)(uintptr_t)src,
109 len);
110
111 remain -= len;
112 guest_addr += len;
113 src += len;
114 }
115
116 desc_chunck_len = desc->len - dev->hdr_len;
117 desc_gaddr += dev->hdr_len;
118 desc_addr = rte_vhost_va_from_guest_pa(
119 dev->mem, desc_gaddr,
120 &desc_chunck_len);
121 if (unlikely(!desc_addr))
122 return -1;
123
124 desc_offset = 0;
125 }
126
127 desc_avail = desc->len - dev->hdr_len;
128
129 mbuf_avail = rte_pktmbuf_data_len(m);
130 mbuf_offset = 0;
131 while (mbuf_avail != 0 || m->next != NULL) {
132 /* done with current mbuf, fetch next */
133 if (mbuf_avail == 0) {
134 m = m->next;
135
136 mbuf_offset = 0;
137 mbuf_avail = rte_pktmbuf_data_len(m);
138 }
139
140 /* done with current desc buf, fetch next */
141 if (desc_avail == 0) {
142 if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
143 /* Room in vring buffer is not enough */
144 return -1;
145 }
146 if (unlikely(desc->next >= vr->size ||
147 ++nr_desc > vr->size))
148 return -1;
149
150 desc = &vr->desc[desc->next];
151 desc_chunck_len = desc->len;
152 desc_gaddr = desc->addr;
153 desc_addr = rte_vhost_va_from_guest_pa(
154 dev->mem, desc_gaddr, &desc_chunck_len);
155 if (unlikely(!desc_addr))
156 return -1;
157
158 desc_offset = 0;
159 desc_avail = desc->len;
160 } else if (unlikely(desc_chunck_len == 0)) {
161 desc_chunck_len = desc_avail;
162 desc_gaddr += desc_offset;
163 desc_addr = rte_vhost_va_from_guest_pa(dev->mem,
164 desc_gaddr,
165 &desc_chunck_len);
166 if (unlikely(!desc_addr))
167 return -1;
168
169 desc_offset = 0;
170 }
171
172 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
173 rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
174 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
175 cpy_len);
176
177 mbuf_avail -= cpy_len;
178 mbuf_offset += cpy_len;
179 desc_avail -= cpy_len;
180 desc_offset += cpy_len;
181 desc_chunck_len -= cpy_len;
182 }
183
184 return 0;
185 }
186
187 uint16_t
vs_enqueue_pkts(struct vhost_dev * dev,uint16_t queue_id,struct rte_mbuf ** pkts,uint32_t count)188 vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
189 struct rte_mbuf **pkts, uint32_t count)
190 {
191 struct vhost_queue *queue;
192 struct rte_vhost_vring *vr;
193 uint16_t avail_idx, free_entries, start_idx;
194 uint16_t desc_indexes[MAX_PKT_BURST];
195 uint16_t used_idx;
196 uint32_t i;
197
198 queue = &dev->queues[queue_id];
199 vr = &queue->vr;
200
201 avail_idx = rte_atomic_load_explicit((uint16_t __rte_atomic *)&vr->avail->idx,
202 rte_memory_order_acquire);
203 start_idx = queue->last_used_idx;
204 free_entries = avail_idx - start_idx;
205 count = RTE_MIN(count, free_entries);
206 count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
207 if (count == 0)
208 return 0;
209
210 /* Retrieve all of the desc indexes first to avoid caching issues. */
211 rte_prefetch0(&vr->avail->ring[start_idx & (vr->size - 1)]);
212 for (i = 0; i < count; i++) {
213 used_idx = (start_idx + i) & (vr->size - 1);
214 desc_indexes[i] = vr->avail->ring[used_idx];
215 vr->used->ring[used_idx].id = desc_indexes[i];
216 vr->used->ring[used_idx].len = pkts[i]->pkt_len +
217 dev->hdr_len;
218 }
219
220 rte_prefetch0(&vr->desc[desc_indexes[0]]);
221 for (i = 0; i < count; i++) {
222 uint16_t desc_idx = desc_indexes[i];
223 int err;
224
225 err = enqueue_pkt(dev, vr, pkts[i], desc_idx);
226 if (unlikely(err)) {
227 used_idx = (start_idx + i) & (vr->size - 1);
228 vr->used->ring[used_idx].len = dev->hdr_len;
229 }
230
231 if (i + 1 < count)
232 rte_prefetch0(&vr->desc[desc_indexes[i+1]]);
233 }
234
235 rte_atomic_fetch_add_explicit((uint16_t __rte_atomic *)&vr->used->idx, count,
236 rte_memory_order_release);
237 queue->last_used_idx += count;
238
239 rte_vhost_vring_call(dev->vid, queue_id);
240
241 return count;
242 }
243
244 uint16_t
builtin_enqueue_pkts(struct vhost_dev * dev,uint16_t queue_id,struct rte_mbuf ** pkts,uint32_t count)245 builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
246 struct rte_mbuf **pkts, uint32_t count)
247 {
248 return vs_enqueue_pkts(dev, queue_id, pkts, count);
249 }
250
251 static __rte_always_inline int
dequeue_pkt(struct vhost_dev * dev,struct rte_vhost_vring * vr,struct rte_mbuf * m,uint16_t desc_idx,struct rte_mempool * mbuf_pool)252 dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr,
253 struct rte_mbuf *m, uint16_t desc_idx,
254 struct rte_mempool *mbuf_pool)
255 {
256 struct vring_desc *desc;
257 uint64_t desc_addr, desc_gaddr;
258 uint32_t desc_avail, desc_offset;
259 uint64_t desc_chunck_len;
260 uint32_t mbuf_avail, mbuf_offset;
261 uint32_t cpy_len;
262 struct rte_mbuf *cur = m, *prev = m;
263 /* A counter to avoid desc dead loop chain */
264 uint32_t nr_desc = 1;
265
266 desc = &vr->desc[desc_idx];
267 if (unlikely((desc->len < dev->hdr_len)) ||
268 (desc->flags & VRING_DESC_F_INDIRECT))
269 return -1;
270
271 desc_chunck_len = desc->len;
272 desc_gaddr = desc->addr;
273 desc_addr = rte_vhost_va_from_guest_pa(
274 dev->mem, desc_gaddr, &desc_chunck_len);
275 if (unlikely(!desc_addr))
276 return -1;
277
278 /*
279 * We don't support ANY_LAYOUT, neither VERSION_1, meaning
280 * a Tx packet from guest must have 2 desc buffers at least:
281 * the first for storing the header and the others for
282 * storing the data.
283 *
284 * And since we don't support TSO, we could simply skip the
285 * header.
286 */
287 desc = &vr->desc[desc->next];
288 desc_chunck_len = desc->len;
289 desc_gaddr = desc->addr;
290 desc_addr = rte_vhost_va_from_guest_pa(
291 dev->mem, desc_gaddr, &desc_chunck_len);
292 if (unlikely(!desc_addr))
293 return -1;
294 rte_prefetch0((void *)(uintptr_t)desc_addr);
295
296 desc_offset = 0;
297 desc_avail = desc->len;
298 nr_desc += 1;
299
300 mbuf_offset = 0;
301 mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
302 while (1) {
303 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
304 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
305 mbuf_offset),
306 (void *)((uintptr_t)(desc_addr + desc_offset)),
307 cpy_len);
308
309 mbuf_avail -= cpy_len;
310 mbuf_offset += cpy_len;
311 desc_avail -= cpy_len;
312 desc_offset += cpy_len;
313 desc_chunck_len -= cpy_len;
314
315 /* This desc reaches to its end, get the next one */
316 if (desc_avail == 0) {
317 if ((desc->flags & VRING_DESC_F_NEXT) == 0)
318 break;
319
320 if (unlikely(desc->next >= vr->size ||
321 ++nr_desc > vr->size))
322 return -1;
323 desc = &vr->desc[desc->next];
324
325 desc_chunck_len = desc->len;
326 desc_gaddr = desc->addr;
327 desc_addr = rte_vhost_va_from_guest_pa(
328 dev->mem, desc_gaddr, &desc_chunck_len);
329 if (unlikely(!desc_addr))
330 return -1;
331 rte_prefetch0((void *)(uintptr_t)desc_addr);
332
333 desc_offset = 0;
334 desc_avail = desc->len;
335 } else if (unlikely(desc_chunck_len == 0)) {
336 desc_chunck_len = desc_avail;
337 desc_gaddr += desc_offset;
338 desc_addr = rte_vhost_va_from_guest_pa(dev->mem,
339 desc_gaddr,
340 &desc_chunck_len);
341 if (unlikely(!desc_addr))
342 return -1;
343
344 desc_offset = 0;
345 }
346
347 /*
348 * This mbuf reaches to its end, get a new one
349 * to hold more data.
350 */
351 if (mbuf_avail == 0) {
352 cur = rte_pktmbuf_alloc(mbuf_pool);
353 if (unlikely(cur == NULL)) {
354 RTE_LOG(ERR, VHOST_DATA, "Failed to "
355 "allocate memory for mbuf.\n");
356 return -1;
357 }
358
359 prev->next = cur;
360 prev->data_len = mbuf_offset;
361 m->nb_segs += 1;
362 m->pkt_len += mbuf_offset;
363 prev = cur;
364
365 mbuf_offset = 0;
366 mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
367 }
368 }
369
370 prev->data_len = mbuf_offset;
371 m->pkt_len += mbuf_offset;
372
373 return 0;
374 }
375
376 static uint16_t
vs_dequeue_pkts(struct vhost_dev * dev,uint16_t queue_id,struct rte_mempool * mbuf_pool,struct rte_mbuf ** pkts,uint16_t count)377 vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
378 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
379 {
380 struct vhost_queue *queue;
381 struct rte_vhost_vring *vr;
382 uint32_t desc_indexes[MAX_PKT_BURST];
383 uint32_t used_idx;
384 uint32_t i = 0;
385 uint16_t free_entries;
386 uint16_t avail_idx;
387
388 queue = &dev->queues[queue_id];
389 vr = &queue->vr;
390
391 free_entries = rte_atomic_load_explicit((uint16_t __rte_atomic *)&vr->avail->idx,
392 rte_memory_order_acquire) - queue->last_avail_idx;
393 if (free_entries == 0)
394 return 0;
395
396 /* Prefetch available and used ring */
397 avail_idx = queue->last_avail_idx & (vr->size - 1);
398 used_idx = queue->last_used_idx & (vr->size - 1);
399 rte_prefetch0(&vr->avail->ring[avail_idx]);
400 rte_prefetch0(&vr->used->ring[used_idx]);
401
402 count = RTE_MIN(count, MAX_PKT_BURST);
403 count = RTE_MIN(count, free_entries);
404
405 if (unlikely(count == 0))
406 return 0;
407
408 /*
409 * Retrieve all of the head indexes first and pre-update used entries
410 * to avoid caching issues.
411 */
412 for (i = 0; i < count; i++) {
413 avail_idx = (queue->last_avail_idx + i) & (vr->size - 1);
414 used_idx = (queue->last_used_idx + i) & (vr->size - 1);
415 desc_indexes[i] = vr->avail->ring[avail_idx];
416
417 vr->used->ring[used_idx].id = desc_indexes[i];
418 vr->used->ring[used_idx].len = 0;
419 }
420
421 /* Prefetch descriptor index. */
422 rte_prefetch0(&vr->desc[desc_indexes[0]]);
423 for (i = 0; i < count; i++) {
424 int err;
425
426 if (likely(i + 1 < count))
427 rte_prefetch0(&vr->desc[desc_indexes[i + 1]]);
428
429 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
430 if (unlikely(pkts[i] == NULL)) {
431 RTE_LOG(ERR, VHOST_DATA,
432 "Failed to allocate memory for mbuf.\n");
433 break;
434 }
435
436 err = dequeue_pkt(dev, vr, pkts[i], desc_indexes[i], mbuf_pool);
437 if (unlikely(err)) {
438 rte_pktmbuf_free(pkts[i]);
439 break;
440 }
441
442 }
443
444 queue->last_avail_idx += i;
445 queue->last_used_idx += i;
446
447 rte_atomic_fetch_add_explicit((uint16_t __rte_atomic *)&vr->used->idx, i,
448 rte_memory_order_acq_rel);
449
450 rte_vhost_vring_call(dev->vid, queue_id);
451
452 return i;
453 }
454
455 uint16_t
builtin_dequeue_pkts(struct vhost_dev * dev,uint16_t queue_id,struct rte_mempool * mbuf_pool,struct rte_mbuf ** pkts,uint16_t count)456 builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
457 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
458 {
459 return vs_dequeue_pkts(dev, queue_id, mbuf_pool, pkts, count);
460 }
461