1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2017 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stdint.h> 35 #include <stdbool.h> 36 #include <linux/virtio_net.h> 37 38 #include <rte_mbuf.h> 39 #include <rte_memcpy.h> 40 #include <rte_vhost.h> 41 42 #include "main.h" 43 44 /* 45 * A very simple vhost-user net driver implementation, without 46 * any extra features being enabled, such as TSO and mrg-Rx. 47 */ 48 49 void 50 vs_vhost_net_setup(struct vhost_dev *dev) 51 { 52 uint16_t i; 53 int vid = dev->vid; 54 struct vhost_queue *queue; 55 56 RTE_LOG(INFO, VHOST_CONFIG, 57 "setting builtin vhost-user net driver\n"); 58 59 rte_vhost_get_negotiated_features(vid, &dev->features); 60 if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) 61 dev->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); 62 else 63 dev->hdr_len = sizeof(struct virtio_net_hdr); 64 65 rte_vhost_get_mem_table(vid, &dev->mem); 66 67 dev->nr_vrings = rte_vhost_get_vring_num(vid); 68 for (i = 0; i < dev->nr_vrings; i++) { 69 queue = &dev->queues[i]; 70 71 queue->last_used_idx = 0; 72 queue->last_avail_idx = 0; 73 rte_vhost_get_vhost_vring(vid, i, &queue->vr); 74 } 75 } 76 77 void 78 vs_vhost_net_remove(struct vhost_dev *dev) 79 { 80 free(dev->mem); 81 } 82 83 static __rte_always_inline int 84 enqueue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, 85 struct rte_mbuf *m, uint16_t desc_idx) 86 { 87 uint32_t desc_avail, desc_offset; 88 uint32_t mbuf_avail, mbuf_offset; 89 uint32_t cpy_len; 90 struct vring_desc *desc; 91 uint64_t desc_addr; 92 struct virtio_net_hdr virtio_hdr = {0, 0, 0, 0, 0, 0}; 93 /* A counter to avoid desc dead loop chain */ 94 uint16_t nr_desc = 1; 95 96 desc = &vr->desc[desc_idx]; 97 desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); 98 /* 99 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid 100 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which 101 * otherwise stores offset on the stack instead of in a register. 102 */ 103 if (unlikely(desc->len < dev->hdr_len) || !desc_addr) 104 return -1; 105 106 rte_prefetch0((void *)(uintptr_t)desc_addr); 107 108 /* write virtio-net header */ 109 *(struct virtio_net_hdr *)(uintptr_t)desc_addr = virtio_hdr; 110 111 desc_offset = dev->hdr_len; 112 desc_avail = desc->len - dev->hdr_len; 113 114 mbuf_avail = rte_pktmbuf_data_len(m); 115 mbuf_offset = 0; 116 while (mbuf_avail != 0 || m->next != NULL) { 117 /* done with current mbuf, fetch next */ 118 if (mbuf_avail == 0) { 119 m = m->next; 120 121 mbuf_offset = 0; 122 mbuf_avail = rte_pktmbuf_data_len(m); 123 } 124 125 /* done with current desc buf, fetch next */ 126 if (desc_avail == 0) { 127 if ((desc->flags & VRING_DESC_F_NEXT) == 0) { 128 /* Room in vring buffer is not enough */ 129 return -1; 130 } 131 if (unlikely(desc->next >= vr->size || 132 ++nr_desc > vr->size)) 133 return -1; 134 135 desc = &vr->desc[desc->next]; 136 desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); 137 if (unlikely(!desc_addr)) 138 return -1; 139 140 desc_offset = 0; 141 desc_avail = desc->len; 142 } 143 144 cpy_len = RTE_MIN(desc_avail, mbuf_avail); 145 rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), 146 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), 147 cpy_len); 148 149 mbuf_avail -= cpy_len; 150 mbuf_offset += cpy_len; 151 desc_avail -= cpy_len; 152 desc_offset += cpy_len; 153 } 154 155 return 0; 156 } 157 158 uint16_t 159 vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, 160 struct rte_mbuf **pkts, uint32_t count) 161 { 162 struct vhost_queue *queue; 163 struct rte_vhost_vring *vr; 164 uint16_t avail_idx, free_entries, start_idx; 165 uint16_t desc_indexes[MAX_PKT_BURST]; 166 uint16_t used_idx; 167 uint32_t i; 168 169 queue = &dev->queues[queue_id]; 170 vr = &queue->vr; 171 172 avail_idx = *((volatile uint16_t *)&vr->avail->idx); 173 start_idx = queue->last_used_idx; 174 free_entries = avail_idx - start_idx; 175 count = RTE_MIN(count, free_entries); 176 count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST); 177 if (count == 0) 178 return 0; 179 180 /* Retrieve all of the desc indexes first to avoid caching issues. */ 181 rte_prefetch0(&vr->avail->ring[start_idx & (vr->size - 1)]); 182 for (i = 0; i < count; i++) { 183 used_idx = (start_idx + i) & (vr->size - 1); 184 desc_indexes[i] = vr->avail->ring[used_idx]; 185 vr->used->ring[used_idx].id = desc_indexes[i]; 186 vr->used->ring[used_idx].len = pkts[i]->pkt_len + 187 dev->hdr_len; 188 } 189 190 rte_prefetch0(&vr->desc[desc_indexes[0]]); 191 for (i = 0; i < count; i++) { 192 uint16_t desc_idx = desc_indexes[i]; 193 int err; 194 195 err = enqueue_pkt(dev, vr, pkts[i], desc_idx); 196 if (unlikely(err)) { 197 used_idx = (start_idx + i) & (vr->size - 1); 198 vr->used->ring[used_idx].len = dev->hdr_len; 199 } 200 201 if (i + 1 < count) 202 rte_prefetch0(&vr->desc[desc_indexes[i+1]]); 203 } 204 205 rte_smp_wmb(); 206 207 *(volatile uint16_t *)&vr->used->idx += count; 208 queue->last_used_idx += count; 209 210 /* flush used->idx update before we read avail->flags. */ 211 rte_mb(); 212 213 /* Kick the guest if necessary. */ 214 if (!(vr->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) 215 && (vr->callfd >= 0)) 216 eventfd_write(vr->callfd, (eventfd_t)1); 217 return count; 218 } 219 220 static __rte_always_inline int 221 dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, 222 struct rte_mbuf *m, uint16_t desc_idx, 223 struct rte_mempool *mbuf_pool) 224 { 225 struct vring_desc *desc; 226 uint64_t desc_addr; 227 uint32_t desc_avail, desc_offset; 228 uint32_t mbuf_avail, mbuf_offset; 229 uint32_t cpy_len; 230 struct rte_mbuf *cur = m, *prev = m; 231 /* A counter to avoid desc dead loop chain */ 232 uint32_t nr_desc = 1; 233 234 desc = &vr->desc[desc_idx]; 235 if (unlikely((desc->len < dev->hdr_len)) || 236 (desc->flags & VRING_DESC_F_INDIRECT)) 237 return -1; 238 239 desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); 240 if (unlikely(!desc_addr)) 241 return -1; 242 243 /* 244 * We don't support ANY_LAYOUT, neither VERSION_1, meaning 245 * a Tx packet from guest must have 2 desc buffers at least: 246 * the first for storing the header and the others for 247 * storing the data. 248 * 249 * And since we don't support TSO, we could simply skip the 250 * header. 251 */ 252 desc = &vr->desc[desc->next]; 253 desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); 254 if (unlikely(!desc_addr)) 255 return -1; 256 rte_prefetch0((void *)(uintptr_t)desc_addr); 257 258 desc_offset = 0; 259 desc_avail = desc->len; 260 nr_desc += 1; 261 262 mbuf_offset = 0; 263 mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; 264 while (1) { 265 cpy_len = RTE_MIN(desc_avail, mbuf_avail); 266 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, 267 mbuf_offset), 268 (void *)((uintptr_t)(desc_addr + desc_offset)), 269 cpy_len); 270 271 mbuf_avail -= cpy_len; 272 mbuf_offset += cpy_len; 273 desc_avail -= cpy_len; 274 desc_offset += cpy_len; 275 276 /* This desc reaches to its end, get the next one */ 277 if (desc_avail == 0) { 278 if ((desc->flags & VRING_DESC_F_NEXT) == 0) 279 break; 280 281 if (unlikely(desc->next >= vr->size || 282 ++nr_desc > vr->size)) 283 return -1; 284 desc = &vr->desc[desc->next]; 285 286 desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); 287 if (unlikely(!desc_addr)) 288 return -1; 289 rte_prefetch0((void *)(uintptr_t)desc_addr); 290 291 desc_offset = 0; 292 desc_avail = desc->len; 293 } 294 295 /* 296 * This mbuf reaches to its end, get a new one 297 * to hold more data. 298 */ 299 if (mbuf_avail == 0) { 300 cur = rte_pktmbuf_alloc(mbuf_pool); 301 if (unlikely(cur == NULL)) { 302 RTE_LOG(ERR, VHOST_DATA, "Failed to " 303 "allocate memory for mbuf.\n"); 304 return -1; 305 } 306 307 prev->next = cur; 308 prev->data_len = mbuf_offset; 309 m->nb_segs += 1; 310 m->pkt_len += mbuf_offset; 311 prev = cur; 312 313 mbuf_offset = 0; 314 mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; 315 } 316 } 317 318 prev->data_len = mbuf_offset; 319 m->pkt_len += mbuf_offset; 320 321 return 0; 322 } 323 324 uint16_t 325 vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, 326 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) 327 { 328 struct vhost_queue *queue; 329 struct rte_vhost_vring *vr; 330 uint32_t desc_indexes[MAX_PKT_BURST]; 331 uint32_t used_idx; 332 uint32_t i = 0; 333 uint16_t free_entries; 334 uint16_t avail_idx; 335 336 queue = &dev->queues[queue_id]; 337 vr = &queue->vr; 338 339 free_entries = *((volatile uint16_t *)&vr->avail->idx) - 340 queue->last_avail_idx; 341 if (free_entries == 0) 342 return 0; 343 344 /* Prefetch available and used ring */ 345 avail_idx = queue->last_avail_idx & (vr->size - 1); 346 used_idx = queue->last_used_idx & (vr->size - 1); 347 rte_prefetch0(&vr->avail->ring[avail_idx]); 348 rte_prefetch0(&vr->used->ring[used_idx]); 349 350 count = RTE_MIN(count, MAX_PKT_BURST); 351 count = RTE_MIN(count, free_entries); 352 353 if (unlikely(count == 0)) 354 return 0; 355 356 /* 357 * Retrieve all of the head indexes first and pre-update used entries 358 * to avoid caching issues. 359 */ 360 for (i = 0; i < count; i++) { 361 avail_idx = (queue->last_avail_idx + i) & (vr->size - 1); 362 used_idx = (queue->last_used_idx + i) & (vr->size - 1); 363 desc_indexes[i] = vr->avail->ring[avail_idx]; 364 365 vr->used->ring[used_idx].id = desc_indexes[i]; 366 vr->used->ring[used_idx].len = 0; 367 } 368 369 /* Prefetch descriptor index. */ 370 rte_prefetch0(&vr->desc[desc_indexes[0]]); 371 for (i = 0; i < count; i++) { 372 int err; 373 374 if (likely(i + 1 < count)) 375 rte_prefetch0(&vr->desc[desc_indexes[i + 1]]); 376 377 pkts[i] = rte_pktmbuf_alloc(mbuf_pool); 378 if (unlikely(pkts[i] == NULL)) { 379 RTE_LOG(ERR, VHOST_DATA, 380 "Failed to allocate memory for mbuf.\n"); 381 break; 382 } 383 384 err = dequeue_pkt(dev, vr, pkts[i], desc_indexes[i], mbuf_pool); 385 if (unlikely(err)) { 386 rte_pktmbuf_free(pkts[i]); 387 break; 388 } 389 390 } 391 392 queue->last_avail_idx += i; 393 queue->last_used_idx += i; 394 rte_smp_wmb(); 395 rte_smp_rmb(); 396 397 vr->used->idx += i; 398 399 if (!(vr->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) 400 && (vr->callfd >= 0)) 401 eventfd_write(vr->callfd, (eventfd_t)1); 402 403 return i; 404 } 405