1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <linux/virtio_blk.h> 35 36 #include "spdk/env.h" 37 #include "spdk/bdev.h" 38 #include "spdk/bdev_module.h" 39 #include "spdk/conf.h" 40 #include "spdk/thread.h" 41 #include "spdk/likely.h" 42 #include "spdk/string.h" 43 #include "spdk/util.h" 44 #include "spdk/vhost.h" 45 46 #include "vhost_internal.h" 47 48 struct spdk_vhost_blk_task { 49 struct spdk_bdev_io *bdev_io; 50 struct spdk_vhost_blk_session *bvsession; 51 struct spdk_vhost_virtqueue *vq; 52 53 volatile uint8_t *status; 54 55 uint16_t req_idx; 56 57 /* for io wait */ 58 struct spdk_bdev_io_wait_entry bdev_io_wait; 59 60 /* If set, the task is currently used for I/O processing. */ 61 bool used; 62 63 /** Number of bytes that were written. */ 64 uint32_t used_len; 65 uint16_t iovcnt; 66 struct iovec iovs[SPDK_VHOST_IOVS_MAX]; 67 }; 68 69 struct spdk_vhost_blk_dev { 70 struct spdk_vhost_dev vdev; 71 struct spdk_bdev *bdev; 72 struct spdk_bdev_desc *bdev_desc; 73 bool readonly; 74 }; 75 76 struct spdk_vhost_blk_session { 77 /* The parent session must be the very first field in this struct */ 78 struct spdk_vhost_session vsession; 79 struct spdk_vhost_blk_dev *bvdev; 80 struct spdk_poller *requestq_poller; 81 struct spdk_io_channel *io_channel; 82 struct spdk_poller *stop_poller; 83 }; 84 85 /* forward declaration */ 86 static const struct spdk_vhost_dev_backend vhost_blk_device_backend; 87 88 static int 89 process_blk_request(struct spdk_vhost_blk_task *task, 90 struct spdk_vhost_blk_session *bvsession, 91 struct spdk_vhost_virtqueue *vq); 92 93 static void 94 blk_task_finish(struct spdk_vhost_blk_task *task) 95 { 96 assert(task->bvsession->vsession.task_cnt > 0); 97 task->bvsession->vsession.task_cnt--; 98 task->used = false; 99 } 100 101 static void 102 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status) 103 { 104 if (task->status) { 105 *task->status = status; 106 } 107 108 vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx, 109 task->used_len); 110 blk_task_finish(task); 111 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status); 112 } 113 114 /* 115 * Process task's descriptor chain and setup data related fields. 116 * Return 117 * total size of suplied buffers 118 * 119 * FIXME: Make this function return to rd_cnt and wr_cnt 120 */ 121 static int 122 blk_iovs_setup(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq, 123 uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) 124 { 125 struct spdk_vhost_session *vsession = &bvsession->vsession; 126 struct spdk_vhost_dev *vdev = vsession->vdev; 127 struct vring_desc *desc, *desc_table; 128 uint16_t out_cnt = 0, cnt = 0; 129 uint32_t desc_table_size, len = 0; 130 uint32_t desc_handled_cnt; 131 int rc; 132 133 rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size); 134 if (rc != 0) { 135 SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); 136 return -1; 137 } 138 139 desc_handled_cnt = 0; 140 while (1) { 141 /* 142 * Maximum cnt reached? 143 * Should not happen if request is well formatted, otherwise this is a BUG. 144 */ 145 if (spdk_unlikely(cnt == *iovs_cnt)) { 146 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n", 147 vsession->name, req_idx); 148 return -1; 149 } 150 151 if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) { 152 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", 153 vsession->name, req_idx, cnt); 154 return -1; 155 } 156 157 len += desc->len; 158 159 out_cnt += vhost_vring_desc_is_wr(desc); 160 161 rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); 162 if (rc != 0) { 163 SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n", 164 vsession->name, req_idx); 165 return -1; 166 } else if (desc == NULL) { 167 break; 168 } 169 170 desc_handled_cnt++; 171 if (spdk_unlikely(desc_handled_cnt > desc_table_size)) { 172 /* Break a cycle and report an error, if any. */ 173 SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n", 174 vsession->name, desc_table_size, desc_handled_cnt); 175 return -1; 176 } 177 } 178 179 /* 180 * There must be least two descriptors. 181 * First contain request so it must be readable. 182 * Last descriptor contain buffer for response so it must be writable. 183 */ 184 if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { 185 return -1; 186 } 187 188 *length = len; 189 *iovs_cnt = cnt; 190 return 0; 191 } 192 193 static void 194 blk_request_finish(bool success, struct spdk_vhost_blk_task *task) 195 { 196 *task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; 197 vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx, 198 task->used_len); 199 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task, 200 task->req_idx, success ? "OK" : "FAIL"); 201 blk_task_finish(task); 202 } 203 204 static void 205 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 206 { 207 struct spdk_vhost_blk_task *task = cb_arg; 208 209 spdk_bdev_free_io(bdev_io); 210 blk_request_finish(success, task); 211 } 212 213 static void 214 blk_request_resubmit(void *arg) 215 { 216 struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg; 217 int rc = 0; 218 219 rc = process_blk_request(task, task->bvsession, task->vq); 220 if (rc == 0) { 221 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task); 222 } else { 223 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task); 224 } 225 } 226 227 static inline void 228 blk_request_queue_io(struct spdk_vhost_blk_task *task) 229 { 230 int rc; 231 struct spdk_vhost_blk_session *bvsession = task->bvsession; 232 struct spdk_bdev *bdev = bvsession->bvdev->bdev; 233 234 task->bdev_io_wait.bdev = bdev; 235 task->bdev_io_wait.cb_fn = blk_request_resubmit; 236 task->bdev_io_wait.cb_arg = task; 237 238 rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait); 239 if (rc != 0) { 240 SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc); 241 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 242 } 243 } 244 245 static int 246 process_blk_request(struct spdk_vhost_blk_task *task, 247 struct spdk_vhost_blk_session *bvsession, 248 struct spdk_vhost_virtqueue *vq) 249 { 250 struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev; 251 const struct virtio_blk_outhdr *req; 252 struct virtio_blk_discard_write_zeroes *desc; 253 struct iovec *iov; 254 uint32_t type; 255 uint32_t payload_len; 256 uint64_t flush_bytes; 257 int rc; 258 259 if (blk_iovs_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) { 260 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx); 261 /* Only READ and WRITE are supported for now. */ 262 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 263 return -1; 264 } 265 266 iov = &task->iovs[0]; 267 if (spdk_unlikely(iov->iov_len != sizeof(*req))) { 268 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, 269 "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n", 270 iov->iov_len, sizeof(*req), task->req_idx); 271 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 272 return -1; 273 } 274 275 req = iov->iov_base; 276 277 iov = &task->iovs[task->iovcnt - 1]; 278 if (spdk_unlikely(iov->iov_len != 1)) { 279 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, 280 "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n", 281 iov->iov_len, 1, task->req_idx); 282 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 283 return -1; 284 } 285 286 task->status = iov->iov_base; 287 payload_len -= sizeof(*req) + sizeof(*task->status); 288 task->iovcnt -= 2; 289 290 type = req->type; 291 #ifdef VIRTIO_BLK_T_BARRIER 292 /* Don't care about barier for now (as QEMU's virtio-blk do). */ 293 type &= ~VIRTIO_BLK_T_BARRIER; 294 #endif 295 296 switch (type) { 297 case VIRTIO_BLK_T_IN: 298 case VIRTIO_BLK_T_OUT: 299 if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) { 300 SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n", 301 type ? "WRITE" : "READ", task->req_idx); 302 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 303 return -1; 304 } 305 306 if (type == VIRTIO_BLK_T_IN) { 307 task->used_len = payload_len + sizeof(*task->status); 308 rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel, 309 &task->iovs[1], task->iovcnt, req->sector * 512, 310 payload_len, blk_request_complete_cb, task); 311 } else if (!bvdev->readonly) { 312 task->used_len = sizeof(*task->status); 313 rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel, 314 &task->iovs[1], task->iovcnt, req->sector * 512, 315 payload_len, blk_request_complete_cb, task); 316 } else { 317 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n"); 318 rc = -1; 319 } 320 321 if (rc) { 322 if (rc == -ENOMEM) { 323 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); 324 blk_request_queue_io(task); 325 } else { 326 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 327 return -1; 328 } 329 } 330 break; 331 case VIRTIO_BLK_T_DISCARD: 332 desc = task->iovs[1].iov_base; 333 if (payload_len != sizeof(*desc)) { 334 SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len); 335 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 336 return -1; 337 } 338 339 rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel, 340 desc->sector * 512, desc->num_sectors * 512, 341 blk_request_complete_cb, task); 342 if (rc) { 343 if (rc == -ENOMEM) { 344 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); 345 blk_request_queue_io(task); 346 } else { 347 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 348 return -1; 349 } 350 } 351 break; 352 case VIRTIO_BLK_T_WRITE_ZEROES: 353 desc = task->iovs[1].iov_base; 354 if (payload_len != sizeof(*desc)) { 355 SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len); 356 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 357 return -1; 358 } 359 360 /* Zeroed and Unmap the range, SPDK doen't support it. */ 361 if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { 362 SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n"); 363 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 364 return -1; 365 } 366 367 rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel, 368 desc->sector * 512, desc->num_sectors * 512, 369 blk_request_complete_cb, task); 370 if (rc) { 371 if (rc == -ENOMEM) { 372 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); 373 blk_request_queue_io(task); 374 } else { 375 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 376 return -1; 377 } 378 } 379 break; 380 case VIRTIO_BLK_T_FLUSH: 381 flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev); 382 if (req->sector != 0) { 383 SPDK_NOTICELOG("sector must be zero for flush command\n"); 384 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 385 return -1; 386 } 387 rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel, 388 0, flush_bytes, 389 blk_request_complete_cb, task); 390 if (rc) { 391 if (rc == -ENOMEM) { 392 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); 393 blk_request_queue_io(task); 394 } else { 395 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 396 return -1; 397 } 398 } 399 break; 400 case VIRTIO_BLK_T_GET_ID: 401 if (!task->iovcnt || !payload_len) { 402 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 403 return -1; 404 } 405 task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len); 406 spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev), 407 task->used_len, ' '); 408 blk_request_finish(true, task); 409 break; 410 default: 411 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type); 412 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 413 return -1; 414 } 415 416 return 0; 417 } 418 419 static void 420 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) 421 { 422 struct spdk_vhost_blk_task *task; 423 struct spdk_vhost_session *vsession = &bvsession->vsession; 424 int rc; 425 uint16_t reqs[32]; 426 uint16_t reqs_cnt, i; 427 428 reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); 429 if (!reqs_cnt) { 430 return; 431 } 432 433 for (i = 0; i < reqs_cnt; i++) { 434 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n", 435 reqs[i]); 436 437 if (spdk_unlikely(reqs[i] >= vq->vring.size)) { 438 SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", 439 vsession->name, reqs[i], vq->vring.size); 440 vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0); 441 continue; 442 } 443 444 task = &((struct spdk_vhost_blk_task *)vq->tasks)[reqs[i]]; 445 if (spdk_unlikely(task->used)) { 446 SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", 447 vsession->name, reqs[i]); 448 vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0); 449 continue; 450 } 451 452 vsession->task_cnt++; 453 454 task->used = true; 455 task->iovcnt = SPDK_COUNTOF(task->iovs); 456 task->status = NULL; 457 task->used_len = 0; 458 459 rc = process_blk_request(task, bvsession, vq); 460 if (rc == 0) { 461 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task, 462 reqs[i]); 463 } else { 464 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task, reqs[i]); 465 } 466 } 467 } 468 469 static int 470 vdev_worker(void *arg) 471 { 472 struct spdk_vhost_blk_session *bvsession = arg; 473 struct spdk_vhost_session *vsession = &bvsession->vsession; 474 475 uint16_t q_idx; 476 477 for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { 478 process_vq(bvsession, &vsession->virtqueue[q_idx]); 479 } 480 481 vhost_session_used_signal(vsession); 482 483 return -1; 484 } 485 486 static void 487 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) 488 { 489 struct spdk_vhost_session *vsession = &bvsession->vsession; 490 struct iovec iovs[SPDK_VHOST_IOVS_MAX]; 491 uint32_t length; 492 uint16_t iovcnt, req_idx; 493 494 if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) { 495 return; 496 } 497 498 iovcnt = SPDK_COUNTOF(iovs); 499 if (blk_iovs_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) { 500 *(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR; 501 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx); 502 } 503 504 vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0); 505 } 506 507 static int 508 no_bdev_vdev_worker(void *arg) 509 { 510 struct spdk_vhost_blk_session *bvsession = arg; 511 struct spdk_vhost_session *vsession = &bvsession->vsession; 512 uint16_t q_idx; 513 514 for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { 515 no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]); 516 } 517 518 vhost_session_used_signal(vsession); 519 520 if (vsession->task_cnt == 0 && bvsession->io_channel) { 521 spdk_put_io_channel(bvsession->io_channel); 522 bvsession->io_channel = NULL; 523 } 524 525 return -1; 526 } 527 528 static struct spdk_vhost_blk_session * 529 to_blk_session(struct spdk_vhost_session *vsession) 530 { 531 assert(vsession->vdev->backend == &vhost_blk_device_backend); 532 return (struct spdk_vhost_blk_session *)vsession; 533 } 534 535 static struct spdk_vhost_blk_dev * 536 to_blk_dev(struct spdk_vhost_dev *vdev) 537 { 538 if (vdev == NULL) { 539 return NULL; 540 } 541 542 if (vdev->backend != &vhost_blk_device_backend) { 543 SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name); 544 return NULL; 545 } 546 547 return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev); 548 } 549 550 struct spdk_bdev * 551 spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev) 552 { 553 struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); 554 555 assert(bvdev != NULL); 556 return bvdev->bdev; 557 } 558 559 static void 560 vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx) 561 { 562 563 /* All sessions have been notified, time to close the bdev */ 564 struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); 565 566 assert(bvdev != NULL); 567 spdk_bdev_close(bvdev->bdev_desc); 568 bvdev->bdev_desc = NULL; 569 bvdev->bdev = NULL; 570 } 571 572 static int 573 vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev, 574 struct spdk_vhost_session *vsession, 575 void *ctx) 576 { 577 struct spdk_vhost_blk_session *bvsession; 578 579 bvsession = (struct spdk_vhost_blk_session *)vsession; 580 if (bvsession->requestq_poller) { 581 spdk_poller_unregister(&bvsession->requestq_poller); 582 bvsession->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvsession, 0); 583 } 584 585 return 0; 586 } 587 588 static void 589 bdev_remove_cb(void *remove_ctx) 590 { 591 struct spdk_vhost_blk_dev *bvdev = remove_ctx; 592 593 SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n", 594 bvdev->vdev.name); 595 596 spdk_vhost_lock(); 597 vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb, 598 vhost_dev_bdev_remove_cpl_cb, NULL); 599 spdk_vhost_unlock(); 600 } 601 602 static void 603 free_task_pool(struct spdk_vhost_blk_session *bvsession) 604 { 605 struct spdk_vhost_session *vsession = &bvsession->vsession; 606 struct spdk_vhost_virtqueue *vq; 607 uint16_t i; 608 609 for (i = 0; i < vsession->max_queues; i++) { 610 vq = &vsession->virtqueue[i]; 611 if (vq->tasks == NULL) { 612 continue; 613 } 614 615 spdk_free(vq->tasks); 616 vq->tasks = NULL; 617 } 618 } 619 620 static int 621 alloc_task_pool(struct spdk_vhost_blk_session *bvsession) 622 { 623 struct spdk_vhost_session *vsession = &bvsession->vsession; 624 struct spdk_vhost_virtqueue *vq; 625 struct spdk_vhost_blk_task *task; 626 uint32_t task_cnt; 627 uint16_t i; 628 uint32_t j; 629 630 for (i = 0; i < vsession->max_queues; i++) { 631 vq = &vsession->virtqueue[i]; 632 if (vq->vring.desc == NULL) { 633 continue; 634 } 635 636 task_cnt = vq->vring.size; 637 if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) { 638 /* sanity check */ 639 SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", 640 vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); 641 free_task_pool(bvsession); 642 return -1; 643 } 644 vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt, 645 SPDK_CACHE_LINE_SIZE, NULL, 646 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 647 if (vq->tasks == NULL) { 648 SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", 649 vsession->name, task_cnt, i); 650 free_task_pool(bvsession); 651 return -1; 652 } 653 654 for (j = 0; j < task_cnt; j++) { 655 task = &((struct spdk_vhost_blk_task *)vq->tasks)[j]; 656 task->bvsession = bvsession; 657 task->req_idx = j; 658 task->vq = vq; 659 } 660 } 661 662 return 0; 663 } 664 665 static int 666 vhost_blk_start_cb(struct spdk_vhost_dev *vdev, 667 struct spdk_vhost_session *vsession, void *unused) 668 { 669 struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); 670 struct spdk_vhost_blk_dev *bvdev; 671 int i, rc = 0; 672 673 bvdev = to_blk_dev(vdev); 674 assert(bvdev != NULL); 675 bvsession->bvdev = bvdev; 676 677 /* validate all I/O queues are in a contiguous index range */ 678 for (i = 0; i < vsession->max_queues; i++) { 679 if (vsession->virtqueue[i].vring.desc == NULL) { 680 SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i); 681 rc = -1; 682 goto out; 683 } 684 } 685 686 rc = alloc_task_pool(bvsession); 687 if (rc != 0) { 688 SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name); 689 goto out; 690 } 691 692 if (bvdev->bdev) { 693 bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc); 694 if (!bvsession->io_channel) { 695 free_task_pool(bvsession); 696 SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name); 697 rc = -1; 698 goto out; 699 } 700 } 701 702 bvsession->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker, 703 bvsession, 0); 704 SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n", 705 vsession->name, spdk_env_get_current_core()); 706 out: 707 vhost_session_start_done(vsession, rc); 708 return rc; 709 } 710 711 static int 712 vhost_blk_start(struct spdk_vhost_session *vsession) 713 { 714 struct vhost_poll_group *pg; 715 716 pg = vhost_get_poll_group(vsession->vdev->cpumask); 717 return vhost_session_send_event(pg, vsession, vhost_blk_start_cb, 718 3, "start session"); 719 } 720 721 static int 722 destroy_session_poller_cb(void *arg) 723 { 724 struct spdk_vhost_blk_session *bvsession = arg; 725 struct spdk_vhost_session *vsession = &bvsession->vsession; 726 int i; 727 728 if (vsession->task_cnt > 0) { 729 return -1; 730 } 731 732 if (spdk_vhost_trylock() != 0) { 733 return -1; 734 } 735 736 for (i = 0; i < vsession->max_queues; i++) { 737 vsession->virtqueue[i].next_event_time = 0; 738 vhost_vq_used_signal(vsession, &vsession->virtqueue[i]); 739 } 740 741 SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n", 742 vsession->name, spdk_env_get_current_core()); 743 744 if (bvsession->io_channel) { 745 spdk_put_io_channel(bvsession->io_channel); 746 bvsession->io_channel = NULL; 747 } 748 749 free_task_pool(bvsession); 750 spdk_poller_unregister(&bvsession->stop_poller); 751 vhost_session_stop_done(vsession, 0); 752 753 spdk_vhost_unlock(); 754 return -1; 755 } 756 757 static int 758 vhost_blk_stop_cb(struct spdk_vhost_dev *vdev, 759 struct spdk_vhost_session *vsession, void *unused) 760 { 761 struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); 762 763 spdk_poller_unregister(&bvsession->requestq_poller); 764 bvsession->stop_poller = spdk_poller_register(destroy_session_poller_cb, 765 bvsession, 1000); 766 return 0; 767 } 768 769 static int 770 vhost_blk_stop(struct spdk_vhost_session *vsession) 771 { 772 return vhost_session_send_event(vsession->poll_group, vsession, 773 vhost_blk_stop_cb, 3, "stop session"); 774 } 775 776 static void 777 vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) 778 { 779 struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev); 780 struct spdk_vhost_blk_dev *bvdev; 781 782 bvdev = to_blk_dev(vdev); 783 assert(bvdev != NULL); 784 spdk_json_write_named_object_begin(w, "block"); 785 786 spdk_json_write_named_bool(w, "readonly", bvdev->readonly); 787 788 spdk_json_write_name(w, "bdev"); 789 if (bdev) { 790 spdk_json_write_string(w, spdk_bdev_get_name(bdev)); 791 } else { 792 spdk_json_write_null(w); 793 } 794 795 spdk_json_write_object_end(w); 796 } 797 798 static void 799 vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) 800 { 801 struct spdk_vhost_blk_dev *bvdev; 802 803 bvdev = to_blk_dev(vdev); 804 assert(bvdev != NULL); 805 if (!bvdev->bdev) { 806 return; 807 } 808 809 spdk_json_write_object_begin(w); 810 spdk_json_write_named_string(w, "method", "vhost_create_blk_controller"); 811 812 spdk_json_write_named_object_begin(w, "params"); 813 spdk_json_write_named_string(w, "ctrlr", vdev->name); 814 spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev)); 815 spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(vdev->cpumask)); 816 spdk_json_write_named_bool(w, "readonly", bvdev->readonly); 817 spdk_json_write_object_end(w); 818 819 spdk_json_write_object_end(w); 820 } 821 822 static int vhost_blk_destroy(struct spdk_vhost_dev *dev); 823 824 static int 825 vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config, 826 uint32_t len) 827 { 828 struct virtio_blk_config blkcfg; 829 struct spdk_vhost_blk_dev *bvdev; 830 struct spdk_bdev *bdev; 831 uint32_t blk_size; 832 uint64_t blkcnt; 833 834 bvdev = to_blk_dev(vdev); 835 assert(bvdev != NULL); 836 bdev = bvdev->bdev; 837 if (bdev == NULL) { 838 /* We can't just return -1 here as this GET_CONFIG message might 839 * be caused by a QEMU VM reboot. Returning -1 will indicate an 840 * error to QEMU, who might then decide to terminate itself. 841 * We don't want that. A simple reboot shouldn't break the system. 842 * 843 * Presenting a block device with block size 0 and block count 0 844 * doesn't cause any problems on QEMU side and the virtio-pci 845 * device is even still available inside the VM, but there will 846 * be no block device created for it - the kernel drivers will 847 * silently reject it. 848 */ 849 blk_size = 0; 850 blkcnt = 0; 851 } else { 852 blk_size = spdk_bdev_get_block_size(bdev); 853 blkcnt = spdk_bdev_get_num_blocks(bdev); 854 if (spdk_bdev_get_buf_align(bdev) > 1) { 855 blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE; 856 blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1); 857 } else { 858 blkcfg.size_max = 131072; 859 /* -2 for REQ and RESP and -1 for region boundary splitting */ 860 blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1; 861 } 862 } 863 864 memset(&blkcfg, 0, sizeof(blkcfg)); 865 blkcfg.blk_size = blk_size; 866 /* minimum I/O size in blocks */ 867 blkcfg.min_io_size = 1; 868 /* expressed in 512 Bytes sectors */ 869 blkcfg.capacity = (blkcnt * blk_size) / 512; 870 /* QEMU can overwrite this value when started */ 871 blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES; 872 873 if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { 874 /* 16MiB, expressed in 512 Bytes */ 875 blkcfg.max_discard_sectors = 32768; 876 blkcfg.max_discard_seg = 1; 877 blkcfg.discard_sector_alignment = blk_size / 512; 878 } 879 if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 880 blkcfg.max_write_zeroes_sectors = 32768; 881 blkcfg.max_write_zeroes_seg = 1; 882 } 883 884 memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg))); 885 886 return 0; 887 } 888 889 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = { 890 .virtio_features = SPDK_VHOST_FEATURES | 891 (1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | 892 (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_RO) | 893 (1ULL << VIRTIO_BLK_F_BLK_SIZE) | (1ULL << VIRTIO_BLK_F_TOPOLOGY) | 894 (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI) | 895 (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | 896 (1ULL << VIRTIO_BLK_F_MQ) | (1ULL << VIRTIO_BLK_F_DISCARD) | 897 (1ULL << VIRTIO_BLK_F_WRITE_ZEROES), 898 .disabled_features = SPDK_VHOST_DISABLED_FEATURES | (1ULL << VIRTIO_BLK_F_GEOMETRY) | 899 (1ULL << VIRTIO_BLK_F_RO) | (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | 900 (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_DISCARD) | 901 (1ULL << VIRTIO_BLK_F_WRITE_ZEROES), 902 .session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session), 903 .start_session = vhost_blk_start, 904 .stop_session = vhost_blk_stop, 905 .vhost_get_config = vhost_blk_get_config, 906 .dump_info_json = vhost_blk_dump_info_json, 907 .write_config_json = vhost_blk_write_config_json, 908 .remove_device = vhost_blk_destroy, 909 }; 910 911 int 912 vhost_blk_controller_construct(void) 913 { 914 struct spdk_conf_section *sp; 915 unsigned ctrlr_num; 916 char *bdev_name; 917 char *cpumask; 918 char *name; 919 bool readonly; 920 921 for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { 922 if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) { 923 continue; 924 } 925 926 if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) { 927 SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", 928 spdk_conf_section_get_name(sp)); 929 return -1; 930 } 931 932 name = spdk_conf_section_get_val(sp, "Name"); 933 if (name == NULL) { 934 SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num); 935 return -1; 936 } 937 938 cpumask = spdk_conf_section_get_val(sp, "Cpumask"); 939 readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false); 940 941 bdev_name = spdk_conf_section_get_val(sp, "Dev"); 942 if (bdev_name == NULL) { 943 continue; 944 } 945 946 if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) { 947 return -1; 948 } 949 } 950 951 return 0; 952 } 953 954 int 955 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly) 956 { 957 struct spdk_vhost_blk_dev *bvdev = NULL; 958 struct spdk_bdev *bdev; 959 uint64_t features = 0; 960 int ret = 0; 961 962 spdk_vhost_lock(); 963 bdev = spdk_bdev_get_by_name(dev_name); 964 if (bdev == NULL) { 965 SPDK_ERRLOG("%s: bdev '%s' not found\n", 966 name, dev_name); 967 ret = -ENODEV; 968 goto out; 969 } 970 971 bvdev = calloc(1, sizeof(*bvdev)); 972 if (bvdev == NULL) { 973 ret = -ENOMEM; 974 goto out; 975 } 976 977 ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc); 978 if (ret != 0) { 979 SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n", 980 name, dev_name, ret); 981 goto out; 982 } 983 984 bvdev->bdev = bdev; 985 bvdev->readonly = readonly; 986 ret = vhost_dev_register(&bvdev->vdev, name, cpumask, &vhost_blk_device_backend); 987 if (ret != 0) { 988 spdk_bdev_close(bvdev->bdev_desc); 989 goto out; 990 } 991 992 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { 993 features |= (1ULL << VIRTIO_BLK_F_DISCARD); 994 } 995 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 996 features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES); 997 } 998 if (readonly) { 999 features |= (1ULL << VIRTIO_BLK_F_RO); 1000 } 1001 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) { 1002 features |= (1ULL << VIRTIO_BLK_F_FLUSH); 1003 } 1004 1005 if (features && rte_vhost_driver_enable_features(bvdev->vdev.path, features)) { 1006 SPDK_ERRLOG("%s: failed to enable features 0x%"PRIx64"\n", name, features); 1007 1008 if (vhost_dev_unregister(&bvdev->vdev) != 0) { 1009 SPDK_ERRLOG("%s: failed to remove device\n", name); 1010 } 1011 1012 spdk_bdev_close(bvdev->bdev_desc); 1013 ret = -1; 1014 goto out; 1015 } 1016 1017 SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: using bdev '%s'\n", name, dev_name); 1018 out: 1019 if (ret != 0 && bvdev) { 1020 free(bvdev); 1021 } 1022 spdk_vhost_unlock(); 1023 return ret; 1024 } 1025 1026 static int 1027 vhost_blk_destroy(struct spdk_vhost_dev *vdev) 1028 { 1029 struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); 1030 int rc; 1031 1032 assert(bvdev != NULL); 1033 rc = vhost_dev_unregister(&bvdev->vdev); 1034 if (rc != 0) { 1035 return rc; 1036 } 1037 1038 if (bvdev->bdev_desc) { 1039 spdk_bdev_close(bvdev->bdev_desc); 1040 bvdev->bdev_desc = NULL; 1041 } 1042 bvdev->bdev = NULL; 1043 1044 free(bvdev); 1045 return 0; 1046 } 1047 1048 SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK) 1049 SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA) 1050