1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <linux/virtio_blk.h> 35 36 #include "spdk/env.h" 37 #include "spdk/bdev.h" 38 #include "spdk/bdev_module.h" 39 #include "spdk/thread.h" 40 #include "spdk/likely.h" 41 #include "spdk/string.h" 42 #include "spdk/util.h" 43 #include "spdk/vhost.h" 44 45 #include "vhost_internal.h" 46 #include <rte_version.h> 47 48 /* Minimal set of features supported by every SPDK VHOST-BLK device */ 49 #define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \ 50 (1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ 51 (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ 52 (1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER) | \ 53 (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \ 54 (1ULL << VIRTIO_BLK_F_MQ)) 55 56 /* Not supported features */ 57 #define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \ 58 (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \ 59 (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI)) 60 61 /* Vhost-blk support protocol features */ 62 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \ 63 (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) 64 65 struct spdk_vhost_blk_task { 66 struct spdk_bdev_io *bdev_io; 67 struct spdk_vhost_blk_session *bvsession; 68 struct spdk_vhost_virtqueue *vq; 69 70 volatile uint8_t *status; 71 72 uint16_t req_idx; 73 uint16_t num_descs; 74 uint16_t buffer_id; 75 uint16_t inflight_head; 76 77 /* for io wait */ 78 struct spdk_bdev_io_wait_entry bdev_io_wait; 79 80 /* If set, the task is currently used for I/O processing. */ 81 bool used; 82 83 /** Number of bytes that were written. */ 84 uint32_t used_len; 85 uint16_t iovcnt; 86 struct iovec iovs[SPDK_VHOST_IOVS_MAX]; 87 88 /** Size of whole payload in bytes */ 89 uint32_t payload_size; 90 }; 91 92 struct spdk_vhost_blk_dev { 93 struct spdk_vhost_dev vdev; 94 struct spdk_bdev *bdev; 95 struct spdk_bdev_desc *bdev_desc; 96 /* dummy_io_channel is used to hold a bdev reference */ 97 struct spdk_io_channel *dummy_io_channel; 98 bool readonly; 99 }; 100 101 struct spdk_vhost_blk_session { 102 /* The parent session must be the very first field in this struct */ 103 struct spdk_vhost_session vsession; 104 struct spdk_vhost_blk_dev *bvdev; 105 struct spdk_poller *requestq_poller; 106 struct spdk_io_channel *io_channel; 107 struct spdk_poller *stop_poller; 108 }; 109 110 /* forward declaration */ 111 static const struct spdk_vhost_dev_backend vhost_blk_device_backend; 112 113 static int 114 process_blk_request(struct spdk_vhost_blk_task *task, 115 struct spdk_vhost_blk_session *bvsession); 116 117 static struct spdk_vhost_blk_session * 118 to_blk_session(struct spdk_vhost_session *vsession) 119 { 120 assert(vsession->vdev->backend == &vhost_blk_device_backend); 121 return (struct spdk_vhost_blk_session *)vsession; 122 } 123 124 static void 125 blk_task_finish(struct spdk_vhost_blk_task *task) 126 { 127 assert(task->bvsession->vsession.task_cnt > 0); 128 task->bvsession->vsession.task_cnt--; 129 task->used = false; 130 } 131 132 static void 133 blk_task_init(struct spdk_vhost_blk_task *task) 134 { 135 task->used = true; 136 task->iovcnt = SPDK_COUNTOF(task->iovs); 137 task->status = NULL; 138 task->used_len = 0; 139 task->payload_size = 0; 140 } 141 142 static void 143 blk_task_enqueue(struct spdk_vhost_blk_task *task) 144 { 145 if (task->vq->packed.packed_ring) { 146 vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq, 147 task->num_descs, 148 task->buffer_id, task->used_len, 149 task->inflight_head); 150 } else { 151 vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, 152 task->req_idx, task->used_len); 153 } 154 } 155 156 static void 157 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status) 158 { 159 if (task->status) { 160 *task->status = status; 161 } 162 163 blk_task_enqueue(task); 164 blk_task_finish(task); 165 SPDK_DEBUGLOG(vhost_blk_data, "Invalid request (status=%" PRIu8")\n", status); 166 } 167 168 /* 169 * Process task's descriptor chain and setup data related fields. 170 * Return 171 * total size of suplied buffers 172 * 173 * FIXME: Make this function return to rd_cnt and wr_cnt 174 */ 175 static int 176 blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession, 177 struct spdk_vhost_virtqueue *vq, 178 uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) 179 { 180 struct spdk_vhost_session *vsession = &bvsession->vsession; 181 struct spdk_vhost_dev *vdev = vsession->vdev; 182 struct vring_desc *desc, *desc_table; 183 uint16_t out_cnt = 0, cnt = 0; 184 uint32_t desc_table_size, len = 0; 185 uint32_t desc_handled_cnt; 186 int rc; 187 188 rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size); 189 if (rc != 0) { 190 SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); 191 return -1; 192 } 193 194 desc_handled_cnt = 0; 195 while (1) { 196 /* 197 * Maximum cnt reached? 198 * Should not happen if request is well formatted, otherwise this is a BUG. 199 */ 200 if (spdk_unlikely(cnt == *iovs_cnt)) { 201 SPDK_DEBUGLOG(vhost_blk, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n", 202 vsession->name, req_idx); 203 return -1; 204 } 205 206 if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) { 207 SPDK_DEBUGLOG(vhost_blk, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", 208 vsession->name, req_idx, cnt); 209 return -1; 210 } 211 212 len += desc->len; 213 214 out_cnt += vhost_vring_desc_is_wr(desc); 215 216 rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); 217 if (rc != 0) { 218 SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n", 219 vsession->name, req_idx); 220 return -1; 221 } else if (desc == NULL) { 222 break; 223 } 224 225 desc_handled_cnt++; 226 if (spdk_unlikely(desc_handled_cnt > desc_table_size)) { 227 /* Break a cycle and report an error, if any. */ 228 SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n", 229 vsession->name, desc_table_size, desc_handled_cnt); 230 return -1; 231 } 232 } 233 234 /* 235 * There must be least two descriptors. 236 * First contain request so it must be readable. 237 * Last descriptor contain buffer for response so it must be writable. 238 */ 239 if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { 240 return -1; 241 } 242 243 *length = len; 244 *iovs_cnt = cnt; 245 return 0; 246 } 247 248 static int 249 blk_iovs_packed_desc_setup(struct spdk_vhost_session *vsession, 250 struct spdk_vhost_virtqueue *vq, uint16_t req_idx, 251 struct vring_packed_desc *desc_table, uint16_t desc_table_size, 252 struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) 253 { 254 struct vring_packed_desc *desc; 255 uint16_t cnt = 0, out_cnt = 0; 256 uint32_t len = 0; 257 258 if (desc_table == NULL) { 259 desc = &vq->vring.desc_packed[req_idx]; 260 } else { 261 req_idx = 0; 262 desc = desc_table; 263 } 264 265 while (1) { 266 /* 267 * Maximum cnt reached? 268 * Should not happen if request is well formatted, otherwise this is a BUG. 269 */ 270 if (spdk_unlikely(cnt == *iovs_cnt)) { 271 SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n", 272 vsession->name, req_idx); 273 return -EINVAL; 274 } 275 276 if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) { 277 SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", 278 vsession->name, req_idx, cnt); 279 return -EINVAL; 280 } 281 282 len += desc->len; 283 out_cnt += vhost_vring_packed_desc_is_wr(desc); 284 285 /* desc is NULL means we reach the last desc of this request */ 286 vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size); 287 if (desc == NULL) { 288 break; 289 } 290 } 291 292 /* 293 * There must be least two descriptors. 294 * First contain request so it must be readable. 295 * Last descriptor contain buffer for response so it must be writable. 296 */ 297 if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { 298 return -EINVAL; 299 } 300 301 *length = len; 302 *iovs_cnt = cnt; 303 304 return 0; 305 } 306 307 static int 308 blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession, 309 struct spdk_vhost_virtqueue *vq, uint16_t req_idx, 310 struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) 311 { 312 struct spdk_vhost_session *vsession = &bvsession->vsession; 313 struct spdk_vhost_dev *vdev = vsession->vdev; 314 struct vring_packed_desc *desc = NULL, *desc_table; 315 uint32_t desc_table_size; 316 int rc; 317 318 rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc, 319 &desc_table, &desc_table_size); 320 if (spdk_unlikely(rc != 0)) { 321 SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); 322 return rc; 323 } 324 325 return blk_iovs_packed_desc_setup(vsession, vq, req_idx, desc_table, desc_table_size, 326 iovs, iovs_cnt, length); 327 } 328 329 static int 330 blk_iovs_inflight_queue_setup(struct spdk_vhost_blk_session *bvsession, 331 struct spdk_vhost_virtqueue *vq, uint16_t req_idx, 332 struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) 333 { 334 struct spdk_vhost_session *vsession = &bvsession->vsession; 335 struct spdk_vhost_dev *vdev = vsession->vdev; 336 spdk_vhost_inflight_desc *inflight_desc; 337 struct vring_packed_desc *desc_table; 338 uint16_t out_cnt = 0, cnt = 0; 339 uint32_t desc_table_size, len = 0; 340 int rc = 0; 341 342 rc = vhost_inflight_queue_get_desc(vsession, vq->vring_inflight.inflight_packed->desc, 343 req_idx, &inflight_desc, &desc_table, &desc_table_size); 344 if (spdk_unlikely(rc != 0)) { 345 SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); 346 return rc; 347 } 348 349 if (desc_table != NULL) { 350 return blk_iovs_packed_desc_setup(vsession, vq, req_idx, desc_table, desc_table_size, 351 iovs, iovs_cnt, length); 352 } 353 354 while (1) { 355 /* 356 * Maximum cnt reached? 357 * Should not happen if request is well formatted, otherwise this is a BUG. 358 */ 359 if (spdk_unlikely(cnt == *iovs_cnt)) { 360 SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n", 361 vsession->name, req_idx); 362 return -EINVAL; 363 } 364 365 if (spdk_unlikely(vhost_vring_inflight_desc_to_iov(vsession, iovs, &cnt, inflight_desc))) { 366 SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", 367 vsession->name, req_idx, cnt); 368 return -EINVAL; 369 } 370 371 len += inflight_desc->len; 372 out_cnt += vhost_vring_inflight_desc_is_wr(inflight_desc); 373 374 /* Without F_NEXT means it's the last desc */ 375 if ((inflight_desc->flags & VRING_DESC_F_NEXT) == 0) { 376 break; 377 } 378 379 inflight_desc = &vq->vring_inflight.inflight_packed->desc[inflight_desc->next]; 380 } 381 382 /* 383 * There must be least two descriptors. 384 * First contain request so it must be readable. 385 * Last descriptor contain buffer for response so it must be writable. 386 */ 387 if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { 388 return -EINVAL; 389 } 390 391 *length = len; 392 *iovs_cnt = cnt; 393 394 return 0; 395 } 396 397 static void 398 blk_request_finish(bool success, struct spdk_vhost_blk_task *task) 399 { 400 *task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; 401 402 blk_task_enqueue(task); 403 404 SPDK_DEBUGLOG(vhost_blk, "Finished task (%p) req_idx=%d\n status: %s\n", task, 405 task->req_idx, success ? "OK" : "FAIL"); 406 blk_task_finish(task); 407 } 408 409 static void 410 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 411 { 412 struct spdk_vhost_blk_task *task = cb_arg; 413 414 spdk_bdev_free_io(bdev_io); 415 blk_request_finish(success, task); 416 } 417 418 static void 419 blk_request_resubmit(void *arg) 420 { 421 struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg; 422 int rc = 0; 423 424 rc = process_blk_request(task, task->bvsession); 425 if (rc == 0) { 426 SPDK_DEBUGLOG(vhost_blk, "====== Task %p resubmitted ======\n", task); 427 } else { 428 SPDK_DEBUGLOG(vhost_blk, "====== Task %p failed ======\n", task); 429 } 430 } 431 432 static inline void 433 blk_request_queue_io(struct spdk_vhost_blk_task *task) 434 { 435 int rc; 436 struct spdk_vhost_blk_session *bvsession = task->bvsession; 437 struct spdk_bdev *bdev = bvsession->bvdev->bdev; 438 439 task->bdev_io_wait.bdev = bdev; 440 task->bdev_io_wait.cb_fn = blk_request_resubmit; 441 task->bdev_io_wait.cb_arg = task; 442 443 rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait); 444 if (rc != 0) { 445 SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc); 446 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 447 } 448 } 449 450 static int 451 process_blk_request(struct spdk_vhost_blk_task *task, 452 struct spdk_vhost_blk_session *bvsession) 453 { 454 struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev; 455 const struct virtio_blk_outhdr *req; 456 struct virtio_blk_discard_write_zeroes *desc; 457 struct iovec *iov; 458 uint32_t type; 459 uint64_t flush_bytes; 460 uint32_t payload_len; 461 int rc; 462 463 iov = &task->iovs[0]; 464 if (spdk_unlikely(iov->iov_len != sizeof(*req))) { 465 SPDK_DEBUGLOG(vhost_blk, 466 "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n", 467 iov->iov_len, sizeof(*req), task->req_idx); 468 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 469 return -1; 470 } 471 472 req = iov->iov_base; 473 474 iov = &task->iovs[task->iovcnt - 1]; 475 if (spdk_unlikely(iov->iov_len != 1)) { 476 SPDK_DEBUGLOG(vhost_blk, 477 "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n", 478 iov->iov_len, 1, task->req_idx); 479 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 480 return -1; 481 } 482 483 payload_len = task->payload_size; 484 task->status = iov->iov_base; 485 payload_len -= sizeof(*req) + sizeof(*task->status); 486 task->iovcnt -= 2; 487 488 type = req->type; 489 #ifdef VIRTIO_BLK_T_BARRIER 490 /* Don't care about barier for now (as QEMU's virtio-blk do). */ 491 type &= ~VIRTIO_BLK_T_BARRIER; 492 #endif 493 494 switch (type) { 495 case VIRTIO_BLK_T_IN: 496 case VIRTIO_BLK_T_OUT: 497 if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) { 498 SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n", 499 type ? "WRITE" : "READ", task->req_idx); 500 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 501 return -1; 502 } 503 504 if (type == VIRTIO_BLK_T_IN) { 505 task->used_len = payload_len + sizeof(*task->status); 506 rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel, 507 &task->iovs[1], task->iovcnt, req->sector * 512, 508 payload_len, blk_request_complete_cb, task); 509 } else if (!bvdev->readonly) { 510 task->used_len = sizeof(*task->status); 511 rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel, 512 &task->iovs[1], task->iovcnt, req->sector * 512, 513 payload_len, blk_request_complete_cb, task); 514 } else { 515 SPDK_DEBUGLOG(vhost_blk, "Device is in read-only mode!\n"); 516 rc = -1; 517 } 518 519 if (rc) { 520 if (rc == -ENOMEM) { 521 SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n"); 522 blk_request_queue_io(task); 523 } else { 524 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 525 return -1; 526 } 527 } 528 break; 529 case VIRTIO_BLK_T_DISCARD: 530 desc = task->iovs[1].iov_base; 531 if (payload_len != sizeof(*desc)) { 532 SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len); 533 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 534 return -1; 535 } 536 537 if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { 538 SPDK_ERRLOG("UNMAP flag is only used for WRITE ZEROES command\n"); 539 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 540 return -1; 541 } 542 543 rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel, 544 desc->sector * 512, desc->num_sectors * 512, 545 blk_request_complete_cb, task); 546 if (rc) { 547 if (rc == -ENOMEM) { 548 SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n"); 549 blk_request_queue_io(task); 550 } else { 551 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 552 return -1; 553 } 554 } 555 break; 556 case VIRTIO_BLK_T_WRITE_ZEROES: 557 desc = task->iovs[1].iov_base; 558 if (payload_len != sizeof(*desc)) { 559 SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len); 560 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 561 return -1; 562 } 563 564 /* Unmap this range, SPDK doesn't support it, kernel will enable this flag by default 565 * without checking unmap feature is negociated or not, the flag isn't mandatory, so 566 * just print a warning. 567 */ 568 if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { 569 SPDK_WARNLOG("Ignore the unmap flag for WRITE ZEROES from %"PRIx64", len %"PRIx64"\n", 570 (uint64_t)desc->sector * 512, (uint64_t)desc->num_sectors * 512); 571 } 572 573 rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel, 574 desc->sector * 512, desc->num_sectors * 512, 575 blk_request_complete_cb, task); 576 if (rc) { 577 if (rc == -ENOMEM) { 578 SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n"); 579 blk_request_queue_io(task); 580 } else { 581 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 582 return -1; 583 } 584 } 585 break; 586 case VIRTIO_BLK_T_FLUSH: 587 flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev); 588 if (req->sector != 0) { 589 SPDK_NOTICELOG("sector must be zero for flush command\n"); 590 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 591 return -1; 592 } 593 rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel, 594 0, flush_bytes, 595 blk_request_complete_cb, task); 596 if (rc) { 597 if (rc == -ENOMEM) { 598 SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n"); 599 blk_request_queue_io(task); 600 } else { 601 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 602 return -1; 603 } 604 } 605 break; 606 case VIRTIO_BLK_T_GET_ID: 607 if (!task->iovcnt || !payload_len) { 608 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 609 return -1; 610 } 611 task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len); 612 spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev), 613 task->used_len, ' '); 614 blk_request_finish(true, task); 615 break; 616 default: 617 SPDK_DEBUGLOG(vhost_blk, "Not supported request type '%"PRIu32"'.\n", type); 618 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 619 return -1; 620 } 621 622 return 0; 623 } 624 625 static void 626 process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx) 627 { 628 struct spdk_vhost_blk_task *task; 629 int rc; 630 631 assert(vq->packed.packed_ring == false); 632 633 task = &((struct spdk_vhost_blk_task *)vq->tasks)[req_idx]; 634 if (spdk_unlikely(task->used)) { 635 SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", 636 task->bvsession->vsession.name, req_idx); 637 task->used_len = 0; 638 blk_task_enqueue(task); 639 return; 640 } 641 642 task->bvsession->vsession.task_cnt++; 643 644 blk_task_init(task); 645 646 rc = blk_iovs_split_queue_setup(task->bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, 647 &task->payload_size); 648 649 if (rc) { 650 SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx); 651 /* Only READ and WRITE are supported for now. */ 652 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 653 return; 654 } 655 656 if (process_blk_request(task, task->bvsession) == 0) { 657 SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task, 658 req_idx); 659 } else { 660 SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, req_idx); 661 } 662 } 663 664 static void 665 process_packed_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx) 666 { 667 struct spdk_vhost_blk_task *task; 668 uint16_t task_idx = req_idx, num_descs; 669 int rc; 670 671 assert(vq->packed.packed_ring); 672 673 /* Packed ring used the buffer_id as the task_idx to get task struct. 674 * In kernel driver, it uses the vq->free_head to set the buffer_id so the value 675 * must be in the range of 0 ~ vring.size. The free_head value must be unique 676 * in the outstanding requests. 677 * We can't use the req_idx as the task_idx because the desc can be reused in 678 * the next phase even when it's not completed in the previous phase. For example, 679 * At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving 680 * phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used 681 * as task_idx because we will know task[0]->used is true at phase 1. 682 * The split queue is quite different, the desc would insert into the free list when 683 * device completes the request, the driver gets the desc from the free list which 684 * ensures the req_idx is unique in the outstanding requests. 685 */ 686 task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs); 687 688 task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx]; 689 if (spdk_unlikely(task->used)) { 690 SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", 691 task->bvsession->vsession.name, task_idx); 692 task->used_len = 0; 693 blk_task_enqueue(task); 694 return; 695 } 696 697 task->req_idx = req_idx; 698 task->num_descs = num_descs; 699 task->buffer_id = task_idx; 700 701 rte_vhost_set_inflight_desc_packed(task->bvsession->vsession.vid, vq->vring_idx, 702 req_idx, (req_idx + num_descs - 1) % vq->vring.size, 703 &task->inflight_head); 704 705 task->bvsession->vsession.task_cnt++; 706 707 blk_task_init(task); 708 709 rc = blk_iovs_packed_queue_setup(task->bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, 710 &task->payload_size); 711 if (rc) { 712 SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx); 713 /* Only READ and WRITE are supported for now. */ 714 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 715 return; 716 } 717 718 if (process_blk_request(task, task->bvsession) == 0) { 719 SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task, 720 task_idx); 721 } else { 722 SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx); 723 } 724 } 725 726 static void 727 process_packed_inflight_blk_task(struct spdk_vhost_virtqueue *vq, 728 uint16_t req_idx) 729 { 730 spdk_vhost_inflight_desc *desc_array = vq->vring_inflight.inflight_packed->desc; 731 spdk_vhost_inflight_desc *desc = &desc_array[req_idx]; 732 struct spdk_vhost_blk_task *task; 733 uint16_t task_idx, num_descs; 734 int rc; 735 736 task_idx = desc_array[desc->last].id; 737 num_descs = desc->num; 738 /* In packed ring reconnection, we use the last_used_idx as the 739 * initial value. So when we process the inflight descs we still 740 * need to update the available ring index. 741 */ 742 vq->last_avail_idx += num_descs; 743 if (vq->last_avail_idx >= vq->vring.size) { 744 vq->last_avail_idx -= vq->vring.size; 745 vq->packed.avail_phase = !vq->packed.avail_phase; 746 } 747 748 task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx]; 749 if (spdk_unlikely(task->used)) { 750 SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", 751 task->bvsession->vsession.name, task_idx); 752 task->used_len = 0; 753 blk_task_enqueue(task); 754 return; 755 } 756 757 task->req_idx = req_idx; 758 task->num_descs = num_descs; 759 task->buffer_id = task_idx; 760 /* It's for cleaning inflight entries */ 761 task->inflight_head = req_idx; 762 763 task->bvsession->vsession.task_cnt++; 764 765 blk_task_init(task); 766 767 rc = blk_iovs_inflight_queue_setup(task->bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, 768 &task->payload_size); 769 if (rc) { 770 SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx); 771 /* Only READ and WRITE are supported for now. */ 772 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 773 return; 774 } 775 776 if (process_blk_request(task, task->bvsession) == 0) { 777 SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task, 778 task_idx); 779 } else { 780 SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx); 781 } 782 } 783 784 static void 785 submit_inflight_desc(struct spdk_vhost_blk_session *bvsession, 786 struct spdk_vhost_virtqueue *vq) 787 { 788 struct spdk_vhost_session *vsession = &bvsession->vsession; 789 spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight; 790 spdk_vhost_resubmit_desc *resubmit_list; 791 uint16_t req_idx; 792 793 if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) { 794 return; 795 } 796 797 resubmit_list = resubmit->resubmit_list; 798 while (resubmit->resubmit_num-- > 0) { 799 req_idx = resubmit_list[resubmit->resubmit_num].index; 800 SPDK_DEBUGLOG(vhost_blk, "====== Start processing request idx %"PRIu16"======\n", 801 req_idx); 802 803 if (spdk_unlikely(req_idx >= vq->vring.size)) { 804 SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", 805 vsession->name, req_idx, vq->vring.size); 806 vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0); 807 continue; 808 } 809 810 if (vq->packed.packed_ring) { 811 process_packed_inflight_blk_task(vq, req_idx); 812 } else { 813 process_blk_task(vq, req_idx); 814 } 815 } 816 817 free(resubmit_list); 818 resubmit->resubmit_list = NULL; 819 } 820 821 static void 822 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) 823 { 824 struct spdk_vhost_session *vsession = &bvsession->vsession; 825 uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS]; 826 uint16_t reqs_cnt, i; 827 828 submit_inflight_desc(bvsession, vq); 829 830 reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); 831 if (!reqs_cnt) { 832 return; 833 } 834 835 for (i = 0; i < reqs_cnt; i++) { 836 SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n", 837 reqs[i]); 838 839 if (spdk_unlikely(reqs[i] >= vq->vring.size)) { 840 SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", 841 vsession->name, reqs[i], vq->vring.size); 842 vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0); 843 continue; 844 } 845 846 rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]); 847 848 process_blk_task(vq, reqs[i]); 849 } 850 } 851 852 static void 853 process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) 854 { 855 uint16_t i = 0; 856 857 submit_inflight_desc(bvsession, vq); 858 859 while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS && 860 vhost_vq_packed_ring_is_avail(vq)) { 861 SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n", 862 vq->last_avail_idx); 863 864 process_packed_blk_task(vq, vq->last_avail_idx); 865 } 866 } 867 868 static int 869 _vdev_vq_worker(struct spdk_vhost_virtqueue *vq) 870 { 871 struct spdk_vhost_session *vsession = vq->vsession; 872 struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); 873 bool packed_ring; 874 875 packed_ring = vq->packed.packed_ring; 876 if (packed_ring) { 877 process_packed_vq(bvsession, vq); 878 } else { 879 process_vq(bvsession, vq); 880 } 881 882 vhost_session_vq_used_signal(vq); 883 884 return SPDK_POLLER_BUSY; 885 886 } 887 888 static int 889 vdev_vq_worker(void *arg) 890 { 891 struct spdk_vhost_virtqueue *vq = arg; 892 893 return _vdev_vq_worker(vq); 894 } 895 896 static int 897 vdev_worker(void *arg) 898 { 899 struct spdk_vhost_blk_session *bvsession = arg; 900 struct spdk_vhost_session *vsession = &bvsession->vsession; 901 uint16_t q_idx; 902 903 for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { 904 _vdev_vq_worker(&vsession->virtqueue[q_idx]); 905 } 906 907 return SPDK_POLLER_BUSY; 908 } 909 910 static void 911 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) 912 { 913 struct spdk_vhost_session *vsession = &bvsession->vsession; 914 struct iovec iovs[SPDK_VHOST_IOVS_MAX]; 915 uint32_t length; 916 uint16_t iovcnt, req_idx; 917 918 if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) { 919 return; 920 } 921 922 iovcnt = SPDK_COUNTOF(iovs); 923 if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) { 924 *(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR; 925 SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx); 926 } 927 928 vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0); 929 } 930 931 static void 932 no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) 933 { 934 struct spdk_vhost_session *vsession = &bvsession->vsession; 935 struct spdk_vhost_blk_task *task; 936 uint32_t length; 937 uint16_t req_idx = vq->last_avail_idx; 938 uint16_t task_idx, num_descs; 939 940 if (!vhost_vq_packed_ring_is_avail(vq)) { 941 return; 942 } 943 944 task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs); 945 task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx]; 946 if (spdk_unlikely(task->used)) { 947 SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", 948 vsession->name, req_idx); 949 vhost_vq_packed_ring_enqueue(vsession, vq, num_descs, 950 task->buffer_id, task->used_len, 951 task->inflight_head); 952 return; 953 } 954 955 task->req_idx = req_idx; 956 task->num_descs = num_descs; 957 task->buffer_id = task_idx; 958 blk_task_init(task); 959 960 if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, 961 &length)) { 962 *(volatile uint8_t *)(task->iovs[task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR; 963 SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx); 964 } 965 966 task->used = false; 967 vhost_vq_packed_ring_enqueue(vsession, vq, num_descs, 968 task->buffer_id, task->used_len, 969 task->inflight_head); 970 } 971 972 static int 973 _no_bdev_vdev_vq_worker(struct spdk_vhost_virtqueue *vq) 974 { 975 struct spdk_vhost_session *vsession = vq->vsession; 976 struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); 977 bool packed_ring; 978 979 packed_ring = vq->packed.packed_ring; 980 if (packed_ring) { 981 no_bdev_process_packed_vq(bvsession, vq); 982 } else { 983 no_bdev_process_vq(bvsession, vq); 984 } 985 986 vhost_session_vq_used_signal(vq); 987 988 if (vsession->task_cnt == 0 && bvsession->io_channel) { 989 spdk_put_io_channel(bvsession->io_channel); 990 bvsession->io_channel = NULL; 991 } 992 993 return SPDK_POLLER_BUSY; 994 } 995 996 static int 997 no_bdev_vdev_vq_worker(void *arg) 998 { 999 struct spdk_vhost_virtqueue *vq = arg; 1000 1001 return _no_bdev_vdev_vq_worker(vq); 1002 } 1003 1004 static int 1005 no_bdev_vdev_worker(void *arg) 1006 { 1007 struct spdk_vhost_blk_session *bvsession = arg; 1008 struct spdk_vhost_session *vsession = &bvsession->vsession; 1009 uint16_t q_idx; 1010 1011 for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { 1012 _no_bdev_vdev_vq_worker(&vsession->virtqueue[q_idx]); 1013 } 1014 1015 return SPDK_POLLER_BUSY; 1016 } 1017 1018 static void 1019 vhost_blk_session_unregister_interrupts(struct spdk_vhost_blk_session *bvsession) 1020 { 1021 struct spdk_vhost_session *vsession = &bvsession->vsession; 1022 struct spdk_vhost_virtqueue *vq; 1023 int i; 1024 1025 SPDK_DEBUGLOG(vhost_blk, "unregister virtqueues interrupt\n"); 1026 for (i = 0; i < vsession->max_queues; i++) { 1027 vq = &vsession->virtqueue[i]; 1028 if (vq->intr == NULL) { 1029 break; 1030 } 1031 1032 SPDK_DEBUGLOG(vhost_blk, "unregister vq[%d]'s kickfd is %d\n", 1033 i, vq->vring.kickfd); 1034 spdk_interrupt_unregister(&vq->intr); 1035 } 1036 } 1037 1038 static int 1039 vhost_blk_session_register_interrupts(struct spdk_vhost_blk_session *bvsession, 1040 spdk_interrupt_fn fn) 1041 { 1042 struct spdk_vhost_session *vsession = &bvsession->vsession; 1043 struct spdk_vhost_virtqueue *vq = NULL; 1044 int i; 1045 1046 SPDK_DEBUGLOG(vhost_blk, "Register virtqueues interrupt\n"); 1047 for (i = 0; i < vsession->max_queues; i++) { 1048 vq = &vsession->virtqueue[i]; 1049 SPDK_DEBUGLOG(vhost_blk, "Register vq[%d]'s kickfd is %d\n", 1050 i, vq->vring.kickfd); 1051 1052 vq->intr = SPDK_INTERRUPT_REGISTER(vq->vring.kickfd, fn, vq); 1053 if (vq->intr == NULL) { 1054 SPDK_ERRLOG("Fail to register req notifier handler.\n"); 1055 goto err; 1056 } 1057 } 1058 1059 return 0; 1060 1061 err: 1062 vhost_blk_session_unregister_interrupts(bvsession); 1063 1064 return -1; 1065 } 1066 1067 static struct spdk_vhost_blk_dev * 1068 to_blk_dev(struct spdk_vhost_dev *vdev) 1069 { 1070 if (vdev == NULL) { 1071 return NULL; 1072 } 1073 1074 if (vdev->backend != &vhost_blk_device_backend) { 1075 SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name); 1076 return NULL; 1077 } 1078 1079 return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev); 1080 } 1081 1082 static int 1083 vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev, 1084 struct spdk_vhost_session *vsession, 1085 void *ctx) 1086 { 1087 #if RTE_VERSION >= RTE_VERSION_NUM(20, 02, 0, 0) 1088 SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid); 1089 rte_vhost_slave_config_change(vsession->vid, false); 1090 #else 1091 SPDK_NOTICELOG("bdev does not support resize until DPDK submodule version >= 20.02\n"); 1092 #endif 1093 1094 return 0; 1095 } 1096 1097 static void 1098 blk_resize_cb(void *resize_ctx) 1099 { 1100 struct spdk_vhost_blk_dev *bvdev = resize_ctx; 1101 1102 spdk_vhost_lock(); 1103 vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_resize_cb, 1104 NULL, NULL); 1105 spdk_vhost_unlock(); 1106 } 1107 1108 static void 1109 vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx) 1110 { 1111 1112 /* All sessions have been notified, time to close the bdev */ 1113 struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); 1114 1115 assert(bvdev != NULL); 1116 spdk_put_io_channel(bvdev->dummy_io_channel); 1117 spdk_bdev_close(bvdev->bdev_desc); 1118 bvdev->bdev_desc = NULL; 1119 bvdev->bdev = NULL; 1120 } 1121 1122 static int 1123 vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev, 1124 struct spdk_vhost_session *vsession, 1125 void *ctx) 1126 { 1127 struct spdk_vhost_blk_session *bvsession; 1128 int rc; 1129 1130 bvsession = (struct spdk_vhost_blk_session *)vsession; 1131 if (bvsession->requestq_poller) { 1132 spdk_poller_unregister(&bvsession->requestq_poller); 1133 bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0); 1134 } 1135 1136 if (vsession->virtqueue[0].intr) { 1137 vhost_blk_session_unregister_interrupts(bvsession); 1138 rc = vhost_blk_session_register_interrupts(bvsession, no_bdev_vdev_vq_worker); 1139 if (rc) { 1140 SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name); 1141 return -1; 1142 } 1143 1144 } 1145 1146 return 0; 1147 } 1148 1149 static void 1150 bdev_remove_cb(void *remove_ctx) 1151 { 1152 struct spdk_vhost_blk_dev *bvdev = remove_ctx; 1153 1154 SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n", 1155 bvdev->vdev.name); 1156 1157 spdk_vhost_lock(); 1158 vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb, 1159 vhost_dev_bdev_remove_cpl_cb, NULL); 1160 spdk_vhost_unlock(); 1161 } 1162 1163 static void 1164 bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, 1165 void *event_ctx) 1166 { 1167 SPDK_DEBUGLOG(vhost_blk, "Bdev event: type %d, name %s\n", 1168 type, 1169 bdev->name); 1170 1171 switch (type) { 1172 case SPDK_BDEV_EVENT_REMOVE: 1173 SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name); 1174 bdev_remove_cb(event_ctx); 1175 break; 1176 case SPDK_BDEV_EVENT_RESIZE: 1177 SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name); 1178 blk_resize_cb(event_ctx); 1179 break; 1180 default: 1181 SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); 1182 break; 1183 } 1184 } 1185 1186 static void 1187 free_task_pool(struct spdk_vhost_blk_session *bvsession) 1188 { 1189 struct spdk_vhost_session *vsession = &bvsession->vsession; 1190 struct spdk_vhost_virtqueue *vq; 1191 uint16_t i; 1192 1193 for (i = 0; i < vsession->max_queues; i++) { 1194 vq = &vsession->virtqueue[i]; 1195 if (vq->tasks == NULL) { 1196 continue; 1197 } 1198 1199 spdk_free(vq->tasks); 1200 vq->tasks = NULL; 1201 } 1202 } 1203 1204 static int 1205 alloc_task_pool(struct spdk_vhost_blk_session *bvsession) 1206 { 1207 struct spdk_vhost_session *vsession = &bvsession->vsession; 1208 struct spdk_vhost_virtqueue *vq; 1209 struct spdk_vhost_blk_task *task; 1210 uint32_t task_cnt; 1211 uint16_t i; 1212 uint32_t j; 1213 1214 for (i = 0; i < vsession->max_queues; i++) { 1215 vq = &vsession->virtqueue[i]; 1216 if (vq->vring.desc == NULL) { 1217 continue; 1218 } 1219 1220 task_cnt = vq->vring.size; 1221 if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) { 1222 /* sanity check */ 1223 SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", 1224 vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); 1225 free_task_pool(bvsession); 1226 return -1; 1227 } 1228 vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt, 1229 SPDK_CACHE_LINE_SIZE, NULL, 1230 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1231 if (vq->tasks == NULL) { 1232 SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", 1233 vsession->name, task_cnt, i); 1234 free_task_pool(bvsession); 1235 return -1; 1236 } 1237 1238 for (j = 0; j < task_cnt; j++) { 1239 task = &((struct spdk_vhost_blk_task *)vq->tasks)[j]; 1240 task->bvsession = bvsession; 1241 task->req_idx = j; 1242 task->vq = vq; 1243 } 1244 } 1245 1246 return 0; 1247 } 1248 1249 static int 1250 vhost_blk_start_cb(struct spdk_vhost_dev *vdev, 1251 struct spdk_vhost_session *vsession, void *unused) 1252 { 1253 struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); 1254 struct spdk_vhost_blk_dev *bvdev; 1255 int i, rc = 0; 1256 1257 bvdev = to_blk_dev(vdev); 1258 assert(bvdev != NULL); 1259 bvsession->bvdev = bvdev; 1260 1261 /* validate all I/O queues are in a contiguous index range */ 1262 for (i = 0; i < vsession->max_queues; i++) { 1263 /* vring.desc and vring.desc_packed are in a union struct 1264 * so q->vring.desc can replace q->vring.desc_packed. 1265 */ 1266 if (vsession->virtqueue[i].vring.desc == NULL) { 1267 SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i); 1268 rc = -1; 1269 goto out; 1270 } 1271 } 1272 1273 rc = alloc_task_pool(bvsession); 1274 if (rc != 0) { 1275 SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name); 1276 goto out; 1277 } 1278 1279 if (bvdev->bdev) { 1280 bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc); 1281 if (!bvsession->io_channel) { 1282 free_task_pool(bvsession); 1283 SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name); 1284 rc = -1; 1285 goto out; 1286 } 1287 } 1288 1289 if (spdk_interrupt_mode_is_enabled()) { 1290 rc = vhost_blk_session_register_interrupts(bvsession, 1291 bvdev->bdev ? vdev_vq_worker : no_bdev_vdev_vq_worker); 1292 if (rc) { 1293 SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name); 1294 goto out; 1295 } 1296 SPDK_INFOLOG(vhost, "%s: started interrupt source on lcore %d\n", 1297 vsession->name, spdk_env_get_current_core()); 1298 } else { 1299 bvsession->requestq_poller = SPDK_POLLER_REGISTER(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker, 1300 bvsession, 0); 1301 SPDK_INFOLOG(vhost, "%s: started poller on lcore %d\n", 1302 vsession->name, spdk_env_get_current_core()); 1303 } 1304 1305 out: 1306 vhost_session_start_done(vsession, rc); 1307 return rc; 1308 } 1309 1310 static int 1311 vhost_blk_start(struct spdk_vhost_session *vsession) 1312 { 1313 return vhost_session_send_event(vsession, vhost_blk_start_cb, 1314 3, "start session"); 1315 } 1316 1317 static int 1318 destroy_session_poller_cb(void *arg) 1319 { 1320 struct spdk_vhost_blk_session *bvsession = arg; 1321 struct spdk_vhost_session *vsession = &bvsession->vsession; 1322 int i; 1323 1324 if (vsession->task_cnt > 0) { 1325 return SPDK_POLLER_BUSY; 1326 } 1327 1328 if (spdk_vhost_trylock() != 0) { 1329 return SPDK_POLLER_BUSY; 1330 } 1331 1332 for (i = 0; i < vsession->max_queues; i++) { 1333 vsession->virtqueue[i].next_event_time = 0; 1334 vhost_vq_used_signal(vsession, &vsession->virtqueue[i]); 1335 } 1336 1337 SPDK_INFOLOG(vhost, "%s: stopping poller on lcore %d\n", 1338 vsession->name, spdk_env_get_current_core()); 1339 1340 if (bvsession->io_channel) { 1341 spdk_put_io_channel(bvsession->io_channel); 1342 bvsession->io_channel = NULL; 1343 } 1344 1345 free_task_pool(bvsession); 1346 spdk_poller_unregister(&bvsession->stop_poller); 1347 vhost_session_stop_done(vsession, 0); 1348 1349 spdk_vhost_unlock(); 1350 return SPDK_POLLER_BUSY; 1351 } 1352 1353 static int 1354 vhost_blk_stop_cb(struct spdk_vhost_dev *vdev, 1355 struct spdk_vhost_session *vsession, void *unused) 1356 { 1357 struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); 1358 1359 spdk_poller_unregister(&bvsession->requestq_poller); 1360 1361 if (vsession->virtqueue[0].intr) { 1362 vhost_blk_session_unregister_interrupts(bvsession); 1363 } 1364 1365 bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb, 1366 bvsession, 1000); 1367 return 0; 1368 } 1369 1370 static int 1371 vhost_blk_stop(struct spdk_vhost_session *vsession) 1372 { 1373 return vhost_session_send_event(vsession, vhost_blk_stop_cb, 1374 3, "stop session"); 1375 } 1376 1377 static void 1378 vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) 1379 { 1380 struct spdk_vhost_blk_dev *bvdev; 1381 1382 bvdev = to_blk_dev(vdev); 1383 assert(bvdev != NULL); 1384 1385 spdk_json_write_named_object_begin(w, "block"); 1386 1387 spdk_json_write_named_bool(w, "readonly", bvdev->readonly); 1388 1389 spdk_json_write_name(w, "bdev"); 1390 if (bvdev->bdev) { 1391 spdk_json_write_string(w, spdk_bdev_get_name(bvdev->bdev)); 1392 } else { 1393 spdk_json_write_null(w); 1394 } 1395 1396 spdk_json_write_object_end(w); 1397 } 1398 1399 static void 1400 vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) 1401 { 1402 struct spdk_vhost_blk_dev *bvdev; 1403 1404 bvdev = to_blk_dev(vdev); 1405 assert(bvdev != NULL); 1406 1407 if (!bvdev->bdev) { 1408 return; 1409 } 1410 1411 spdk_json_write_object_begin(w); 1412 spdk_json_write_named_string(w, "method", "vhost_create_blk_controller"); 1413 1414 spdk_json_write_named_object_begin(w, "params"); 1415 spdk_json_write_named_string(w, "ctrlr", vdev->name); 1416 spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev)); 1417 spdk_json_write_named_string(w, "cpumask", 1418 spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread))); 1419 spdk_json_write_named_bool(w, "readonly", bvdev->readonly); 1420 spdk_json_write_object_end(w); 1421 1422 spdk_json_write_object_end(w); 1423 } 1424 1425 static int vhost_blk_destroy(struct spdk_vhost_dev *dev); 1426 1427 static int 1428 vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config, 1429 uint32_t len) 1430 { 1431 struct virtio_blk_config blkcfg; 1432 struct spdk_vhost_blk_dev *bvdev; 1433 struct spdk_bdev *bdev; 1434 uint32_t blk_size; 1435 uint64_t blkcnt; 1436 1437 memset(&blkcfg, 0, sizeof(blkcfg)); 1438 bvdev = to_blk_dev(vdev); 1439 assert(bvdev != NULL); 1440 bdev = bvdev->bdev; 1441 if (bdev == NULL) { 1442 /* We can't just return -1 here as this GET_CONFIG message might 1443 * be caused by a QEMU VM reboot. Returning -1 will indicate an 1444 * error to QEMU, who might then decide to terminate itself. 1445 * We don't want that. A simple reboot shouldn't break the system. 1446 * 1447 * Presenting a block device with block size 0 and block count 0 1448 * doesn't cause any problems on QEMU side and the virtio-pci 1449 * device is even still available inside the VM, but there will 1450 * be no block device created for it - the kernel drivers will 1451 * silently reject it. 1452 */ 1453 blk_size = 0; 1454 blkcnt = 0; 1455 } else { 1456 blk_size = spdk_bdev_get_block_size(bdev); 1457 blkcnt = spdk_bdev_get_num_blocks(bdev); 1458 if (spdk_bdev_get_buf_align(bdev) > 1) { 1459 blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE; 1460 blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1); 1461 } else { 1462 blkcfg.size_max = 131072; 1463 /* -2 for REQ and RESP and -1 for region boundary splitting */ 1464 blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1; 1465 } 1466 } 1467 1468 blkcfg.blk_size = blk_size; 1469 /* minimum I/O size in blocks */ 1470 blkcfg.min_io_size = 1; 1471 /* expressed in 512 Bytes sectors */ 1472 blkcfg.capacity = (blkcnt * blk_size) / 512; 1473 /* QEMU can overwrite this value when started */ 1474 blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES; 1475 1476 if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { 1477 /* 16MiB, expressed in 512 Bytes */ 1478 blkcfg.max_discard_sectors = 32768; 1479 blkcfg.max_discard_seg = 1; 1480 blkcfg.discard_sector_alignment = blk_size / 512; 1481 } 1482 if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 1483 blkcfg.max_write_zeroes_sectors = 32768; 1484 blkcfg.max_write_zeroes_seg = 1; 1485 } 1486 1487 memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg))); 1488 1489 return 0; 1490 } 1491 1492 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = { 1493 .session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session), 1494 .start_session = vhost_blk_start, 1495 .stop_session = vhost_blk_stop, 1496 .vhost_get_config = vhost_blk_get_config, 1497 .dump_info_json = vhost_blk_dump_info_json, 1498 .write_config_json = vhost_blk_write_config_json, 1499 .remove_device = vhost_blk_destroy, 1500 }; 1501 1502 int 1503 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, 1504 bool readonly, bool packed_ring) 1505 { 1506 struct spdk_vhost_blk_dev *bvdev = NULL; 1507 struct spdk_vhost_dev *vdev; 1508 struct spdk_bdev *bdev; 1509 int ret = 0; 1510 1511 spdk_vhost_lock(); 1512 1513 bvdev = calloc(1, sizeof(*bvdev)); 1514 if (bvdev == NULL) { 1515 ret = -ENOMEM; 1516 goto out; 1517 } 1518 1519 ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc); 1520 if (ret != 0) { 1521 SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n", 1522 name, dev_name, ret); 1523 goto out; 1524 } 1525 bdev = spdk_bdev_desc_get_bdev(bvdev->bdev_desc); 1526 1527 vdev = &bvdev->vdev; 1528 vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE; 1529 vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES; 1530 vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES; 1531 1532 vdev->virtio_features |= (uint64_t)packed_ring << VIRTIO_F_RING_PACKED; 1533 1534 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { 1535 vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD); 1536 } 1537 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 1538 vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES); 1539 } 1540 if (readonly) { 1541 vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO); 1542 } 1543 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) { 1544 vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH); 1545 } 1546 1547 /* 1548 * When starting qemu with vhost-user-blk multiqueue, the vhost device will 1549 * be started/stopped many times, related to the queues num, as the 1550 * vhost-user backend doesn't know the exact number of queues used for this 1551 * device. The target have to stop and start the device once got a valid 1552 * IO queue. 1553 * When stoping and starting the vhost device, the backend bdev io device 1554 * will be deleted and created repeatedly. 1555 * Hold a bdev reference so that in the struct spdk_vhost_blk_dev, so that 1556 * the io device will not be deleted. 1557 */ 1558 bvdev->dummy_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc); 1559 1560 bvdev->bdev = bdev; 1561 bvdev->readonly = readonly; 1562 ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend); 1563 if (ret != 0) { 1564 spdk_put_io_channel(bvdev->dummy_io_channel); 1565 spdk_bdev_close(bvdev->bdev_desc); 1566 goto out; 1567 } 1568 1569 SPDK_INFOLOG(vhost, "%s: using bdev '%s'\n", name, dev_name); 1570 out: 1571 if (ret != 0 && bvdev) { 1572 free(bvdev); 1573 } 1574 spdk_vhost_unlock(); 1575 return ret; 1576 } 1577 1578 static int 1579 vhost_blk_destroy(struct spdk_vhost_dev *vdev) 1580 { 1581 struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); 1582 int rc; 1583 1584 assert(bvdev != NULL); 1585 1586 rc = vhost_dev_unregister(&bvdev->vdev); 1587 if (rc != 0) { 1588 return rc; 1589 } 1590 1591 /* if the bdev is removed, don't need call spdk_put_io_channel. */ 1592 if (bvdev->bdev) { 1593 spdk_put_io_channel(bvdev->dummy_io_channel); 1594 } 1595 1596 if (bvdev->bdev_desc) { 1597 spdk_bdev_close(bvdev->bdev_desc); 1598 bvdev->bdev_desc = NULL; 1599 } 1600 bvdev->bdev = NULL; 1601 1602 free(bvdev); 1603 return 0; 1604 } 1605 1606 SPDK_LOG_REGISTER_COMPONENT(vhost_blk) 1607 SPDK_LOG_REGISTER_COMPONENT(vhost_blk_data) 1608