1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <linux/virtio_blk.h> 35 36 #include "spdk/env.h" 37 #include "spdk/bdev.h" 38 #include "spdk/bdev_module.h" 39 #include "spdk/thread.h" 40 #include "spdk/likely.h" 41 #include "spdk/string.h" 42 #include "spdk/util.h" 43 #include "spdk/vhost.h" 44 45 #include "vhost_internal.h" 46 #include <rte_version.h> 47 48 /* Minimal set of features supported by every SPDK VHOST-BLK device */ 49 #define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \ 50 (1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ 51 (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ 52 (1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER) | \ 53 (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \ 54 (1ULL << VIRTIO_BLK_F_MQ)) 55 56 /* Not supported features */ 57 #define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \ 58 (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \ 59 (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI)) 60 61 /* Vhost-blk support protocol features */ 62 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \ 63 (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) 64 65 struct spdk_vhost_user_blk_task { 66 struct spdk_vhost_blk_task blk_task; 67 struct spdk_vhost_blk_session *bvsession; 68 struct spdk_vhost_virtqueue *vq; 69 70 uint16_t req_idx; 71 uint16_t num_descs; 72 uint16_t buffer_id; 73 uint16_t inflight_head; 74 75 /* If set, the task is currently used for I/O processing. */ 76 bool used; 77 }; 78 79 struct spdk_vhost_blk_dev { 80 struct spdk_vhost_dev vdev; 81 struct spdk_bdev *bdev; 82 struct spdk_bdev_desc *bdev_desc; 83 /* dummy_io_channel is used to hold a bdev reference */ 84 struct spdk_io_channel *dummy_io_channel; 85 bool readonly; 86 }; 87 88 struct spdk_vhost_blk_session { 89 /* The parent session must be the very first field in this struct */ 90 struct spdk_vhost_session vsession; 91 struct spdk_vhost_blk_dev *bvdev; 92 struct spdk_poller *requestq_poller; 93 struct spdk_io_channel *io_channel; 94 struct spdk_poller *stop_poller; 95 }; 96 97 struct rpc_vhost_blk { 98 bool readonly; 99 bool packed_ring; 100 bool packed_ring_recovery; 101 }; 102 103 static const struct spdk_json_object_decoder rpc_construct_vhost_blk[] = { 104 {"readonly", offsetof(struct rpc_vhost_blk, readonly), spdk_json_decode_bool, true}, 105 {"packed_ring", offsetof(struct rpc_vhost_blk, packed_ring), spdk_json_decode_bool, true}, 106 {"packed_ring_recovery", offsetof(struct rpc_vhost_blk, packed_ring_recovery), spdk_json_decode_bool, true}, 107 }; 108 109 /* forward declaration */ 110 static const struct spdk_vhost_dev_backend vhost_blk_device_backend; 111 112 static int 113 virtio_blk_process_request(struct spdk_vhost_dev *vdev, struct spdk_io_channel *ch, 114 struct spdk_vhost_blk_task *task); 115 116 static int 117 vhost_user_process_blk_request(struct spdk_vhost_user_blk_task *user_task) 118 { 119 struct spdk_vhost_blk_session *bvsession = user_task->bvsession; 120 struct spdk_vhost_dev *vdev = &bvsession->bvdev->vdev; 121 122 return virtio_blk_process_request(vdev, bvsession->io_channel, &user_task->blk_task); 123 } 124 125 static struct spdk_vhost_blk_dev * 126 to_blk_dev(struct spdk_vhost_dev *vdev) 127 { 128 if (vdev == NULL) { 129 return NULL; 130 } 131 132 if (vdev->backend->type != VHOST_BACKEND_BLK) { 133 SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name); 134 return NULL; 135 } 136 137 return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev); 138 } 139 140 static struct spdk_vhost_blk_session * 141 to_blk_session(struct spdk_vhost_session *vsession) 142 { 143 assert(vsession->vdev->backend->type == VHOST_BACKEND_BLK); 144 return (struct spdk_vhost_blk_session *)vsession; 145 } 146 147 static void 148 blk_task_finish(struct spdk_vhost_user_blk_task *task) 149 { 150 assert(task->bvsession->vsession.task_cnt > 0); 151 task->bvsession->vsession.task_cnt--; 152 task->used = false; 153 } 154 155 static void 156 blk_task_init(struct spdk_vhost_user_blk_task *task) 157 { 158 struct spdk_vhost_blk_task *blk_task = &task->blk_task; 159 160 task->used = true; 161 blk_task->iovcnt = SPDK_COUNTOF(blk_task->iovs); 162 blk_task->status = NULL; 163 blk_task->used_len = 0; 164 blk_task->payload_size = 0; 165 } 166 167 static void 168 blk_task_enqueue(struct spdk_vhost_user_blk_task *task) 169 { 170 if (task->vq->packed.packed_ring) { 171 vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq, 172 task->num_descs, 173 task->buffer_id, task->blk_task.used_len, 174 task->inflight_head); 175 } else { 176 vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, 177 task->req_idx, task->blk_task.used_len); 178 } 179 } 180 181 static void 182 vhost_user_blk_request_finish(uint8_t status, struct spdk_vhost_blk_task *task) 183 { 184 struct spdk_vhost_user_blk_task *user_task; 185 186 user_task = SPDK_CONTAINEROF(task, struct spdk_vhost_user_blk_task, blk_task); 187 188 blk_task_enqueue(user_task); 189 190 SPDK_DEBUGLOG(vhost_blk, "Finished task (%p) req_idx=%d\n status: %" PRIu8"\n", 191 user_task, user_task->req_idx, status); 192 blk_task_finish(user_task); 193 } 194 195 static void 196 blk_request_finish(uint8_t status, struct spdk_vhost_blk_task *task) 197 { 198 199 if (task->status) { 200 *task->status = status; 201 } 202 203 vhost_user_blk_request_finish(status, task); 204 } 205 206 /* 207 * Process task's descriptor chain and setup data related fields. 208 * Return 209 * total size of supplied buffers 210 * 211 * FIXME: Make this function return to rd_cnt and wr_cnt 212 */ 213 static int 214 blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession, 215 struct spdk_vhost_virtqueue *vq, 216 uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) 217 { 218 struct spdk_vhost_session *vsession = &bvsession->vsession; 219 struct spdk_vhost_dev *vdev = vsession->vdev; 220 struct vring_desc *desc, *desc_table; 221 uint16_t out_cnt = 0, cnt = 0; 222 uint32_t desc_table_size, len = 0; 223 uint32_t desc_handled_cnt; 224 int rc; 225 226 rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size); 227 if (rc != 0) { 228 SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); 229 return -1; 230 } 231 232 desc_handled_cnt = 0; 233 while (1) { 234 /* 235 * Maximum cnt reached? 236 * Should not happen if request is well formatted, otherwise this is a BUG. 237 */ 238 if (spdk_unlikely(cnt == *iovs_cnt)) { 239 SPDK_DEBUGLOG(vhost_blk, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n", 240 vsession->name, req_idx); 241 return -1; 242 } 243 244 if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) { 245 SPDK_DEBUGLOG(vhost_blk, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", 246 vsession->name, req_idx, cnt); 247 return -1; 248 } 249 250 len += desc->len; 251 252 out_cnt += vhost_vring_desc_is_wr(desc); 253 254 rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); 255 if (rc != 0) { 256 SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n", 257 vsession->name, req_idx); 258 return -1; 259 } else if (desc == NULL) { 260 break; 261 } 262 263 desc_handled_cnt++; 264 if (spdk_unlikely(desc_handled_cnt > desc_table_size)) { 265 /* Break a cycle and report an error, if any. */ 266 SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n", 267 vsession->name, desc_table_size, desc_handled_cnt); 268 return -1; 269 } 270 } 271 272 /* 273 * There must be least two descriptors. 274 * First contain request so it must be readable. 275 * Last descriptor contain buffer for response so it must be writable. 276 */ 277 if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { 278 return -1; 279 } 280 281 *length = len; 282 *iovs_cnt = cnt; 283 return 0; 284 } 285 286 static int 287 blk_iovs_packed_desc_setup(struct spdk_vhost_session *vsession, 288 struct spdk_vhost_virtqueue *vq, uint16_t req_idx, 289 struct vring_packed_desc *desc_table, uint16_t desc_table_size, 290 struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) 291 { 292 struct vring_packed_desc *desc; 293 uint16_t cnt = 0, out_cnt = 0; 294 uint32_t len = 0; 295 296 if (desc_table == NULL) { 297 desc = &vq->vring.desc_packed[req_idx]; 298 } else { 299 req_idx = 0; 300 desc = desc_table; 301 } 302 303 while (1) { 304 /* 305 * Maximum cnt reached? 306 * Should not happen if request is well formatted, otherwise this is a BUG. 307 */ 308 if (spdk_unlikely(cnt == *iovs_cnt)) { 309 SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n", 310 vsession->name, req_idx); 311 return -EINVAL; 312 } 313 314 if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) { 315 SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", 316 vsession->name, req_idx, cnt); 317 return -EINVAL; 318 } 319 320 len += desc->len; 321 out_cnt += vhost_vring_packed_desc_is_wr(desc); 322 323 /* desc is NULL means we reach the last desc of this request */ 324 vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size); 325 if (desc == NULL) { 326 break; 327 } 328 } 329 330 /* 331 * There must be least two descriptors. 332 * First contain request so it must be readable. 333 * Last descriptor contain buffer for response so it must be writable. 334 */ 335 if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { 336 return -EINVAL; 337 } 338 339 *length = len; 340 *iovs_cnt = cnt; 341 342 return 0; 343 } 344 345 static int 346 blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession, 347 struct spdk_vhost_virtqueue *vq, uint16_t req_idx, 348 struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) 349 { 350 struct spdk_vhost_session *vsession = &bvsession->vsession; 351 struct spdk_vhost_dev *vdev = vsession->vdev; 352 struct vring_packed_desc *desc = NULL, *desc_table; 353 uint32_t desc_table_size; 354 int rc; 355 356 rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc, 357 &desc_table, &desc_table_size); 358 if (spdk_unlikely(rc != 0)) { 359 SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); 360 return rc; 361 } 362 363 return blk_iovs_packed_desc_setup(vsession, vq, req_idx, desc_table, desc_table_size, 364 iovs, iovs_cnt, length); 365 } 366 367 static int 368 blk_iovs_inflight_queue_setup(struct spdk_vhost_blk_session *bvsession, 369 struct spdk_vhost_virtqueue *vq, uint16_t req_idx, 370 struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) 371 { 372 struct spdk_vhost_session *vsession = &bvsession->vsession; 373 struct spdk_vhost_dev *vdev = vsession->vdev; 374 spdk_vhost_inflight_desc *inflight_desc; 375 struct vring_packed_desc *desc_table; 376 uint16_t out_cnt = 0, cnt = 0; 377 uint32_t desc_table_size, len = 0; 378 int rc = 0; 379 380 rc = vhost_inflight_queue_get_desc(vsession, vq->vring_inflight.inflight_packed->desc, 381 req_idx, &inflight_desc, &desc_table, &desc_table_size); 382 if (spdk_unlikely(rc != 0)) { 383 SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); 384 return rc; 385 } 386 387 if (desc_table != NULL) { 388 return blk_iovs_packed_desc_setup(vsession, vq, req_idx, desc_table, desc_table_size, 389 iovs, iovs_cnt, length); 390 } 391 392 while (1) { 393 /* 394 * Maximum cnt reached? 395 * Should not happen if request is well formatted, otherwise this is a BUG. 396 */ 397 if (spdk_unlikely(cnt == *iovs_cnt)) { 398 SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n", 399 vsession->name, req_idx); 400 return -EINVAL; 401 } 402 403 if (spdk_unlikely(vhost_vring_inflight_desc_to_iov(vsession, iovs, &cnt, inflight_desc))) { 404 SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", 405 vsession->name, req_idx, cnt); 406 return -EINVAL; 407 } 408 409 len += inflight_desc->len; 410 out_cnt += vhost_vring_inflight_desc_is_wr(inflight_desc); 411 412 /* Without F_NEXT means it's the last desc */ 413 if ((inflight_desc->flags & VRING_DESC_F_NEXT) == 0) { 414 break; 415 } 416 417 inflight_desc = &vq->vring_inflight.inflight_packed->desc[inflight_desc->next]; 418 } 419 420 /* 421 * There must be least two descriptors. 422 * First contain request so it must be readable. 423 * Last descriptor contain buffer for response so it must be writable. 424 */ 425 if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { 426 return -EINVAL; 427 } 428 429 *length = len; 430 *iovs_cnt = cnt; 431 432 return 0; 433 } 434 435 static void 436 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 437 { 438 struct spdk_vhost_blk_task *task = cb_arg; 439 440 spdk_bdev_free_io(bdev_io); 441 blk_request_finish(success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR, task); 442 } 443 444 static void 445 blk_request_resubmit(void *arg) 446 { 447 struct spdk_vhost_blk_task *task = arg; 448 int rc = 0; 449 450 rc = virtio_blk_process_request(task->bdev_io_wait_vdev, task->bdev_io_wait_ch, task); 451 if (rc == 0) { 452 SPDK_DEBUGLOG(vhost_blk, "====== Task %p resubmitted ======\n", task); 453 } else { 454 SPDK_DEBUGLOG(vhost_blk, "====== Task %p failed ======\n", task); 455 } 456 } 457 458 static inline void 459 blk_request_queue_io(struct spdk_vhost_dev *vdev, struct spdk_io_channel *ch, 460 struct spdk_vhost_blk_task *task) 461 { 462 int rc; 463 struct spdk_bdev *bdev = task->bdev_io->bdev; 464 465 task->bdev_io_wait.bdev = bdev; 466 task->bdev_io_wait.cb_fn = blk_request_resubmit; 467 task->bdev_io_wait.cb_arg = task; 468 task->bdev_io_wait_ch = ch; 469 task->bdev_io_wait_vdev = vdev; 470 471 rc = spdk_bdev_queue_io_wait(bdev, ch, &task->bdev_io_wait); 472 if (rc != 0) { 473 blk_request_finish(VIRTIO_BLK_S_IOERR, task); 474 } 475 } 476 477 static int 478 virtio_blk_process_request(struct spdk_vhost_dev *vdev, struct spdk_io_channel *ch, 479 struct spdk_vhost_blk_task *task) 480 { 481 struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); 482 struct virtio_blk_outhdr req; 483 struct virtio_blk_discard_write_zeroes *desc; 484 struct iovec *iov; 485 uint32_t type; 486 uint64_t flush_bytes; 487 uint32_t payload_len; 488 uint16_t iovcnt; 489 int rc; 490 491 iov = &task->iovs[0]; 492 if (spdk_unlikely(iov->iov_len != sizeof(req))) { 493 SPDK_DEBUGLOG(vhost_blk, 494 "First descriptor size is %zu but expected %zu (task = %p).\n", 495 iov->iov_len, sizeof(req), task); 496 blk_request_finish(VIRTIO_BLK_S_UNSUPP, task); 497 return -1; 498 } 499 500 /* Some SeaBIOS versions don't align the virtio_blk_outhdr on an 8-byte boundary, which 501 * triggers ubsan errors. So copy this small 16-byte structure to the stack to workaround 502 * this problem. 503 */ 504 memcpy(&req, iov->iov_base, sizeof(req)); 505 506 iov = &task->iovs[task->iovcnt - 1]; 507 if (spdk_unlikely(iov->iov_len != 1)) { 508 SPDK_DEBUGLOG(vhost_blk, 509 "Last descriptor size is %zu but expected %d (task = %p).\n", 510 iov->iov_len, 1, task); 511 blk_request_finish(VIRTIO_BLK_S_UNSUPP, task); 512 return -1; 513 } 514 515 payload_len = task->payload_size; 516 task->status = iov->iov_base; 517 payload_len -= sizeof(req) + sizeof(*task->status); 518 iovcnt = task->iovcnt - 2; 519 520 type = req.type; 521 #ifdef VIRTIO_BLK_T_BARRIER 522 /* Don't care about barrier for now (as QEMU's virtio-blk do). */ 523 type &= ~VIRTIO_BLK_T_BARRIER; 524 #endif 525 526 switch (type) { 527 case VIRTIO_BLK_T_IN: 528 case VIRTIO_BLK_T_OUT: 529 if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) { 530 SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (task = %p).\n", 531 type ? "WRITE" : "READ", task); 532 blk_request_finish(VIRTIO_BLK_S_UNSUPP, task); 533 return -1; 534 } 535 536 if (type == VIRTIO_BLK_T_IN) { 537 task->used_len = payload_len + sizeof(*task->status); 538 rc = spdk_bdev_readv(bvdev->bdev_desc, ch, 539 &task->iovs[1], iovcnt, req.sector * 512, 540 payload_len, blk_request_complete_cb, task); 541 } else if (!bvdev->readonly) { 542 task->used_len = sizeof(*task->status); 543 rc = spdk_bdev_writev(bvdev->bdev_desc, ch, 544 &task->iovs[1], iovcnt, req.sector * 512, 545 payload_len, blk_request_complete_cb, task); 546 } else { 547 SPDK_DEBUGLOG(vhost_blk, "Device is in read-only mode!\n"); 548 rc = -1; 549 } 550 551 if (rc) { 552 if (rc == -ENOMEM) { 553 SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n"); 554 blk_request_queue_io(vdev, ch, task); 555 } else { 556 blk_request_finish(VIRTIO_BLK_S_IOERR, task); 557 return -1; 558 } 559 } 560 break; 561 case VIRTIO_BLK_T_DISCARD: 562 desc = task->iovs[1].iov_base; 563 if (payload_len != sizeof(*desc)) { 564 SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len); 565 blk_request_finish(VIRTIO_BLK_S_IOERR, task); 566 return -1; 567 } 568 569 if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { 570 SPDK_ERRLOG("UNMAP flag is only used for WRITE ZEROES command\n"); 571 blk_request_finish(VIRTIO_BLK_S_UNSUPP, task); 572 return -1; 573 } 574 575 rc = spdk_bdev_unmap(bvdev->bdev_desc, ch, 576 desc->sector * 512, desc->num_sectors * 512, 577 blk_request_complete_cb, task); 578 if (rc) { 579 if (rc == -ENOMEM) { 580 SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n"); 581 blk_request_queue_io(vdev, ch, task); 582 } else { 583 blk_request_finish(VIRTIO_BLK_S_IOERR, task); 584 return -1; 585 } 586 } 587 break; 588 case VIRTIO_BLK_T_WRITE_ZEROES: 589 desc = task->iovs[1].iov_base; 590 if (payload_len != sizeof(*desc)) { 591 SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len); 592 blk_request_finish(VIRTIO_BLK_S_IOERR, task); 593 return -1; 594 } 595 596 /* Unmap this range, SPDK doesn't support it, kernel will enable this flag by default 597 * without checking unmap feature is negotiated or not, the flag isn't mandatory, so 598 * just print a warning. 599 */ 600 if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { 601 SPDK_WARNLOG("Ignore the unmap flag for WRITE ZEROES from %"PRIx64", len %"PRIx64"\n", 602 (uint64_t)desc->sector * 512, (uint64_t)desc->num_sectors * 512); 603 } 604 605 rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, ch, 606 desc->sector * 512, desc->num_sectors * 512, 607 blk_request_complete_cb, task); 608 if (rc) { 609 if (rc == -ENOMEM) { 610 SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n"); 611 blk_request_queue_io(vdev, ch, task); 612 } else { 613 blk_request_finish(VIRTIO_BLK_S_IOERR, task); 614 return -1; 615 } 616 } 617 break; 618 case VIRTIO_BLK_T_FLUSH: 619 flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev); 620 if (req.sector != 0) { 621 SPDK_NOTICELOG("sector must be zero for flush command\n"); 622 blk_request_finish(VIRTIO_BLK_S_IOERR, task); 623 return -1; 624 } 625 rc = spdk_bdev_flush(bvdev->bdev_desc, ch, 626 0, flush_bytes, 627 blk_request_complete_cb, task); 628 if (rc) { 629 if (rc == -ENOMEM) { 630 SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n"); 631 blk_request_queue_io(vdev, ch, task); 632 } else { 633 blk_request_finish(VIRTIO_BLK_S_IOERR, task); 634 return -1; 635 } 636 } 637 break; 638 case VIRTIO_BLK_T_GET_ID: 639 if (!iovcnt || !payload_len) { 640 blk_request_finish(VIRTIO_BLK_S_UNSUPP, task); 641 return -1; 642 } 643 task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len); 644 spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_name(bvdev->bdev), 645 task->used_len, ' '); 646 blk_request_finish(VIRTIO_BLK_S_OK, task); 647 break; 648 default: 649 SPDK_DEBUGLOG(vhost_blk, "Not supported request type '%"PRIu32"'.\n", type); 650 blk_request_finish(VIRTIO_BLK_S_UNSUPP, task); 651 return -1; 652 } 653 654 return 0; 655 } 656 657 static void 658 process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx) 659 { 660 struct spdk_vhost_user_blk_task *task; 661 struct spdk_vhost_blk_task *blk_task; 662 int rc; 663 664 assert(vq->packed.packed_ring == false); 665 666 task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[req_idx]; 667 blk_task = &task->blk_task; 668 if (spdk_unlikely(task->used)) { 669 SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", 670 task->bvsession->vsession.name, req_idx); 671 blk_task->used_len = 0; 672 blk_task_enqueue(task); 673 return; 674 } 675 676 task->bvsession->vsession.task_cnt++; 677 678 blk_task_init(task); 679 680 rc = blk_iovs_split_queue_setup(task->bvsession, vq, task->req_idx, 681 blk_task->iovs, &blk_task->iovcnt, &blk_task->payload_size); 682 683 if (rc) { 684 SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx); 685 /* Only READ and WRITE are supported for now. */ 686 vhost_user_blk_request_finish(VIRTIO_BLK_S_UNSUPP, blk_task); 687 return; 688 } 689 690 if (vhost_user_process_blk_request(task) == 0) { 691 SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task, 692 req_idx); 693 } else { 694 SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, req_idx); 695 } 696 } 697 698 static void 699 process_packed_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx) 700 { 701 struct spdk_vhost_user_blk_task *task; 702 struct spdk_vhost_blk_task *blk_task; 703 uint16_t task_idx = req_idx, num_descs; 704 int rc; 705 706 assert(vq->packed.packed_ring); 707 708 /* Packed ring used the buffer_id as the task_idx to get task struct. 709 * In kernel driver, it uses the vq->free_head to set the buffer_id so the value 710 * must be in the range of 0 ~ vring.size. The free_head value must be unique 711 * in the outstanding requests. 712 * We can't use the req_idx as the task_idx because the desc can be reused in 713 * the next phase even when it's not completed in the previous phase. For example, 714 * At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving 715 * phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used 716 * as task_idx because we will know task[0]->used is true at phase 1. 717 * The split queue is quite different, the desc would insert into the free list when 718 * device completes the request, the driver gets the desc from the free list which 719 * ensures the req_idx is unique in the outstanding requests. 720 */ 721 task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs); 722 723 task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[task_idx]; 724 blk_task = &task->blk_task; 725 if (spdk_unlikely(task->used)) { 726 SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", 727 task->bvsession->vsession.name, task_idx); 728 blk_task->used_len = 0; 729 blk_task_enqueue(task); 730 return; 731 } 732 733 task->req_idx = req_idx; 734 task->num_descs = num_descs; 735 task->buffer_id = task_idx; 736 737 rte_vhost_set_inflight_desc_packed(task->bvsession->vsession.vid, vq->vring_idx, 738 req_idx, (req_idx + num_descs - 1) % vq->vring.size, 739 &task->inflight_head); 740 741 task->bvsession->vsession.task_cnt++; 742 743 blk_task_init(task); 744 745 rc = blk_iovs_packed_queue_setup(task->bvsession, vq, task->req_idx, blk_task->iovs, 746 &blk_task->iovcnt, 747 &blk_task->payload_size); 748 if (rc) { 749 SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx); 750 /* Only READ and WRITE are supported for now. */ 751 vhost_user_blk_request_finish(VIRTIO_BLK_S_UNSUPP, blk_task); 752 return; 753 } 754 755 if (vhost_user_process_blk_request(task) == 0) { 756 SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task, 757 task_idx); 758 } else { 759 SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx); 760 } 761 } 762 763 static void 764 process_packed_inflight_blk_task(struct spdk_vhost_virtqueue *vq, 765 uint16_t req_idx) 766 { 767 spdk_vhost_inflight_desc *desc_array = vq->vring_inflight.inflight_packed->desc; 768 spdk_vhost_inflight_desc *desc = &desc_array[req_idx]; 769 struct spdk_vhost_user_blk_task *task; 770 struct spdk_vhost_blk_task *blk_task; 771 uint16_t task_idx, num_descs; 772 int rc; 773 774 task_idx = desc_array[desc->last].id; 775 num_descs = desc->num; 776 /* In packed ring reconnection, we use the last_used_idx as the 777 * initial value. So when we process the inflight descs we still 778 * need to update the available ring index. 779 */ 780 vq->last_avail_idx += num_descs; 781 if (vq->last_avail_idx >= vq->vring.size) { 782 vq->last_avail_idx -= vq->vring.size; 783 vq->packed.avail_phase = !vq->packed.avail_phase; 784 } 785 786 task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[task_idx]; 787 blk_task = &task->blk_task; 788 if (spdk_unlikely(task->used)) { 789 SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", 790 task->bvsession->vsession.name, task_idx); 791 blk_task->used_len = 0; 792 blk_task_enqueue(task); 793 return; 794 } 795 796 task->req_idx = req_idx; 797 task->num_descs = num_descs; 798 task->buffer_id = task_idx; 799 /* It's for cleaning inflight entries */ 800 task->inflight_head = req_idx; 801 802 task->bvsession->vsession.task_cnt++; 803 804 blk_task_init(task); 805 806 rc = blk_iovs_inflight_queue_setup(task->bvsession, vq, task->req_idx, blk_task->iovs, 807 &blk_task->iovcnt, 808 &blk_task->payload_size); 809 if (rc) { 810 SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx); 811 /* Only READ and WRITE are supported for now. */ 812 vhost_user_blk_request_finish(VIRTIO_BLK_S_UNSUPP, blk_task); 813 return; 814 } 815 816 if (vhost_user_process_blk_request(task) == 0) { 817 SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task, 818 task_idx); 819 } else { 820 SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx); 821 } 822 } 823 824 static void 825 submit_inflight_desc(struct spdk_vhost_blk_session *bvsession, 826 struct spdk_vhost_virtqueue *vq) 827 { 828 struct spdk_vhost_session *vsession; 829 spdk_vhost_resubmit_info *resubmit; 830 spdk_vhost_resubmit_desc *resubmit_list; 831 uint16_t req_idx; 832 int i; 833 834 resubmit = vq->vring_inflight.resubmit_inflight; 835 if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL || 836 resubmit->resubmit_num == 0)) { 837 return; 838 } 839 840 resubmit_list = resubmit->resubmit_list; 841 vsession = &bvsession->vsession; 842 843 for (i = resubmit->resubmit_num - 1; i >= 0; --i) { 844 req_idx = resubmit_list[i].index; 845 SPDK_DEBUGLOG(vhost_blk, "====== Start processing resubmit request idx %"PRIu16"======\n", 846 req_idx); 847 848 if (spdk_unlikely(req_idx >= vq->vring.size)) { 849 SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", 850 vsession->name, req_idx, vq->vring.size); 851 vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0); 852 continue; 853 } 854 855 if (vq->packed.packed_ring) { 856 process_packed_inflight_blk_task(vq, req_idx); 857 } else { 858 process_blk_task(vq, req_idx); 859 } 860 } 861 resubmit->resubmit_num = 0; 862 } 863 864 static int 865 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) 866 { 867 struct spdk_vhost_session *vsession = &bvsession->vsession; 868 uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS]; 869 uint16_t reqs_cnt, i; 870 871 submit_inflight_desc(bvsession, vq); 872 873 reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); 874 if (!reqs_cnt) { 875 return 0; 876 } 877 878 for (i = 0; i < reqs_cnt; i++) { 879 SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n", 880 reqs[i]); 881 882 if (spdk_unlikely(reqs[i] >= vq->vring.size)) { 883 SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", 884 vsession->name, reqs[i], vq->vring.size); 885 vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0); 886 continue; 887 } 888 889 rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]); 890 891 process_blk_task(vq, reqs[i]); 892 } 893 894 return reqs_cnt; 895 } 896 897 static int 898 process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) 899 { 900 uint16_t i = 0; 901 uint16_t count = 0; 902 903 submit_inflight_desc(bvsession, vq); 904 905 while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS && 906 vhost_vq_packed_ring_is_avail(vq)) { 907 SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n", 908 vq->last_avail_idx); 909 count++; 910 process_packed_blk_task(vq, vq->last_avail_idx); 911 } 912 913 return count; 914 } 915 916 static int 917 _vdev_vq_worker(struct spdk_vhost_virtqueue *vq) 918 { 919 struct spdk_vhost_session *vsession = vq->vsession; 920 struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); 921 bool packed_ring; 922 int rc = 0; 923 924 packed_ring = vq->packed.packed_ring; 925 if (packed_ring) { 926 rc = process_packed_vq(bvsession, vq); 927 } else { 928 rc = process_vq(bvsession, vq); 929 } 930 931 vhost_session_vq_used_signal(vq); 932 933 return rc; 934 935 } 936 937 static int 938 vdev_vq_worker(void *arg) 939 { 940 struct spdk_vhost_virtqueue *vq = arg; 941 942 return _vdev_vq_worker(vq); 943 } 944 945 static int 946 vdev_worker(void *arg) 947 { 948 struct spdk_vhost_blk_session *bvsession = arg; 949 struct spdk_vhost_session *vsession = &bvsession->vsession; 950 uint16_t q_idx; 951 int rc = 0; 952 953 for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { 954 rc += _vdev_vq_worker(&vsession->virtqueue[q_idx]); 955 } 956 957 return rc > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 958 } 959 960 static void 961 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) 962 { 963 struct spdk_vhost_session *vsession = &bvsession->vsession; 964 struct iovec iovs[SPDK_VHOST_IOVS_MAX]; 965 uint32_t length; 966 uint16_t iovcnt, req_idx; 967 968 if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) { 969 return; 970 } 971 972 iovcnt = SPDK_COUNTOF(iovs); 973 if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) { 974 *(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR; 975 SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx); 976 } 977 978 vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0); 979 } 980 981 static void 982 no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) 983 { 984 struct spdk_vhost_session *vsession = &bvsession->vsession; 985 struct spdk_vhost_user_blk_task *task; 986 struct spdk_vhost_blk_task *blk_task; 987 uint32_t length; 988 uint16_t req_idx = vq->last_avail_idx; 989 uint16_t task_idx, num_descs; 990 991 if (!vhost_vq_packed_ring_is_avail(vq)) { 992 return; 993 } 994 995 task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs); 996 task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[task_idx]; 997 blk_task = &task->blk_task; 998 if (spdk_unlikely(task->used)) { 999 SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", 1000 vsession->name, req_idx); 1001 vhost_vq_packed_ring_enqueue(vsession, vq, num_descs, 1002 task->buffer_id, blk_task->used_len, 1003 task->inflight_head); 1004 return; 1005 } 1006 1007 task->req_idx = req_idx; 1008 task->num_descs = num_descs; 1009 task->buffer_id = task_idx; 1010 blk_task_init(task); 1011 1012 if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, blk_task->iovs, &blk_task->iovcnt, 1013 &length)) { 1014 *(volatile uint8_t *)(blk_task->iovs[blk_task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR; 1015 SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx); 1016 } 1017 1018 task->used = false; 1019 vhost_vq_packed_ring_enqueue(vsession, vq, num_descs, 1020 task->buffer_id, blk_task->used_len, 1021 task->inflight_head); 1022 } 1023 1024 static int 1025 _no_bdev_vdev_vq_worker(struct spdk_vhost_virtqueue *vq) 1026 { 1027 struct spdk_vhost_session *vsession = vq->vsession; 1028 struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); 1029 bool packed_ring; 1030 1031 packed_ring = vq->packed.packed_ring; 1032 if (packed_ring) { 1033 no_bdev_process_packed_vq(bvsession, vq); 1034 } else { 1035 no_bdev_process_vq(bvsession, vq); 1036 } 1037 1038 vhost_session_vq_used_signal(vq); 1039 1040 if (vsession->task_cnt == 0 && bvsession->io_channel) { 1041 vhost_blk_put_io_channel(bvsession->io_channel); 1042 bvsession->io_channel = NULL; 1043 } 1044 1045 return SPDK_POLLER_BUSY; 1046 } 1047 1048 static int 1049 no_bdev_vdev_vq_worker(void *arg) 1050 { 1051 struct spdk_vhost_virtqueue *vq = arg; 1052 1053 return _no_bdev_vdev_vq_worker(vq); 1054 } 1055 1056 static int 1057 no_bdev_vdev_worker(void *arg) 1058 { 1059 struct spdk_vhost_blk_session *bvsession = arg; 1060 struct spdk_vhost_session *vsession = &bvsession->vsession; 1061 uint16_t q_idx; 1062 1063 for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { 1064 _no_bdev_vdev_vq_worker(&vsession->virtqueue[q_idx]); 1065 } 1066 1067 return SPDK_POLLER_BUSY; 1068 } 1069 1070 static void 1071 vhost_blk_session_unregister_interrupts(struct spdk_vhost_blk_session *bvsession) 1072 { 1073 struct spdk_vhost_session *vsession = &bvsession->vsession; 1074 struct spdk_vhost_virtqueue *vq; 1075 int i; 1076 1077 SPDK_DEBUGLOG(vhost_blk, "unregister virtqueues interrupt\n"); 1078 for (i = 0; i < vsession->max_queues; i++) { 1079 vq = &vsession->virtqueue[i]; 1080 if (vq->intr == NULL) { 1081 break; 1082 } 1083 1084 SPDK_DEBUGLOG(vhost_blk, "unregister vq[%d]'s kickfd is %d\n", 1085 i, vq->vring.kickfd); 1086 spdk_interrupt_unregister(&vq->intr); 1087 } 1088 } 1089 1090 static int 1091 vhost_blk_session_register_interrupts(struct spdk_vhost_blk_session *bvsession, 1092 spdk_interrupt_fn fn, const char *name) 1093 { 1094 struct spdk_vhost_session *vsession = &bvsession->vsession; 1095 struct spdk_vhost_virtqueue *vq = NULL; 1096 int i; 1097 1098 SPDK_DEBUGLOG(vhost_blk, "Register virtqueues interrupt\n"); 1099 for (i = 0; i < vsession->max_queues; i++) { 1100 vq = &vsession->virtqueue[i]; 1101 SPDK_DEBUGLOG(vhost_blk, "Register vq[%d]'s kickfd is %d\n", 1102 i, vq->vring.kickfd); 1103 1104 vq->intr = spdk_interrupt_register(vq->vring.kickfd, fn, vq, name); 1105 if (vq->intr == NULL) { 1106 SPDK_ERRLOG("Fail to register req notifier handler.\n"); 1107 goto err; 1108 } 1109 } 1110 1111 return 0; 1112 1113 err: 1114 vhost_blk_session_unregister_interrupts(bvsession); 1115 1116 return -1; 1117 } 1118 1119 static void 1120 vhost_blk_poller_set_interrupt_mode(struct spdk_poller *poller, void *cb_arg, bool interrupt_mode) 1121 { 1122 struct spdk_vhost_blk_session *bvsession = cb_arg; 1123 1124 vhost_user_session_set_interrupt_mode(&bvsession->vsession, interrupt_mode); 1125 } 1126 1127 static int 1128 vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev, 1129 struct spdk_vhost_session *vsession, 1130 void *ctx) 1131 { 1132 #if RTE_VERSION >= RTE_VERSION_NUM(20, 02, 0, 0) 1133 SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid); 1134 rte_vhost_slave_config_change(vsession->vid, false); 1135 #else 1136 SPDK_NOTICELOG("bdev does not support resize until DPDK submodule version >= 20.02\n"); 1137 #endif 1138 1139 return 0; 1140 } 1141 1142 static void 1143 blk_resize_cb(void *resize_ctx) 1144 { 1145 struct spdk_vhost_blk_dev *bvdev = resize_ctx; 1146 1147 spdk_vhost_lock(); 1148 vhost_user_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_resize_cb, 1149 NULL, NULL); 1150 spdk_vhost_unlock(); 1151 } 1152 1153 static void 1154 vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx) 1155 { 1156 1157 /* All sessions have been notified, time to close the bdev */ 1158 struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); 1159 1160 assert(bvdev != NULL); 1161 spdk_put_io_channel(bvdev->dummy_io_channel); 1162 spdk_bdev_close(bvdev->bdev_desc); 1163 bvdev->bdev_desc = NULL; 1164 bvdev->bdev = NULL; 1165 } 1166 1167 static int 1168 vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev, 1169 struct spdk_vhost_session *vsession, 1170 void *ctx) 1171 { 1172 struct spdk_vhost_blk_session *bvsession; 1173 int rc; 1174 1175 bvsession = to_blk_session(vsession); 1176 if (bvsession->requestq_poller) { 1177 spdk_poller_unregister(&bvsession->requestq_poller); 1178 if (vsession->virtqueue[0].intr) { 1179 vhost_blk_session_unregister_interrupts(bvsession); 1180 rc = vhost_blk_session_register_interrupts(bvsession, no_bdev_vdev_vq_worker, 1181 "no_bdev_vdev_vq_worker"); 1182 if (rc) { 1183 SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name); 1184 return rc; 1185 } 1186 } 1187 1188 bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0); 1189 spdk_poller_register_interrupt(bvsession->requestq_poller, vhost_blk_poller_set_interrupt_mode, 1190 bvsession); 1191 } 1192 1193 return 0; 1194 } 1195 1196 static void 1197 bdev_remove_cb(void *remove_ctx) 1198 { 1199 struct spdk_vhost_blk_dev *bvdev = remove_ctx; 1200 1201 SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n", 1202 bvdev->vdev.name); 1203 1204 spdk_vhost_lock(); 1205 vhost_user_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb, 1206 vhost_dev_bdev_remove_cpl_cb, NULL); 1207 spdk_vhost_unlock(); 1208 } 1209 1210 static void 1211 bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, 1212 void *event_ctx) 1213 { 1214 SPDK_DEBUGLOG(vhost_blk, "Bdev event: type %d, name %s\n", 1215 type, 1216 bdev->name); 1217 1218 switch (type) { 1219 case SPDK_BDEV_EVENT_REMOVE: 1220 SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name); 1221 bdev_remove_cb(event_ctx); 1222 break; 1223 case SPDK_BDEV_EVENT_RESIZE: 1224 SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name); 1225 blk_resize_cb(event_ctx); 1226 break; 1227 default: 1228 SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); 1229 break; 1230 } 1231 } 1232 1233 static void 1234 free_task_pool(struct spdk_vhost_blk_session *bvsession) 1235 { 1236 struct spdk_vhost_session *vsession = &bvsession->vsession; 1237 struct spdk_vhost_virtqueue *vq; 1238 uint16_t i; 1239 1240 for (i = 0; i < vsession->max_queues; i++) { 1241 vq = &vsession->virtqueue[i]; 1242 if (vq->tasks == NULL) { 1243 continue; 1244 } 1245 1246 spdk_free(vq->tasks); 1247 vq->tasks = NULL; 1248 } 1249 } 1250 1251 static int 1252 alloc_task_pool(struct spdk_vhost_blk_session *bvsession) 1253 { 1254 struct spdk_vhost_session *vsession = &bvsession->vsession; 1255 struct spdk_vhost_virtqueue *vq; 1256 struct spdk_vhost_user_blk_task *task; 1257 uint32_t task_cnt; 1258 uint16_t i; 1259 uint32_t j; 1260 1261 for (i = 0; i < vsession->max_queues; i++) { 1262 vq = &vsession->virtqueue[i]; 1263 if (vq->vring.desc == NULL) { 1264 continue; 1265 } 1266 1267 task_cnt = vq->vring.size; 1268 if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) { 1269 /* sanity check */ 1270 SPDK_ERRLOG("%s: virtqueue %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", 1271 vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); 1272 free_task_pool(bvsession); 1273 return -1; 1274 } 1275 vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_user_blk_task) * task_cnt, 1276 SPDK_CACHE_LINE_SIZE, NULL, 1277 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1278 if (vq->tasks == NULL) { 1279 SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", 1280 vsession->name, task_cnt, i); 1281 free_task_pool(bvsession); 1282 return -1; 1283 } 1284 1285 for (j = 0; j < task_cnt; j++) { 1286 task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[j]; 1287 task->bvsession = bvsession; 1288 task->req_idx = j; 1289 task->vq = vq; 1290 } 1291 } 1292 1293 return 0; 1294 } 1295 1296 static int 1297 vhost_blk_start_cb(struct spdk_vhost_dev *vdev, 1298 struct spdk_vhost_session *vsession, void *unused) 1299 { 1300 struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); 1301 struct spdk_vhost_blk_dev *bvdev; 1302 int i, rc = 0; 1303 1304 bvdev = to_blk_dev(vdev); 1305 assert(bvdev != NULL); 1306 bvsession->bvdev = bvdev; 1307 1308 /* validate all I/O queues are in a contiguous index range */ 1309 for (i = 0; i < vsession->max_queues; i++) { 1310 /* vring.desc and vring.desc_packed are in a union struct 1311 * so q->vring.desc can replace q->vring.desc_packed. 1312 */ 1313 if (vsession->virtqueue[i].vring.desc == NULL) { 1314 SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i); 1315 rc = -1; 1316 goto out; 1317 } 1318 } 1319 1320 rc = alloc_task_pool(bvsession); 1321 if (rc != 0) { 1322 SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name); 1323 goto out; 1324 } 1325 1326 if (bvdev->bdev) { 1327 bvsession->io_channel = vhost_blk_get_io_channel(vdev); 1328 if (!bvsession->io_channel) { 1329 free_task_pool(bvsession); 1330 SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name); 1331 rc = -1; 1332 goto out; 1333 } 1334 } 1335 1336 if (spdk_interrupt_mode_is_enabled()) { 1337 if (bvdev->bdev) { 1338 rc = vhost_blk_session_register_interrupts(bvsession, 1339 vdev_vq_worker, 1340 "vdev_vq_worker"); 1341 } else { 1342 rc = vhost_blk_session_register_interrupts(bvsession, 1343 no_bdev_vdev_vq_worker, 1344 "no_bdev_vdev_vq_worker"); 1345 } 1346 1347 if (rc) { 1348 SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name); 1349 goto out; 1350 } 1351 } 1352 1353 if (bvdev->bdev) { 1354 bvsession->requestq_poller = SPDK_POLLER_REGISTER(vdev_worker, bvsession, 0); 1355 } else { 1356 bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0); 1357 } 1358 SPDK_INFOLOG(vhost, "%s: started poller on lcore %d\n", 1359 vsession->name, spdk_env_get_current_core()); 1360 1361 spdk_poller_register_interrupt(bvsession->requestq_poller, vhost_blk_poller_set_interrupt_mode, 1362 bvsession); 1363 1364 out: 1365 vhost_user_session_start_done(vsession, rc); 1366 return rc; 1367 } 1368 1369 static int 1370 vhost_blk_start(struct spdk_vhost_session *vsession) 1371 { 1372 return vhost_user_session_send_event(vsession, vhost_blk_start_cb, 1373 3, "start session"); 1374 } 1375 1376 static int 1377 destroy_session_poller_cb(void *arg) 1378 { 1379 struct spdk_vhost_blk_session *bvsession = arg; 1380 struct spdk_vhost_session *vsession = &bvsession->vsession; 1381 int i; 1382 1383 if (vsession->task_cnt > 0 || spdk_vhost_trylock() != 0) { 1384 assert(vsession->stop_retry_count > 0); 1385 vsession->stop_retry_count--; 1386 if (vsession->stop_retry_count == 0) { 1387 SPDK_ERRLOG("%s: Timedout when destroy session (task_cnt %d)\n", vsession->name, 1388 vsession->task_cnt); 1389 spdk_poller_unregister(&bvsession->stop_poller); 1390 vhost_user_session_stop_done(vsession, -ETIMEDOUT); 1391 } 1392 1393 return SPDK_POLLER_BUSY; 1394 } 1395 1396 for (i = 0; i < vsession->max_queues; i++) { 1397 vsession->virtqueue[i].next_event_time = 0; 1398 vhost_vq_used_signal(vsession, &vsession->virtqueue[i]); 1399 } 1400 1401 SPDK_INFOLOG(vhost, "%s: stopping poller on lcore %d\n", 1402 vsession->name, spdk_env_get_current_core()); 1403 1404 if (bvsession->io_channel) { 1405 vhost_blk_put_io_channel(bvsession->io_channel); 1406 bvsession->io_channel = NULL; 1407 } 1408 1409 free_task_pool(bvsession); 1410 spdk_poller_unregister(&bvsession->stop_poller); 1411 vhost_user_session_stop_done(vsession, 0); 1412 1413 spdk_vhost_unlock(); 1414 return SPDK_POLLER_BUSY; 1415 } 1416 1417 static int 1418 vhost_blk_stop_cb(struct spdk_vhost_dev *vdev, 1419 struct spdk_vhost_session *vsession, void *unused) 1420 { 1421 struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); 1422 1423 spdk_poller_unregister(&bvsession->requestq_poller); 1424 1425 if (vsession->virtqueue[0].intr) { 1426 vhost_blk_session_unregister_interrupts(bvsession); 1427 } 1428 1429 /* vhost_user_session_send_event timeout is 3 seconds, here set retry within 4 seconds */ 1430 bvsession->vsession.stop_retry_count = 4000; 1431 bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb, 1432 bvsession, 1000); 1433 return 0; 1434 } 1435 1436 static int 1437 vhost_blk_stop(struct spdk_vhost_session *vsession) 1438 { 1439 return vhost_user_session_send_event(vsession, vhost_blk_stop_cb, 1440 3, "stop session"); 1441 } 1442 1443 static void 1444 vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) 1445 { 1446 struct spdk_vhost_blk_dev *bvdev; 1447 1448 bvdev = to_blk_dev(vdev); 1449 assert(bvdev != NULL); 1450 1451 spdk_json_write_named_object_begin(w, "block"); 1452 1453 spdk_json_write_named_bool(w, "readonly", bvdev->readonly); 1454 1455 spdk_json_write_name(w, "bdev"); 1456 if (bvdev->bdev) { 1457 spdk_json_write_string(w, spdk_bdev_get_name(bvdev->bdev)); 1458 } else { 1459 spdk_json_write_null(w); 1460 } 1461 1462 spdk_json_write_object_end(w); 1463 } 1464 1465 static void 1466 vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) 1467 { 1468 struct spdk_vhost_blk_dev *bvdev; 1469 1470 bvdev = to_blk_dev(vdev); 1471 assert(bvdev != NULL); 1472 1473 if (!bvdev->bdev) { 1474 return; 1475 } 1476 1477 spdk_json_write_object_begin(w); 1478 spdk_json_write_named_string(w, "method", "vhost_create_blk_controller"); 1479 1480 spdk_json_write_named_object_begin(w, "params"); 1481 spdk_json_write_named_string(w, "ctrlr", vdev->name); 1482 spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev)); 1483 spdk_json_write_named_string(w, "cpumask", 1484 spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread))); 1485 spdk_json_write_named_bool(w, "readonly", bvdev->readonly); 1486 spdk_json_write_object_end(w); 1487 1488 spdk_json_write_object_end(w); 1489 } 1490 1491 static int vhost_blk_destroy(struct spdk_vhost_dev *dev); 1492 1493 static int 1494 vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config, 1495 uint32_t len) 1496 { 1497 struct virtio_blk_config blkcfg; 1498 struct spdk_vhost_blk_dev *bvdev; 1499 struct spdk_bdev *bdev; 1500 uint32_t blk_size; 1501 uint64_t blkcnt; 1502 1503 memset(&blkcfg, 0, sizeof(blkcfg)); 1504 bvdev = to_blk_dev(vdev); 1505 assert(bvdev != NULL); 1506 bdev = bvdev->bdev; 1507 if (bdev == NULL) { 1508 /* We can't just return -1 here as this GET_CONFIG message might 1509 * be caused by a QEMU VM reboot. Returning -1 will indicate an 1510 * error to QEMU, who might then decide to terminate itself. 1511 * We don't want that. A simple reboot shouldn't break the system. 1512 * 1513 * Presenting a block device with block size 0 and block count 0 1514 * doesn't cause any problems on QEMU side and the virtio-pci 1515 * device is even still available inside the VM, but there will 1516 * be no block device created for it - the kernel drivers will 1517 * silently reject it. 1518 */ 1519 blk_size = 0; 1520 blkcnt = 0; 1521 } else { 1522 blk_size = spdk_bdev_get_block_size(bdev); 1523 blkcnt = spdk_bdev_get_num_blocks(bdev); 1524 if (spdk_bdev_get_buf_align(bdev) > 1) { 1525 blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE; 1526 blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1); 1527 } else { 1528 blkcfg.size_max = 131072; 1529 /* -2 for REQ and RESP and -1 for region boundary splitting */ 1530 blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1; 1531 } 1532 } 1533 1534 blkcfg.blk_size = blk_size; 1535 /* minimum I/O size in blocks */ 1536 blkcfg.min_io_size = 1; 1537 /* expressed in 512 Bytes sectors */ 1538 blkcfg.capacity = (blkcnt * blk_size) / 512; 1539 /* QEMU can overwrite this value when started */ 1540 blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES; 1541 1542 if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { 1543 /* 16MiB, expressed in 512 Bytes */ 1544 blkcfg.max_discard_sectors = 32768; 1545 blkcfg.max_discard_seg = 1; 1546 blkcfg.discard_sector_alignment = blk_size / 512; 1547 } 1548 if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 1549 blkcfg.max_write_zeroes_sectors = 32768; 1550 blkcfg.max_write_zeroes_seg = 1; 1551 } 1552 1553 memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg))); 1554 1555 return 0; 1556 } 1557 1558 static const struct spdk_vhost_user_dev_backend vhost_blk_user_device_backend = { 1559 .session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session), 1560 .start_session = vhost_blk_start, 1561 .stop_session = vhost_blk_stop, 1562 }; 1563 1564 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = { 1565 .type = VHOST_BACKEND_BLK, 1566 .vhost_get_config = vhost_blk_get_config, 1567 .dump_info_json = vhost_blk_dump_info_json, 1568 .write_config_json = vhost_blk_write_config_json, 1569 .remove_device = vhost_blk_destroy, 1570 }; 1571 1572 int 1573 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, 1574 const struct spdk_json_val *params) 1575 { 1576 struct rpc_vhost_blk req = {0}; 1577 struct spdk_vhost_blk_dev *bvdev = NULL; 1578 struct spdk_vhost_dev *vdev; 1579 struct spdk_bdev *bdev; 1580 int ret = 0; 1581 1582 spdk_vhost_lock(); 1583 1584 if (spdk_json_decode_object_relaxed(params, rpc_construct_vhost_blk, 1585 SPDK_COUNTOF(rpc_construct_vhost_blk), 1586 &req)) { 1587 SPDK_DEBUGLOG(vhost_blk, "spdk_json_decode_object failed\n"); 1588 ret = -EINVAL; 1589 goto out; 1590 } 1591 1592 bvdev = calloc(1, sizeof(*bvdev)); 1593 if (bvdev == NULL) { 1594 ret = -ENOMEM; 1595 goto out; 1596 } 1597 1598 ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc); 1599 if (ret != 0) { 1600 SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n", 1601 name, dev_name, ret); 1602 goto out; 1603 } 1604 bdev = spdk_bdev_desc_get_bdev(bvdev->bdev_desc); 1605 1606 vdev = &bvdev->vdev; 1607 vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE; 1608 vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES; 1609 vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES; 1610 vdev->packed_ring_recovery = false; 1611 1612 if (req.packed_ring) { 1613 vdev->virtio_features |= (uint64_t)req.packed_ring << VIRTIO_F_RING_PACKED; 1614 vdev->packed_ring_recovery = req.packed_ring_recovery; 1615 } 1616 1617 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { 1618 vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD); 1619 } 1620 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 1621 vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES); 1622 } 1623 if (req.readonly) { 1624 vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO); 1625 } 1626 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) { 1627 vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH); 1628 } 1629 1630 /* 1631 * When starting qemu with multiqueue enable, the vhost device will 1632 * be started/stopped many times, related to the queues num, as the 1633 * exact number of queues used for this device is not known at the time. 1634 * The target has to stop and start the device once got a valid IO queue. 1635 * When stoping and starting the vhost device, the backend bdev io device 1636 * will be deleted and created repeatedly. 1637 * Hold a bdev reference so that in the struct spdk_vhost_blk_dev, so that 1638 * the io device will not be deleted. 1639 */ 1640 bvdev->dummy_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc); 1641 1642 bvdev->bdev = bdev; 1643 bvdev->readonly = req.readonly; 1644 ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend, 1645 &vhost_blk_user_device_backend); 1646 if (ret != 0) { 1647 spdk_put_io_channel(bvdev->dummy_io_channel); 1648 spdk_bdev_close(bvdev->bdev_desc); 1649 goto out; 1650 } 1651 1652 SPDK_INFOLOG(vhost, "%s: using bdev '%s'\n", name, dev_name); 1653 out: 1654 if (ret != 0 && bvdev) { 1655 free(bvdev); 1656 } 1657 spdk_vhost_unlock(); 1658 return ret; 1659 } 1660 1661 static int 1662 vhost_blk_destroy(struct spdk_vhost_dev *vdev) 1663 { 1664 struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); 1665 int rc; 1666 1667 assert(bvdev != NULL); 1668 1669 rc = vhost_dev_unregister(&bvdev->vdev); 1670 if (rc != 0) { 1671 return rc; 1672 } 1673 1674 /* if the bdev is removed, don't need call spdk_put_io_channel. */ 1675 if (bvdev->bdev) { 1676 spdk_put_io_channel(bvdev->dummy_io_channel); 1677 } 1678 1679 if (bvdev->bdev_desc) { 1680 spdk_bdev_close(bvdev->bdev_desc); 1681 bvdev->bdev_desc = NULL; 1682 } 1683 bvdev->bdev = NULL; 1684 1685 free(bvdev); 1686 return 0; 1687 } 1688 1689 struct spdk_io_channel * 1690 vhost_blk_get_io_channel(struct spdk_vhost_dev *vdev) 1691 { 1692 struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); 1693 1694 return spdk_bdev_get_io_channel(bvdev->bdev_desc); 1695 } 1696 1697 void 1698 vhost_blk_put_io_channel(struct spdk_io_channel *ch) 1699 { 1700 spdk_put_io_channel(ch); 1701 } 1702 1703 SPDK_LOG_REGISTER_COMPONENT(vhost_blk) 1704 SPDK_LOG_REGISTER_COMPONENT(vhost_blk_data) 1705