1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <linux/virtio_blk.h> 35 36 #include "spdk/env.h" 37 #include "spdk/bdev.h" 38 #include "spdk/bdev_module.h" 39 #include "spdk/conf.h" 40 #include "spdk/thread.h" 41 #include "spdk/likely.h" 42 #include "spdk/string.h" 43 #include "spdk/util.h" 44 #include "spdk/vhost.h" 45 46 #include "vhost_internal.h" 47 48 /* Minimal set of features supported by every SPDK VHOST-BLK device */ 49 #define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \ 50 (1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ 51 (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ 52 (1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER) | \ 53 (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \ 54 (1ULL << VIRTIO_BLK_F_MQ)) 55 56 /* Not supported features */ 57 #define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \ 58 (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \ 59 (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI)) 60 61 /* Vhost-blk support protocol features */ 62 #ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB 63 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \ 64 (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) 65 #else 66 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES (1ULL << VHOST_USER_PROTOCOL_F_CONFIG) 67 #endif 68 69 struct spdk_vhost_blk_task { 70 struct spdk_bdev_io *bdev_io; 71 struct spdk_vhost_blk_session *bvsession; 72 struct spdk_vhost_virtqueue *vq; 73 74 volatile uint8_t *status; 75 76 uint16_t req_idx; 77 78 /* for io wait */ 79 struct spdk_bdev_io_wait_entry bdev_io_wait; 80 81 /* If set, the task is currently used for I/O processing. */ 82 bool used; 83 84 /** Number of bytes that were written. */ 85 uint32_t used_len; 86 uint16_t iovcnt; 87 struct iovec iovs[SPDK_VHOST_IOVS_MAX]; 88 }; 89 90 struct spdk_vhost_blk_dev { 91 struct spdk_vhost_dev vdev; 92 struct spdk_bdev *bdev; 93 struct spdk_bdev_desc *bdev_desc; 94 bool readonly; 95 }; 96 97 struct spdk_vhost_blk_session { 98 /* The parent session must be the very first field in this struct */ 99 struct spdk_vhost_session vsession; 100 struct spdk_vhost_blk_dev *bvdev; 101 struct spdk_poller *requestq_poller; 102 struct spdk_io_channel *io_channel; 103 struct spdk_poller *stop_poller; 104 }; 105 106 /* forward declaration */ 107 static const struct spdk_vhost_dev_backend vhost_blk_device_backend; 108 109 static int 110 process_blk_request(struct spdk_vhost_blk_task *task, 111 struct spdk_vhost_blk_session *bvsession, 112 struct spdk_vhost_virtqueue *vq); 113 114 static void 115 blk_task_finish(struct spdk_vhost_blk_task *task) 116 { 117 assert(task->bvsession->vsession.task_cnt > 0); 118 task->bvsession->vsession.task_cnt--; 119 task->used = false; 120 } 121 122 static void 123 blk_task_init(struct spdk_vhost_blk_task *task) 124 { 125 task->used = true; 126 task->iovcnt = SPDK_COUNTOF(task->iovs); 127 task->status = NULL; 128 task->used_len = 0; 129 } 130 131 static void 132 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status) 133 { 134 if (task->status) { 135 *task->status = status; 136 } 137 138 vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx, 139 task->used_len); 140 blk_task_finish(task); 141 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status); 142 } 143 144 /* 145 * Process task's descriptor chain and setup data related fields. 146 * Return 147 * total size of suplied buffers 148 * 149 * FIXME: Make this function return to rd_cnt and wr_cnt 150 */ 151 static int 152 blk_iovs_setup(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq, 153 uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) 154 { 155 struct spdk_vhost_session *vsession = &bvsession->vsession; 156 struct spdk_vhost_dev *vdev = vsession->vdev; 157 struct vring_desc *desc, *desc_table; 158 uint16_t out_cnt = 0, cnt = 0; 159 uint32_t desc_table_size, len = 0; 160 uint32_t desc_handled_cnt; 161 int rc; 162 163 rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size); 164 if (rc != 0) { 165 SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); 166 return -1; 167 } 168 169 desc_handled_cnt = 0; 170 while (1) { 171 /* 172 * Maximum cnt reached? 173 * Should not happen if request is well formatted, otherwise this is a BUG. 174 */ 175 if (spdk_unlikely(cnt == *iovs_cnt)) { 176 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n", 177 vsession->name, req_idx); 178 return -1; 179 } 180 181 if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) { 182 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", 183 vsession->name, req_idx, cnt); 184 return -1; 185 } 186 187 len += desc->len; 188 189 out_cnt += vhost_vring_desc_is_wr(desc); 190 191 rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); 192 if (rc != 0) { 193 SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n", 194 vsession->name, req_idx); 195 return -1; 196 } else if (desc == NULL) { 197 break; 198 } 199 200 desc_handled_cnt++; 201 if (spdk_unlikely(desc_handled_cnt > desc_table_size)) { 202 /* Break a cycle and report an error, if any. */ 203 SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n", 204 vsession->name, desc_table_size, desc_handled_cnt); 205 return -1; 206 } 207 } 208 209 /* 210 * There must be least two descriptors. 211 * First contain request so it must be readable. 212 * Last descriptor contain buffer for response so it must be writable. 213 */ 214 if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { 215 return -1; 216 } 217 218 *length = len; 219 *iovs_cnt = cnt; 220 return 0; 221 } 222 223 static void 224 blk_request_finish(bool success, struct spdk_vhost_blk_task *task) 225 { 226 *task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; 227 vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx, 228 task->used_len); 229 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task, 230 task->req_idx, success ? "OK" : "FAIL"); 231 blk_task_finish(task); 232 } 233 234 static void 235 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 236 { 237 struct spdk_vhost_blk_task *task = cb_arg; 238 239 spdk_bdev_free_io(bdev_io); 240 blk_request_finish(success, task); 241 } 242 243 static void 244 blk_request_resubmit(void *arg) 245 { 246 struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg; 247 int rc = 0; 248 249 blk_task_init(task); 250 251 rc = process_blk_request(task, task->bvsession, task->vq); 252 if (rc == 0) { 253 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task); 254 } else { 255 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task); 256 } 257 } 258 259 static inline void 260 blk_request_queue_io(struct spdk_vhost_blk_task *task) 261 { 262 int rc; 263 struct spdk_vhost_blk_session *bvsession = task->bvsession; 264 struct spdk_bdev *bdev = bvsession->bvdev->bdev; 265 266 task->bdev_io_wait.bdev = bdev; 267 task->bdev_io_wait.cb_fn = blk_request_resubmit; 268 task->bdev_io_wait.cb_arg = task; 269 270 rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait); 271 if (rc != 0) { 272 SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc); 273 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 274 } 275 } 276 277 static int 278 process_blk_request(struct spdk_vhost_blk_task *task, 279 struct spdk_vhost_blk_session *bvsession, 280 struct spdk_vhost_virtqueue *vq) 281 { 282 struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev; 283 const struct virtio_blk_outhdr *req; 284 struct virtio_blk_discard_write_zeroes *desc; 285 struct iovec *iov; 286 uint32_t type; 287 uint32_t payload_len; 288 uint64_t flush_bytes; 289 int rc; 290 291 if (blk_iovs_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) { 292 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx); 293 /* Only READ and WRITE are supported for now. */ 294 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 295 return -1; 296 } 297 298 iov = &task->iovs[0]; 299 if (spdk_unlikely(iov->iov_len != sizeof(*req))) { 300 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, 301 "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n", 302 iov->iov_len, sizeof(*req), task->req_idx); 303 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 304 return -1; 305 } 306 307 req = iov->iov_base; 308 309 iov = &task->iovs[task->iovcnt - 1]; 310 if (spdk_unlikely(iov->iov_len != 1)) { 311 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, 312 "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n", 313 iov->iov_len, 1, task->req_idx); 314 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 315 return -1; 316 } 317 318 task->status = iov->iov_base; 319 payload_len -= sizeof(*req) + sizeof(*task->status); 320 task->iovcnt -= 2; 321 322 type = req->type; 323 #ifdef VIRTIO_BLK_T_BARRIER 324 /* Don't care about barier for now (as QEMU's virtio-blk do). */ 325 type &= ~VIRTIO_BLK_T_BARRIER; 326 #endif 327 328 switch (type) { 329 case VIRTIO_BLK_T_IN: 330 case VIRTIO_BLK_T_OUT: 331 if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) { 332 SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n", 333 type ? "WRITE" : "READ", task->req_idx); 334 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 335 return -1; 336 } 337 338 if (type == VIRTIO_BLK_T_IN) { 339 task->used_len = payload_len + sizeof(*task->status); 340 rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel, 341 &task->iovs[1], task->iovcnt, req->sector * 512, 342 payload_len, blk_request_complete_cb, task); 343 } else if (!bvdev->readonly) { 344 task->used_len = sizeof(*task->status); 345 rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel, 346 &task->iovs[1], task->iovcnt, req->sector * 512, 347 payload_len, blk_request_complete_cb, task); 348 } else { 349 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n"); 350 rc = -1; 351 } 352 353 if (rc) { 354 if (rc == -ENOMEM) { 355 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); 356 blk_request_queue_io(task); 357 } else { 358 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 359 return -1; 360 } 361 } 362 break; 363 case VIRTIO_BLK_T_DISCARD: 364 desc = task->iovs[1].iov_base; 365 if (payload_len != sizeof(*desc)) { 366 SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len); 367 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 368 return -1; 369 } 370 371 rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel, 372 desc->sector * 512, desc->num_sectors * 512, 373 blk_request_complete_cb, task); 374 if (rc) { 375 if (rc == -ENOMEM) { 376 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); 377 blk_request_queue_io(task); 378 } else { 379 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 380 return -1; 381 } 382 } 383 break; 384 case VIRTIO_BLK_T_WRITE_ZEROES: 385 desc = task->iovs[1].iov_base; 386 if (payload_len != sizeof(*desc)) { 387 SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len); 388 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 389 return -1; 390 } 391 392 /* Zeroed and Unmap the range, SPDK doen't support it. */ 393 if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { 394 SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n"); 395 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 396 return -1; 397 } 398 399 rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel, 400 desc->sector * 512, desc->num_sectors * 512, 401 blk_request_complete_cb, task); 402 if (rc) { 403 if (rc == -ENOMEM) { 404 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); 405 blk_request_queue_io(task); 406 } else { 407 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 408 return -1; 409 } 410 } 411 break; 412 case VIRTIO_BLK_T_FLUSH: 413 flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev); 414 if (req->sector != 0) { 415 SPDK_NOTICELOG("sector must be zero for flush command\n"); 416 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 417 return -1; 418 } 419 rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel, 420 0, flush_bytes, 421 blk_request_complete_cb, task); 422 if (rc) { 423 if (rc == -ENOMEM) { 424 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); 425 blk_request_queue_io(task); 426 } else { 427 invalid_blk_request(task, VIRTIO_BLK_S_IOERR); 428 return -1; 429 } 430 } 431 break; 432 case VIRTIO_BLK_T_GET_ID: 433 if (!task->iovcnt || !payload_len) { 434 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 435 return -1; 436 } 437 task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len); 438 spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev), 439 task->used_len, ' '); 440 blk_request_finish(true, task); 441 break; 442 default: 443 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type); 444 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); 445 return -1; 446 } 447 448 return 0; 449 } 450 451 static void 452 process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx) 453 { 454 struct spdk_vhost_blk_task *task; 455 456 task = &((struct spdk_vhost_blk_task *)vq->tasks)[req_idx]; 457 if (spdk_unlikely(task->used)) { 458 SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", 459 task->bvsession->vsession.name, req_idx); 460 task->used_len = 0; 461 vhost_vq_used_ring_enqueue(&task->bvsession->vsession, vq, req_idx, 0); 462 return; 463 } 464 465 task->bvsession->vsession.task_cnt++; 466 467 blk_task_init(task); 468 469 if (process_blk_request(task, task->bvsession, vq) == 0) { 470 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task, 471 req_idx); 472 } else { 473 SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, req_idx); 474 } 475 } 476 477 static void 478 submit_inflight_desc(struct spdk_vhost_blk_session *bvsession, 479 struct spdk_vhost_virtqueue *vq) 480 { 481 struct spdk_vhost_session *vsession = &bvsession->vsession; 482 spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight; 483 spdk_vhost_resubmit_desc *resubmit_list; 484 uint16_t req_idx; 485 486 if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) { 487 return; 488 } 489 490 resubmit_list = resubmit->resubmit_list; 491 while (resubmit->resubmit_num-- > 0) { 492 req_idx = resubmit_list[resubmit->resubmit_num].index; 493 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Start processing request idx %"PRIu16"======\n", 494 req_idx); 495 496 if (spdk_unlikely(req_idx >= vq->vring.size)) { 497 SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", 498 vsession->name, req_idx, vq->vring.size); 499 vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0); 500 continue; 501 } 502 503 process_blk_task(vq, req_idx); 504 } 505 506 free(resubmit_list); 507 resubmit->resubmit_list = NULL; 508 } 509 510 static void 511 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) 512 { 513 struct spdk_vhost_session *vsession = &bvsession->vsession; 514 uint16_t reqs[32]; 515 uint16_t reqs_cnt, i; 516 uint16_t vq_idx = vq->vring_idx; 517 518 submit_inflight_desc(bvsession, vq); 519 520 reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); 521 if (!reqs_cnt) { 522 return; 523 } 524 525 for (i = 0; i < reqs_cnt; i++) { 526 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n", 527 reqs[i]); 528 529 if (spdk_unlikely(reqs[i] >= vq->vring.size)) { 530 SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", 531 vsession->name, reqs[i], vq->vring.size); 532 vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0); 533 continue; 534 } 535 536 rte_vhost_set_inflight_desc_split(vsession->vid, vq_idx, reqs[i]); 537 538 process_blk_task(vq, reqs[i]); 539 } 540 } 541 542 static int 543 vdev_worker(void *arg) 544 { 545 struct spdk_vhost_blk_session *bvsession = arg; 546 struct spdk_vhost_session *vsession = &bvsession->vsession; 547 548 uint16_t q_idx; 549 550 for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { 551 process_vq(bvsession, &vsession->virtqueue[q_idx]); 552 } 553 554 vhost_session_used_signal(vsession); 555 556 return -1; 557 } 558 559 static void 560 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) 561 { 562 struct spdk_vhost_session *vsession = &bvsession->vsession; 563 struct iovec iovs[SPDK_VHOST_IOVS_MAX]; 564 uint32_t length; 565 uint16_t iovcnt, req_idx; 566 567 if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) { 568 return; 569 } 570 571 iovcnt = SPDK_COUNTOF(iovs); 572 if (blk_iovs_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) { 573 *(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR; 574 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx); 575 } 576 577 vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0); 578 } 579 580 static int 581 no_bdev_vdev_worker(void *arg) 582 { 583 struct spdk_vhost_blk_session *bvsession = arg; 584 struct spdk_vhost_session *vsession = &bvsession->vsession; 585 uint16_t q_idx; 586 587 for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { 588 no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]); 589 } 590 591 vhost_session_used_signal(vsession); 592 593 if (vsession->task_cnt == 0 && bvsession->io_channel) { 594 spdk_put_io_channel(bvsession->io_channel); 595 bvsession->io_channel = NULL; 596 } 597 598 return -1; 599 } 600 601 static struct spdk_vhost_blk_session * 602 to_blk_session(struct spdk_vhost_session *vsession) 603 { 604 assert(vsession->vdev->backend == &vhost_blk_device_backend); 605 return (struct spdk_vhost_blk_session *)vsession; 606 } 607 608 static struct spdk_vhost_blk_dev * 609 to_blk_dev(struct spdk_vhost_dev *vdev) 610 { 611 if (vdev == NULL) { 612 return NULL; 613 } 614 615 if (vdev->backend != &vhost_blk_device_backend) { 616 SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name); 617 return NULL; 618 } 619 620 return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev); 621 } 622 623 struct spdk_bdev * 624 spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev) 625 { 626 struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); 627 628 assert(bvdev != NULL); 629 return bvdev->bdev; 630 } 631 632 static void 633 vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx) 634 { 635 636 /* All sessions have been notified, time to close the bdev */ 637 struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); 638 639 assert(bvdev != NULL); 640 spdk_bdev_close(bvdev->bdev_desc); 641 bvdev->bdev_desc = NULL; 642 bvdev->bdev = NULL; 643 } 644 645 static int 646 vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev, 647 struct spdk_vhost_session *vsession, 648 void *ctx) 649 { 650 struct spdk_vhost_blk_session *bvsession; 651 652 bvsession = (struct spdk_vhost_blk_session *)vsession; 653 if (bvsession->requestq_poller) { 654 spdk_poller_unregister(&bvsession->requestq_poller); 655 bvsession->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvsession, 0); 656 } 657 658 return 0; 659 } 660 661 static void 662 bdev_remove_cb(void *remove_ctx) 663 { 664 struct spdk_vhost_blk_dev *bvdev = remove_ctx; 665 666 SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n", 667 bvdev->vdev.name); 668 669 spdk_vhost_lock(); 670 vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb, 671 vhost_dev_bdev_remove_cpl_cb, NULL); 672 spdk_vhost_unlock(); 673 } 674 675 static void 676 free_task_pool(struct spdk_vhost_blk_session *bvsession) 677 { 678 struct spdk_vhost_session *vsession = &bvsession->vsession; 679 struct spdk_vhost_virtqueue *vq; 680 uint16_t i; 681 682 for (i = 0; i < vsession->max_queues; i++) { 683 vq = &vsession->virtqueue[i]; 684 if (vq->tasks == NULL) { 685 continue; 686 } 687 688 spdk_free(vq->tasks); 689 vq->tasks = NULL; 690 } 691 } 692 693 static int 694 alloc_task_pool(struct spdk_vhost_blk_session *bvsession) 695 { 696 struct spdk_vhost_session *vsession = &bvsession->vsession; 697 struct spdk_vhost_virtqueue *vq; 698 struct spdk_vhost_blk_task *task; 699 uint32_t task_cnt; 700 uint16_t i; 701 uint32_t j; 702 703 for (i = 0; i < vsession->max_queues; i++) { 704 vq = &vsession->virtqueue[i]; 705 if (vq->vring.desc == NULL) { 706 continue; 707 } 708 709 task_cnt = vq->vring.size; 710 if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) { 711 /* sanity check */ 712 SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", 713 vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); 714 free_task_pool(bvsession); 715 return -1; 716 } 717 vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt, 718 SPDK_CACHE_LINE_SIZE, NULL, 719 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 720 if (vq->tasks == NULL) { 721 SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", 722 vsession->name, task_cnt, i); 723 free_task_pool(bvsession); 724 return -1; 725 } 726 727 for (j = 0; j < task_cnt; j++) { 728 task = &((struct spdk_vhost_blk_task *)vq->tasks)[j]; 729 task->bvsession = bvsession; 730 task->req_idx = j; 731 task->vq = vq; 732 } 733 } 734 735 return 0; 736 } 737 738 static int 739 vhost_blk_start_cb(struct spdk_vhost_dev *vdev, 740 struct spdk_vhost_session *vsession, void *unused) 741 { 742 struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); 743 struct spdk_vhost_blk_dev *bvdev; 744 int i, rc = 0; 745 746 bvdev = to_blk_dev(vdev); 747 assert(bvdev != NULL); 748 bvsession->bvdev = bvdev; 749 750 /* validate all I/O queues are in a contiguous index range */ 751 for (i = 0; i < vsession->max_queues; i++) { 752 if (vsession->virtqueue[i].vring.desc == NULL) { 753 SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i); 754 rc = -1; 755 goto out; 756 } 757 } 758 759 rc = alloc_task_pool(bvsession); 760 if (rc != 0) { 761 SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name); 762 goto out; 763 } 764 765 if (bvdev->bdev) { 766 bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc); 767 if (!bvsession->io_channel) { 768 free_task_pool(bvsession); 769 SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name); 770 rc = -1; 771 goto out; 772 } 773 } 774 775 bvsession->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker, 776 bvsession, 0); 777 SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n", 778 vsession->name, spdk_env_get_current_core()); 779 out: 780 vhost_session_start_done(vsession, rc); 781 return rc; 782 } 783 784 static int 785 vhost_blk_start(struct spdk_vhost_session *vsession) 786 { 787 return vhost_session_send_event(vsession, vhost_blk_start_cb, 788 3, "start session"); 789 } 790 791 static int 792 destroy_session_poller_cb(void *arg) 793 { 794 struct spdk_vhost_blk_session *bvsession = arg; 795 struct spdk_vhost_session *vsession = &bvsession->vsession; 796 int i; 797 798 if (vsession->task_cnt > 0) { 799 return -1; 800 } 801 802 if (spdk_vhost_trylock() != 0) { 803 return -1; 804 } 805 806 for (i = 0; i < vsession->max_queues; i++) { 807 vsession->virtqueue[i].next_event_time = 0; 808 vhost_vq_used_signal(vsession, &vsession->virtqueue[i]); 809 } 810 811 SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n", 812 vsession->name, spdk_env_get_current_core()); 813 814 if (bvsession->io_channel) { 815 spdk_put_io_channel(bvsession->io_channel); 816 bvsession->io_channel = NULL; 817 } 818 819 free_task_pool(bvsession); 820 spdk_poller_unregister(&bvsession->stop_poller); 821 vhost_session_stop_done(vsession, 0); 822 823 spdk_vhost_unlock(); 824 return -1; 825 } 826 827 static int 828 vhost_blk_stop_cb(struct spdk_vhost_dev *vdev, 829 struct spdk_vhost_session *vsession, void *unused) 830 { 831 struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); 832 833 spdk_poller_unregister(&bvsession->requestq_poller); 834 bvsession->stop_poller = spdk_poller_register(destroy_session_poller_cb, 835 bvsession, 1000); 836 return 0; 837 } 838 839 static int 840 vhost_blk_stop(struct spdk_vhost_session *vsession) 841 { 842 return vhost_session_send_event(vsession, vhost_blk_stop_cb, 843 3, "stop session"); 844 } 845 846 static void 847 vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) 848 { 849 struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev); 850 struct spdk_vhost_blk_dev *bvdev; 851 852 bvdev = to_blk_dev(vdev); 853 assert(bvdev != NULL); 854 spdk_json_write_named_object_begin(w, "block"); 855 856 spdk_json_write_named_bool(w, "readonly", bvdev->readonly); 857 858 spdk_json_write_name(w, "bdev"); 859 if (bdev) { 860 spdk_json_write_string(w, spdk_bdev_get_name(bdev)); 861 } else { 862 spdk_json_write_null(w); 863 } 864 865 spdk_json_write_object_end(w); 866 } 867 868 static void 869 vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) 870 { 871 struct spdk_vhost_blk_dev *bvdev; 872 873 bvdev = to_blk_dev(vdev); 874 assert(bvdev != NULL); 875 if (!bvdev->bdev) { 876 return; 877 } 878 879 spdk_json_write_object_begin(w); 880 spdk_json_write_named_string(w, "method", "vhost_create_blk_controller"); 881 882 spdk_json_write_named_object_begin(w, "params"); 883 spdk_json_write_named_string(w, "ctrlr", vdev->name); 884 spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev)); 885 spdk_json_write_named_string(w, "cpumask", 886 spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread))); 887 spdk_json_write_named_bool(w, "readonly", bvdev->readonly); 888 spdk_json_write_object_end(w); 889 890 spdk_json_write_object_end(w); 891 } 892 893 static int vhost_blk_destroy(struct spdk_vhost_dev *dev); 894 895 static int 896 vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config, 897 uint32_t len) 898 { 899 struct virtio_blk_config blkcfg; 900 struct spdk_vhost_blk_dev *bvdev; 901 struct spdk_bdev *bdev; 902 uint32_t blk_size; 903 uint64_t blkcnt; 904 905 memset(&blkcfg, 0, sizeof(blkcfg)); 906 bvdev = to_blk_dev(vdev); 907 assert(bvdev != NULL); 908 bdev = bvdev->bdev; 909 if (bdev == NULL) { 910 /* We can't just return -1 here as this GET_CONFIG message might 911 * be caused by a QEMU VM reboot. Returning -1 will indicate an 912 * error to QEMU, who might then decide to terminate itself. 913 * We don't want that. A simple reboot shouldn't break the system. 914 * 915 * Presenting a block device with block size 0 and block count 0 916 * doesn't cause any problems on QEMU side and the virtio-pci 917 * device is even still available inside the VM, but there will 918 * be no block device created for it - the kernel drivers will 919 * silently reject it. 920 */ 921 blk_size = 0; 922 blkcnt = 0; 923 } else { 924 blk_size = spdk_bdev_get_block_size(bdev); 925 blkcnt = spdk_bdev_get_num_blocks(bdev); 926 if (spdk_bdev_get_buf_align(bdev) > 1) { 927 blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE; 928 blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1); 929 } else { 930 blkcfg.size_max = 131072; 931 /* -2 for REQ and RESP and -1 for region boundary splitting */ 932 blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1; 933 } 934 } 935 936 blkcfg.blk_size = blk_size; 937 /* minimum I/O size in blocks */ 938 blkcfg.min_io_size = 1; 939 /* expressed in 512 Bytes sectors */ 940 blkcfg.capacity = (blkcnt * blk_size) / 512; 941 /* QEMU can overwrite this value when started */ 942 blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES; 943 944 if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { 945 /* 16MiB, expressed in 512 Bytes */ 946 blkcfg.max_discard_sectors = 32768; 947 blkcfg.max_discard_seg = 1; 948 blkcfg.discard_sector_alignment = blk_size / 512; 949 } 950 if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 951 blkcfg.max_write_zeroes_sectors = 32768; 952 blkcfg.max_write_zeroes_seg = 1; 953 } 954 955 memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg))); 956 957 return 0; 958 } 959 960 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = { 961 .session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session), 962 .start_session = vhost_blk_start, 963 .stop_session = vhost_blk_stop, 964 .vhost_get_config = vhost_blk_get_config, 965 .dump_info_json = vhost_blk_dump_info_json, 966 .write_config_json = vhost_blk_write_config_json, 967 .remove_device = vhost_blk_destroy, 968 }; 969 970 int 971 vhost_blk_controller_construct(void) 972 { 973 struct spdk_conf_section *sp; 974 unsigned ctrlr_num; 975 char *bdev_name; 976 char *cpumask; 977 char *name; 978 bool readonly; 979 980 for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { 981 if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) { 982 continue; 983 } 984 985 if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) { 986 SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", 987 spdk_conf_section_get_name(sp)); 988 return -1; 989 } 990 991 name = spdk_conf_section_get_val(sp, "Name"); 992 if (name == NULL) { 993 SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num); 994 return -1; 995 } 996 997 cpumask = spdk_conf_section_get_val(sp, "Cpumask"); 998 readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false); 999 1000 bdev_name = spdk_conf_section_get_val(sp, "Dev"); 1001 if (bdev_name == NULL) { 1002 continue; 1003 } 1004 1005 if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) { 1006 return -1; 1007 } 1008 } 1009 1010 return 0; 1011 } 1012 1013 int 1014 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly) 1015 { 1016 struct spdk_vhost_blk_dev *bvdev = NULL; 1017 struct spdk_vhost_dev *vdev; 1018 struct spdk_bdev *bdev; 1019 int ret = 0; 1020 1021 spdk_vhost_lock(); 1022 bdev = spdk_bdev_get_by_name(dev_name); 1023 if (bdev == NULL) { 1024 SPDK_ERRLOG("%s: bdev '%s' not found\n", 1025 name, dev_name); 1026 ret = -ENODEV; 1027 goto out; 1028 } 1029 1030 bvdev = calloc(1, sizeof(*bvdev)); 1031 if (bvdev == NULL) { 1032 ret = -ENOMEM; 1033 goto out; 1034 } 1035 1036 vdev = &bvdev->vdev; 1037 vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE; 1038 vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES; 1039 vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES; 1040 1041 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { 1042 vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD); 1043 } 1044 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 1045 vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES); 1046 } 1047 if (readonly) { 1048 vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO); 1049 } 1050 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) { 1051 vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH); 1052 } 1053 1054 ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc); 1055 if (ret != 0) { 1056 SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n", 1057 name, dev_name, ret); 1058 goto out; 1059 } 1060 1061 bvdev->bdev = bdev; 1062 bvdev->readonly = readonly; 1063 ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend); 1064 if (ret != 0) { 1065 spdk_bdev_close(bvdev->bdev_desc); 1066 goto out; 1067 } 1068 1069 SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: using bdev '%s'\n", name, dev_name); 1070 out: 1071 if (ret != 0 && bvdev) { 1072 free(bvdev); 1073 } 1074 spdk_vhost_unlock(); 1075 return ret; 1076 } 1077 1078 static int 1079 vhost_blk_destroy(struct spdk_vhost_dev *vdev) 1080 { 1081 struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); 1082 int rc; 1083 1084 assert(bvdev != NULL); 1085 rc = vhost_dev_unregister(&bvdev->vdev); 1086 if (rc != 0) { 1087 return rc; 1088 } 1089 1090 if (bvdev->bdev_desc) { 1091 spdk_bdev_close(bvdev->bdev_desc); 1092 bvdev->bdev_desc = NULL; 1093 } 1094 bvdev->bdev = NULL; 1095 1096 free(bvdev); 1097 return 0; 1098 } 1099 1100 SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK) 1101 SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA) 1102