1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "vbdev_delay.h" 38 #include "spdk/rpc.h" 39 #include "spdk/env.h" 40 #include "spdk/endian.h" 41 #include "spdk/string.h" 42 #include "spdk/thread.h" 43 #include "spdk/util.h" 44 45 #include "spdk/bdev_module.h" 46 #include "spdk/log.h" 47 48 49 static int vbdev_delay_init(void); 50 static int vbdev_delay_get_ctx_size(void); 51 static void vbdev_delay_examine(struct spdk_bdev *bdev); 52 static void vbdev_delay_finish(void); 53 static int vbdev_delay_config_json(struct spdk_json_write_ctx *w); 54 55 static struct spdk_bdev_module delay_if = { 56 .name = "delay", 57 .module_init = vbdev_delay_init, 58 .get_ctx_size = vbdev_delay_get_ctx_size, 59 .examine_config = vbdev_delay_examine, 60 .module_fini = vbdev_delay_finish, 61 .config_json = vbdev_delay_config_json 62 }; 63 64 SPDK_BDEV_MODULE_REGISTER(delay, &delay_if) 65 66 /* Associative list to be used in examine */ 67 struct bdev_association { 68 char *vbdev_name; 69 char *bdev_name; 70 uint64_t avg_read_latency; 71 uint64_t p99_read_latency; 72 uint64_t avg_write_latency; 73 uint64_t p99_write_latency; 74 TAILQ_ENTRY(bdev_association) link; 75 }; 76 static TAILQ_HEAD(, bdev_association) g_bdev_associations = TAILQ_HEAD_INITIALIZER( 77 g_bdev_associations); 78 79 /* List of virtual bdevs and associated info for each. */ 80 struct vbdev_delay { 81 struct spdk_bdev *base_bdev; /* the thing we're attaching to */ 82 struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */ 83 struct spdk_bdev delay_bdev; /* the delay virtual bdev */ 84 uint64_t average_read_latency_ticks; /* the average read delay */ 85 uint64_t p99_read_latency_ticks; /* the p99 read delay */ 86 uint64_t average_write_latency_ticks; /* the average write delay */ 87 uint64_t p99_write_latency_ticks; /* the p99 write delay */ 88 TAILQ_ENTRY(vbdev_delay) link; 89 struct spdk_thread *thread; /* thread where base device is opened */ 90 }; 91 static TAILQ_HEAD(, vbdev_delay) g_delay_nodes = TAILQ_HEAD_INITIALIZER(g_delay_nodes); 92 93 struct delay_bdev_io { 94 int status; 95 96 uint64_t completion_tick; 97 98 enum delay_io_type type; 99 100 struct spdk_io_channel *ch; 101 102 struct spdk_bdev_io_wait_entry bdev_io_wait; 103 104 struct spdk_bdev_io *zcopy_bdev_io; 105 106 STAILQ_ENTRY(delay_bdev_io) link; 107 }; 108 109 struct delay_io_channel { 110 struct spdk_io_channel *base_ch; /* IO channel of base device */ 111 STAILQ_HEAD(, delay_bdev_io) avg_read_io; 112 STAILQ_HEAD(, delay_bdev_io) p99_read_io; 113 STAILQ_HEAD(, delay_bdev_io) avg_write_io; 114 STAILQ_HEAD(, delay_bdev_io) p99_write_io; 115 struct spdk_poller *io_poller; 116 unsigned int rand_seed; 117 }; 118 119 static void 120 vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io); 121 122 123 /* Callback for unregistering the IO device. */ 124 static void 125 _device_unregister_cb(void *io_device) 126 { 127 struct vbdev_delay *delay_node = io_device; 128 129 /* Done with this delay_node. */ 130 free(delay_node->delay_bdev.name); 131 free(delay_node); 132 } 133 134 static void 135 _vbdev_delay_destruct(void *ctx) 136 { 137 struct spdk_bdev_desc *desc = ctx; 138 139 spdk_bdev_close(desc); 140 } 141 142 static int 143 vbdev_delay_destruct(void *ctx) 144 { 145 struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; 146 147 /* It is important to follow this exact sequence of steps for destroying 148 * a vbdev... 149 */ 150 151 TAILQ_REMOVE(&g_delay_nodes, delay_node, link); 152 153 /* Unclaim the underlying bdev. */ 154 spdk_bdev_module_release_bdev(delay_node->base_bdev); 155 156 /* Close the underlying bdev on its same opened thread. */ 157 if (delay_node->thread && delay_node->thread != spdk_get_thread()) { 158 spdk_thread_send_msg(delay_node->thread, _vbdev_delay_destruct, delay_node->base_desc); 159 } else { 160 spdk_bdev_close(delay_node->base_desc); 161 } 162 163 /* Unregister the io_device. */ 164 spdk_io_device_unregister(delay_node, _device_unregister_cb); 165 166 return 0; 167 } 168 169 static int 170 _process_io_stailq(void *arg, uint64_t ticks) 171 { 172 STAILQ_HEAD(, delay_bdev_io) *head = arg; 173 struct delay_bdev_io *io_ctx, *tmp; 174 int completions = 0; 175 176 STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) { 177 if (io_ctx->completion_tick <= ticks) { 178 STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link); 179 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), io_ctx->status); 180 completions++; 181 } else { 182 /* In the general case, I/O will become ready in an fifo order. When timeouts are dynamically 183 * changed, this is not necessarily the case. However, the normal behavior will be restored 184 * after the outstanding I/O at the time of the change have been completed. 185 * This essentially means that moving from a high to low latency creates a dam for the new I/O 186 * submitted after the latency change. This is considered desirable behavior for the use case where 187 * we are trying to trigger a pre-defined timeout on an initiator. 188 */ 189 break; 190 } 191 } 192 193 return completions; 194 } 195 196 static int 197 _delay_finish_io(void *arg) 198 { 199 struct delay_io_channel *delay_ch = arg; 200 uint64_t ticks = spdk_get_ticks(); 201 int completions = 0; 202 203 completions += _process_io_stailq(&delay_ch->avg_read_io, ticks); 204 completions += _process_io_stailq(&delay_ch->avg_write_io, ticks); 205 completions += _process_io_stailq(&delay_ch->p99_read_io, ticks); 206 completions += _process_io_stailq(&delay_ch->p99_write_io, ticks); 207 208 return completions == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 209 } 210 211 /* Completion callback for IO that were issued from this bdev. The original bdev_io 212 * is passed in as an arg so we'll complete that one with the appropriate status 213 * and then free the one that this module issued. 214 */ 215 static void 216 _delay_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 217 { 218 struct spdk_bdev_io *orig_io = cb_arg; 219 struct vbdev_delay *delay_node = SPDK_CONTAINEROF(orig_io->bdev, struct vbdev_delay, delay_bdev); 220 struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)orig_io->driver_ctx; 221 struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch); 222 223 io_ctx->status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 224 225 if (bdev_io->type == SPDK_BDEV_IO_TYPE_ZCOPY && bdev_io->u.bdev.zcopy.start && success) { 226 io_ctx->zcopy_bdev_io = bdev_io; 227 } else { 228 assert(io_ctx->zcopy_bdev_io == NULL || io_ctx->zcopy_bdev_io == bdev_io); 229 io_ctx->zcopy_bdev_io = NULL; 230 spdk_bdev_free_io(bdev_io); 231 } 232 233 /* Put the I/O into the proper list for processing by the channel poller. */ 234 switch (io_ctx->type) { 235 case DELAY_AVG_READ: 236 io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_read_latency_ticks; 237 STAILQ_INSERT_TAIL(&delay_ch->avg_read_io, io_ctx, link); 238 break; 239 case DELAY_AVG_WRITE: 240 io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_write_latency_ticks; 241 STAILQ_INSERT_TAIL(&delay_ch->avg_write_io, io_ctx, link); 242 break; 243 case DELAY_P99_READ: 244 io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_read_latency_ticks; 245 STAILQ_INSERT_TAIL(&delay_ch->p99_read_io, io_ctx, link); 246 break; 247 case DELAY_P99_WRITE: 248 io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_write_latency_ticks; 249 STAILQ_INSERT_TAIL(&delay_ch->p99_write_io, io_ctx, link); 250 break; 251 case DELAY_NONE: 252 default: 253 spdk_bdev_io_complete(orig_io, io_ctx->status); 254 break; 255 } 256 } 257 258 static void 259 vbdev_delay_resubmit_io(void *arg) 260 { 261 struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg; 262 struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx; 263 264 vbdev_delay_submit_request(io_ctx->ch, bdev_io); 265 } 266 267 static void 268 vbdev_delay_queue_io(struct spdk_bdev_io *bdev_io) 269 { 270 struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx; 271 struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch); 272 int rc; 273 274 io_ctx->bdev_io_wait.bdev = bdev_io->bdev; 275 io_ctx->bdev_io_wait.cb_fn = vbdev_delay_resubmit_io; 276 io_ctx->bdev_io_wait.cb_arg = bdev_io; 277 278 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, delay_ch->base_ch, &io_ctx->bdev_io_wait); 279 if (rc != 0) { 280 SPDK_ERRLOG("Queue io failed in vbdev_delay_queue_io, rc=%d.\n", rc); 281 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 282 } 283 } 284 285 static void 286 delay_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 287 { 288 struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay, 289 delay_bdev); 290 struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch); 291 int rc; 292 293 if (!success) { 294 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 295 return; 296 } 297 298 rc = spdk_bdev_readv_blocks(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs, 299 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, 300 bdev_io->u.bdev.num_blocks, _delay_complete_io, 301 bdev_io); 302 303 if (rc == -ENOMEM) { 304 SPDK_ERRLOG("No memory, start to queue io for delay.\n"); 305 vbdev_delay_queue_io(bdev_io); 306 } else if (rc != 0) { 307 SPDK_ERRLOG("ERROR on bdev_io submission!\n"); 308 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 309 } 310 } 311 312 static void 313 vbdev_delay_reset_dev(struct spdk_io_channel_iter *i, int status) 314 { 315 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 316 struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx; 317 struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch); 318 struct vbdev_delay *delay_node = spdk_io_channel_iter_get_io_device(i); 319 int rc; 320 321 rc = spdk_bdev_reset(delay_node->base_desc, delay_ch->base_ch, 322 _delay_complete_io, bdev_io); 323 324 if (rc == -ENOMEM) { 325 SPDK_ERRLOG("No memory, start to queue io for delay.\n"); 326 vbdev_delay_queue_io(bdev_io); 327 } else if (rc != 0) { 328 SPDK_ERRLOG("ERROR on bdev_io submission!\n"); 329 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 330 } 331 } 332 333 static void 334 abort_zcopy_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 335 { 336 spdk_bdev_free_io(bdev_io); 337 } 338 339 static void 340 _abort_all_delayed_io(void *arg) 341 { 342 STAILQ_HEAD(, delay_bdev_io) *head = arg; 343 struct delay_bdev_io *io_ctx, *tmp; 344 345 STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) { 346 STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link); 347 if (io_ctx->zcopy_bdev_io != NULL) { 348 spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, false, abort_zcopy_io, NULL); 349 } 350 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), SPDK_BDEV_IO_STATUS_ABORTED); 351 } 352 } 353 354 static void 355 vbdev_delay_reset_channel(struct spdk_io_channel_iter *i) 356 { 357 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 358 struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch); 359 360 _abort_all_delayed_io(&delay_ch->avg_read_io); 361 _abort_all_delayed_io(&delay_ch->avg_write_io); 362 _abort_all_delayed_io(&delay_ch->p99_read_io); 363 _abort_all_delayed_io(&delay_ch->p99_write_io); 364 365 spdk_for_each_channel_continue(i, 0); 366 } 367 368 static bool 369 abort_delayed_io(void *_head, struct spdk_bdev_io *bio_to_abort) 370 { 371 STAILQ_HEAD(, delay_bdev_io) *head = _head; 372 struct delay_bdev_io *io_ctx_to_abort = (struct delay_bdev_io *)bio_to_abort->driver_ctx; 373 struct delay_bdev_io *io_ctx; 374 375 STAILQ_FOREACH(io_ctx, head, link) { 376 if (io_ctx == io_ctx_to_abort) { 377 STAILQ_REMOVE(head, io_ctx_to_abort, delay_bdev_io, link); 378 if (io_ctx->zcopy_bdev_io != NULL) { 379 spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, false, abort_zcopy_io, NULL); 380 } 381 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 382 return true; 383 } 384 } 385 386 return false; 387 } 388 389 static int 390 vbdev_delay_abort(struct vbdev_delay *delay_node, struct delay_io_channel *delay_ch, 391 struct spdk_bdev_io *bdev_io) 392 { 393 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 394 395 if (abort_delayed_io(&delay_ch->avg_read_io, bio_to_abort) || 396 abort_delayed_io(&delay_ch->avg_write_io, bio_to_abort) || 397 abort_delayed_io(&delay_ch->p99_read_io, bio_to_abort) || 398 abort_delayed_io(&delay_ch->p99_write_io, bio_to_abort)) { 399 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 400 return 0; 401 } 402 403 return spdk_bdev_abort(delay_node->base_desc, delay_ch->base_ch, bio_to_abort, 404 _delay_complete_io, bdev_io); 405 } 406 407 static void 408 vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 409 { 410 struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay, delay_bdev); 411 struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch); 412 struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx; 413 int rc = 0; 414 bool is_p99; 415 416 is_p99 = rand_r(&delay_ch->rand_seed) % 100 == 0 ? true : false; 417 418 io_ctx->ch = ch; 419 io_ctx->type = DELAY_NONE; 420 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY || bdev_io->u.bdev.zcopy.start) { 421 io_ctx->zcopy_bdev_io = NULL; 422 } 423 424 switch (bdev_io->type) { 425 case SPDK_BDEV_IO_TYPE_READ: 426 io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ; 427 spdk_bdev_io_get_buf(bdev_io, delay_read_get_buf_cb, 428 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 429 break; 430 case SPDK_BDEV_IO_TYPE_WRITE: 431 io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE; 432 rc = spdk_bdev_writev_blocks(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs, 433 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, 434 bdev_io->u.bdev.num_blocks, _delay_complete_io, 435 bdev_io); 436 break; 437 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 438 rc = spdk_bdev_write_zeroes_blocks(delay_node->base_desc, delay_ch->base_ch, 439 bdev_io->u.bdev.offset_blocks, 440 bdev_io->u.bdev.num_blocks, 441 _delay_complete_io, bdev_io); 442 break; 443 case SPDK_BDEV_IO_TYPE_UNMAP: 444 rc = spdk_bdev_unmap_blocks(delay_node->base_desc, delay_ch->base_ch, 445 bdev_io->u.bdev.offset_blocks, 446 bdev_io->u.bdev.num_blocks, 447 _delay_complete_io, bdev_io); 448 break; 449 case SPDK_BDEV_IO_TYPE_FLUSH: 450 rc = spdk_bdev_flush_blocks(delay_node->base_desc, delay_ch->base_ch, 451 bdev_io->u.bdev.offset_blocks, 452 bdev_io->u.bdev.num_blocks, 453 _delay_complete_io, bdev_io); 454 break; 455 case SPDK_BDEV_IO_TYPE_RESET: 456 /* During reset, the generic bdev layer aborts all new I/Os and queues all new resets. 457 * Hence we can simply abort all I/Os delayed to complete. 458 */ 459 spdk_for_each_channel(delay_node, vbdev_delay_reset_channel, bdev_io, 460 vbdev_delay_reset_dev); 461 break; 462 case SPDK_BDEV_IO_TYPE_ABORT: 463 rc = vbdev_delay_abort(delay_node, delay_ch, bdev_io); 464 break; 465 case SPDK_BDEV_IO_TYPE_ZCOPY: 466 if (bdev_io->u.bdev.zcopy.commit) { 467 io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE; 468 } else if (bdev_io->u.bdev.zcopy.populate) { 469 io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ; 470 } 471 if (bdev_io->u.bdev.zcopy.start) { 472 rc = spdk_bdev_zcopy_start(delay_node->base_desc, delay_ch->base_ch, 473 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 474 bdev_io->u.bdev.offset_blocks, 475 bdev_io->u.bdev.num_blocks, 476 bdev_io->u.bdev.zcopy.populate, 477 _delay_complete_io, bdev_io); 478 } else { 479 rc = spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, bdev_io->u.bdev.zcopy.commit, 480 _delay_complete_io, bdev_io); 481 } 482 break; 483 default: 484 SPDK_ERRLOG("delay: unknown I/O type %d\n", bdev_io->type); 485 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 486 return; 487 } 488 489 if (rc == -ENOMEM) { 490 SPDK_ERRLOG("No memory, start to queue io for delay.\n"); 491 vbdev_delay_queue_io(bdev_io); 492 } else if (rc != 0) { 493 SPDK_ERRLOG("ERROR on bdev_io submission!\n"); 494 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 495 } 496 } 497 498 static bool 499 vbdev_delay_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 500 { 501 struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; 502 503 return spdk_bdev_io_type_supported(delay_node->base_bdev, io_type); 504 } 505 506 static struct spdk_io_channel * 507 vbdev_delay_get_io_channel(void *ctx) 508 { 509 struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; 510 struct spdk_io_channel *delay_ch = NULL; 511 512 delay_ch = spdk_get_io_channel(delay_node); 513 514 return delay_ch; 515 } 516 517 static void 518 _delay_write_conf_values(struct vbdev_delay *delay_node, struct spdk_json_write_ctx *w) 519 { 520 spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&delay_node->delay_bdev)); 521 spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(delay_node->base_bdev)); 522 spdk_json_write_named_int64(w, "avg_read_latency", 523 delay_node->average_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz()); 524 spdk_json_write_named_int64(w, "p99_read_latency", 525 delay_node->p99_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz()); 526 spdk_json_write_named_int64(w, "avg_write_latency", 527 delay_node->average_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz()); 528 spdk_json_write_named_int64(w, "p99_write_latency", 529 delay_node->p99_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz()); 530 } 531 532 static int 533 vbdev_delay_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 534 { 535 struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; 536 537 spdk_json_write_name(w, "delay"); 538 spdk_json_write_object_begin(w); 539 _delay_write_conf_values(delay_node, w); 540 spdk_json_write_object_end(w); 541 542 return 0; 543 } 544 545 /* This is used to generate JSON that can configure this module to its current state. */ 546 static int 547 vbdev_delay_config_json(struct spdk_json_write_ctx *w) 548 { 549 struct vbdev_delay *delay_node; 550 551 TAILQ_FOREACH(delay_node, &g_delay_nodes, link) { 552 spdk_json_write_object_begin(w); 553 spdk_json_write_named_string(w, "method", "bdev_delay_create"); 554 spdk_json_write_named_object_begin(w, "params"); 555 _delay_write_conf_values(delay_node, w); 556 spdk_json_write_object_end(w); 557 spdk_json_write_object_end(w); 558 } 559 return 0; 560 } 561 562 /* We provide this callback for the SPDK channel code to create a channel using 563 * the channel struct we provided in our module get_io_channel() entry point. Here 564 * we get and save off an underlying base channel of the device below us so that 565 * we can communicate with the base bdev on a per channel basis. If we needed 566 * our own poller for this vbdev, we'd register it here. 567 */ 568 static int 569 delay_bdev_ch_create_cb(void *io_device, void *ctx_buf) 570 { 571 struct delay_io_channel *delay_ch = ctx_buf; 572 struct vbdev_delay *delay_node = io_device; 573 574 STAILQ_INIT(&delay_ch->avg_read_io); 575 STAILQ_INIT(&delay_ch->p99_read_io); 576 STAILQ_INIT(&delay_ch->avg_write_io); 577 STAILQ_INIT(&delay_ch->p99_write_io); 578 579 delay_ch->io_poller = SPDK_POLLER_REGISTER(_delay_finish_io, delay_ch, 0); 580 delay_ch->base_ch = spdk_bdev_get_io_channel(delay_node->base_desc); 581 delay_ch->rand_seed = time(NULL); 582 583 return 0; 584 } 585 586 /* We provide this callback for the SPDK channel code to destroy a channel 587 * created with our create callback. We just need to undo anything we did 588 * when we created. If this bdev used its own poller, we'd unregister it here. 589 */ 590 static void 591 delay_bdev_ch_destroy_cb(void *io_device, void *ctx_buf) 592 { 593 struct delay_io_channel *delay_ch = ctx_buf; 594 595 spdk_poller_unregister(&delay_ch->io_poller); 596 spdk_put_io_channel(delay_ch->base_ch); 597 } 598 599 /* Create the delay association from the bdev and vbdev name and insert 600 * on the global list. */ 601 static int 602 vbdev_delay_insert_association(const char *bdev_name, const char *vbdev_name, 603 uint64_t avg_read_latency, uint64_t p99_read_latency, 604 uint64_t avg_write_latency, uint64_t p99_write_latency) 605 { 606 struct bdev_association *assoc; 607 608 TAILQ_FOREACH(assoc, &g_bdev_associations, link) { 609 if (strcmp(vbdev_name, assoc->vbdev_name) == 0) { 610 SPDK_ERRLOG("delay bdev %s already exists\n", vbdev_name); 611 return -EEXIST; 612 } 613 } 614 615 assoc = calloc(1, sizeof(struct bdev_association)); 616 if (!assoc) { 617 SPDK_ERRLOG("could not allocate bdev_association\n"); 618 return -ENOMEM; 619 } 620 621 assoc->bdev_name = strdup(bdev_name); 622 if (!assoc->bdev_name) { 623 SPDK_ERRLOG("could not allocate assoc->bdev_name\n"); 624 free(assoc); 625 return -ENOMEM; 626 } 627 628 assoc->vbdev_name = strdup(vbdev_name); 629 if (!assoc->vbdev_name) { 630 SPDK_ERRLOG("could not allocate assoc->vbdev_name\n"); 631 free(assoc->bdev_name); 632 free(assoc); 633 return -ENOMEM; 634 } 635 636 assoc->avg_read_latency = avg_read_latency; 637 assoc->p99_read_latency = p99_read_latency; 638 assoc->avg_write_latency = avg_write_latency; 639 assoc->p99_write_latency = p99_write_latency; 640 641 TAILQ_INSERT_TAIL(&g_bdev_associations, assoc, link); 642 643 return 0; 644 } 645 646 int 647 vbdev_delay_update_latency_value(char *delay_name, uint64_t latency_us, enum delay_io_type type) 648 { 649 struct vbdev_delay *delay_node; 650 uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 651 652 TAILQ_FOREACH(delay_node, &g_delay_nodes, link) { 653 if (strcmp(delay_node->delay_bdev.name, delay_name) == 0) { 654 break; 655 } 656 } 657 658 if (delay_node == NULL) { 659 return -ENODEV; 660 } 661 662 switch (type) { 663 case DELAY_AVG_READ: 664 delay_node->average_read_latency_ticks = ticks_mhz * latency_us; 665 break; 666 case DELAY_AVG_WRITE: 667 delay_node->average_write_latency_ticks = ticks_mhz * latency_us; 668 break; 669 case DELAY_P99_READ: 670 delay_node->p99_read_latency_ticks = ticks_mhz * latency_us; 671 break; 672 case DELAY_P99_WRITE: 673 delay_node->p99_write_latency_ticks = ticks_mhz * latency_us; 674 break; 675 default: 676 return -EINVAL; 677 } 678 679 return 0; 680 } 681 682 static int 683 vbdev_delay_init(void) 684 { 685 /* Not allowing for .ini style configuration. */ 686 return 0; 687 } 688 689 static void 690 vbdev_delay_finish(void) 691 { 692 struct bdev_association *assoc; 693 694 while ((assoc = TAILQ_FIRST(&g_bdev_associations))) { 695 TAILQ_REMOVE(&g_bdev_associations, assoc, link); 696 free(assoc->bdev_name); 697 free(assoc->vbdev_name); 698 free(assoc); 699 } 700 } 701 702 static int 703 vbdev_delay_get_ctx_size(void) 704 { 705 return sizeof(struct delay_bdev_io); 706 } 707 708 static void 709 vbdev_delay_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 710 { 711 /* No config per bdev needed */ 712 } 713 714 static int 715 vbdev_delay_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 716 { 717 struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; 718 719 /* Delay bdev doesn't work with data buffers, so it supports any memory domain used by base_bdev */ 720 return spdk_bdev_get_memory_domains(delay_node->base_bdev, domains, array_size); 721 } 722 723 /* When we register our bdev this is how we specify our entry points. */ 724 static const struct spdk_bdev_fn_table vbdev_delay_fn_table = { 725 .destruct = vbdev_delay_destruct, 726 .submit_request = vbdev_delay_submit_request, 727 .io_type_supported = vbdev_delay_io_type_supported, 728 .get_io_channel = vbdev_delay_get_io_channel, 729 .dump_info_json = vbdev_delay_dump_info_json, 730 .write_config_json = vbdev_delay_write_config_json, 731 .get_memory_domains = vbdev_delay_get_memory_domains, 732 }; 733 734 static void 735 vbdev_delay_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find) 736 { 737 struct vbdev_delay *delay_node, *tmp; 738 739 TAILQ_FOREACH_SAFE(delay_node, &g_delay_nodes, link, tmp) { 740 if (bdev_find == delay_node->base_bdev) { 741 spdk_bdev_unregister(&delay_node->delay_bdev, NULL, NULL); 742 } 743 } 744 } 745 746 /* Called when the underlying base bdev triggers asynchronous event such as bdev removal. */ 747 static void 748 vbdev_delay_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, 749 void *event_ctx) 750 { 751 switch (type) { 752 case SPDK_BDEV_EVENT_REMOVE: 753 vbdev_delay_base_bdev_hotremove_cb(bdev); 754 break; 755 default: 756 SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); 757 break; 758 } 759 } 760 761 /* Create and register the delay vbdev if we find it in our list of bdev names. 762 * This can be called either by the examine path or RPC method. 763 */ 764 static int 765 vbdev_delay_register(const char *bdev_name) 766 { 767 struct bdev_association *assoc; 768 struct vbdev_delay *delay_node; 769 struct spdk_bdev *bdev; 770 uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 771 int rc = 0; 772 773 /* Check our list of names from config versus this bdev and if 774 * there's a match, create the delay_node & bdev accordingly. 775 */ 776 TAILQ_FOREACH(assoc, &g_bdev_associations, link) { 777 if (strcmp(assoc->bdev_name, bdev_name) != 0) { 778 continue; 779 } 780 781 delay_node = calloc(1, sizeof(struct vbdev_delay)); 782 if (!delay_node) { 783 rc = -ENOMEM; 784 SPDK_ERRLOG("could not allocate delay_node\n"); 785 break; 786 } 787 delay_node->delay_bdev.name = strdup(assoc->vbdev_name); 788 if (!delay_node->delay_bdev.name) { 789 rc = -ENOMEM; 790 SPDK_ERRLOG("could not allocate delay_bdev name\n"); 791 free(delay_node); 792 break; 793 } 794 delay_node->delay_bdev.product_name = "delay"; 795 796 /* The base bdev that we're attaching to. */ 797 rc = spdk_bdev_open_ext(bdev_name, true, vbdev_delay_base_bdev_event_cb, 798 NULL, &delay_node->base_desc); 799 if (rc) { 800 if (rc != -ENODEV) { 801 SPDK_ERRLOG("could not open bdev %s\n", bdev_name); 802 } 803 free(delay_node->delay_bdev.name); 804 free(delay_node); 805 break; 806 } 807 808 bdev = spdk_bdev_desc_get_bdev(delay_node->base_desc); 809 delay_node->base_bdev = bdev; 810 811 delay_node->delay_bdev.write_cache = bdev->write_cache; 812 delay_node->delay_bdev.required_alignment = bdev->required_alignment; 813 delay_node->delay_bdev.optimal_io_boundary = bdev->optimal_io_boundary; 814 delay_node->delay_bdev.blocklen = bdev->blocklen; 815 delay_node->delay_bdev.blockcnt = bdev->blockcnt; 816 817 delay_node->delay_bdev.ctxt = delay_node; 818 delay_node->delay_bdev.fn_table = &vbdev_delay_fn_table; 819 delay_node->delay_bdev.module = &delay_if; 820 821 /* Store the number of ticks you need to add to get the I/O expiration time. */ 822 delay_node->average_read_latency_ticks = ticks_mhz * assoc->avg_read_latency; 823 delay_node->p99_read_latency_ticks = ticks_mhz * assoc->p99_read_latency; 824 delay_node->average_write_latency_ticks = ticks_mhz * assoc->avg_write_latency; 825 delay_node->p99_write_latency_ticks = ticks_mhz * assoc->p99_write_latency; 826 827 spdk_io_device_register(delay_node, delay_bdev_ch_create_cb, delay_bdev_ch_destroy_cb, 828 sizeof(struct delay_io_channel), 829 assoc->vbdev_name); 830 831 /* Save the thread where the base device is opened */ 832 delay_node->thread = spdk_get_thread(); 833 834 rc = spdk_bdev_module_claim_bdev(bdev, delay_node->base_desc, delay_node->delay_bdev.module); 835 if (rc) { 836 SPDK_ERRLOG("could not claim bdev %s\n", bdev_name); 837 goto error_close; 838 } 839 840 rc = spdk_bdev_register(&delay_node->delay_bdev); 841 if (rc) { 842 SPDK_ERRLOG("could not register delay_bdev\n"); 843 spdk_bdev_module_release_bdev(delay_node->base_bdev); 844 goto error_close; 845 } 846 847 TAILQ_INSERT_TAIL(&g_delay_nodes, delay_node, link); 848 } 849 850 return rc; 851 852 error_close: 853 spdk_bdev_close(delay_node->base_desc); 854 spdk_io_device_unregister(delay_node, NULL); 855 free(delay_node->delay_bdev.name); 856 free(delay_node); 857 return rc; 858 } 859 860 int 861 create_delay_disk(const char *bdev_name, const char *vbdev_name, uint64_t avg_read_latency, 862 uint64_t p99_read_latency, uint64_t avg_write_latency, uint64_t p99_write_latency) 863 { 864 int rc = 0; 865 866 if (p99_read_latency < avg_read_latency || p99_write_latency < avg_write_latency) { 867 SPDK_ERRLOG("Unable to create a delay bdev where p99 latency is less than average latency.\n"); 868 return -EINVAL; 869 } 870 871 rc = vbdev_delay_insert_association(bdev_name, vbdev_name, avg_read_latency, p99_read_latency, 872 avg_write_latency, p99_write_latency); 873 if (rc) { 874 return rc; 875 } 876 877 rc = vbdev_delay_register(bdev_name); 878 if (rc == -ENODEV) { 879 /* This is not an error, we tracked the name above and it still 880 * may show up later. 881 */ 882 SPDK_NOTICELOG("vbdev creation deferred pending base bdev arrival\n"); 883 rc = 0; 884 } 885 886 return rc; 887 } 888 889 void 890 delete_delay_disk(const char *vbdev_name, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 891 { 892 struct bdev_association *assoc; 893 int rc; 894 895 rc = spdk_bdev_unregister_by_name(vbdev_name, &delay_if, cb_fn, cb_arg); 896 if (rc == 0) { 897 TAILQ_FOREACH(assoc, &g_bdev_associations, link) { 898 if (strcmp(assoc->vbdev_name, vbdev_name) == 0) { 899 TAILQ_REMOVE(&g_bdev_associations, assoc, link); 900 free(assoc->bdev_name); 901 free(assoc->vbdev_name); 902 free(assoc); 903 break; 904 } 905 } 906 } else { 907 cb_fn(cb_arg, rc); 908 } 909 } 910 911 static void 912 vbdev_delay_examine(struct spdk_bdev *bdev) 913 { 914 vbdev_delay_register(bdev->name); 915 916 spdk_bdev_module_examine_done(&delay_if); 917 } 918 919 SPDK_LOG_REGISTER_COMPONENT(vbdev_delay) 920