1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "vbdev_delay.h" 10 #include "spdk/rpc.h" 11 #include "spdk/env.h" 12 #include "spdk/endian.h" 13 #include "spdk/string.h" 14 #include "spdk/thread.h" 15 #include "spdk/util.h" 16 17 #include "spdk/bdev_module.h" 18 #include "spdk/log.h" 19 20 21 static int vbdev_delay_init(void); 22 static int vbdev_delay_get_ctx_size(void); 23 static void vbdev_delay_examine(struct spdk_bdev *bdev); 24 static void vbdev_delay_finish(void); 25 static int vbdev_delay_config_json(struct spdk_json_write_ctx *w); 26 27 static struct spdk_bdev_module delay_if = { 28 .name = "delay", 29 .module_init = vbdev_delay_init, 30 .get_ctx_size = vbdev_delay_get_ctx_size, 31 .examine_config = vbdev_delay_examine, 32 .module_fini = vbdev_delay_finish, 33 .config_json = vbdev_delay_config_json 34 }; 35 36 SPDK_BDEV_MODULE_REGISTER(delay, &delay_if) 37 38 /* Associative list to be used in examine */ 39 struct bdev_association { 40 char *vbdev_name; 41 char *bdev_name; 42 uint64_t avg_read_latency; 43 uint64_t p99_read_latency; 44 uint64_t avg_write_latency; 45 uint64_t p99_write_latency; 46 TAILQ_ENTRY(bdev_association) link; 47 }; 48 static TAILQ_HEAD(, bdev_association) g_bdev_associations = TAILQ_HEAD_INITIALIZER( 49 g_bdev_associations); 50 51 /* List of virtual bdevs and associated info for each. */ 52 struct vbdev_delay { 53 struct spdk_bdev *base_bdev; /* the thing we're attaching to */ 54 struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */ 55 struct spdk_bdev delay_bdev; /* the delay virtual bdev */ 56 uint64_t average_read_latency_ticks; /* the average read delay */ 57 uint64_t p99_read_latency_ticks; /* the p99 read delay */ 58 uint64_t average_write_latency_ticks; /* the average write delay */ 59 uint64_t p99_write_latency_ticks; /* the p99 write delay */ 60 TAILQ_ENTRY(vbdev_delay) link; 61 struct spdk_thread *thread; /* thread where base device is opened */ 62 }; 63 static TAILQ_HEAD(, vbdev_delay) g_delay_nodes = TAILQ_HEAD_INITIALIZER(g_delay_nodes); 64 65 struct delay_bdev_io { 66 int status; 67 68 uint64_t completion_tick; 69 70 enum delay_io_type type; 71 72 struct spdk_io_channel *ch; 73 74 struct spdk_bdev_io_wait_entry bdev_io_wait; 75 76 struct spdk_bdev_io *zcopy_bdev_io; 77 78 STAILQ_ENTRY(delay_bdev_io) link; 79 }; 80 81 struct delay_io_channel { 82 struct spdk_io_channel *base_ch; /* IO channel of base device */ 83 STAILQ_HEAD(, delay_bdev_io) avg_read_io; 84 STAILQ_HEAD(, delay_bdev_io) p99_read_io; 85 STAILQ_HEAD(, delay_bdev_io) avg_write_io; 86 STAILQ_HEAD(, delay_bdev_io) p99_write_io; 87 struct spdk_poller *io_poller; 88 unsigned int rand_seed; 89 }; 90 91 static void 92 vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io); 93 94 95 /* Callback for unregistering the IO device. */ 96 static void 97 _device_unregister_cb(void *io_device) 98 { 99 struct vbdev_delay *delay_node = io_device; 100 101 /* Done with this delay_node. */ 102 free(delay_node->delay_bdev.name); 103 free(delay_node); 104 } 105 106 static void 107 _vbdev_delay_destruct(void *ctx) 108 { 109 struct spdk_bdev_desc *desc = ctx; 110 111 spdk_bdev_close(desc); 112 } 113 114 static int 115 vbdev_delay_destruct(void *ctx) 116 { 117 struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; 118 119 /* It is important to follow this exact sequence of steps for destroying 120 * a vbdev... 121 */ 122 123 TAILQ_REMOVE(&g_delay_nodes, delay_node, link); 124 125 /* Unclaim the underlying bdev. */ 126 spdk_bdev_module_release_bdev(delay_node->base_bdev); 127 128 /* Close the underlying bdev on its same opened thread. */ 129 if (delay_node->thread && delay_node->thread != spdk_get_thread()) { 130 spdk_thread_send_msg(delay_node->thread, _vbdev_delay_destruct, delay_node->base_desc); 131 } else { 132 spdk_bdev_close(delay_node->base_desc); 133 } 134 135 /* Unregister the io_device. */ 136 spdk_io_device_unregister(delay_node, _device_unregister_cb); 137 138 return 0; 139 } 140 141 static int 142 _process_io_stailq(void *arg, uint64_t ticks) 143 { 144 STAILQ_HEAD(, delay_bdev_io) *head = arg; 145 struct delay_bdev_io *io_ctx, *tmp; 146 int completions = 0; 147 148 STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) { 149 if (io_ctx->completion_tick <= ticks) { 150 STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link); 151 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), io_ctx->status); 152 completions++; 153 } else { 154 /* In the general case, I/O will become ready in an fifo order. When timeouts are dynamically 155 * changed, this is not necessarily the case. However, the normal behavior will be restored 156 * after the outstanding I/O at the time of the change have been completed. 157 * This essentially means that moving from a high to low latency creates a dam for the new I/O 158 * submitted after the latency change. This is considered desirable behavior for the use case where 159 * we are trying to trigger a pre-defined timeout on an initiator. 160 */ 161 break; 162 } 163 } 164 165 return completions; 166 } 167 168 static int 169 _delay_finish_io(void *arg) 170 { 171 struct delay_io_channel *delay_ch = arg; 172 uint64_t ticks = spdk_get_ticks(); 173 int completions = 0; 174 175 completions += _process_io_stailq(&delay_ch->avg_read_io, ticks); 176 completions += _process_io_stailq(&delay_ch->avg_write_io, ticks); 177 completions += _process_io_stailq(&delay_ch->p99_read_io, ticks); 178 completions += _process_io_stailq(&delay_ch->p99_write_io, ticks); 179 180 return completions == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 181 } 182 183 /* Completion callback for IO that were issued from this bdev. The original bdev_io 184 * is passed in as an arg so we'll complete that one with the appropriate status 185 * and then free the one that this module issued. 186 */ 187 static void 188 _delay_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 189 { 190 struct spdk_bdev_io *orig_io = cb_arg; 191 struct vbdev_delay *delay_node = SPDK_CONTAINEROF(orig_io->bdev, struct vbdev_delay, delay_bdev); 192 struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)orig_io->driver_ctx; 193 struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch); 194 195 io_ctx->status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 196 197 if (bdev_io->type == SPDK_BDEV_IO_TYPE_ZCOPY && bdev_io->u.bdev.zcopy.start && success) { 198 io_ctx->zcopy_bdev_io = bdev_io; 199 } else { 200 assert(io_ctx->zcopy_bdev_io == NULL || io_ctx->zcopy_bdev_io == bdev_io); 201 io_ctx->zcopy_bdev_io = NULL; 202 spdk_bdev_free_io(bdev_io); 203 } 204 205 /* Put the I/O into the proper list for processing by the channel poller. */ 206 switch (io_ctx->type) { 207 case DELAY_AVG_READ: 208 io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_read_latency_ticks; 209 STAILQ_INSERT_TAIL(&delay_ch->avg_read_io, io_ctx, link); 210 break; 211 case DELAY_AVG_WRITE: 212 io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_write_latency_ticks; 213 STAILQ_INSERT_TAIL(&delay_ch->avg_write_io, io_ctx, link); 214 break; 215 case DELAY_P99_READ: 216 io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_read_latency_ticks; 217 STAILQ_INSERT_TAIL(&delay_ch->p99_read_io, io_ctx, link); 218 break; 219 case DELAY_P99_WRITE: 220 io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_write_latency_ticks; 221 STAILQ_INSERT_TAIL(&delay_ch->p99_write_io, io_ctx, link); 222 break; 223 case DELAY_NONE: 224 default: 225 spdk_bdev_io_complete(orig_io, io_ctx->status); 226 break; 227 } 228 } 229 230 static void 231 vbdev_delay_resubmit_io(void *arg) 232 { 233 struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg; 234 struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx; 235 236 vbdev_delay_submit_request(io_ctx->ch, bdev_io); 237 } 238 239 static void 240 vbdev_delay_queue_io(struct spdk_bdev_io *bdev_io) 241 { 242 struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx; 243 struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch); 244 int rc; 245 246 io_ctx->bdev_io_wait.bdev = bdev_io->bdev; 247 io_ctx->bdev_io_wait.cb_fn = vbdev_delay_resubmit_io; 248 io_ctx->bdev_io_wait.cb_arg = bdev_io; 249 250 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, delay_ch->base_ch, &io_ctx->bdev_io_wait); 251 if (rc != 0) { 252 SPDK_ERRLOG("Queue io failed in vbdev_delay_queue_io, rc=%d.\n", rc); 253 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 254 } 255 } 256 257 static void 258 delay_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 259 { 260 struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay, 261 delay_bdev); 262 struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch); 263 int rc; 264 265 if (!success) { 266 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 267 return; 268 } 269 270 rc = spdk_bdev_readv_blocks(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs, 271 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, 272 bdev_io->u.bdev.num_blocks, _delay_complete_io, 273 bdev_io); 274 275 if (rc == -ENOMEM) { 276 SPDK_ERRLOG("No memory, start to queue io for delay.\n"); 277 vbdev_delay_queue_io(bdev_io); 278 } else if (rc != 0) { 279 SPDK_ERRLOG("ERROR on bdev_io submission!\n"); 280 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 281 } 282 } 283 284 static void 285 vbdev_delay_reset_dev(struct spdk_io_channel_iter *i, int status) 286 { 287 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 288 struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx; 289 struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch); 290 struct vbdev_delay *delay_node = spdk_io_channel_iter_get_io_device(i); 291 int rc; 292 293 rc = spdk_bdev_reset(delay_node->base_desc, delay_ch->base_ch, 294 _delay_complete_io, bdev_io); 295 296 if (rc == -ENOMEM) { 297 SPDK_ERRLOG("No memory, start to queue io for delay.\n"); 298 vbdev_delay_queue_io(bdev_io); 299 } else if (rc != 0) { 300 SPDK_ERRLOG("ERROR on bdev_io submission!\n"); 301 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 302 } 303 } 304 305 static void 306 abort_zcopy_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 307 { 308 spdk_bdev_free_io(bdev_io); 309 } 310 311 static void 312 _abort_all_delayed_io(void *arg) 313 { 314 STAILQ_HEAD(, delay_bdev_io) *head = arg; 315 struct delay_bdev_io *io_ctx, *tmp; 316 317 STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) { 318 STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link); 319 if (io_ctx->zcopy_bdev_io != NULL) { 320 spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, false, abort_zcopy_io, NULL); 321 } 322 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), SPDK_BDEV_IO_STATUS_ABORTED); 323 } 324 } 325 326 static void 327 vbdev_delay_reset_channel(struct spdk_io_channel_iter *i) 328 { 329 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 330 struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch); 331 332 _abort_all_delayed_io(&delay_ch->avg_read_io); 333 _abort_all_delayed_io(&delay_ch->avg_write_io); 334 _abort_all_delayed_io(&delay_ch->p99_read_io); 335 _abort_all_delayed_io(&delay_ch->p99_write_io); 336 337 spdk_for_each_channel_continue(i, 0); 338 } 339 340 static bool 341 abort_delayed_io(void *_head, struct spdk_bdev_io *bio_to_abort) 342 { 343 STAILQ_HEAD(, delay_bdev_io) *head = _head; 344 struct delay_bdev_io *io_ctx_to_abort = (struct delay_bdev_io *)bio_to_abort->driver_ctx; 345 struct delay_bdev_io *io_ctx; 346 347 STAILQ_FOREACH(io_ctx, head, link) { 348 if (io_ctx == io_ctx_to_abort) { 349 STAILQ_REMOVE(head, io_ctx_to_abort, delay_bdev_io, link); 350 if (io_ctx->zcopy_bdev_io != NULL) { 351 spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, false, abort_zcopy_io, NULL); 352 } 353 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 354 return true; 355 } 356 } 357 358 return false; 359 } 360 361 static int 362 vbdev_delay_abort(struct vbdev_delay *delay_node, struct delay_io_channel *delay_ch, 363 struct spdk_bdev_io *bdev_io) 364 { 365 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 366 367 if (abort_delayed_io(&delay_ch->avg_read_io, bio_to_abort) || 368 abort_delayed_io(&delay_ch->avg_write_io, bio_to_abort) || 369 abort_delayed_io(&delay_ch->p99_read_io, bio_to_abort) || 370 abort_delayed_io(&delay_ch->p99_write_io, bio_to_abort)) { 371 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 372 return 0; 373 } 374 375 return spdk_bdev_abort(delay_node->base_desc, delay_ch->base_ch, bio_to_abort, 376 _delay_complete_io, bdev_io); 377 } 378 379 static void 380 vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 381 { 382 struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay, delay_bdev); 383 struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch); 384 struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx; 385 int rc = 0; 386 bool is_p99; 387 388 is_p99 = rand_r(&delay_ch->rand_seed) % 100 == 0 ? true : false; 389 390 io_ctx->ch = ch; 391 io_ctx->type = DELAY_NONE; 392 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY || bdev_io->u.bdev.zcopy.start) { 393 io_ctx->zcopy_bdev_io = NULL; 394 } 395 396 switch (bdev_io->type) { 397 case SPDK_BDEV_IO_TYPE_READ: 398 io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ; 399 spdk_bdev_io_get_buf(bdev_io, delay_read_get_buf_cb, 400 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 401 break; 402 case SPDK_BDEV_IO_TYPE_WRITE: 403 io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE; 404 rc = spdk_bdev_writev_blocks(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs, 405 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, 406 bdev_io->u.bdev.num_blocks, _delay_complete_io, 407 bdev_io); 408 break; 409 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 410 rc = spdk_bdev_write_zeroes_blocks(delay_node->base_desc, delay_ch->base_ch, 411 bdev_io->u.bdev.offset_blocks, 412 bdev_io->u.bdev.num_blocks, 413 _delay_complete_io, bdev_io); 414 break; 415 case SPDK_BDEV_IO_TYPE_UNMAP: 416 rc = spdk_bdev_unmap_blocks(delay_node->base_desc, delay_ch->base_ch, 417 bdev_io->u.bdev.offset_blocks, 418 bdev_io->u.bdev.num_blocks, 419 _delay_complete_io, bdev_io); 420 break; 421 case SPDK_BDEV_IO_TYPE_FLUSH: 422 rc = spdk_bdev_flush_blocks(delay_node->base_desc, delay_ch->base_ch, 423 bdev_io->u.bdev.offset_blocks, 424 bdev_io->u.bdev.num_blocks, 425 _delay_complete_io, bdev_io); 426 break; 427 case SPDK_BDEV_IO_TYPE_RESET: 428 /* During reset, the generic bdev layer aborts all new I/Os and queues all new resets. 429 * Hence we can simply abort all I/Os delayed to complete. 430 */ 431 spdk_for_each_channel(delay_node, vbdev_delay_reset_channel, bdev_io, 432 vbdev_delay_reset_dev); 433 break; 434 case SPDK_BDEV_IO_TYPE_ABORT: 435 rc = vbdev_delay_abort(delay_node, delay_ch, bdev_io); 436 break; 437 case SPDK_BDEV_IO_TYPE_ZCOPY: 438 if (bdev_io->u.bdev.zcopy.commit) { 439 io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE; 440 } else if (bdev_io->u.bdev.zcopy.populate) { 441 io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ; 442 } 443 if (bdev_io->u.bdev.zcopy.start) { 444 rc = spdk_bdev_zcopy_start(delay_node->base_desc, delay_ch->base_ch, 445 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 446 bdev_io->u.bdev.offset_blocks, 447 bdev_io->u.bdev.num_blocks, 448 bdev_io->u.bdev.zcopy.populate, 449 _delay_complete_io, bdev_io); 450 } else { 451 rc = spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, bdev_io->u.bdev.zcopy.commit, 452 _delay_complete_io, bdev_io); 453 } 454 break; 455 default: 456 SPDK_ERRLOG("delay: unknown I/O type %d\n", bdev_io->type); 457 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 458 return; 459 } 460 461 if (rc == -ENOMEM) { 462 SPDK_ERRLOG("No memory, start to queue io for delay.\n"); 463 vbdev_delay_queue_io(bdev_io); 464 } else if (rc != 0) { 465 SPDK_ERRLOG("ERROR on bdev_io submission!\n"); 466 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 467 } 468 } 469 470 static bool 471 vbdev_delay_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 472 { 473 struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; 474 475 return spdk_bdev_io_type_supported(delay_node->base_bdev, io_type); 476 } 477 478 static struct spdk_io_channel * 479 vbdev_delay_get_io_channel(void *ctx) 480 { 481 struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; 482 struct spdk_io_channel *delay_ch = NULL; 483 484 delay_ch = spdk_get_io_channel(delay_node); 485 486 return delay_ch; 487 } 488 489 static void 490 _delay_write_conf_values(struct vbdev_delay *delay_node, struct spdk_json_write_ctx *w) 491 { 492 spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&delay_node->delay_bdev)); 493 spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(delay_node->base_bdev)); 494 spdk_json_write_named_int64(w, "avg_read_latency", 495 delay_node->average_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz()); 496 spdk_json_write_named_int64(w, "p99_read_latency", 497 delay_node->p99_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz()); 498 spdk_json_write_named_int64(w, "avg_write_latency", 499 delay_node->average_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz()); 500 spdk_json_write_named_int64(w, "p99_write_latency", 501 delay_node->p99_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz()); 502 } 503 504 static int 505 vbdev_delay_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 506 { 507 struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; 508 509 spdk_json_write_name(w, "delay"); 510 spdk_json_write_object_begin(w); 511 _delay_write_conf_values(delay_node, w); 512 spdk_json_write_object_end(w); 513 514 return 0; 515 } 516 517 /* This is used to generate JSON that can configure this module to its current state. */ 518 static int 519 vbdev_delay_config_json(struct spdk_json_write_ctx *w) 520 { 521 struct vbdev_delay *delay_node; 522 523 TAILQ_FOREACH(delay_node, &g_delay_nodes, link) { 524 spdk_json_write_object_begin(w); 525 spdk_json_write_named_string(w, "method", "bdev_delay_create"); 526 spdk_json_write_named_object_begin(w, "params"); 527 _delay_write_conf_values(delay_node, w); 528 spdk_json_write_object_end(w); 529 spdk_json_write_object_end(w); 530 } 531 return 0; 532 } 533 534 /* We provide this callback for the SPDK channel code to create a channel using 535 * the channel struct we provided in our module get_io_channel() entry point. Here 536 * we get and save off an underlying base channel of the device below us so that 537 * we can communicate with the base bdev on a per channel basis. If we needed 538 * our own poller for this vbdev, we'd register it here. 539 */ 540 static int 541 delay_bdev_ch_create_cb(void *io_device, void *ctx_buf) 542 { 543 struct delay_io_channel *delay_ch = ctx_buf; 544 struct vbdev_delay *delay_node = io_device; 545 546 STAILQ_INIT(&delay_ch->avg_read_io); 547 STAILQ_INIT(&delay_ch->p99_read_io); 548 STAILQ_INIT(&delay_ch->avg_write_io); 549 STAILQ_INIT(&delay_ch->p99_write_io); 550 551 delay_ch->io_poller = SPDK_POLLER_REGISTER(_delay_finish_io, delay_ch, 0); 552 delay_ch->base_ch = spdk_bdev_get_io_channel(delay_node->base_desc); 553 delay_ch->rand_seed = time(NULL); 554 555 return 0; 556 } 557 558 /* We provide this callback for the SPDK channel code to destroy a channel 559 * created with our create callback. We just need to undo anything we did 560 * when we created. If this bdev used its own poller, we'd unregister it here. 561 */ 562 static void 563 delay_bdev_ch_destroy_cb(void *io_device, void *ctx_buf) 564 { 565 struct delay_io_channel *delay_ch = ctx_buf; 566 567 spdk_poller_unregister(&delay_ch->io_poller); 568 spdk_put_io_channel(delay_ch->base_ch); 569 } 570 571 /* Create the delay association from the bdev and vbdev name and insert 572 * on the global list. */ 573 static int 574 vbdev_delay_insert_association(const char *bdev_name, const char *vbdev_name, 575 uint64_t avg_read_latency, uint64_t p99_read_latency, 576 uint64_t avg_write_latency, uint64_t p99_write_latency) 577 { 578 struct bdev_association *assoc; 579 580 TAILQ_FOREACH(assoc, &g_bdev_associations, link) { 581 if (strcmp(vbdev_name, assoc->vbdev_name) == 0) { 582 SPDK_ERRLOG("delay bdev %s already exists\n", vbdev_name); 583 return -EEXIST; 584 } 585 } 586 587 assoc = calloc(1, sizeof(struct bdev_association)); 588 if (!assoc) { 589 SPDK_ERRLOG("could not allocate bdev_association\n"); 590 return -ENOMEM; 591 } 592 593 assoc->bdev_name = strdup(bdev_name); 594 if (!assoc->bdev_name) { 595 SPDK_ERRLOG("could not allocate assoc->bdev_name\n"); 596 free(assoc); 597 return -ENOMEM; 598 } 599 600 assoc->vbdev_name = strdup(vbdev_name); 601 if (!assoc->vbdev_name) { 602 SPDK_ERRLOG("could not allocate assoc->vbdev_name\n"); 603 free(assoc->bdev_name); 604 free(assoc); 605 return -ENOMEM; 606 } 607 608 assoc->avg_read_latency = avg_read_latency; 609 assoc->p99_read_latency = p99_read_latency; 610 assoc->avg_write_latency = avg_write_latency; 611 assoc->p99_write_latency = p99_write_latency; 612 613 TAILQ_INSERT_TAIL(&g_bdev_associations, assoc, link); 614 615 return 0; 616 } 617 618 int 619 vbdev_delay_update_latency_value(char *delay_name, uint64_t latency_us, enum delay_io_type type) 620 { 621 struct vbdev_delay *delay_node; 622 uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 623 624 TAILQ_FOREACH(delay_node, &g_delay_nodes, link) { 625 if (strcmp(delay_node->delay_bdev.name, delay_name) == 0) { 626 break; 627 } 628 } 629 630 if (delay_node == NULL) { 631 return -ENODEV; 632 } 633 634 switch (type) { 635 case DELAY_AVG_READ: 636 delay_node->average_read_latency_ticks = ticks_mhz * latency_us; 637 break; 638 case DELAY_AVG_WRITE: 639 delay_node->average_write_latency_ticks = ticks_mhz * latency_us; 640 break; 641 case DELAY_P99_READ: 642 delay_node->p99_read_latency_ticks = ticks_mhz * latency_us; 643 break; 644 case DELAY_P99_WRITE: 645 delay_node->p99_write_latency_ticks = ticks_mhz * latency_us; 646 break; 647 default: 648 return -EINVAL; 649 } 650 651 return 0; 652 } 653 654 static int 655 vbdev_delay_init(void) 656 { 657 /* Not allowing for .ini style configuration. */ 658 return 0; 659 } 660 661 static void 662 vbdev_delay_finish(void) 663 { 664 struct bdev_association *assoc; 665 666 while ((assoc = TAILQ_FIRST(&g_bdev_associations))) { 667 TAILQ_REMOVE(&g_bdev_associations, assoc, link); 668 free(assoc->bdev_name); 669 free(assoc->vbdev_name); 670 free(assoc); 671 } 672 } 673 674 static int 675 vbdev_delay_get_ctx_size(void) 676 { 677 return sizeof(struct delay_bdev_io); 678 } 679 680 static void 681 vbdev_delay_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 682 { 683 /* No config per bdev needed */ 684 } 685 686 static int 687 vbdev_delay_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 688 { 689 struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; 690 691 /* Delay bdev doesn't work with data buffers, so it supports any memory domain used by base_bdev */ 692 return spdk_bdev_get_memory_domains(delay_node->base_bdev, domains, array_size); 693 } 694 695 /* When we register our bdev this is how we specify our entry points. */ 696 static const struct spdk_bdev_fn_table vbdev_delay_fn_table = { 697 .destruct = vbdev_delay_destruct, 698 .submit_request = vbdev_delay_submit_request, 699 .io_type_supported = vbdev_delay_io_type_supported, 700 .get_io_channel = vbdev_delay_get_io_channel, 701 .dump_info_json = vbdev_delay_dump_info_json, 702 .write_config_json = vbdev_delay_write_config_json, 703 .get_memory_domains = vbdev_delay_get_memory_domains, 704 }; 705 706 static void 707 vbdev_delay_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find) 708 { 709 struct vbdev_delay *delay_node, *tmp; 710 711 TAILQ_FOREACH_SAFE(delay_node, &g_delay_nodes, link, tmp) { 712 if (bdev_find == delay_node->base_bdev) { 713 spdk_bdev_unregister(&delay_node->delay_bdev, NULL, NULL); 714 } 715 } 716 } 717 718 /* Called when the underlying base bdev triggers asynchronous event such as bdev removal. */ 719 static void 720 vbdev_delay_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, 721 void *event_ctx) 722 { 723 switch (type) { 724 case SPDK_BDEV_EVENT_REMOVE: 725 vbdev_delay_base_bdev_hotremove_cb(bdev); 726 break; 727 default: 728 SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); 729 break; 730 } 731 } 732 733 /* Create and register the delay vbdev if we find it in our list of bdev names. 734 * This can be called either by the examine path or RPC method. 735 */ 736 static int 737 vbdev_delay_register(const char *bdev_name) 738 { 739 struct bdev_association *assoc; 740 struct vbdev_delay *delay_node; 741 struct spdk_bdev *bdev; 742 uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 743 int rc = 0; 744 745 /* Check our list of names from config versus this bdev and if 746 * there's a match, create the delay_node & bdev accordingly. 747 */ 748 TAILQ_FOREACH(assoc, &g_bdev_associations, link) { 749 if (strcmp(assoc->bdev_name, bdev_name) != 0) { 750 continue; 751 } 752 753 delay_node = calloc(1, sizeof(struct vbdev_delay)); 754 if (!delay_node) { 755 rc = -ENOMEM; 756 SPDK_ERRLOG("could not allocate delay_node\n"); 757 break; 758 } 759 delay_node->delay_bdev.name = strdup(assoc->vbdev_name); 760 if (!delay_node->delay_bdev.name) { 761 rc = -ENOMEM; 762 SPDK_ERRLOG("could not allocate delay_bdev name\n"); 763 free(delay_node); 764 break; 765 } 766 delay_node->delay_bdev.product_name = "delay"; 767 768 /* The base bdev that we're attaching to. */ 769 rc = spdk_bdev_open_ext(bdev_name, true, vbdev_delay_base_bdev_event_cb, 770 NULL, &delay_node->base_desc); 771 if (rc) { 772 if (rc != -ENODEV) { 773 SPDK_ERRLOG("could not open bdev %s\n", bdev_name); 774 } 775 free(delay_node->delay_bdev.name); 776 free(delay_node); 777 break; 778 } 779 780 bdev = spdk_bdev_desc_get_bdev(delay_node->base_desc); 781 delay_node->base_bdev = bdev; 782 783 delay_node->delay_bdev.write_cache = bdev->write_cache; 784 delay_node->delay_bdev.required_alignment = bdev->required_alignment; 785 delay_node->delay_bdev.optimal_io_boundary = bdev->optimal_io_boundary; 786 delay_node->delay_bdev.blocklen = bdev->blocklen; 787 delay_node->delay_bdev.blockcnt = bdev->blockcnt; 788 789 delay_node->delay_bdev.ctxt = delay_node; 790 delay_node->delay_bdev.fn_table = &vbdev_delay_fn_table; 791 delay_node->delay_bdev.module = &delay_if; 792 793 /* Store the number of ticks you need to add to get the I/O expiration time. */ 794 delay_node->average_read_latency_ticks = ticks_mhz * assoc->avg_read_latency; 795 delay_node->p99_read_latency_ticks = ticks_mhz * assoc->p99_read_latency; 796 delay_node->average_write_latency_ticks = ticks_mhz * assoc->avg_write_latency; 797 delay_node->p99_write_latency_ticks = ticks_mhz * assoc->p99_write_latency; 798 799 spdk_io_device_register(delay_node, delay_bdev_ch_create_cb, delay_bdev_ch_destroy_cb, 800 sizeof(struct delay_io_channel), 801 assoc->vbdev_name); 802 803 /* Save the thread where the base device is opened */ 804 delay_node->thread = spdk_get_thread(); 805 806 rc = spdk_bdev_module_claim_bdev(bdev, delay_node->base_desc, delay_node->delay_bdev.module); 807 if (rc) { 808 SPDK_ERRLOG("could not claim bdev %s\n", bdev_name); 809 goto error_close; 810 } 811 812 rc = spdk_bdev_register(&delay_node->delay_bdev); 813 if (rc) { 814 SPDK_ERRLOG("could not register delay_bdev\n"); 815 spdk_bdev_module_release_bdev(delay_node->base_bdev); 816 goto error_close; 817 } 818 819 TAILQ_INSERT_TAIL(&g_delay_nodes, delay_node, link); 820 } 821 822 return rc; 823 824 error_close: 825 spdk_bdev_close(delay_node->base_desc); 826 spdk_io_device_unregister(delay_node, NULL); 827 free(delay_node->delay_bdev.name); 828 free(delay_node); 829 return rc; 830 } 831 832 int 833 create_delay_disk(const char *bdev_name, const char *vbdev_name, uint64_t avg_read_latency, 834 uint64_t p99_read_latency, uint64_t avg_write_latency, uint64_t p99_write_latency) 835 { 836 int rc = 0; 837 838 if (p99_read_latency < avg_read_latency || p99_write_latency < avg_write_latency) { 839 SPDK_ERRLOG("Unable to create a delay bdev where p99 latency is less than average latency.\n"); 840 return -EINVAL; 841 } 842 843 rc = vbdev_delay_insert_association(bdev_name, vbdev_name, avg_read_latency, p99_read_latency, 844 avg_write_latency, p99_write_latency); 845 if (rc) { 846 return rc; 847 } 848 849 rc = vbdev_delay_register(bdev_name); 850 if (rc == -ENODEV) { 851 /* This is not an error, we tracked the name above and it still 852 * may show up later. 853 */ 854 SPDK_NOTICELOG("vbdev creation deferred pending base bdev arrival\n"); 855 rc = 0; 856 } 857 858 return rc; 859 } 860 861 void 862 delete_delay_disk(const char *vbdev_name, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 863 { 864 struct bdev_association *assoc; 865 int rc; 866 867 rc = spdk_bdev_unregister_by_name(vbdev_name, &delay_if, cb_fn, cb_arg); 868 if (rc == 0) { 869 TAILQ_FOREACH(assoc, &g_bdev_associations, link) { 870 if (strcmp(assoc->vbdev_name, vbdev_name) == 0) { 871 TAILQ_REMOVE(&g_bdev_associations, assoc, link); 872 free(assoc->bdev_name); 873 free(assoc->vbdev_name); 874 free(assoc); 875 break; 876 } 877 } 878 } else { 879 cb_fn(cb_arg, rc); 880 } 881 } 882 883 static void 884 vbdev_delay_examine(struct spdk_bdev *bdev) 885 { 886 vbdev_delay_register(bdev->name); 887 888 spdk_bdev_module_examine_done(&delay_if); 889 } 890 891 SPDK_LOG_REGISTER_COMPONENT(vbdev_delay) 892