xref: /spdk/module/bdev/delay/vbdev_delay.c (revision 12fbe739a31b09aff0d05f354d4f3bbef99afc55)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2019 Intel Corporation.
3  *   All rights reserved.
4  *   Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "vbdev_delay.h"
10 #include "spdk/rpc.h"
11 #include "spdk/env.h"
12 #include "spdk/endian.h"
13 #include "spdk/string.h"
14 #include "spdk/thread.h"
15 #include "spdk/util.h"
16 
17 #include "spdk/bdev_module.h"
18 #include "spdk/log.h"
19 
20 /* This namespace UUID was generated using uuid_generate() method. */
21 #define BDEV_DELAY_NAMESPACE_UUID "4009b574-6430-4f1b-bc40-ace811091027"
22 
23 static int vbdev_delay_init(void);
24 static int vbdev_delay_get_ctx_size(void);
25 static void vbdev_delay_examine(struct spdk_bdev *bdev);
26 static void vbdev_delay_finish(void);
27 static int vbdev_delay_config_json(struct spdk_json_write_ctx *w);
28 
29 static struct spdk_bdev_module delay_if = {
30 	.name = "delay",
31 	.module_init = vbdev_delay_init,
32 	.get_ctx_size = vbdev_delay_get_ctx_size,
33 	.examine_config = vbdev_delay_examine,
34 	.module_fini = vbdev_delay_finish,
35 	.config_json = vbdev_delay_config_json
36 };
37 
38 SPDK_BDEV_MODULE_REGISTER(delay, &delay_if)
39 
40 /* Associative list to be used in examine */
41 struct bdev_association {
42 	char			*vbdev_name;
43 	char			*bdev_name;
44 	struct spdk_uuid	uuid;
45 	uint64_t		avg_read_latency;
46 	uint64_t		p99_read_latency;
47 	uint64_t		avg_write_latency;
48 	uint64_t		p99_write_latency;
49 	TAILQ_ENTRY(bdev_association)	link;
50 };
51 static TAILQ_HEAD(, bdev_association) g_bdev_associations = TAILQ_HEAD_INITIALIZER(
52 			g_bdev_associations);
53 
54 /* List of virtual bdevs and associated info for each. */
55 struct vbdev_delay {
56 	struct spdk_bdev		*base_bdev; /* the thing we're attaching to */
57 	struct spdk_bdev_desc		*base_desc; /* its descriptor we get from open */
58 	struct spdk_bdev		delay_bdev;    /* the delay virtual bdev */
59 	uint64_t			average_read_latency_ticks; /* the average read delay */
60 	uint64_t			p99_read_latency_ticks; /* the p99 read delay */
61 	uint64_t			average_write_latency_ticks; /* the average write delay */
62 	uint64_t			p99_write_latency_ticks; /* the p99 write delay */
63 	TAILQ_ENTRY(vbdev_delay)	link;
64 	struct spdk_thread		*thread;    /* thread where base device is opened */
65 };
66 static TAILQ_HEAD(, vbdev_delay) g_delay_nodes = TAILQ_HEAD_INITIALIZER(g_delay_nodes);
67 
68 struct delay_bdev_io {
69 	int status;
70 
71 	uint64_t completion_tick;
72 
73 	enum delay_io_type type;
74 
75 	struct spdk_io_channel *ch;
76 
77 	struct spdk_bdev_io_wait_entry bdev_io_wait;
78 
79 	struct spdk_bdev_io *zcopy_bdev_io;
80 
81 	STAILQ_ENTRY(delay_bdev_io) link;
82 };
83 
84 struct delay_io_channel {
85 	struct spdk_io_channel	*base_ch; /* IO channel of base device */
86 	STAILQ_HEAD(, delay_bdev_io) avg_read_io;
87 	STAILQ_HEAD(, delay_bdev_io) p99_read_io;
88 	STAILQ_HEAD(, delay_bdev_io) avg_write_io;
89 	STAILQ_HEAD(, delay_bdev_io) p99_write_io;
90 	struct spdk_poller *io_poller;
91 	unsigned int rand_seed;
92 };
93 
94 static void vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io);
95 
96 
97 /* Callback for unregistering the IO device. */
98 static void
99 _device_unregister_cb(void *io_device)
100 {
101 	struct vbdev_delay *delay_node  = io_device;
102 
103 	/* Done with this delay_node. */
104 	free(delay_node->delay_bdev.name);
105 	free(delay_node);
106 }
107 
108 static void
109 _vbdev_delay_destruct(void *ctx)
110 {
111 	struct spdk_bdev_desc *desc = ctx;
112 
113 	spdk_bdev_close(desc);
114 }
115 
116 static int
117 vbdev_delay_destruct(void *ctx)
118 {
119 	struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
120 
121 	/* It is important to follow this exact sequence of steps for destroying
122 	 * a vbdev...
123 	 */
124 
125 	TAILQ_REMOVE(&g_delay_nodes, delay_node, link);
126 
127 	/* Unclaim the underlying bdev. */
128 	spdk_bdev_module_release_bdev(delay_node->base_bdev);
129 
130 	/* Close the underlying bdev on its same opened thread. */
131 	if (delay_node->thread && delay_node->thread != spdk_get_thread()) {
132 		spdk_thread_send_msg(delay_node->thread, _vbdev_delay_destruct, delay_node->base_desc);
133 	} else {
134 		spdk_bdev_close(delay_node->base_desc);
135 	}
136 
137 	/* Unregister the io_device. */
138 	spdk_io_device_unregister(delay_node, _device_unregister_cb);
139 
140 	return 0;
141 }
142 
143 static int
144 _process_io_stailq(void *arg, uint64_t ticks)
145 {
146 	STAILQ_HEAD(, delay_bdev_io) *head = arg;
147 	struct delay_bdev_io *io_ctx, *tmp;
148 	int completions = 0;
149 
150 	STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) {
151 		if (io_ctx->completion_tick <= ticks) {
152 			STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link);
153 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), io_ctx->status);
154 			completions++;
155 		} else {
156 			/* In the general case, I/O will become ready in an fifo order. When timeouts are dynamically
157 			 * changed, this is not necessarily the case. However, the normal behavior will be restored
158 			 * after the outstanding I/O at the time of the change have been completed.
159 			 * This essentially means that moving from a high to low latency creates a dam for the new I/O
160 			 * submitted after the latency change. This is considered desirable behavior for the use case where
161 			 * we are trying to trigger a pre-defined timeout on an initiator.
162 			 */
163 			break;
164 		}
165 	}
166 
167 	return completions;
168 }
169 
170 static int
171 _delay_finish_io(void *arg)
172 {
173 	struct delay_io_channel *delay_ch = arg;
174 	uint64_t ticks = spdk_get_ticks();
175 	int completions = 0;
176 
177 	completions += _process_io_stailq(&delay_ch->avg_read_io, ticks);
178 	completions += _process_io_stailq(&delay_ch->avg_write_io, ticks);
179 	completions += _process_io_stailq(&delay_ch->p99_read_io, ticks);
180 	completions += _process_io_stailq(&delay_ch->p99_write_io, ticks);
181 
182 	return completions == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
183 }
184 
185 /* Completion callback for IO that were issued from this bdev. The original bdev_io
186  * is passed in as an arg so we'll complete that one with the appropriate status
187  * and then free the one that this module issued.
188  */
189 static void
190 _delay_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
191 {
192 	struct spdk_bdev_io *orig_io = cb_arg;
193 	struct vbdev_delay *delay_node = SPDK_CONTAINEROF(orig_io->bdev, struct vbdev_delay, delay_bdev);
194 	struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)orig_io->driver_ctx;
195 	struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
196 
197 	io_ctx->status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
198 
199 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_ZCOPY && bdev_io->u.bdev.zcopy.start && success) {
200 		io_ctx->zcopy_bdev_io = bdev_io;
201 	} else {
202 		assert(io_ctx->zcopy_bdev_io == NULL || io_ctx->zcopy_bdev_io == bdev_io);
203 		io_ctx->zcopy_bdev_io = NULL;
204 		spdk_bdev_free_io(bdev_io);
205 	}
206 
207 	/* Put the I/O into the proper list for processing by the channel poller. */
208 	switch (io_ctx->type) {
209 	case DELAY_AVG_READ:
210 		io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_read_latency_ticks;
211 		STAILQ_INSERT_TAIL(&delay_ch->avg_read_io, io_ctx, link);
212 		break;
213 	case DELAY_AVG_WRITE:
214 		io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_write_latency_ticks;
215 		STAILQ_INSERT_TAIL(&delay_ch->avg_write_io, io_ctx, link);
216 		break;
217 	case DELAY_P99_READ:
218 		io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_read_latency_ticks;
219 		STAILQ_INSERT_TAIL(&delay_ch->p99_read_io, io_ctx, link);
220 		break;
221 	case DELAY_P99_WRITE:
222 		io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_write_latency_ticks;
223 		STAILQ_INSERT_TAIL(&delay_ch->p99_write_io, io_ctx, link);
224 		break;
225 	case DELAY_NONE:
226 	default:
227 		spdk_bdev_io_complete(orig_io, io_ctx->status);
228 		break;
229 	}
230 }
231 
232 static void
233 vbdev_delay_resubmit_io(void *arg)
234 {
235 	struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg;
236 	struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
237 
238 	vbdev_delay_submit_request(io_ctx->ch, bdev_io);
239 }
240 
241 static void
242 vbdev_delay_queue_io(struct spdk_bdev_io *bdev_io)
243 {
244 	struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
245 	struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
246 	int rc;
247 
248 	io_ctx->bdev_io_wait.bdev = bdev_io->bdev;
249 	io_ctx->bdev_io_wait.cb_fn = vbdev_delay_resubmit_io;
250 	io_ctx->bdev_io_wait.cb_arg = bdev_io;
251 
252 	rc = spdk_bdev_queue_io_wait(bdev_io->bdev, delay_ch->base_ch, &io_ctx->bdev_io_wait);
253 	if (rc != 0) {
254 		SPDK_ERRLOG("Queue io failed in vbdev_delay_queue_io, rc=%d.\n", rc);
255 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
256 	}
257 }
258 
259 static void
260 delay_init_ext_io_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts)
261 {
262 	memset(opts, 0, sizeof(*opts));
263 	opts->size = sizeof(*opts);
264 	opts->memory_domain = bdev_io->u.bdev.memory_domain;
265 	opts->memory_domain_ctx = bdev_io->u.bdev.memory_domain_ctx;
266 	opts->metadata = bdev_io->u.bdev.md_buf;
267 }
268 
269 static void
270 delay_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
271 {
272 	struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay,
273 					 delay_bdev);
274 	struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
275 	struct spdk_bdev_ext_io_opts io_opts;
276 	int rc;
277 
278 	if (!success) {
279 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
280 		return;
281 	}
282 
283 	delay_init_ext_io_opts(bdev_io, &io_opts);
284 	rc = spdk_bdev_readv_blocks_ext(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs,
285 					bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks,
286 					bdev_io->u.bdev.num_blocks, _delay_complete_io,
287 					bdev_io, &io_opts);
288 
289 	if (rc == -ENOMEM) {
290 		SPDK_ERRLOG("No memory, start to queue io for delay.\n");
291 		vbdev_delay_queue_io(bdev_io);
292 	} else if (rc != 0) {
293 		SPDK_ERRLOG("ERROR on bdev_io submission!\n");
294 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
295 	}
296 }
297 
298 static void
299 vbdev_delay_reset_dev(struct spdk_io_channel_iter *i, int status)
300 {
301 	struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
302 	struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
303 	struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
304 	struct vbdev_delay *delay_node = spdk_io_channel_iter_get_io_device(i);
305 	int rc;
306 
307 	rc = spdk_bdev_reset(delay_node->base_desc, delay_ch->base_ch,
308 			     _delay_complete_io, bdev_io);
309 
310 	if (rc == -ENOMEM) {
311 		SPDK_ERRLOG("No memory, start to queue io for delay.\n");
312 		vbdev_delay_queue_io(bdev_io);
313 	} else if (rc != 0) {
314 		SPDK_ERRLOG("ERROR on bdev_io submission!\n");
315 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
316 	}
317 }
318 
319 static void
320 abort_zcopy_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
321 {
322 	spdk_bdev_free_io(bdev_io);
323 }
324 
325 static void
326 _abort_all_delayed_io(void *arg)
327 {
328 	STAILQ_HEAD(, delay_bdev_io) *head = arg;
329 	struct delay_bdev_io *io_ctx, *tmp;
330 
331 	STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) {
332 		STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link);
333 		if (io_ctx->zcopy_bdev_io != NULL) {
334 			spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, false, abort_zcopy_io, NULL);
335 		}
336 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), SPDK_BDEV_IO_STATUS_ABORTED);
337 	}
338 }
339 
340 static void
341 vbdev_delay_reset_channel(struct spdk_io_channel_iter *i)
342 {
343 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
344 	struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
345 
346 	_abort_all_delayed_io(&delay_ch->avg_read_io);
347 	_abort_all_delayed_io(&delay_ch->avg_write_io);
348 	_abort_all_delayed_io(&delay_ch->p99_read_io);
349 	_abort_all_delayed_io(&delay_ch->p99_write_io);
350 
351 	spdk_for_each_channel_continue(i, 0);
352 }
353 
354 static bool
355 abort_delayed_io(void *_head, struct spdk_bdev_io *bio_to_abort)
356 {
357 	STAILQ_HEAD(, delay_bdev_io) *head = _head;
358 	struct delay_bdev_io *io_ctx_to_abort = (struct delay_bdev_io *)bio_to_abort->driver_ctx;
359 	struct delay_bdev_io *io_ctx;
360 
361 	STAILQ_FOREACH(io_ctx, head, link) {
362 		if (io_ctx == io_ctx_to_abort) {
363 			STAILQ_REMOVE(head, io_ctx_to_abort, delay_bdev_io, link);
364 			if (io_ctx->zcopy_bdev_io != NULL) {
365 				spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, false, abort_zcopy_io, NULL);
366 			}
367 			spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
368 			return true;
369 		}
370 	}
371 
372 	return false;
373 }
374 
375 static int
376 vbdev_delay_abort(struct vbdev_delay *delay_node, struct delay_io_channel *delay_ch,
377 		  struct spdk_bdev_io *bdev_io)
378 {
379 	struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
380 
381 	if (abort_delayed_io(&delay_ch->avg_read_io, bio_to_abort) ||
382 	    abort_delayed_io(&delay_ch->avg_write_io, bio_to_abort) ||
383 	    abort_delayed_io(&delay_ch->p99_read_io, bio_to_abort) ||
384 	    abort_delayed_io(&delay_ch->p99_write_io, bio_to_abort)) {
385 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
386 		return 0;
387 	}
388 
389 	return spdk_bdev_abort(delay_node->base_desc, delay_ch->base_ch, bio_to_abort,
390 			       _delay_complete_io, bdev_io);
391 }
392 
393 static void
394 vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
395 {
396 	struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay, delay_bdev);
397 	struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
398 	struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
399 	struct spdk_bdev_ext_io_opts io_opts;
400 	int rc = 0;
401 	bool is_p99;
402 
403 	is_p99 = rand_r(&delay_ch->rand_seed) % 100 == 0 ? true : false;
404 
405 	io_ctx->ch = ch;
406 	io_ctx->type = DELAY_NONE;
407 	if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY || bdev_io->u.bdev.zcopy.start) {
408 		io_ctx->zcopy_bdev_io = NULL;
409 	}
410 
411 	switch (bdev_io->type) {
412 	case SPDK_BDEV_IO_TYPE_READ:
413 		io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ;
414 		spdk_bdev_io_get_buf(bdev_io, delay_read_get_buf_cb,
415 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
416 		break;
417 	case SPDK_BDEV_IO_TYPE_WRITE:
418 		io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE;
419 		delay_init_ext_io_opts(bdev_io, &io_opts);
420 		rc = spdk_bdev_writev_blocks_ext(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs,
421 						 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks,
422 						 bdev_io->u.bdev.num_blocks, _delay_complete_io,
423 						 bdev_io, &io_opts);
424 		break;
425 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
426 		rc = spdk_bdev_write_zeroes_blocks(delay_node->base_desc, delay_ch->base_ch,
427 						   bdev_io->u.bdev.offset_blocks,
428 						   bdev_io->u.bdev.num_blocks,
429 						   _delay_complete_io, bdev_io);
430 		break;
431 	case SPDK_BDEV_IO_TYPE_UNMAP:
432 		rc = spdk_bdev_unmap_blocks(delay_node->base_desc, delay_ch->base_ch,
433 					    bdev_io->u.bdev.offset_blocks,
434 					    bdev_io->u.bdev.num_blocks,
435 					    _delay_complete_io, bdev_io);
436 		break;
437 	case SPDK_BDEV_IO_TYPE_FLUSH:
438 		rc = spdk_bdev_flush_blocks(delay_node->base_desc, delay_ch->base_ch,
439 					    bdev_io->u.bdev.offset_blocks,
440 					    bdev_io->u.bdev.num_blocks,
441 					    _delay_complete_io, bdev_io);
442 		break;
443 	case SPDK_BDEV_IO_TYPE_RESET:
444 		/* During reset, the generic bdev layer aborts all new I/Os and queues all new resets.
445 		 * Hence we can simply abort all I/Os delayed to complete.
446 		 */
447 		spdk_for_each_channel(delay_node, vbdev_delay_reset_channel, bdev_io,
448 				      vbdev_delay_reset_dev);
449 		break;
450 	case SPDK_BDEV_IO_TYPE_ABORT:
451 		rc = vbdev_delay_abort(delay_node, delay_ch, bdev_io);
452 		break;
453 	case SPDK_BDEV_IO_TYPE_ZCOPY:
454 		if (bdev_io->u.bdev.zcopy.commit) {
455 			io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE;
456 		} else if (bdev_io->u.bdev.zcopy.populate) {
457 			io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ;
458 		}
459 		if (bdev_io->u.bdev.zcopy.start) {
460 			rc = spdk_bdev_zcopy_start(delay_node->base_desc, delay_ch->base_ch,
461 						   bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
462 						   bdev_io->u.bdev.offset_blocks,
463 						   bdev_io->u.bdev.num_blocks,
464 						   bdev_io->u.bdev.zcopy.populate,
465 						   _delay_complete_io, bdev_io);
466 		} else {
467 			rc = spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, bdev_io->u.bdev.zcopy.commit,
468 						 _delay_complete_io, bdev_io);
469 		}
470 		break;
471 	default:
472 		SPDK_ERRLOG("delay: unknown I/O type %d\n", bdev_io->type);
473 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
474 		return;
475 	}
476 
477 	if (rc == -ENOMEM) {
478 		SPDK_ERRLOG("No memory, start to queue io for delay.\n");
479 		vbdev_delay_queue_io(bdev_io);
480 	} else if (rc != 0) {
481 		SPDK_ERRLOG("ERROR on bdev_io submission!\n");
482 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
483 	}
484 }
485 
486 static bool
487 vbdev_delay_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
488 {
489 	struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
490 
491 	return spdk_bdev_io_type_supported(delay_node->base_bdev, io_type);
492 }
493 
494 static struct spdk_io_channel *
495 vbdev_delay_get_io_channel(void *ctx)
496 {
497 	struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
498 	struct spdk_io_channel *delay_ch = NULL;
499 
500 	delay_ch = spdk_get_io_channel(delay_node);
501 
502 	return delay_ch;
503 }
504 
505 static void
506 _delay_write_conf_values(struct vbdev_delay *delay_node, struct spdk_json_write_ctx *w)
507 {
508 	struct spdk_uuid *uuid = &delay_node->delay_bdev.uuid;
509 	char uuid_str[SPDK_UUID_STRING_LEN];
510 
511 	spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&delay_node->delay_bdev));
512 	spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(delay_node->base_bdev));
513 	if (!spdk_uuid_is_null(uuid)) {
514 		spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), uuid);
515 		spdk_json_write_named_string(w, "uuid", uuid_str);
516 	}
517 	spdk_json_write_named_int64(w, "avg_read_latency",
518 				    delay_node->average_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
519 	spdk_json_write_named_int64(w, "p99_read_latency",
520 				    delay_node->p99_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
521 	spdk_json_write_named_int64(w, "avg_write_latency",
522 				    delay_node->average_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
523 	spdk_json_write_named_int64(w, "p99_write_latency",
524 				    delay_node->p99_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
525 }
526 
527 static int
528 vbdev_delay_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
529 {
530 	struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
531 
532 	spdk_json_write_name(w, "delay");
533 	spdk_json_write_object_begin(w);
534 	_delay_write_conf_values(delay_node, w);
535 	spdk_json_write_object_end(w);
536 
537 	return 0;
538 }
539 
540 /* This is used to generate JSON that can configure this module to its current state. */
541 static int
542 vbdev_delay_config_json(struct spdk_json_write_ctx *w)
543 {
544 	struct vbdev_delay *delay_node;
545 
546 	TAILQ_FOREACH(delay_node, &g_delay_nodes, link) {
547 		spdk_json_write_object_begin(w);
548 		spdk_json_write_named_string(w, "method", "bdev_delay_create");
549 		spdk_json_write_named_object_begin(w, "params");
550 		_delay_write_conf_values(delay_node, w);
551 		spdk_json_write_object_end(w);
552 		spdk_json_write_object_end(w);
553 	}
554 	return 0;
555 }
556 
557 /* We provide this callback for the SPDK channel code to create a channel using
558  * the channel struct we provided in our module get_io_channel() entry point. Here
559  * we get and save off an underlying base channel of the device below us so that
560  * we can communicate with the base bdev on a per channel basis.  If we needed
561  * our own poller for this vbdev, we'd register it here.
562  */
563 static int
564 delay_bdev_ch_create_cb(void *io_device, void *ctx_buf)
565 {
566 	struct delay_io_channel *delay_ch = ctx_buf;
567 	struct vbdev_delay *delay_node = io_device;
568 
569 	STAILQ_INIT(&delay_ch->avg_read_io);
570 	STAILQ_INIT(&delay_ch->p99_read_io);
571 	STAILQ_INIT(&delay_ch->avg_write_io);
572 	STAILQ_INIT(&delay_ch->p99_write_io);
573 
574 	delay_ch->io_poller = SPDK_POLLER_REGISTER(_delay_finish_io, delay_ch, 0);
575 	delay_ch->base_ch = spdk_bdev_get_io_channel(delay_node->base_desc);
576 	delay_ch->rand_seed = time(NULL);
577 
578 	return 0;
579 }
580 
581 /* We provide this callback for the SPDK channel code to destroy a channel
582  * created with our create callback. We just need to undo anything we did
583  * when we created. If this bdev used its own poller, we'd unregister it here.
584  */
585 static void
586 delay_bdev_ch_destroy_cb(void *io_device, void *ctx_buf)
587 {
588 	struct delay_io_channel *delay_ch = ctx_buf;
589 
590 	spdk_poller_unregister(&delay_ch->io_poller);
591 	spdk_put_io_channel(delay_ch->base_ch);
592 }
593 
594 /* Create the delay association from the bdev and vbdev name and insert
595  * on the global list. */
596 static int
597 vbdev_delay_insert_association(const char *bdev_name, const char *vbdev_name,
598 			       struct spdk_uuid *uuid,
599 			       uint64_t avg_read_latency, uint64_t p99_read_latency,
600 			       uint64_t avg_write_latency, uint64_t p99_write_latency)
601 {
602 	struct bdev_association *assoc;
603 
604 	TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
605 		if (strcmp(vbdev_name, assoc->vbdev_name) == 0) {
606 			SPDK_ERRLOG("delay bdev %s already exists\n", vbdev_name);
607 			return -EEXIST;
608 		}
609 	}
610 
611 	assoc = calloc(1, sizeof(struct bdev_association));
612 	if (!assoc) {
613 		SPDK_ERRLOG("could not allocate bdev_association\n");
614 		return -ENOMEM;
615 	}
616 
617 	assoc->bdev_name = strdup(bdev_name);
618 	if (!assoc->bdev_name) {
619 		SPDK_ERRLOG("could not allocate assoc->bdev_name\n");
620 		free(assoc);
621 		return -ENOMEM;
622 	}
623 
624 	assoc->vbdev_name = strdup(vbdev_name);
625 	if (!assoc->vbdev_name) {
626 		SPDK_ERRLOG("could not allocate assoc->vbdev_name\n");
627 		free(assoc->bdev_name);
628 		free(assoc);
629 		return -ENOMEM;
630 	}
631 
632 	assoc->avg_read_latency = avg_read_latency;
633 	assoc->p99_read_latency = p99_read_latency;
634 	assoc->avg_write_latency = avg_write_latency;
635 	assoc->p99_write_latency = p99_write_latency;
636 	spdk_uuid_copy(&assoc->uuid, uuid);
637 
638 	TAILQ_INSERT_TAIL(&g_bdev_associations, assoc, link);
639 
640 	return 0;
641 }
642 
643 int
644 vbdev_delay_update_latency_value(char *delay_name, uint64_t latency_us, enum delay_io_type type)
645 {
646 	struct vbdev_delay *delay_node;
647 	uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
648 
649 	TAILQ_FOREACH(delay_node, &g_delay_nodes, link) {
650 		if (strcmp(delay_node->delay_bdev.name, delay_name) == 0) {
651 			break;
652 		}
653 	}
654 
655 	if (delay_node == NULL) {
656 		return -ENODEV;
657 	}
658 
659 	switch (type) {
660 	case DELAY_AVG_READ:
661 		delay_node->average_read_latency_ticks = ticks_mhz * latency_us;
662 		break;
663 	case DELAY_AVG_WRITE:
664 		delay_node->average_write_latency_ticks = ticks_mhz * latency_us;
665 		break;
666 	case DELAY_P99_READ:
667 		delay_node->p99_read_latency_ticks = ticks_mhz * latency_us;
668 		break;
669 	case DELAY_P99_WRITE:
670 		delay_node->p99_write_latency_ticks = ticks_mhz * latency_us;
671 		break;
672 	default:
673 		return -EINVAL;
674 	}
675 
676 	return 0;
677 }
678 
679 static int
680 vbdev_delay_init(void)
681 {
682 	/* Not allowing for .ini style configuration. */
683 	return 0;
684 }
685 
686 static void
687 vbdev_delay_finish(void)
688 {
689 	struct bdev_association *assoc;
690 
691 	while ((assoc = TAILQ_FIRST(&g_bdev_associations))) {
692 		TAILQ_REMOVE(&g_bdev_associations, assoc, link);
693 		free(assoc->bdev_name);
694 		free(assoc->vbdev_name);
695 		free(assoc);
696 	}
697 }
698 
699 static int
700 vbdev_delay_get_ctx_size(void)
701 {
702 	return sizeof(struct delay_bdev_io);
703 }
704 
705 static void
706 vbdev_delay_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
707 {
708 	/* No config per bdev needed */
709 }
710 
711 static int
712 vbdev_delay_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
713 {
714 	struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
715 
716 	/* Delay bdev doesn't work with data buffers, so it supports any memory domain used by base_bdev */
717 	return spdk_bdev_get_memory_domains(delay_node->base_bdev, domains, array_size);
718 }
719 
720 /* When we register our bdev this is how we specify our entry points. */
721 static const struct spdk_bdev_fn_table vbdev_delay_fn_table = {
722 	.destruct		= vbdev_delay_destruct,
723 	.submit_request		= vbdev_delay_submit_request,
724 	.io_type_supported	= vbdev_delay_io_type_supported,
725 	.get_io_channel		= vbdev_delay_get_io_channel,
726 	.dump_info_json		= vbdev_delay_dump_info_json,
727 	.write_config_json	= vbdev_delay_write_config_json,
728 	.get_memory_domains	= vbdev_delay_get_memory_domains,
729 };
730 
731 static void
732 vbdev_delay_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find)
733 {
734 	struct vbdev_delay *delay_node, *tmp;
735 
736 	TAILQ_FOREACH_SAFE(delay_node, &g_delay_nodes, link, tmp) {
737 		if (bdev_find == delay_node->base_bdev) {
738 			spdk_bdev_unregister(&delay_node->delay_bdev, NULL, NULL);
739 		}
740 	}
741 }
742 
743 /* Called when the underlying base bdev triggers asynchronous event such as bdev removal. */
744 static void
745 vbdev_delay_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
746 			       void *event_ctx)
747 {
748 	switch (type) {
749 	case SPDK_BDEV_EVENT_REMOVE:
750 		vbdev_delay_base_bdev_hotremove_cb(bdev);
751 		break;
752 	default:
753 		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
754 		break;
755 	}
756 }
757 
758 /* Create and register the delay vbdev if we find it in our list of bdev names.
759  * This can be called either by the examine path or RPC method.
760  */
761 static int
762 vbdev_delay_register(const char *bdev_name)
763 {
764 	struct bdev_association *assoc;
765 	struct vbdev_delay *delay_node;
766 	struct spdk_bdev *bdev;
767 	uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
768 	struct spdk_uuid ns_uuid;
769 	int rc = 0;
770 
771 	spdk_uuid_parse(&ns_uuid, BDEV_DELAY_NAMESPACE_UUID);
772 
773 	/* Check our list of names from config versus this bdev and if
774 	 * there's a match, create the delay_node & bdev accordingly.
775 	 */
776 	TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
777 		if (strcmp(assoc->bdev_name, bdev_name) != 0) {
778 			continue;
779 		}
780 
781 		delay_node = calloc(1, sizeof(struct vbdev_delay));
782 		if (!delay_node) {
783 			rc = -ENOMEM;
784 			SPDK_ERRLOG("could not allocate delay_node\n");
785 			break;
786 		}
787 		delay_node->delay_bdev.name = strdup(assoc->vbdev_name);
788 		if (!delay_node->delay_bdev.name) {
789 			rc = -ENOMEM;
790 			SPDK_ERRLOG("could not allocate delay_bdev name\n");
791 			free(delay_node);
792 			break;
793 		}
794 		delay_node->delay_bdev.product_name = "delay";
795 
796 		/* The base bdev that we're attaching to. */
797 		rc = spdk_bdev_open_ext(bdev_name, true, vbdev_delay_base_bdev_event_cb,
798 					NULL, &delay_node->base_desc);
799 		if (rc) {
800 			if (rc != -ENODEV) {
801 				SPDK_ERRLOG("could not open bdev %s\n", bdev_name);
802 			}
803 			free(delay_node->delay_bdev.name);
804 			free(delay_node);
805 			break;
806 		}
807 
808 		bdev = spdk_bdev_desc_get_bdev(delay_node->base_desc);
809 		delay_node->base_bdev = bdev;
810 
811 		delay_node->delay_bdev.write_cache = bdev->write_cache;
812 		delay_node->delay_bdev.required_alignment = bdev->required_alignment;
813 		delay_node->delay_bdev.optimal_io_boundary = bdev->optimal_io_boundary;
814 		delay_node->delay_bdev.blocklen = bdev->blocklen;
815 		delay_node->delay_bdev.blockcnt = bdev->blockcnt;
816 
817 		delay_node->delay_bdev.ctxt = delay_node;
818 		delay_node->delay_bdev.fn_table = &vbdev_delay_fn_table;
819 		delay_node->delay_bdev.module = &delay_if;
820 
821 		/* Store the number of ticks you need to add to get the I/O expiration time. */
822 		delay_node->average_read_latency_ticks = ticks_mhz * assoc->avg_read_latency;
823 		delay_node->p99_read_latency_ticks = ticks_mhz * assoc->p99_read_latency;
824 		delay_node->average_write_latency_ticks = ticks_mhz * assoc->avg_write_latency;
825 		delay_node->p99_write_latency_ticks = ticks_mhz * assoc->p99_write_latency;
826 
827 		if (spdk_uuid_is_null(&assoc->uuid)) {
828 			/* Generate UUID based on namespace UUID + base bdev UUID */
829 			rc = spdk_uuid_generate_sha1(&delay_node->delay_bdev.uuid, &ns_uuid,
830 						     (const char *)&bdev->uuid, sizeof(struct spdk_uuid));
831 			if (rc) {
832 				spdk_bdev_close(delay_node->base_desc);
833 				free(delay_node->delay_bdev.name);
834 				free(delay_node);
835 				break;
836 			}
837 		} else {
838 			spdk_uuid_copy(&delay_node->delay_bdev.uuid, &assoc->uuid);
839 		}
840 
841 		spdk_io_device_register(delay_node, delay_bdev_ch_create_cb, delay_bdev_ch_destroy_cb,
842 					sizeof(struct delay_io_channel),
843 					assoc->vbdev_name);
844 
845 		/* Save the thread where the base device is opened */
846 		delay_node->thread = spdk_get_thread();
847 
848 		rc = spdk_bdev_module_claim_bdev(bdev, delay_node->base_desc, delay_node->delay_bdev.module);
849 		if (rc) {
850 			SPDK_ERRLOG("could not claim bdev %s\n", bdev_name);
851 			goto error_close;
852 		}
853 
854 		rc = spdk_bdev_register(&delay_node->delay_bdev);
855 		if (rc) {
856 			SPDK_ERRLOG("could not register delay_bdev\n");
857 			spdk_bdev_module_release_bdev(delay_node->base_bdev);
858 			goto error_close;
859 		}
860 
861 		TAILQ_INSERT_TAIL(&g_delay_nodes, delay_node, link);
862 	}
863 
864 	return rc;
865 
866 error_close:
867 	spdk_bdev_close(delay_node->base_desc);
868 	spdk_io_device_unregister(delay_node, NULL);
869 	free(delay_node->delay_bdev.name);
870 	free(delay_node);
871 	return rc;
872 }
873 
874 int
875 create_delay_disk(const char *bdev_name, const char *vbdev_name, struct spdk_uuid *uuid,
876 		  uint64_t avg_read_latency,
877 		  uint64_t p99_read_latency, uint64_t avg_write_latency, uint64_t p99_write_latency)
878 {
879 	int rc = 0;
880 
881 	if (p99_read_latency < avg_read_latency || p99_write_latency < avg_write_latency) {
882 		SPDK_ERRLOG("Unable to create a delay bdev where p99 latency is less than average latency.\n");
883 		return -EINVAL;
884 	}
885 
886 	rc = vbdev_delay_insert_association(bdev_name, vbdev_name, uuid, avg_read_latency, p99_read_latency,
887 					    avg_write_latency, p99_write_latency);
888 	if (rc) {
889 		return rc;
890 	}
891 
892 	rc = vbdev_delay_register(bdev_name);
893 	if (rc == -ENODEV) {
894 		/* This is not an error, we tracked the name above and it still
895 		 * may show up later.
896 		 */
897 		SPDK_NOTICELOG("vbdev creation deferred pending base bdev arrival\n");
898 		rc = 0;
899 	}
900 
901 	return rc;
902 }
903 
904 void
905 delete_delay_disk(const char *vbdev_name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
906 {
907 	struct bdev_association *assoc;
908 	int rc;
909 
910 	rc = spdk_bdev_unregister_by_name(vbdev_name, &delay_if, cb_fn, cb_arg);
911 	if (rc == 0) {
912 		TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
913 			if (strcmp(assoc->vbdev_name, vbdev_name) == 0) {
914 				TAILQ_REMOVE(&g_bdev_associations, assoc, link);
915 				free(assoc->bdev_name);
916 				free(assoc->vbdev_name);
917 				free(assoc);
918 				break;
919 			}
920 		}
921 	} else {
922 		cb_fn(cb_arg, rc);
923 	}
924 }
925 
926 static void
927 vbdev_delay_examine(struct spdk_bdev *bdev)
928 {
929 	vbdev_delay_register(bdev->name);
930 
931 	spdk_bdev_module_examine_done(&delay_if);
932 }
933 
934 SPDK_LOG_REGISTER_COMPONENT(vbdev_delay)
935