xref: /spdk/module/bdev/delay/vbdev_delay.c (revision b6875e1ce57743f3b1416016b9c624d79a862af9)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2019 Intel Corporation.
3  *   All rights reserved.
4  *   Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "vbdev_delay.h"
10 #include "spdk/rpc.h"
11 #include "spdk/env.h"
12 #include "spdk/endian.h"
13 #include "spdk/string.h"
14 #include "spdk/thread.h"
15 #include "spdk/util.h"
16 
17 #include "spdk/bdev_module.h"
18 #include "spdk/log.h"
19 
20 /* This namespace UUID was generated using uuid_generate() method. */
21 #define BDEV_DELAY_NAMESPACE_UUID "4009b574-6430-4f1b-bc40-ace811091027"
22 
23 static int vbdev_delay_init(void);
24 static int vbdev_delay_get_ctx_size(void);
25 static void vbdev_delay_examine(struct spdk_bdev *bdev);
26 static void vbdev_delay_finish(void);
27 static int vbdev_delay_config_json(struct spdk_json_write_ctx *w);
28 
29 static struct spdk_bdev_module delay_if = {
30 	.name = "delay",
31 	.module_init = vbdev_delay_init,
32 	.get_ctx_size = vbdev_delay_get_ctx_size,
33 	.examine_config = vbdev_delay_examine,
34 	.module_fini = vbdev_delay_finish,
35 	.config_json = vbdev_delay_config_json
36 };
37 
38 SPDK_BDEV_MODULE_REGISTER(delay, &delay_if)
39 
40 /* Associative list to be used in examine */
41 struct bdev_association {
42 	char			*vbdev_name;
43 	char			*bdev_name;
44 	struct spdk_uuid	uuid;
45 	uint64_t		avg_read_latency;
46 	uint64_t		p99_read_latency;
47 	uint64_t		avg_write_latency;
48 	uint64_t		p99_write_latency;
49 	TAILQ_ENTRY(bdev_association)	link;
50 };
51 static TAILQ_HEAD(, bdev_association) g_bdev_associations = TAILQ_HEAD_INITIALIZER(
52 			g_bdev_associations);
53 
54 /* List of virtual bdevs and associated info for each. */
55 struct vbdev_delay {
56 	struct spdk_bdev		*base_bdev; /* the thing we're attaching to */
57 	struct spdk_bdev_desc		*base_desc; /* its descriptor we get from open */
58 	struct spdk_bdev		delay_bdev;    /* the delay virtual bdev */
59 	uint64_t			average_read_latency_ticks; /* the average read delay */
60 	uint64_t			p99_read_latency_ticks; /* the p99 read delay */
61 	uint64_t			average_write_latency_ticks; /* the average write delay */
62 	uint64_t			p99_write_latency_ticks; /* the p99 write delay */
63 	TAILQ_ENTRY(vbdev_delay)	link;
64 	struct spdk_thread		*thread;    /* thread where base device is opened */
65 };
66 static TAILQ_HEAD(, vbdev_delay) g_delay_nodes = TAILQ_HEAD_INITIALIZER(g_delay_nodes);
67 
68 struct delay_bdev_io {
69 	int status;
70 
71 	uint64_t completion_tick;
72 
73 	enum delay_io_type type;
74 
75 	struct spdk_io_channel *ch;
76 
77 	struct spdk_bdev_io_wait_entry bdev_io_wait;
78 
79 	struct spdk_bdev_io *zcopy_bdev_io;
80 
81 	STAILQ_ENTRY(delay_bdev_io) link;
82 };
83 
84 struct delay_io_channel {
85 	struct spdk_io_channel	*base_ch; /* IO channel of base device */
86 	STAILQ_HEAD(, delay_bdev_io) avg_read_io;
87 	STAILQ_HEAD(, delay_bdev_io) p99_read_io;
88 	STAILQ_HEAD(, delay_bdev_io) avg_write_io;
89 	STAILQ_HEAD(, delay_bdev_io) p99_write_io;
90 	struct spdk_poller *io_poller;
91 	unsigned int rand_seed;
92 };
93 
94 static void vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io);
95 
96 
97 /* Callback for unregistering the IO device. */
98 static void
99 _device_unregister_cb(void *io_device)
100 {
101 	struct vbdev_delay *delay_node  = io_device;
102 
103 	/* Done with this delay_node. */
104 	free(delay_node->delay_bdev.name);
105 	free(delay_node);
106 }
107 
108 static void
109 _vbdev_delay_destruct(void *ctx)
110 {
111 	struct spdk_bdev_desc *desc = ctx;
112 
113 	spdk_bdev_close(desc);
114 }
115 
116 static int
117 vbdev_delay_destruct(void *ctx)
118 {
119 	struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
120 
121 	/* It is important to follow this exact sequence of steps for destroying
122 	 * a vbdev...
123 	 */
124 
125 	TAILQ_REMOVE(&g_delay_nodes, delay_node, link);
126 
127 	/* Unclaim the underlying bdev. */
128 	spdk_bdev_module_release_bdev(delay_node->base_bdev);
129 
130 	/* Close the underlying bdev on its same opened thread. */
131 	if (delay_node->thread && delay_node->thread != spdk_get_thread()) {
132 		spdk_thread_send_msg(delay_node->thread, _vbdev_delay_destruct, delay_node->base_desc);
133 	} else {
134 		spdk_bdev_close(delay_node->base_desc);
135 	}
136 
137 	/* Unregister the io_device. */
138 	spdk_io_device_unregister(delay_node, _device_unregister_cb);
139 
140 	return 0;
141 }
142 
143 static int
144 _process_io_stailq(void *arg, uint64_t ticks)
145 {
146 	STAILQ_HEAD(, delay_bdev_io) *head = arg;
147 	struct delay_bdev_io *io_ctx, *tmp;
148 	int completions = 0;
149 
150 	STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) {
151 		if (io_ctx->completion_tick <= ticks) {
152 			STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link);
153 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), io_ctx->status);
154 			completions++;
155 		} else {
156 			/* In the general case, I/O will become ready in an fifo order. When timeouts are dynamically
157 			 * changed, this is not necessarily the case. However, the normal behavior will be restored
158 			 * after the outstanding I/O at the time of the change have been completed.
159 			 * This essentially means that moving from a high to low latency creates a dam for the new I/O
160 			 * submitted after the latency change. This is considered desirable behavior for the use case where
161 			 * we are trying to trigger a pre-defined timeout on an initiator.
162 			 */
163 			break;
164 		}
165 	}
166 
167 	return completions;
168 }
169 
170 static int
171 _delay_finish_io(void *arg)
172 {
173 	struct delay_io_channel *delay_ch = arg;
174 	uint64_t ticks = spdk_get_ticks();
175 	int completions = 0;
176 
177 	completions += _process_io_stailq(&delay_ch->avg_read_io, ticks);
178 	completions += _process_io_stailq(&delay_ch->avg_write_io, ticks);
179 	completions += _process_io_stailq(&delay_ch->p99_read_io, ticks);
180 	completions += _process_io_stailq(&delay_ch->p99_write_io, ticks);
181 
182 	return completions == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
183 }
184 
185 /* Completion callback for IO that were issued from this bdev. The original bdev_io
186  * is passed in as an arg so we'll complete that one with the appropriate status
187  * and then free the one that this module issued.
188  */
189 static void
190 _delay_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
191 {
192 	struct spdk_bdev_io *orig_io = cb_arg;
193 	struct vbdev_delay *delay_node = SPDK_CONTAINEROF(orig_io->bdev, struct vbdev_delay, delay_bdev);
194 	struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)orig_io->driver_ctx;
195 	struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
196 
197 	io_ctx->status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
198 
199 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_ZCOPY && bdev_io->u.bdev.zcopy.start && success) {
200 		io_ctx->zcopy_bdev_io = bdev_io;
201 	} else {
202 		assert(io_ctx->zcopy_bdev_io == NULL || io_ctx->zcopy_bdev_io == bdev_io);
203 		io_ctx->zcopy_bdev_io = NULL;
204 		spdk_bdev_free_io(bdev_io);
205 	}
206 
207 	/* Put the I/O into the proper list for processing by the channel poller. */
208 	switch (io_ctx->type) {
209 	case DELAY_AVG_READ:
210 		io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_read_latency_ticks;
211 		STAILQ_INSERT_TAIL(&delay_ch->avg_read_io, io_ctx, link);
212 		break;
213 	case DELAY_AVG_WRITE:
214 		io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_write_latency_ticks;
215 		STAILQ_INSERT_TAIL(&delay_ch->avg_write_io, io_ctx, link);
216 		break;
217 	case DELAY_P99_READ:
218 		io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_read_latency_ticks;
219 		STAILQ_INSERT_TAIL(&delay_ch->p99_read_io, io_ctx, link);
220 		break;
221 	case DELAY_P99_WRITE:
222 		io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_write_latency_ticks;
223 		STAILQ_INSERT_TAIL(&delay_ch->p99_write_io, io_ctx, link);
224 		break;
225 	case DELAY_NONE:
226 	default:
227 		spdk_bdev_io_complete(orig_io, io_ctx->status);
228 		break;
229 	}
230 }
231 
232 static void
233 vbdev_delay_resubmit_io(void *arg)
234 {
235 	struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg;
236 	struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
237 
238 	vbdev_delay_submit_request(io_ctx->ch, bdev_io);
239 }
240 
241 static void
242 vbdev_delay_queue_io(struct spdk_bdev_io *bdev_io)
243 {
244 	struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
245 	struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
246 	int rc;
247 
248 	io_ctx->bdev_io_wait.bdev = bdev_io->bdev;
249 	io_ctx->bdev_io_wait.cb_fn = vbdev_delay_resubmit_io;
250 	io_ctx->bdev_io_wait.cb_arg = bdev_io;
251 
252 	rc = spdk_bdev_queue_io_wait(bdev_io->bdev, delay_ch->base_ch, &io_ctx->bdev_io_wait);
253 	if (rc != 0) {
254 		SPDK_ERRLOG("Queue io failed in vbdev_delay_queue_io, rc=%d.\n", rc);
255 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
256 	}
257 }
258 
259 static void
260 delay_init_ext_io_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts)
261 {
262 	memset(opts, 0, sizeof(*opts));
263 	opts->size = sizeof(*opts);
264 	opts->memory_domain = bdev_io->u.bdev.memory_domain;
265 	opts->memory_domain_ctx = bdev_io->u.bdev.memory_domain_ctx;
266 	opts->metadata = bdev_io->u.bdev.md_buf;
267 }
268 
269 static void
270 delay_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
271 {
272 	struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay,
273 					 delay_bdev);
274 	struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
275 	struct spdk_bdev_ext_io_opts io_opts;
276 	int rc;
277 
278 	if (!success) {
279 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
280 		return;
281 	}
282 
283 	delay_init_ext_io_opts(bdev_io, &io_opts);
284 	rc = spdk_bdev_readv_blocks_ext(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs,
285 					bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks,
286 					bdev_io->u.bdev.num_blocks, _delay_complete_io,
287 					bdev_io, &io_opts);
288 
289 	if (rc == -ENOMEM) {
290 		SPDK_ERRLOG("No memory, start to queue io for delay.\n");
291 		vbdev_delay_queue_io(bdev_io);
292 	} else if (rc != 0) {
293 		SPDK_ERRLOG("ERROR on bdev_io submission!\n");
294 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
295 	}
296 }
297 
298 static void
299 vbdev_delay_reset_dev(struct spdk_io_channel_iter *i, int status)
300 {
301 	struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
302 	struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
303 	struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
304 	struct vbdev_delay *delay_node = spdk_io_channel_iter_get_io_device(i);
305 	int rc;
306 
307 	rc = spdk_bdev_reset(delay_node->base_desc, delay_ch->base_ch,
308 			     _delay_complete_io, bdev_io);
309 
310 	if (rc == -ENOMEM) {
311 		SPDK_ERRLOG("No memory, start to queue io for delay.\n");
312 		vbdev_delay_queue_io(bdev_io);
313 	} else if (rc != 0) {
314 		SPDK_ERRLOG("ERROR on bdev_io submission!\n");
315 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
316 	}
317 }
318 
319 static void
320 abort_zcopy_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
321 {
322 	spdk_bdev_free_io(bdev_io);
323 }
324 
325 static void
326 _abort_all_delayed_io(void *arg)
327 {
328 	STAILQ_HEAD(, delay_bdev_io) *head = arg;
329 	struct delay_bdev_io *io_ctx, *tmp;
330 
331 	STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) {
332 		STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link);
333 		if (io_ctx->zcopy_bdev_io != NULL) {
334 			spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, false, abort_zcopy_io, NULL);
335 		}
336 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), SPDK_BDEV_IO_STATUS_ABORTED);
337 	}
338 }
339 
340 static void
341 vbdev_delay_reset_channel(struct spdk_io_channel_iter *i)
342 {
343 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
344 	struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
345 
346 	_abort_all_delayed_io(&delay_ch->avg_read_io);
347 	_abort_all_delayed_io(&delay_ch->avg_write_io);
348 	_abort_all_delayed_io(&delay_ch->p99_read_io);
349 	_abort_all_delayed_io(&delay_ch->p99_write_io);
350 
351 	spdk_for_each_channel_continue(i, 0);
352 }
353 
354 static bool
355 abort_delayed_io(void *_head, struct spdk_bdev_io *bio_to_abort)
356 {
357 	STAILQ_HEAD(, delay_bdev_io) *head = _head;
358 	struct delay_bdev_io *io_ctx_to_abort = (struct delay_bdev_io *)bio_to_abort->driver_ctx;
359 	struct delay_bdev_io *io_ctx;
360 
361 	STAILQ_FOREACH(io_ctx, head, link) {
362 		if (io_ctx == io_ctx_to_abort) {
363 			STAILQ_REMOVE(head, io_ctx_to_abort, delay_bdev_io, link);
364 			if (io_ctx->zcopy_bdev_io != NULL) {
365 				spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, false, abort_zcopy_io, NULL);
366 			}
367 			spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
368 			return true;
369 		}
370 	}
371 
372 	return false;
373 }
374 
375 static int
376 vbdev_delay_abort(struct vbdev_delay *delay_node, struct delay_io_channel *delay_ch,
377 		  struct spdk_bdev_io *bdev_io)
378 {
379 	struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
380 
381 	if (abort_delayed_io(&delay_ch->avg_read_io, bio_to_abort) ||
382 	    abort_delayed_io(&delay_ch->avg_write_io, bio_to_abort) ||
383 	    abort_delayed_io(&delay_ch->p99_read_io, bio_to_abort) ||
384 	    abort_delayed_io(&delay_ch->p99_write_io, bio_to_abort)) {
385 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
386 		return 0;
387 	}
388 
389 	return spdk_bdev_abort(delay_node->base_desc, delay_ch->base_ch, bio_to_abort,
390 			       _delay_complete_io, bdev_io);
391 }
392 
393 static void
394 vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
395 {
396 	struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay, delay_bdev);
397 	struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
398 	struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
399 	struct spdk_bdev_ext_io_opts io_opts;
400 	int rc = 0;
401 	bool is_p99;
402 
403 	is_p99 = rand_r(&delay_ch->rand_seed) % 100 == 0 ? true : false;
404 
405 	io_ctx->ch = ch;
406 	io_ctx->type = DELAY_NONE;
407 	if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY || bdev_io->u.bdev.zcopy.start) {
408 		io_ctx->zcopy_bdev_io = NULL;
409 	}
410 
411 	switch (bdev_io->type) {
412 	case SPDK_BDEV_IO_TYPE_READ:
413 		io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ;
414 		spdk_bdev_io_get_buf(bdev_io, delay_read_get_buf_cb,
415 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
416 		break;
417 	case SPDK_BDEV_IO_TYPE_WRITE:
418 		io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE;
419 		delay_init_ext_io_opts(bdev_io, &io_opts);
420 		rc = spdk_bdev_writev_blocks_ext(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs,
421 						 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks,
422 						 bdev_io->u.bdev.num_blocks, _delay_complete_io,
423 						 bdev_io, &io_opts);
424 		break;
425 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
426 		rc = spdk_bdev_write_zeroes_blocks(delay_node->base_desc, delay_ch->base_ch,
427 						   bdev_io->u.bdev.offset_blocks,
428 						   bdev_io->u.bdev.num_blocks,
429 						   _delay_complete_io, bdev_io);
430 		break;
431 	case SPDK_BDEV_IO_TYPE_UNMAP:
432 		rc = spdk_bdev_unmap_blocks(delay_node->base_desc, delay_ch->base_ch,
433 					    bdev_io->u.bdev.offset_blocks,
434 					    bdev_io->u.bdev.num_blocks,
435 					    _delay_complete_io, bdev_io);
436 		break;
437 	case SPDK_BDEV_IO_TYPE_FLUSH:
438 		rc = spdk_bdev_flush_blocks(delay_node->base_desc, delay_ch->base_ch,
439 					    bdev_io->u.bdev.offset_blocks,
440 					    bdev_io->u.bdev.num_blocks,
441 					    _delay_complete_io, bdev_io);
442 		break;
443 	case SPDK_BDEV_IO_TYPE_RESET:
444 		/* During reset, the generic bdev layer aborts all new I/Os and queues all new resets.
445 		 * Hence we can simply abort all I/Os delayed to complete.
446 		 */
447 		spdk_for_each_channel(delay_node, vbdev_delay_reset_channel, bdev_io,
448 				      vbdev_delay_reset_dev);
449 		break;
450 	case SPDK_BDEV_IO_TYPE_ABORT:
451 		rc = vbdev_delay_abort(delay_node, delay_ch, bdev_io);
452 		break;
453 	case SPDK_BDEV_IO_TYPE_ZCOPY:
454 		if (bdev_io->u.bdev.zcopy.commit) {
455 			io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE;
456 		} else if (bdev_io->u.bdev.zcopy.populate) {
457 			io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ;
458 		}
459 		if (bdev_io->u.bdev.zcopy.start) {
460 			rc = spdk_bdev_zcopy_start(delay_node->base_desc, delay_ch->base_ch,
461 						   bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
462 						   bdev_io->u.bdev.offset_blocks,
463 						   bdev_io->u.bdev.num_blocks,
464 						   bdev_io->u.bdev.zcopy.populate,
465 						   _delay_complete_io, bdev_io);
466 		} else {
467 			rc = spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, bdev_io->u.bdev.zcopy.commit,
468 						 _delay_complete_io, bdev_io);
469 		}
470 		break;
471 	default:
472 		SPDK_ERRLOG("delay: unknown I/O type %d\n", bdev_io->type);
473 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
474 		return;
475 	}
476 
477 	if (rc == -ENOMEM) {
478 		SPDK_ERRLOG("No memory, start to queue io for delay.\n");
479 		vbdev_delay_queue_io(bdev_io);
480 	} else if (rc != 0) {
481 		SPDK_ERRLOG("ERROR on bdev_io submission!\n");
482 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
483 	}
484 }
485 
486 static bool
487 vbdev_delay_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
488 {
489 	struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
490 
491 	return spdk_bdev_io_type_supported(delay_node->base_bdev, io_type);
492 }
493 
494 static struct spdk_io_channel *
495 vbdev_delay_get_io_channel(void *ctx)
496 {
497 	struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
498 	struct spdk_io_channel *delay_ch = NULL;
499 
500 	delay_ch = spdk_get_io_channel(delay_node);
501 
502 	return delay_ch;
503 }
504 
505 static void
506 _delay_write_conf_values(struct vbdev_delay *delay_node, struct spdk_json_write_ctx *w)
507 {
508 	struct spdk_uuid *uuid = &delay_node->delay_bdev.uuid;
509 
510 	spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&delay_node->delay_bdev));
511 	spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(delay_node->base_bdev));
512 	if (!spdk_uuid_is_null(uuid)) {
513 		spdk_json_write_named_uuid(w, "uuid", uuid);
514 	}
515 	spdk_json_write_named_int64(w, "avg_read_latency",
516 				    delay_node->average_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
517 	spdk_json_write_named_int64(w, "p99_read_latency",
518 				    delay_node->p99_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
519 	spdk_json_write_named_int64(w, "avg_write_latency",
520 				    delay_node->average_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
521 	spdk_json_write_named_int64(w, "p99_write_latency",
522 				    delay_node->p99_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
523 }
524 
525 static int
526 vbdev_delay_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
527 {
528 	struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
529 
530 	spdk_json_write_name(w, "delay");
531 	spdk_json_write_object_begin(w);
532 	_delay_write_conf_values(delay_node, w);
533 	spdk_json_write_object_end(w);
534 
535 	return 0;
536 }
537 
538 /* This is used to generate JSON that can configure this module to its current state. */
539 static int
540 vbdev_delay_config_json(struct spdk_json_write_ctx *w)
541 {
542 	struct vbdev_delay *delay_node;
543 
544 	TAILQ_FOREACH(delay_node, &g_delay_nodes, link) {
545 		spdk_json_write_object_begin(w);
546 		spdk_json_write_named_string(w, "method", "bdev_delay_create");
547 		spdk_json_write_named_object_begin(w, "params");
548 		_delay_write_conf_values(delay_node, w);
549 		spdk_json_write_object_end(w);
550 		spdk_json_write_object_end(w);
551 	}
552 	return 0;
553 }
554 
555 /* We provide this callback for the SPDK channel code to create a channel using
556  * the channel struct we provided in our module get_io_channel() entry point. Here
557  * we get and save off an underlying base channel of the device below us so that
558  * we can communicate with the base bdev on a per channel basis.  If we needed
559  * our own poller for this vbdev, we'd register it here.
560  */
561 static int
562 delay_bdev_ch_create_cb(void *io_device, void *ctx_buf)
563 {
564 	struct delay_io_channel *delay_ch = ctx_buf;
565 	struct vbdev_delay *delay_node = io_device;
566 
567 	STAILQ_INIT(&delay_ch->avg_read_io);
568 	STAILQ_INIT(&delay_ch->p99_read_io);
569 	STAILQ_INIT(&delay_ch->avg_write_io);
570 	STAILQ_INIT(&delay_ch->p99_write_io);
571 
572 	delay_ch->io_poller = SPDK_POLLER_REGISTER(_delay_finish_io, delay_ch, 0);
573 	delay_ch->base_ch = spdk_bdev_get_io_channel(delay_node->base_desc);
574 	delay_ch->rand_seed = time(NULL);
575 
576 	return 0;
577 }
578 
579 /* We provide this callback for the SPDK channel code to destroy a channel
580  * created with our create callback. We just need to undo anything we did
581  * when we created. If this bdev used its own poller, we'd unregister it here.
582  */
583 static void
584 delay_bdev_ch_destroy_cb(void *io_device, void *ctx_buf)
585 {
586 	struct delay_io_channel *delay_ch = ctx_buf;
587 
588 	spdk_poller_unregister(&delay_ch->io_poller);
589 	spdk_put_io_channel(delay_ch->base_ch);
590 }
591 
592 /* Create the delay association from the bdev and vbdev name and insert
593  * on the global list. */
594 static int
595 vbdev_delay_insert_association(const char *bdev_name, const char *vbdev_name,
596 			       struct spdk_uuid *uuid,
597 			       uint64_t avg_read_latency, uint64_t p99_read_latency,
598 			       uint64_t avg_write_latency, uint64_t p99_write_latency)
599 {
600 	struct bdev_association *assoc;
601 
602 	TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
603 		if (strcmp(vbdev_name, assoc->vbdev_name) == 0) {
604 			SPDK_ERRLOG("delay bdev %s already exists\n", vbdev_name);
605 			return -EEXIST;
606 		}
607 	}
608 
609 	assoc = calloc(1, sizeof(struct bdev_association));
610 	if (!assoc) {
611 		SPDK_ERRLOG("could not allocate bdev_association\n");
612 		return -ENOMEM;
613 	}
614 
615 	assoc->bdev_name = strdup(bdev_name);
616 	if (!assoc->bdev_name) {
617 		SPDK_ERRLOG("could not allocate assoc->bdev_name\n");
618 		free(assoc);
619 		return -ENOMEM;
620 	}
621 
622 	assoc->vbdev_name = strdup(vbdev_name);
623 	if (!assoc->vbdev_name) {
624 		SPDK_ERRLOG("could not allocate assoc->vbdev_name\n");
625 		free(assoc->bdev_name);
626 		free(assoc);
627 		return -ENOMEM;
628 	}
629 
630 	assoc->avg_read_latency = avg_read_latency;
631 	assoc->p99_read_latency = p99_read_latency;
632 	assoc->avg_write_latency = avg_write_latency;
633 	assoc->p99_write_latency = p99_write_latency;
634 	spdk_uuid_copy(&assoc->uuid, uuid);
635 
636 	TAILQ_INSERT_TAIL(&g_bdev_associations, assoc, link);
637 
638 	return 0;
639 }
640 
641 int
642 vbdev_delay_update_latency_value(char *delay_name, uint64_t latency_us, enum delay_io_type type)
643 {
644 	struct vbdev_delay *delay_node;
645 	uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
646 
647 	TAILQ_FOREACH(delay_node, &g_delay_nodes, link) {
648 		if (strcmp(delay_node->delay_bdev.name, delay_name) == 0) {
649 			break;
650 		}
651 	}
652 
653 	if (delay_node == NULL) {
654 		return -ENODEV;
655 	}
656 
657 	switch (type) {
658 	case DELAY_AVG_READ:
659 		delay_node->average_read_latency_ticks = ticks_mhz * latency_us;
660 		break;
661 	case DELAY_AVG_WRITE:
662 		delay_node->average_write_latency_ticks = ticks_mhz * latency_us;
663 		break;
664 	case DELAY_P99_READ:
665 		delay_node->p99_read_latency_ticks = ticks_mhz * latency_us;
666 		break;
667 	case DELAY_P99_WRITE:
668 		delay_node->p99_write_latency_ticks = ticks_mhz * latency_us;
669 		break;
670 	default:
671 		return -EINVAL;
672 	}
673 
674 	return 0;
675 }
676 
677 static int
678 vbdev_delay_init(void)
679 {
680 	/* Not allowing for .ini style configuration. */
681 	return 0;
682 }
683 
684 static void
685 vbdev_delay_finish(void)
686 {
687 	struct bdev_association *assoc;
688 
689 	while ((assoc = TAILQ_FIRST(&g_bdev_associations))) {
690 		TAILQ_REMOVE(&g_bdev_associations, assoc, link);
691 		free(assoc->bdev_name);
692 		free(assoc->vbdev_name);
693 		free(assoc);
694 	}
695 }
696 
697 static int
698 vbdev_delay_get_ctx_size(void)
699 {
700 	return sizeof(struct delay_bdev_io);
701 }
702 
703 static void
704 vbdev_delay_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
705 {
706 	/* No config per bdev needed */
707 }
708 
709 static int
710 vbdev_delay_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
711 {
712 	struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
713 
714 	/* Delay bdev doesn't work with data buffers, so it supports any memory domain used by base_bdev */
715 	return spdk_bdev_get_memory_domains(delay_node->base_bdev, domains, array_size);
716 }
717 
718 /* When we register our bdev this is how we specify our entry points. */
719 static const struct spdk_bdev_fn_table vbdev_delay_fn_table = {
720 	.destruct		= vbdev_delay_destruct,
721 	.submit_request		= vbdev_delay_submit_request,
722 	.io_type_supported	= vbdev_delay_io_type_supported,
723 	.get_io_channel		= vbdev_delay_get_io_channel,
724 	.dump_info_json		= vbdev_delay_dump_info_json,
725 	.write_config_json	= vbdev_delay_write_config_json,
726 	.get_memory_domains	= vbdev_delay_get_memory_domains,
727 };
728 
729 static void
730 vbdev_delay_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find)
731 {
732 	struct vbdev_delay *delay_node, *tmp;
733 
734 	TAILQ_FOREACH_SAFE(delay_node, &g_delay_nodes, link, tmp) {
735 		if (bdev_find == delay_node->base_bdev) {
736 			spdk_bdev_unregister(&delay_node->delay_bdev, NULL, NULL);
737 		}
738 	}
739 }
740 
741 /* Called when the underlying base bdev triggers asynchronous event such as bdev removal. */
742 static void
743 vbdev_delay_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
744 			       void *event_ctx)
745 {
746 	switch (type) {
747 	case SPDK_BDEV_EVENT_REMOVE:
748 		vbdev_delay_base_bdev_hotremove_cb(bdev);
749 		break;
750 	default:
751 		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
752 		break;
753 	}
754 }
755 
756 /* Create and register the delay vbdev if we find it in our list of bdev names.
757  * This can be called either by the examine path or RPC method.
758  */
759 static int
760 vbdev_delay_register(const char *bdev_name)
761 {
762 	struct bdev_association *assoc;
763 	struct vbdev_delay *delay_node;
764 	struct spdk_bdev *bdev;
765 	uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
766 	struct spdk_uuid ns_uuid;
767 	int rc = 0;
768 
769 	spdk_uuid_parse(&ns_uuid, BDEV_DELAY_NAMESPACE_UUID);
770 
771 	/* Check our list of names from config versus this bdev and if
772 	 * there's a match, create the delay_node & bdev accordingly.
773 	 */
774 	TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
775 		if (strcmp(assoc->bdev_name, bdev_name) != 0) {
776 			continue;
777 		}
778 
779 		delay_node = calloc(1, sizeof(struct vbdev_delay));
780 		if (!delay_node) {
781 			rc = -ENOMEM;
782 			SPDK_ERRLOG("could not allocate delay_node\n");
783 			break;
784 		}
785 		delay_node->delay_bdev.name = strdup(assoc->vbdev_name);
786 		if (!delay_node->delay_bdev.name) {
787 			rc = -ENOMEM;
788 			SPDK_ERRLOG("could not allocate delay_bdev name\n");
789 			free(delay_node);
790 			break;
791 		}
792 		delay_node->delay_bdev.product_name = "delay";
793 
794 		/* The base bdev that we're attaching to. */
795 		rc = spdk_bdev_open_ext(bdev_name, true, vbdev_delay_base_bdev_event_cb,
796 					NULL, &delay_node->base_desc);
797 		if (rc) {
798 			if (rc != -ENODEV) {
799 				SPDK_ERRLOG("could not open bdev %s\n", bdev_name);
800 			}
801 			free(delay_node->delay_bdev.name);
802 			free(delay_node);
803 			break;
804 		}
805 
806 		bdev = spdk_bdev_desc_get_bdev(delay_node->base_desc);
807 		delay_node->base_bdev = bdev;
808 
809 		delay_node->delay_bdev.write_cache = bdev->write_cache;
810 		delay_node->delay_bdev.required_alignment = bdev->required_alignment;
811 		delay_node->delay_bdev.optimal_io_boundary = bdev->optimal_io_boundary;
812 		delay_node->delay_bdev.blocklen = bdev->blocklen;
813 		delay_node->delay_bdev.blockcnt = bdev->blockcnt;
814 
815 		delay_node->delay_bdev.md_interleave = bdev->md_interleave;
816 		delay_node->delay_bdev.md_len = bdev->md_len;
817 		delay_node->delay_bdev.dif_type = bdev->dif_type;
818 		delay_node->delay_bdev.dif_is_head_of_md = bdev->dif_is_head_of_md;
819 		delay_node->delay_bdev.dif_check_flags = bdev->dif_check_flags;
820 		delay_node->delay_bdev.dif_pi_format = bdev->dif_pi_format;
821 
822 		delay_node->delay_bdev.ctxt = delay_node;
823 		delay_node->delay_bdev.fn_table = &vbdev_delay_fn_table;
824 		delay_node->delay_bdev.module = &delay_if;
825 
826 		/* Store the number of ticks you need to add to get the I/O expiration time. */
827 		delay_node->average_read_latency_ticks = ticks_mhz * assoc->avg_read_latency;
828 		delay_node->p99_read_latency_ticks = ticks_mhz * assoc->p99_read_latency;
829 		delay_node->average_write_latency_ticks = ticks_mhz * assoc->avg_write_latency;
830 		delay_node->p99_write_latency_ticks = ticks_mhz * assoc->p99_write_latency;
831 
832 		if (spdk_uuid_is_null(&assoc->uuid)) {
833 			/* Generate UUID based on namespace UUID + base bdev UUID */
834 			rc = spdk_uuid_generate_sha1(&delay_node->delay_bdev.uuid, &ns_uuid,
835 						     (const char *)&bdev->uuid, sizeof(struct spdk_uuid));
836 			if (rc) {
837 				spdk_bdev_close(delay_node->base_desc);
838 				free(delay_node->delay_bdev.name);
839 				free(delay_node);
840 				break;
841 			}
842 		} else {
843 			spdk_uuid_copy(&delay_node->delay_bdev.uuid, &assoc->uuid);
844 		}
845 
846 		spdk_io_device_register(delay_node, delay_bdev_ch_create_cb, delay_bdev_ch_destroy_cb,
847 					sizeof(struct delay_io_channel),
848 					assoc->vbdev_name);
849 
850 		/* Save the thread where the base device is opened */
851 		delay_node->thread = spdk_get_thread();
852 
853 		rc = spdk_bdev_module_claim_bdev(bdev, delay_node->base_desc, delay_node->delay_bdev.module);
854 		if (rc) {
855 			SPDK_ERRLOG("could not claim bdev %s\n", bdev_name);
856 			goto error_close;
857 		}
858 
859 		rc = spdk_bdev_register(&delay_node->delay_bdev);
860 		if (rc) {
861 			SPDK_ERRLOG("could not register delay_bdev\n");
862 			spdk_bdev_module_release_bdev(delay_node->base_bdev);
863 			goto error_close;
864 		}
865 
866 		TAILQ_INSERT_TAIL(&g_delay_nodes, delay_node, link);
867 	}
868 
869 	return rc;
870 
871 error_close:
872 	spdk_bdev_close(delay_node->base_desc);
873 	spdk_io_device_unregister(delay_node, NULL);
874 	free(delay_node->delay_bdev.name);
875 	free(delay_node);
876 	return rc;
877 }
878 
879 int
880 create_delay_disk(const char *bdev_name, const char *vbdev_name, struct spdk_uuid *uuid,
881 		  uint64_t avg_read_latency,
882 		  uint64_t p99_read_latency, uint64_t avg_write_latency, uint64_t p99_write_latency)
883 {
884 	int rc = 0;
885 
886 	if (p99_read_latency < avg_read_latency || p99_write_latency < avg_write_latency) {
887 		SPDK_ERRLOG("Unable to create a delay bdev where p99 latency is less than average latency.\n");
888 		return -EINVAL;
889 	}
890 
891 	rc = vbdev_delay_insert_association(bdev_name, vbdev_name, uuid, avg_read_latency, p99_read_latency,
892 					    avg_write_latency, p99_write_latency);
893 	if (rc) {
894 		return rc;
895 	}
896 
897 	rc = vbdev_delay_register(bdev_name);
898 	if (rc == -ENODEV) {
899 		/* This is not an error, we tracked the name above and it still
900 		 * may show up later.
901 		 */
902 		SPDK_NOTICELOG("vbdev creation deferred pending base bdev arrival\n");
903 		rc = 0;
904 	}
905 
906 	return rc;
907 }
908 
909 void
910 delete_delay_disk(const char *vbdev_name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
911 {
912 	struct bdev_association *assoc;
913 	int rc;
914 
915 	rc = spdk_bdev_unregister_by_name(vbdev_name, &delay_if, cb_fn, cb_arg);
916 	if (rc == 0) {
917 		TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
918 			if (strcmp(assoc->vbdev_name, vbdev_name) == 0) {
919 				TAILQ_REMOVE(&g_bdev_associations, assoc, link);
920 				free(assoc->bdev_name);
921 				free(assoc->vbdev_name);
922 				free(assoc);
923 				break;
924 			}
925 		}
926 	} else {
927 		cb_fn(cb_arg, rc);
928 	}
929 }
930 
931 static void
932 vbdev_delay_examine(struct spdk_bdev *bdev)
933 {
934 	vbdev_delay_register(bdev->name);
935 
936 	spdk_bdev_module_examine_done(&delay_if);
937 }
938 
939 SPDK_LOG_REGISTER_COMPONENT(vbdev_delay)
940