xref: /spdk/lib/bdev/bdev.c (revision 179ed697b3c461d100e675915d074be717b7b9cc)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
5  *   Copyright (c) Intel Corporation.
6  *   All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "spdk/stdinc.h"
36 
37 #include "spdk/bdev.h"
38 
39 #include "spdk/env.h"
40 #include "spdk/io_channel.h"
41 #include "spdk/likely.h"
42 #include "spdk/queue.h"
43 #include "spdk/nvme_spec.h"
44 #include "spdk/scsi_spec.h"
45 
46 #include "spdk_internal/bdev.h"
47 #include "spdk_internal/log.h"
48 #include "spdk/string.h"
49 
50 #ifdef SPDK_CONFIG_VTUNE
51 #include "ittnotify.h"
52 #include "ittnotify_types.h"
53 int __itt_init_ittlib(const char *, __itt_group_id);
54 #endif
55 
56 #define SPDK_BDEV_IO_POOL_SIZE	(64 * 1024)
57 #define BUF_SMALL_POOL_SIZE	8192
58 #define BUF_LARGE_POOL_SIZE	1024
59 
60 typedef TAILQ_HEAD(, spdk_bdev_io) need_buf_tailq_t;
61 
62 struct spdk_bdev_mgr {
63 	struct spdk_mempool *bdev_io_pool;
64 
65 	struct spdk_mempool *buf_small_pool;
66 	struct spdk_mempool *buf_large_pool;
67 
68 	TAILQ_HEAD(, spdk_bdev_module_if) bdev_modules;
69 
70 	TAILQ_HEAD(, spdk_bdev) bdevs;
71 
72 	spdk_bdev_poller_start_cb start_poller_fn;
73 	spdk_bdev_poller_stop_cb stop_poller_fn;
74 
75 	bool init_complete;
76 	bool module_init_complete;
77 
78 #ifdef SPDK_CONFIG_VTUNE
79 	__itt_domain	*domain;
80 #endif
81 };
82 
83 static struct spdk_bdev_mgr g_bdev_mgr = {
84 	.bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
85 	.bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
86 	.start_poller_fn = NULL,
87 	.stop_poller_fn = NULL,
88 	.init_complete = false,
89 	.module_init_complete = false,
90 };
91 
92 static spdk_bdev_init_cb	g_cb_fn = NULL;
93 static void			*g_cb_arg = NULL;
94 
95 
96 struct spdk_bdev_mgmt_channel {
97 	need_buf_tailq_t need_buf_small;
98 	need_buf_tailq_t need_buf_large;
99 };
100 
101 struct spdk_bdev_desc {
102 	struct spdk_bdev		*bdev;
103 	spdk_bdev_remove_cb_t		remove_cb;
104 	void				*remove_ctx;
105 	bool				write;
106 	TAILQ_ENTRY(spdk_bdev_desc)	link;
107 };
108 
109 struct spdk_bdev_channel {
110 	struct spdk_bdev	*bdev;
111 
112 	/* The channel for the underlying device */
113 	struct spdk_io_channel	*channel;
114 
115 	/* Channel for the bdev manager */
116 	struct spdk_io_channel *mgmt_channel;
117 
118 	struct spdk_bdev_io_stat stat;
119 
120 	/*
121 	 * Count of I/O submitted to bdev module and waiting for completion.
122 	 * Incremented before submit_request() is called on an spdk_bdev_io.
123 	 */
124 	uint64_t		io_outstanding;
125 
126 #ifdef SPDK_CONFIG_VTUNE
127 	uint64_t		start_tsc;
128 	uint64_t		interval_tsc;
129 	__itt_string_handle	*handle;
130 #endif
131 
132 };
133 
134 struct spdk_bdev *
135 spdk_bdev_first(void)
136 {
137 	struct spdk_bdev *bdev;
138 
139 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
140 	if (bdev) {
141 		SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Starting bdev iteration at %s\n", bdev->name);
142 	}
143 
144 	return bdev;
145 }
146 
147 struct spdk_bdev *
148 spdk_bdev_next(struct spdk_bdev *prev)
149 {
150 	struct spdk_bdev *bdev;
151 
152 	bdev = TAILQ_NEXT(prev, link);
153 	if (bdev) {
154 		SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Continuing bdev iteration at %s\n", bdev->name);
155 	}
156 
157 	return bdev;
158 }
159 
160 static struct spdk_bdev *
161 _bdev_next_leaf(struct spdk_bdev *bdev)
162 {
163 	while (bdev != NULL) {
164 		if (TAILQ_EMPTY(&bdev->vbdevs)) {
165 			return bdev;
166 		} else {
167 			bdev = TAILQ_NEXT(bdev, link);
168 		}
169 	}
170 
171 	return bdev;
172 }
173 
174 struct spdk_bdev *
175 spdk_bdev_first_leaf(void)
176 {
177 	struct spdk_bdev *bdev;
178 
179 	bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
180 
181 	if (bdev) {
182 		SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Starting bdev iteration at %s\n", bdev->name);
183 	}
184 
185 	return bdev;
186 }
187 
188 struct spdk_bdev *
189 spdk_bdev_next_leaf(struct spdk_bdev *prev)
190 {
191 	struct spdk_bdev *bdev;
192 
193 	bdev = _bdev_next_leaf(TAILQ_NEXT(prev, link));
194 
195 	if (bdev) {
196 		SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Continuing bdev iteration at %s\n", bdev->name);
197 	}
198 
199 	return bdev;
200 }
201 
202 struct spdk_bdev *
203 spdk_bdev_get_by_name(const char *bdev_name)
204 {
205 	struct spdk_bdev *bdev = spdk_bdev_first();
206 
207 	while (bdev != NULL) {
208 		if (strcmp(bdev_name, bdev->name) == 0) {
209 			return bdev;
210 		}
211 		bdev = spdk_bdev_next(bdev);
212 	}
213 
214 	return NULL;
215 }
216 
217 static void
218 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf)
219 {
220 	assert(bdev_io->get_buf_cb != NULL);
221 	assert(buf != NULL);
222 	assert(bdev_io->u.read.iovs != NULL);
223 
224 	bdev_io->buf = buf;
225 	bdev_io->u.read.iovs[0].iov_base = (void *)((unsigned long)((char *)buf + 512) & ~511UL);
226 	bdev_io->u.read.iovs[0].iov_len = bdev_io->u.read.len;
227 	bdev_io->get_buf_cb(bdev_io->ch->channel, bdev_io);
228 }
229 
230 static void
231 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
232 {
233 	struct spdk_mempool *pool;
234 	struct spdk_bdev_io *tmp;
235 	void *buf;
236 	need_buf_tailq_t *tailq;
237 	uint64_t length;
238 	struct spdk_bdev_mgmt_channel *ch;
239 
240 	assert(bdev_io->u.read.iovcnt == 1);
241 
242 	length = bdev_io->u.read.len;
243 	buf = bdev_io->buf;
244 
245 	ch = spdk_io_channel_get_ctx(bdev_io->ch->mgmt_channel);
246 
247 	if (length <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) {
248 		pool = g_bdev_mgr.buf_small_pool;
249 		tailq = &ch->need_buf_small;
250 	} else {
251 		pool = g_bdev_mgr.buf_large_pool;
252 		tailq = &ch->need_buf_large;
253 	}
254 
255 	if (TAILQ_EMPTY(tailq)) {
256 		spdk_mempool_put(pool, buf);
257 	} else {
258 		tmp = TAILQ_FIRST(tailq);
259 		TAILQ_REMOVE(tailq, tmp, buf_link);
260 		spdk_bdev_io_set_buf(tmp, buf);
261 	}
262 }
263 
264 void
265 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb)
266 {
267 	uint64_t len = bdev_io->u.read.len;
268 	struct spdk_mempool *pool;
269 	need_buf_tailq_t *tailq;
270 	void *buf = NULL;
271 	struct spdk_bdev_mgmt_channel *ch;
272 
273 	assert(cb != NULL);
274 	assert(bdev_io->u.read.iovs != NULL);
275 
276 	if (spdk_unlikely(bdev_io->u.read.iovs[0].iov_base != NULL)) {
277 		/* Buffer already present */
278 		cb(bdev_io->ch->channel, bdev_io);
279 		return;
280 	}
281 
282 	ch = spdk_io_channel_get_ctx(bdev_io->ch->mgmt_channel);
283 
284 	bdev_io->get_buf_cb = cb;
285 	if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) {
286 		pool = g_bdev_mgr.buf_small_pool;
287 		tailq = &ch->need_buf_small;
288 	} else {
289 		pool = g_bdev_mgr.buf_large_pool;
290 		tailq = &ch->need_buf_large;
291 	}
292 
293 	buf = spdk_mempool_get(pool);
294 
295 	if (!buf) {
296 		TAILQ_INSERT_TAIL(tailq, bdev_io, buf_link);
297 	} else {
298 		spdk_bdev_io_set_buf(bdev_io, buf);
299 	}
300 }
301 
302 static int
303 spdk_bdev_module_get_max_ctx_size(void)
304 {
305 	struct spdk_bdev_module_if *bdev_module;
306 	int max_bdev_module_size = 0;
307 
308 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) {
309 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
310 			max_bdev_module_size = bdev_module->get_ctx_size();
311 		}
312 	}
313 
314 	return max_bdev_module_size;
315 }
316 
317 void
318 spdk_bdev_config_text(FILE *fp)
319 {
320 	struct spdk_bdev_module_if *bdev_module;
321 
322 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) {
323 		if (bdev_module->config_text) {
324 			bdev_module->config_text(fp);
325 		}
326 	}
327 }
328 
329 static int
330 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
331 {
332 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
333 
334 	TAILQ_INIT(&ch->need_buf_small);
335 	TAILQ_INIT(&ch->need_buf_large);
336 
337 	return 0;
338 }
339 
340 static void
341 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
342 {
343 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
344 
345 	if (!TAILQ_EMPTY(&ch->need_buf_small) || !TAILQ_EMPTY(&ch->need_buf_large)) {
346 		SPDK_ERRLOG("Pending I/O list wasn't empty on channel destruction\n");
347 	}
348 }
349 
350 static void
351 spdk_bdev_init_complete(int rc)
352 {
353 	spdk_bdev_init_cb cb_fn = g_cb_fn;
354 	void *cb_arg = g_cb_arg;
355 
356 	g_bdev_mgr.init_complete = true;
357 	g_cb_fn = NULL;
358 	g_cb_arg = NULL;
359 
360 	cb_fn(cb_arg, rc);
361 }
362 
363 static void
364 spdk_bdev_module_init_complete(int rc)
365 {
366 	struct spdk_bdev_module_if *m;
367 
368 	g_bdev_mgr.module_init_complete = true;
369 
370 	if (rc != 0) {
371 		spdk_bdev_init_complete(rc);
372 	}
373 
374 	/*
375 	 * Check all bdev modules for an examinations in progress.  If any
376 	 * exist, return immediately since we cannot finish bdev subsystem
377 	 * initialization until all are completed.
378 	 */
379 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) {
380 		if (m->examine_in_progress > 0) {
381 			return;
382 		}
383 	}
384 
385 	spdk_bdev_init_complete(0);
386 }
387 
388 static int
389 spdk_bdev_modules_init(void)
390 {
391 	struct spdk_bdev_module_if *module;
392 	int rc;
393 
394 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) {
395 		rc = module->module_init();
396 		if (rc != 0) {
397 			return rc;
398 		}
399 	}
400 
401 	return 0;
402 }
403 
404 void
405 spdk_bdev_poller_start(struct spdk_bdev_poller **ppoller,
406 		       spdk_bdev_poller_fn fn,
407 		       void *arg,
408 		       uint32_t lcore,
409 		       uint64_t period_microseconds)
410 {
411 	g_bdev_mgr.start_poller_fn(ppoller, fn, arg, lcore, period_microseconds);
412 }
413 
414 void
415 spdk_bdev_poller_stop(struct spdk_bdev_poller **ppoller)
416 {
417 	g_bdev_mgr.stop_poller_fn(ppoller);
418 }
419 
420 void
421 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg,
422 		     spdk_bdev_poller_start_cb start_poller_fn,
423 		     spdk_bdev_poller_stop_cb stop_poller_fn)
424 {
425 	int cache_size;
426 	int rc = 0;
427 	char mempool_name[32];
428 
429 	assert(cb_fn != NULL);
430 
431 	g_cb_fn = cb_fn;
432 	g_cb_arg = cb_arg;
433 
434 	g_bdev_mgr.start_poller_fn = start_poller_fn;
435 	g_bdev_mgr.stop_poller_fn = stop_poller_fn;
436 
437 	snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
438 
439 	g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
440 				  SPDK_BDEV_IO_POOL_SIZE,
441 				  sizeof(struct spdk_bdev_io) +
442 				  spdk_bdev_module_get_max_ctx_size(),
443 				  64,
444 				  SPDK_ENV_SOCKET_ID_ANY);
445 
446 	if (g_bdev_mgr.bdev_io_pool == NULL) {
447 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
448 		spdk_bdev_module_init_complete(-1);
449 		return;
450 	}
451 
452 	/**
453 	 * Ensure no more than half of the total buffers end up local caches, by
454 	 *   using spdk_env_get_core_count() to determine how many local caches we need
455 	 *   to account for.
456 	 */
457 	cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count());
458 	snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid());
459 
460 	g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name,
461 				    BUF_SMALL_POOL_SIZE,
462 				    SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512,
463 				    cache_size,
464 				    SPDK_ENV_SOCKET_ID_ANY);
465 	if (!g_bdev_mgr.buf_small_pool) {
466 		SPDK_ERRLOG("create rbuf small pool failed\n");
467 		spdk_bdev_module_init_complete(-1);
468 		return;
469 	}
470 
471 	cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count());
472 	snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid());
473 
474 	g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name,
475 				    BUF_LARGE_POOL_SIZE,
476 				    SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512,
477 				    cache_size,
478 				    SPDK_ENV_SOCKET_ID_ANY);
479 	if (!g_bdev_mgr.buf_large_pool) {
480 		SPDK_ERRLOG("create rbuf large pool failed\n");
481 		spdk_bdev_module_init_complete(-1);
482 		return;
483 	}
484 
485 #ifdef SPDK_CONFIG_VTUNE
486 	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
487 #endif
488 
489 	spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create,
490 				spdk_bdev_mgmt_channel_destroy,
491 				sizeof(struct spdk_bdev_mgmt_channel));
492 
493 	rc = spdk_bdev_modules_init();
494 	spdk_bdev_module_init_complete(rc);
495 }
496 
497 int
498 spdk_bdev_finish(void)
499 {
500 	struct spdk_bdev_module_if *bdev_module;
501 
502 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) {
503 		if (bdev_module->module_fini) {
504 			bdev_module->module_fini();
505 		}
506 	}
507 
508 	if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != SPDK_BDEV_IO_POOL_SIZE) {
509 		SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
510 			    spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
511 			    SPDK_BDEV_IO_POOL_SIZE);
512 	}
513 
514 	if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) {
515 		SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n",
516 			    spdk_mempool_count(g_bdev_mgr.buf_small_pool),
517 			    BUF_SMALL_POOL_SIZE);
518 		assert(false);
519 	}
520 
521 	if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) {
522 		SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n",
523 			    spdk_mempool_count(g_bdev_mgr.buf_large_pool),
524 			    BUF_LARGE_POOL_SIZE);
525 		assert(false);
526 	}
527 
528 	spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
529 	spdk_mempool_free(g_bdev_mgr.buf_small_pool);
530 	spdk_mempool_free(g_bdev_mgr.buf_large_pool);
531 
532 	spdk_io_device_unregister(&g_bdev_mgr, NULL);
533 
534 	return 0;
535 }
536 
537 struct spdk_bdev_io *
538 spdk_bdev_get_io(void)
539 {
540 	struct spdk_bdev_io *bdev_io;
541 
542 	bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
543 	if (!bdev_io) {
544 		SPDK_ERRLOG("Unable to get spdk_bdev_io\n");
545 		abort();
546 	}
547 
548 	memset(bdev_io, 0, sizeof(*bdev_io));
549 
550 	return bdev_io;
551 }
552 
553 static void
554 spdk_bdev_put_io(struct spdk_bdev_io *bdev_io)
555 {
556 	if (!bdev_io) {
557 		return;
558 	}
559 
560 	if (bdev_io->buf != NULL) {
561 		spdk_bdev_io_put_buf(bdev_io);
562 	}
563 
564 	spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
565 }
566 
567 static void
568 __submit_request(struct spdk_bdev *bdev, struct spdk_bdev_io *bdev_io)
569 {
570 	struct spdk_io_channel *ch;
571 
572 	assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);
573 
574 	ch = bdev_io->ch->channel;
575 
576 	bdev_io->ch->io_outstanding++;
577 	bdev_io->in_submit_request = true;
578 	bdev->fn_table->submit_request(ch, bdev_io);
579 	bdev_io->in_submit_request = false;
580 }
581 
582 static int
583 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
584 {
585 	struct spdk_bdev *bdev = bdev_io->bdev;
586 
587 	__submit_request(bdev, bdev_io);
588 	return 0;
589 }
590 
591 void
592 spdk_bdev_io_resubmit(struct spdk_bdev_io *bdev_io, struct spdk_bdev_desc *new_bdev_desc)
593 {
594 	struct spdk_bdev *new_bdev = new_bdev_desc->bdev;
595 
596 	assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);
597 	bdev_io->bdev = new_bdev;
598 
599 	/*
600 	 * These fields are normally set during spdk_bdev_io_init(), but since bdev is
601 	 * being switched, they need to be reinitialized.
602 	 */
603 	bdev_io->gencnt = new_bdev->gencnt;
604 
605 	/*
606 	 * This bdev_io was already submitted so decrement io_outstanding to ensure it
607 	 *  does not get double-counted.
608 	 */
609 	assert(bdev_io->ch->io_outstanding > 0);
610 	bdev_io->ch->io_outstanding--;
611 	__submit_request(new_bdev, bdev_io);
612 }
613 
614 static void
615 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io,
616 		  struct spdk_bdev *bdev, void *cb_arg,
617 		  spdk_bdev_io_completion_cb cb)
618 {
619 	bdev_io->bdev = bdev;
620 	bdev_io->caller_ctx = cb_arg;
621 	bdev_io->cb = cb;
622 	bdev_io->gencnt = bdev->gencnt;
623 	bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING;
624 	bdev_io->in_submit_request = false;
625 }
626 
627 bool
628 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
629 {
630 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
631 }
632 
633 int
634 spdk_bdev_dump_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
635 {
636 	if (bdev->fn_table->dump_config_json) {
637 		return bdev->fn_table->dump_config_json(bdev->ctxt, w);
638 	}
639 
640 	return 0;
641 }
642 
643 static int
644 spdk_bdev_channel_create(void *io_device, void *ctx_buf)
645 {
646 	struct spdk_bdev		*bdev = io_device;
647 	struct spdk_bdev_channel	*ch = ctx_buf;
648 
649 	ch->bdev = io_device;
650 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
651 	ch->mgmt_channel = spdk_get_io_channel(&g_bdev_mgr);
652 	memset(&ch->stat, 0, sizeof(ch->stat));
653 	ch->io_outstanding = 0;
654 
655 #ifdef SPDK_CONFIG_VTUNE
656 	{
657 		char *name;
658 		__itt_init_ittlib(NULL, 0);
659 		name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
660 		if (!name) {
661 			return -1;
662 		}
663 		ch->handle = __itt_string_handle_create(name);
664 		free(name);
665 		ch->start_tsc = spdk_get_ticks();
666 		ch->interval_tsc = spdk_get_ticks_hz() / 100;
667 	}
668 #endif
669 
670 	return 0;
671 }
672 
673 static void
674 _spdk_bdev_abort_io(need_buf_tailq_t *queue, struct spdk_bdev_channel *ch)
675 {
676 	struct spdk_bdev_io *bdev_io, *tmp;
677 
678 	TAILQ_FOREACH_SAFE(bdev_io, queue, buf_link, tmp) {
679 		if (bdev_io->ch == ch) {
680 			TAILQ_REMOVE(queue, bdev_io, buf_link);
681 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
682 		}
683 	}
684 }
685 
686 static void
687 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf)
688 {
689 	struct spdk_bdev_channel	*ch = ctx_buf;
690 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
691 
692 	mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel);
693 
694 	_spdk_bdev_abort_io(&mgmt_channel->need_buf_small, ch);
695 	_spdk_bdev_abort_io(&mgmt_channel->need_buf_large, ch);
696 
697 	spdk_put_io_channel(ch->channel);
698 	spdk_put_io_channel(ch->mgmt_channel);
699 	assert(ch->io_outstanding == 0);
700 }
701 
702 struct spdk_io_channel *
703 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
704 {
705 	return spdk_get_io_channel(desc->bdev);
706 }
707 
708 const char *
709 spdk_bdev_get_name(const struct spdk_bdev *bdev)
710 {
711 	return bdev->name;
712 }
713 
714 const char *
715 spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
716 {
717 	return bdev->product_name;
718 }
719 
720 uint32_t
721 spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
722 {
723 	return bdev->blocklen;
724 }
725 
726 uint64_t
727 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
728 {
729 	return bdev->blockcnt;
730 }
731 
732 size_t
733 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
734 {
735 	/* TODO: push this logic down to the bdev modules */
736 	if (bdev->need_aligned_buffer) {
737 		return bdev->blocklen;
738 	}
739 
740 	return 1;
741 }
742 
743 uint32_t
744 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
745 {
746 	return bdev->optimal_io_boundary;
747 }
748 
749 bool
750 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
751 {
752 	return bdev->write_cache;
753 }
754 
755 static bool
756 spdk_bdev_io_valid(struct spdk_bdev *bdev, uint64_t offset, uint64_t nbytes)
757 {
758 	/* Return failure if offset is not a multiple of bdev->blocklen */
759 	if (offset % bdev->blocklen) {
760 		return false;
761 	}
762 
763 	/* Return failure if nbytes is not a multiple of bdev->blocklen */
764 	if (nbytes % bdev->blocklen) {
765 		return false;
766 	}
767 
768 	/* Return failure if offset + nbytes is less than offset; indicates there
769 	 * has been an overflow and hence the offset has been wrapped around */
770 	if (offset + nbytes < offset) {
771 		return false;
772 	}
773 
774 	/* Return failure if offset + nbytes exceeds the size of the bdev */
775 	if (offset + nbytes > bdev->blockcnt * bdev->blocklen) {
776 		return false;
777 	}
778 
779 	return true;
780 }
781 
782 int
783 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
784 	       void *buf, uint64_t offset, uint64_t nbytes,
785 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
786 {
787 	struct spdk_bdev *bdev = desc->bdev;
788 	struct spdk_bdev_io *bdev_io;
789 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
790 	int rc;
791 
792 	if (!spdk_bdev_io_valid(bdev, offset, nbytes)) {
793 		return -EINVAL;
794 	}
795 
796 	bdev_io = spdk_bdev_get_io();
797 	if (!bdev_io) {
798 		SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n");
799 		return -ENOMEM;
800 	}
801 
802 	bdev_io->ch = channel;
803 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
804 	bdev_io->u.read.iov.iov_base = buf;
805 	bdev_io->u.read.iov.iov_len = nbytes;
806 	bdev_io->u.read.iovs = &bdev_io->u.read.iov;
807 	bdev_io->u.read.iovcnt = 1;
808 	bdev_io->u.read.len = nbytes;
809 	bdev_io->u.read.offset = offset;
810 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
811 
812 	rc = spdk_bdev_io_submit(bdev_io);
813 	if (rc < 0) {
814 		spdk_bdev_put_io(bdev_io);
815 		return rc;
816 	}
817 
818 	return 0;
819 }
820 
821 int
822 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
823 		struct iovec *iov, int iovcnt,
824 		uint64_t offset, uint64_t nbytes,
825 		spdk_bdev_io_completion_cb cb, void *cb_arg)
826 {
827 	struct spdk_bdev *bdev = desc->bdev;
828 	struct spdk_bdev_io *bdev_io;
829 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
830 	int rc;
831 
832 	if (!spdk_bdev_io_valid(bdev, offset, nbytes)) {
833 		return -EINVAL;
834 	}
835 
836 	bdev_io = spdk_bdev_get_io();
837 	if (!bdev_io) {
838 		SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n");
839 		return -ENOMEM;
840 	}
841 
842 	bdev_io->ch = channel;
843 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
844 	bdev_io->u.read.iovs = iov;
845 	bdev_io->u.read.iovcnt = iovcnt;
846 	bdev_io->u.read.len = nbytes;
847 	bdev_io->u.read.offset = offset;
848 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
849 
850 	rc = spdk_bdev_io_submit(bdev_io);
851 	if (rc < 0) {
852 		spdk_bdev_put_io(bdev_io);
853 		return rc;
854 	}
855 
856 	return 0;
857 }
858 
859 int
860 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
861 		void *buf, uint64_t offset, uint64_t nbytes,
862 		spdk_bdev_io_completion_cb cb, void *cb_arg)
863 {
864 	struct spdk_bdev *bdev = desc->bdev;
865 	struct spdk_bdev_io *bdev_io;
866 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
867 	int rc;
868 
869 	if (!desc->write) {
870 		return -EBADF;
871 	}
872 
873 	if (!spdk_bdev_io_valid(bdev, offset, nbytes)) {
874 		return -EINVAL;
875 	}
876 
877 	bdev_io = spdk_bdev_get_io();
878 	if (!bdev_io) {
879 		SPDK_ERRLOG("bdev_io memory allocation failed duing write\n");
880 		return -ENOMEM;
881 	}
882 
883 	bdev_io->ch = channel;
884 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
885 	bdev_io->u.write.iov.iov_base = buf;
886 	bdev_io->u.write.iov.iov_len = nbytes;
887 	bdev_io->u.write.iovs = &bdev_io->u.write.iov;
888 	bdev_io->u.write.iovcnt = 1;
889 	bdev_io->u.write.len = nbytes;
890 	bdev_io->u.write.offset = offset;
891 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
892 
893 	rc = spdk_bdev_io_submit(bdev_io);
894 	if (rc < 0) {
895 		spdk_bdev_put_io(bdev_io);
896 		return rc;
897 	}
898 
899 	return 0;
900 }
901 
902 int
903 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
904 		 struct iovec *iov, int iovcnt,
905 		 uint64_t offset, uint64_t len,
906 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
907 {
908 	struct spdk_bdev *bdev = desc->bdev;
909 	struct spdk_bdev_io *bdev_io;
910 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
911 	int rc;
912 
913 	if (!desc->write) {
914 		return -EBADF;
915 	}
916 
917 	if (!spdk_bdev_io_valid(bdev, offset, len)) {
918 		return -EINVAL;
919 	}
920 
921 	bdev_io = spdk_bdev_get_io();
922 	if (!bdev_io) {
923 		SPDK_ERRLOG("bdev_io memory allocation failed duing writev\n");
924 		return -ENOMEM;
925 	}
926 
927 	bdev_io->ch = channel;
928 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
929 	bdev_io->u.write.iovs = iov;
930 	bdev_io->u.write.iovcnt = iovcnt;
931 	bdev_io->u.write.len = len;
932 	bdev_io->u.write.offset = offset;
933 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
934 
935 	rc = spdk_bdev_io_submit(bdev_io);
936 	if (rc < 0) {
937 		spdk_bdev_put_io(bdev_io);
938 		return rc;
939 	}
940 
941 	return 0;
942 }
943 
944 int
945 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
946 		       uint64_t offset, uint64_t len,
947 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
948 {
949 	int rc;
950 	struct spdk_bdev *bdev = desc->bdev;
951 	struct spdk_bdev_io *bdev_io;
952 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
953 
954 	if (!spdk_bdev_io_valid(bdev, offset, len)) {
955 		return -EINVAL;
956 	}
957 
958 	bdev_io = spdk_bdev_get_io();
959 	if (!bdev_io) {
960 		SPDK_ERRLOG("bdev_io memory allocation failed duing write_zeroes\n");
961 		return -ENOMEM;
962 	}
963 
964 	bdev_io->ch = channel;
965 	bdev_io->u.write.len = len;
966 	bdev_io->u.write.offset = offset;
967 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
968 
969 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
970 
971 	rc = spdk_bdev_io_submit(bdev_io);
972 	if (rc < 0) {
973 		spdk_bdev_put_io(bdev_io);
974 		return rc;
975 	}
976 
977 	return 0;
978 }
979 
980 int
981 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
982 		uint64_t offset, uint64_t nbytes,
983 		spdk_bdev_io_completion_cb cb, void *cb_arg)
984 {
985 	struct spdk_bdev *bdev = desc->bdev;
986 	struct spdk_bdev_io *bdev_io;
987 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
988 	int rc;
989 
990 	if (!desc->write) {
991 		return -EBADF;
992 	}
993 
994 	if (!spdk_bdev_io_valid(bdev, offset, nbytes)) {
995 		return -EINVAL;
996 	}
997 
998 	if (nbytes == 0) {
999 		SPDK_ERRLOG("Can't unmap 0 bytes\n");
1000 		return -EINVAL;
1001 	}
1002 
1003 	bdev_io = spdk_bdev_get_io();
1004 	if (!bdev_io) {
1005 		SPDK_ERRLOG("bdev_io memory allocation failed duing unmap\n");
1006 		return -ENOMEM;
1007 	}
1008 
1009 	bdev_io->ch = channel;
1010 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
1011 	bdev_io->u.unmap.offset = offset;
1012 	bdev_io->u.unmap.len = nbytes;
1013 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1014 
1015 	rc = spdk_bdev_io_submit(bdev_io);
1016 	if (rc < 0) {
1017 		spdk_bdev_put_io(bdev_io);
1018 		return rc;
1019 	}
1020 
1021 	return 0;
1022 }
1023 
1024 int
1025 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1026 		uint64_t offset, uint64_t length,
1027 		spdk_bdev_io_completion_cb cb, void *cb_arg)
1028 {
1029 	struct spdk_bdev *bdev = desc->bdev;
1030 	struct spdk_bdev_io *bdev_io;
1031 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1032 	int rc;
1033 
1034 	if (!desc->write) {
1035 		return -EBADF;
1036 	}
1037 
1038 	bdev_io = spdk_bdev_get_io();
1039 	if (!bdev_io) {
1040 		SPDK_ERRLOG("bdev_io memory allocation failed duing flush\n");
1041 		return -ENOMEM;
1042 	}
1043 
1044 	bdev_io->ch = channel;
1045 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
1046 	bdev_io->u.flush.offset = offset;
1047 	bdev_io->u.flush.length = length;
1048 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1049 
1050 	rc = spdk_bdev_io_submit(bdev_io);
1051 	if (rc < 0) {
1052 		spdk_bdev_put_io(bdev_io);
1053 		return rc;
1054 	}
1055 
1056 	return 0;
1057 }
1058 
1059 static void
1060 _spdk_bdev_reset_dev(void *io_device, void *ctx)
1061 {
1062 	struct spdk_bdev_io *bdev_io = ctx;
1063 	int rc;
1064 
1065 	rc = spdk_bdev_io_submit(bdev_io);
1066 	if (rc < 0) {
1067 		spdk_bdev_put_io(bdev_io);
1068 		SPDK_ERRLOG("reset failed\n");
1069 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
1070 	}
1071 }
1072 
1073 static void
1074 _spdk_bdev_reset_abort_channel(void *io_device, struct spdk_io_channel *ch,
1075 			       void *ctx)
1076 {
1077 	struct spdk_bdev_channel	*channel;
1078 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
1079 
1080 	channel = spdk_io_channel_get_ctx(ch);
1081 	mgmt_channel = spdk_io_channel_get_ctx(channel->mgmt_channel);
1082 
1083 	_spdk_bdev_abort_io(&mgmt_channel->need_buf_small, channel);
1084 	_spdk_bdev_abort_io(&mgmt_channel->need_buf_large, channel);
1085 }
1086 
1087 static void
1088 _spdk_bdev_start_reset(void *ctx)
1089 {
1090 	struct spdk_bdev_io *bdev_io = ctx;
1091 
1092 	spdk_for_each_channel(bdev_io->bdev, _spdk_bdev_reset_abort_channel,
1093 			      bdev_io, _spdk_bdev_reset_dev);
1094 }
1095 
1096 static void
1097 _spdk_bdev_start_next_reset(struct spdk_bdev *bdev)
1098 {
1099 	struct spdk_bdev_io *bdev_io;
1100 	struct spdk_thread *thread;
1101 
1102 	pthread_mutex_lock(&bdev->mutex);
1103 
1104 	if (bdev->reset_in_progress || TAILQ_EMPTY(&bdev->queued_resets)) {
1105 		pthread_mutex_unlock(&bdev->mutex);
1106 		return;
1107 	} else {
1108 		bdev_io = TAILQ_FIRST(&bdev->queued_resets);
1109 		TAILQ_REMOVE(&bdev->queued_resets, bdev_io, link);
1110 		bdev->reset_in_progress = true;
1111 		thread = spdk_io_channel_get_thread(bdev_io->ch->channel);
1112 		spdk_thread_send_msg(thread, _spdk_bdev_start_reset, bdev_io);
1113 	}
1114 
1115 	pthread_mutex_unlock(&bdev->mutex);
1116 }
1117 
1118 int
1119 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1120 		spdk_bdev_io_completion_cb cb, void *cb_arg)
1121 {
1122 	struct spdk_bdev *bdev = desc->bdev;
1123 	struct spdk_bdev_io *bdev_io;
1124 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1125 
1126 	bdev_io = spdk_bdev_get_io();
1127 	if (!bdev_io) {
1128 		SPDK_ERRLOG("bdev_io memory allocation failed duing reset\n");
1129 		return -ENOMEM;;
1130 	}
1131 
1132 	bdev_io->ch = channel;
1133 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
1134 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1135 
1136 	pthread_mutex_lock(&bdev->mutex);
1137 	TAILQ_INSERT_TAIL(&bdev->queued_resets, bdev_io, link);
1138 	pthread_mutex_unlock(&bdev->mutex);
1139 
1140 	_spdk_bdev_start_next_reset(bdev);
1141 
1142 	return 0;
1143 }
1144 
1145 void
1146 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
1147 		      struct spdk_bdev_io_stat *stat)
1148 {
1149 #ifdef SPDK_CONFIG_VTUNE
1150 	SPDK_ERRLOG("Calling spdk_bdev_get_io_stat is not allowed when VTune integration is enabled.\n");
1151 	memset(stat, 0, sizeof(*stat));
1152 	return;
1153 #endif
1154 
1155 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1156 
1157 	*stat = channel->stat;
1158 	memset(&channel->stat, 0, sizeof(channel->stat));
1159 }
1160 
1161 int
1162 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1163 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
1164 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
1165 {
1166 	struct spdk_bdev *bdev = desc->bdev;
1167 	struct spdk_bdev_io *bdev_io;
1168 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1169 	int rc;
1170 
1171 	if (!desc->write) {
1172 		return -EBADF;
1173 	}
1174 
1175 	bdev_io = spdk_bdev_get_io();
1176 	if (!bdev_io) {
1177 		SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n");
1178 		return -ENOMEM;
1179 	}
1180 
1181 	bdev_io->ch = channel;
1182 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
1183 	bdev_io->u.nvme_passthru.cmd = *cmd;
1184 	bdev_io->u.nvme_passthru.buf = buf;
1185 	bdev_io->u.nvme_passthru.nbytes = nbytes;
1186 
1187 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1188 
1189 	rc = spdk_bdev_io_submit(bdev_io);
1190 	if (rc < 0) {
1191 		spdk_bdev_put_io(bdev_io);
1192 		return rc;
1193 	}
1194 
1195 	return 0;
1196 }
1197 
1198 int
1199 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1200 			   const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
1201 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
1202 {
1203 	struct spdk_bdev *bdev = desc->bdev;
1204 	struct spdk_bdev_io *bdev_io;
1205 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1206 	int rc;
1207 
1208 	if (!desc->write) {
1209 		/*
1210 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
1211 		 *  to easily determine if the command is a read or write, but for now just
1212 		 *  do not allow io_passthru with a read-only descriptor.
1213 		 */
1214 		return -EBADF;
1215 	}
1216 
1217 	bdev_io = spdk_bdev_get_io();
1218 	if (!bdev_io) {
1219 		SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n");
1220 		return -ENOMEM;
1221 	}
1222 
1223 	bdev_io->ch = channel;
1224 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
1225 	bdev_io->u.nvme_passthru.cmd = *cmd;
1226 	bdev_io->u.nvme_passthru.buf = buf;
1227 	bdev_io->u.nvme_passthru.nbytes = nbytes;
1228 
1229 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1230 
1231 	rc = spdk_bdev_io_submit(bdev_io);
1232 	if (rc < 0) {
1233 		spdk_bdev_put_io(bdev_io);
1234 		return rc;
1235 	}
1236 
1237 	return 0;
1238 }
1239 
1240 int
1241 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
1242 {
1243 	if (!bdev_io) {
1244 		SPDK_ERRLOG("bdev_io is NULL\n");
1245 		return -1;
1246 	}
1247 
1248 	if (bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING) {
1249 		SPDK_ERRLOG("bdev_io is in pending state\n");
1250 		assert(false);
1251 		return -1;
1252 	}
1253 
1254 	spdk_bdev_put_io(bdev_io);
1255 
1256 	return 0;
1257 }
1258 
1259 static void
1260 _spdk_bdev_io_complete(void *ctx)
1261 {
1262 	struct spdk_bdev_io *bdev_io = ctx;
1263 
1264 	assert(bdev_io->cb != NULL);
1265 	bdev_io->cb(bdev_io, bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS, bdev_io->caller_ctx);
1266 }
1267 
1268 void
1269 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
1270 {
1271 	bdev_io->status = status;
1272 
1273 	assert(bdev_io->ch->io_outstanding > 0);
1274 	bdev_io->ch->io_outstanding--;
1275 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_RESET) {
1276 		/* Successful reset */
1277 		if (status == SPDK_BDEV_IO_STATUS_SUCCESS) {
1278 			/* Increase the bdev generation */
1279 			bdev_io->bdev->gencnt++;
1280 		}
1281 		bdev_io->bdev->reset_in_progress = false;
1282 		_spdk_bdev_start_next_reset(bdev_io->bdev);
1283 	} else {
1284 		/*
1285 		 * Check the gencnt, to see if this I/O was issued before the most
1286 		 * recent reset. If the gencnt is not equal, then just free the I/O
1287 		 * without calling the callback, since the caller will have already
1288 		 * freed its context for this I/O.
1289 		 */
1290 		if (bdev_io->bdev->gencnt != bdev_io->gencnt) {
1291 			spdk_bdev_put_io(bdev_io);
1292 			return;
1293 		}
1294 	}
1295 
1296 	if (status == SPDK_BDEV_IO_STATUS_SUCCESS) {
1297 		switch (bdev_io->type) {
1298 		case SPDK_BDEV_IO_TYPE_READ:
1299 			bdev_io->ch->stat.bytes_read += bdev_io->u.read.len;
1300 			bdev_io->ch->stat.num_read_ops++;
1301 			break;
1302 		case SPDK_BDEV_IO_TYPE_WRITE:
1303 			bdev_io->ch->stat.bytes_written += bdev_io->u.write.len;
1304 			bdev_io->ch->stat.num_write_ops++;
1305 			break;
1306 		default:
1307 			break;
1308 		}
1309 	}
1310 
1311 #ifdef SPDK_CONFIG_VTUNE
1312 	uint64_t now_tsc = spdk_get_ticks();
1313 	if (now_tsc > (bdev_io->ch->start_tsc + bdev_io->ch->interval_tsc)) {
1314 		uint64_t data[5];
1315 
1316 		data[0] = bdev_io->ch->stat.num_read_ops;
1317 		data[1] = bdev_io->ch->stat.bytes_read;
1318 		data[2] = bdev_io->ch->stat.num_write_ops;
1319 		data[3] = bdev_io->ch->stat.bytes_written;
1320 		data[4] = bdev_io->bdev->fn_table->get_spin_time ?
1321 			  bdev_io->bdev->fn_table->get_spin_time(bdev_io->ch->channel) : 0;
1322 
1323 		__itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->ch->handle,
1324 				   __itt_metadata_u64, 5, data);
1325 
1326 		memset(&bdev_io->ch->stat, 0, sizeof(bdev_io->ch->stat));
1327 		bdev_io->ch->start_tsc = now_tsc;
1328 	}
1329 #endif
1330 
1331 	if (bdev_io->in_submit_request || bdev_io->type == SPDK_BDEV_IO_TYPE_RESET) {
1332 		/*
1333 		 * Defer completion to avoid potential infinite recursion if the
1334 		 * user's completion callback issues a new I/O.
1335 		 */
1336 		spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->ch->channel),
1337 				     _spdk_bdev_io_complete, bdev_io);
1338 	} else {
1339 		_spdk_bdev_io_complete(bdev_io);
1340 	}
1341 }
1342 
1343 void
1344 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
1345 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
1346 {
1347 	if (sc == SPDK_SCSI_STATUS_GOOD) {
1348 		bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS;
1349 	} else {
1350 		bdev_io->status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
1351 		bdev_io->error.scsi.sc = sc;
1352 		bdev_io->error.scsi.sk = sk;
1353 		bdev_io->error.scsi.asc = asc;
1354 		bdev_io->error.scsi.ascq = ascq;
1355 	}
1356 
1357 	spdk_bdev_io_complete(bdev_io, bdev_io->status);
1358 }
1359 
1360 void
1361 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
1362 			     int *sc, int *sk, int *asc, int *ascq)
1363 {
1364 	assert(sc != NULL);
1365 	assert(sk != NULL);
1366 	assert(asc != NULL);
1367 	assert(ascq != NULL);
1368 
1369 	switch (bdev_io->status) {
1370 	case SPDK_BDEV_IO_STATUS_SUCCESS:
1371 		*sc = SPDK_SCSI_STATUS_GOOD;
1372 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
1373 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
1374 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
1375 		break;
1376 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
1377 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
1378 		break;
1379 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
1380 		*sc = bdev_io->error.scsi.sc;
1381 		*sk = bdev_io->error.scsi.sk;
1382 		*asc = bdev_io->error.scsi.asc;
1383 		*ascq = bdev_io->error.scsi.ascq;
1384 		break;
1385 	default:
1386 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
1387 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
1388 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
1389 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
1390 		break;
1391 	}
1392 }
1393 
1394 void
1395 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc)
1396 {
1397 	if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
1398 		bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS;
1399 	} else {
1400 		bdev_io->error.nvme.sct = sct;
1401 		bdev_io->error.nvme.sc = sc;
1402 		bdev_io->status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
1403 	}
1404 
1405 	spdk_bdev_io_complete(bdev_io, bdev_io->status);
1406 }
1407 
1408 void
1409 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc)
1410 {
1411 	assert(sct != NULL);
1412 	assert(sc != NULL);
1413 
1414 	if (bdev_io->status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
1415 		*sct = bdev_io->error.nvme.sct;
1416 		*sc = bdev_io->error.nvme.sc;
1417 	} else if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) {
1418 		*sct = SPDK_NVME_SCT_GENERIC;
1419 		*sc = SPDK_NVME_SC_SUCCESS;
1420 	} else {
1421 		*sct = SPDK_NVME_SCT_GENERIC;
1422 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1423 	}
1424 }
1425 
1426 static void
1427 _spdk_bdev_register(struct spdk_bdev *bdev)
1428 {
1429 	struct spdk_bdev_module_if *module;
1430 
1431 	assert(bdev->module != NULL);
1432 
1433 	bdev->status = SPDK_BDEV_STATUS_READY;
1434 
1435 	/* initialize the reset generation value to zero */
1436 	bdev->gencnt = 0;
1437 	TAILQ_INIT(&bdev->open_descs);
1438 	bdev->bdev_opened_for_write = false;
1439 
1440 	TAILQ_INIT(&bdev->vbdevs);
1441 	TAILQ_INIT(&bdev->base_bdevs);
1442 
1443 	bdev->reset_in_progress = false;
1444 	TAILQ_INIT(&bdev->queued_resets);
1445 
1446 	spdk_io_device_register(bdev, spdk_bdev_channel_create, spdk_bdev_channel_destroy,
1447 				sizeof(struct spdk_bdev_channel));
1448 
1449 	pthread_mutex_init(&bdev->mutex, NULL);
1450 	SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Inserting bdev %s into list\n", bdev->name);
1451 	TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, link);
1452 
1453 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) {
1454 		if (module->examine) {
1455 			module->examine_in_progress++;
1456 			module->examine(bdev);
1457 		}
1458 	}
1459 }
1460 
1461 void
1462 spdk_bdev_register(struct spdk_bdev *bdev)
1463 {
1464 	_spdk_bdev_register(bdev);
1465 }
1466 
1467 void
1468 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count)
1469 {
1470 	int i;
1471 
1472 	_spdk_bdev_register(vbdev);
1473 	for (i = 0; i < base_bdev_count; i++) {
1474 		assert(base_bdevs[i] != NULL);
1475 		TAILQ_INSERT_TAIL(&vbdev->base_bdevs, base_bdevs[i], base_bdev_link);
1476 		TAILQ_INSERT_TAIL(&base_bdevs[i]->vbdevs, vbdev, vbdev_link);
1477 	}
1478 }
1479 
1480 void
1481 spdk_bdev_unregister(struct spdk_bdev *bdev)
1482 {
1483 	struct spdk_bdev_desc	*desc, *tmp;
1484 	int			rc;
1485 	bool			do_destruct = true;
1486 
1487 	SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Removing bdev %s from list\n", bdev->name);
1488 
1489 	pthread_mutex_lock(&bdev->mutex);
1490 
1491 	bdev->status = SPDK_BDEV_STATUS_REMOVING;
1492 
1493 	TAILQ_FOREACH_SAFE(desc, &bdev->open_descs, link, tmp) {
1494 		if (desc->remove_cb) {
1495 			pthread_mutex_unlock(&bdev->mutex);
1496 			do_destruct = false;
1497 			desc->remove_cb(desc->remove_ctx);
1498 			pthread_mutex_lock(&bdev->mutex);
1499 		}
1500 	}
1501 
1502 	if (!do_destruct) {
1503 		pthread_mutex_unlock(&bdev->mutex);
1504 		return;
1505 	}
1506 
1507 	TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link);
1508 	pthread_mutex_unlock(&bdev->mutex);
1509 
1510 	pthread_mutex_destroy(&bdev->mutex);
1511 
1512 	spdk_io_device_unregister(bdev, NULL);
1513 
1514 	rc = bdev->fn_table->destruct(bdev->ctxt);
1515 	if (rc < 0) {
1516 		SPDK_ERRLOG("destruct failed\n");
1517 	}
1518 }
1519 
1520 void
1521 spdk_vbdev_unregister(struct spdk_bdev *vbdev)
1522 {
1523 	struct spdk_bdev *base_bdev;
1524 
1525 	assert(!TAILQ_EMPTY(&vbdev->base_bdevs));
1526 	TAILQ_FOREACH(base_bdev, &vbdev->base_bdevs, base_bdev_link) {
1527 		TAILQ_REMOVE(&base_bdev->vbdevs, vbdev, vbdev_link);
1528 	}
1529 	spdk_bdev_unregister(vbdev);
1530 }
1531 
1532 void
1533 spdk_bdev_module_examine_done(struct spdk_bdev_module_if *module)
1534 {
1535 	struct spdk_bdev_module_if *m;
1536 
1537 	assert(module->examine_in_progress > 0);
1538 	module->examine_in_progress--;
1539 
1540 	/*
1541 	 * Check all bdev modules for an examinations in progress.  If any
1542 	 * exist, return immediately since we cannot finish bdev subsystem
1543 	 * initialization until all are completed.
1544 	 */
1545 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) {
1546 		if (m->examine_in_progress > 0) {
1547 			return;
1548 		}
1549 	}
1550 
1551 	if (g_bdev_mgr.module_init_complete && !g_bdev_mgr.init_complete) {
1552 		/*
1553 		 * Modules already finished initialization - now that all
1554 		 * the bdev moduless have finished their asynchronous I/O
1555 		 * processing, the entire bdev layer can be marked as complete.
1556 		 */
1557 		spdk_bdev_init_complete(0);
1558 	}
1559 }
1560 
1561 int
1562 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb,
1563 	       void *remove_ctx, struct spdk_bdev_desc **_desc)
1564 {
1565 	struct spdk_bdev_desc *desc;
1566 
1567 	desc = calloc(1, sizeof(*desc));
1568 	if (desc == NULL) {
1569 		return -ENOMEM;
1570 	}
1571 
1572 	pthread_mutex_lock(&bdev->mutex);
1573 
1574 	if (write && (bdev->bdev_opened_for_write || bdev->claim_module)) {
1575 		SPDK_ERRLOG("failed, %s already opened for write or claimed\n", bdev->name);
1576 		free(desc);
1577 		pthread_mutex_unlock(&bdev->mutex);
1578 		return -EPERM;
1579 	}
1580 
1581 	TAILQ_INSERT_TAIL(&bdev->open_descs, desc, link);
1582 
1583 	if (write) {
1584 		bdev->bdev_opened_for_write = true;
1585 	}
1586 
1587 	desc->bdev = bdev;
1588 	desc->remove_cb = remove_cb;
1589 	desc->remove_ctx = remove_ctx;
1590 	desc->write = write;
1591 	*_desc = desc;
1592 
1593 	pthread_mutex_unlock(&bdev->mutex);
1594 
1595 	return 0;
1596 }
1597 
1598 void
1599 spdk_bdev_close(struct spdk_bdev_desc *desc)
1600 {
1601 	struct spdk_bdev *bdev = desc->bdev;
1602 	bool do_unregister = false;
1603 
1604 	pthread_mutex_lock(&bdev->mutex);
1605 
1606 	if (desc->write) {
1607 		assert(bdev->bdev_opened_for_write);
1608 		bdev->bdev_opened_for_write = false;
1609 	}
1610 
1611 	TAILQ_REMOVE(&bdev->open_descs, desc, link);
1612 	free(desc);
1613 
1614 	if (bdev->status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->open_descs)) {
1615 		do_unregister = true;
1616 	}
1617 	pthread_mutex_unlock(&bdev->mutex);
1618 
1619 	if (do_unregister == true) {
1620 		spdk_bdev_unregister(bdev);
1621 	}
1622 }
1623 
1624 int
1625 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
1626 			    struct spdk_bdev_module_if *module)
1627 {
1628 	if (bdev->claim_module != NULL) {
1629 		SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name,
1630 			    bdev->claim_module->name);
1631 		return -EPERM;
1632 	}
1633 
1634 	if ((!desc || !desc->write) && bdev->bdev_opened_for_write) {
1635 		SPDK_ERRLOG("bdev %s already opened with write access\n", bdev->name);
1636 		return -EPERM;
1637 	}
1638 
1639 	if (desc && !desc->write) {
1640 		bdev->bdev_opened_for_write = true;
1641 		desc->write = true;
1642 	}
1643 
1644 	bdev->claim_module = module;
1645 	return 0;
1646 }
1647 
1648 void
1649 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
1650 {
1651 	assert(bdev->claim_module != NULL);
1652 	bdev->claim_module = NULL;
1653 }
1654 
1655 struct spdk_bdev *
1656 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
1657 {
1658 	return desc->bdev;
1659 }
1660 
1661 void
1662 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
1663 {
1664 	struct iovec *iovs;
1665 	int iovcnt;
1666 
1667 	if (bdev_io == NULL) {
1668 		return;
1669 	}
1670 
1671 	switch (bdev_io->type) {
1672 	case SPDK_BDEV_IO_TYPE_READ:
1673 		iovs = bdev_io->u.read.iovs;
1674 		iovcnt = bdev_io->u.read.iovcnt;
1675 		break;
1676 	case SPDK_BDEV_IO_TYPE_WRITE:
1677 		iovs = bdev_io->u.write.iovs;
1678 		iovcnt = bdev_io->u.write.iovcnt;
1679 		break;
1680 	default:
1681 		iovs = NULL;
1682 		iovcnt = 0;
1683 		break;
1684 	}
1685 
1686 	if (iovp) {
1687 		*iovp = iovs;
1688 	}
1689 	if (iovcntp) {
1690 		*iovcntp = iovcnt;
1691 	}
1692 }
1693 
1694 void
1695 spdk_bdev_module_list_add(struct spdk_bdev_module_if *bdev_module)
1696 {
1697 	/*
1698 	 * Modules with examine callbacks must be initialized first, so they are
1699 	 *  ready to handle examine callbacks from later modules that will
1700 	 *  register physical bdevs.
1701 	 */
1702 	if (bdev_module->examine != NULL) {
1703 		TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, tailq);
1704 	} else {
1705 		TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, tailq);
1706 	}
1707 }
1708