xref: /spdk/lib/bdev/bdev.c (revision 60c38d40228d8b35bbfa049452393df89155de6b)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
5  *   Copyright (c) Intel Corporation.
6  *   All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "spdk/stdinc.h"
36 
37 #include "spdk/bdev.h"
38 
39 #include "spdk/env.h"
40 #include "spdk/io_channel.h"
41 #include "spdk/likely.h"
42 #include "spdk/queue.h"
43 #include "spdk/nvme_spec.h"
44 #include "spdk/scsi_spec.h"
45 
46 #include "spdk_internal/bdev.h"
47 #include "spdk_internal/log.h"
48 #include "spdk/string.h"
49 
50 #ifdef SPDK_CONFIG_VTUNE
51 #include "ittnotify.h"
52 #include "ittnotify_types.h"
53 int __itt_init_ittlib(const char *, __itt_group_id);
54 #endif
55 
56 #define SPDK_BDEV_IO_POOL_SIZE	(64 * 1024)
57 #define BUF_SMALL_POOL_SIZE	8192
58 #define BUF_LARGE_POOL_SIZE	1024
59 
60 typedef TAILQ_HEAD(, spdk_bdev_io) need_buf_tailq_t;
61 
62 struct spdk_bdev_mgr {
63 	struct spdk_mempool *bdev_io_pool;
64 
65 	struct spdk_mempool *buf_small_pool;
66 	struct spdk_mempool *buf_large_pool;
67 
68 	TAILQ_HEAD(, spdk_bdev_module_if) bdev_modules;
69 
70 	TAILQ_HEAD(, spdk_bdev) bdevs;
71 
72 	spdk_bdev_poller_start_cb start_poller_fn;
73 	spdk_bdev_poller_stop_cb stop_poller_fn;
74 
75 	bool init_complete;
76 	bool module_init_complete;
77 
78 #ifdef SPDK_CONFIG_VTUNE
79 	__itt_domain	*domain;
80 #endif
81 };
82 
83 static struct spdk_bdev_mgr g_bdev_mgr = {
84 	.bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
85 	.bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
86 	.start_poller_fn = NULL,
87 	.stop_poller_fn = NULL,
88 	.init_complete = false,
89 	.module_init_complete = false,
90 };
91 
92 static spdk_bdev_init_cb	g_cb_fn = NULL;
93 static void			*g_cb_arg = NULL;
94 
95 
96 struct spdk_bdev_mgmt_channel {
97 	need_buf_tailq_t need_buf_small;
98 	need_buf_tailq_t need_buf_large;
99 };
100 
101 struct spdk_bdev_desc {
102 	struct spdk_bdev		*bdev;
103 	spdk_bdev_remove_cb_t		remove_cb;
104 	void				*remove_ctx;
105 	bool				write;
106 	TAILQ_ENTRY(spdk_bdev_desc)	link;
107 };
108 
109 struct spdk_bdev_channel {
110 	struct spdk_bdev	*bdev;
111 
112 	/* The channel for the underlying device */
113 	struct spdk_io_channel	*channel;
114 
115 	/* Channel for the bdev manager */
116 	struct spdk_io_channel *mgmt_channel;
117 
118 	struct spdk_bdev_io_stat stat;
119 
120 	/*
121 	 * Count of I/O submitted to bdev module and waiting for completion.
122 	 * Incremented before submit_request() is called on an spdk_bdev_io.
123 	 */
124 	uint64_t		io_outstanding;
125 
126 #ifdef SPDK_CONFIG_VTUNE
127 	uint64_t		start_tsc;
128 	uint64_t		interval_tsc;
129 	__itt_string_handle	*handle;
130 #endif
131 
132 };
133 
134 struct spdk_bdev *
135 spdk_bdev_first(void)
136 {
137 	struct spdk_bdev *bdev;
138 
139 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
140 	if (bdev) {
141 		SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Starting bdev iteration at %s\n", bdev->name);
142 	}
143 
144 	return bdev;
145 }
146 
147 struct spdk_bdev *
148 spdk_bdev_next(struct spdk_bdev *prev)
149 {
150 	struct spdk_bdev *bdev;
151 
152 	bdev = TAILQ_NEXT(prev, link);
153 	if (bdev) {
154 		SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Continuing bdev iteration at %s\n", bdev->name);
155 	}
156 
157 	return bdev;
158 }
159 
160 static struct spdk_bdev *
161 _bdev_next_leaf(struct spdk_bdev *bdev)
162 {
163 	while (bdev != NULL) {
164 		if (TAILQ_EMPTY(&bdev->vbdevs)) {
165 			return bdev;
166 		} else {
167 			bdev = TAILQ_NEXT(bdev, link);
168 		}
169 	}
170 
171 	return bdev;
172 }
173 
174 struct spdk_bdev *
175 spdk_bdev_first_leaf(void)
176 {
177 	struct spdk_bdev *bdev;
178 
179 	bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
180 
181 	if (bdev) {
182 		SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Starting bdev iteration at %s\n", bdev->name);
183 	}
184 
185 	return bdev;
186 }
187 
188 struct spdk_bdev *
189 spdk_bdev_next_leaf(struct spdk_bdev *prev)
190 {
191 	struct spdk_bdev *bdev;
192 
193 	bdev = _bdev_next_leaf(TAILQ_NEXT(prev, link));
194 
195 	if (bdev) {
196 		SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Continuing bdev iteration at %s\n", bdev->name);
197 	}
198 
199 	return bdev;
200 }
201 
202 struct spdk_bdev *
203 spdk_bdev_get_by_name(const char *bdev_name)
204 {
205 	struct spdk_bdev *bdev = spdk_bdev_first();
206 
207 	while (bdev != NULL) {
208 		if (strcmp(bdev_name, bdev->name) == 0) {
209 			return bdev;
210 		}
211 		bdev = spdk_bdev_next(bdev);
212 	}
213 
214 	return NULL;
215 }
216 
217 static void
218 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf)
219 {
220 	assert(bdev_io->get_buf_cb != NULL);
221 	assert(buf != NULL);
222 	assert(bdev_io->u.read.iovs != NULL);
223 
224 	bdev_io->buf = buf;
225 	bdev_io->u.read.iovs[0].iov_base = (void *)((unsigned long)((char *)buf + 512) & ~511UL);
226 	bdev_io->u.read.iovs[0].iov_len = bdev_io->u.read.len;
227 	bdev_io->get_buf_cb(bdev_io->ch->channel, bdev_io);
228 }
229 
230 static void
231 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
232 {
233 	struct spdk_mempool *pool;
234 	struct spdk_bdev_io *tmp;
235 	void *buf;
236 	need_buf_tailq_t *tailq;
237 	uint64_t length;
238 	struct spdk_bdev_mgmt_channel *ch;
239 
240 	assert(bdev_io->u.read.iovcnt == 1);
241 
242 	length = bdev_io->u.read.len;
243 	buf = bdev_io->buf;
244 
245 	ch = spdk_io_channel_get_ctx(bdev_io->ch->mgmt_channel);
246 
247 	if (length <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) {
248 		pool = g_bdev_mgr.buf_small_pool;
249 		tailq = &ch->need_buf_small;
250 	} else {
251 		pool = g_bdev_mgr.buf_large_pool;
252 		tailq = &ch->need_buf_large;
253 	}
254 
255 	if (TAILQ_EMPTY(tailq)) {
256 		spdk_mempool_put(pool, buf);
257 	} else {
258 		tmp = TAILQ_FIRST(tailq);
259 		TAILQ_REMOVE(tailq, tmp, buf_link);
260 		spdk_bdev_io_set_buf(tmp, buf);
261 	}
262 }
263 
264 void
265 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb)
266 {
267 	uint64_t len = bdev_io->u.read.len;
268 	struct spdk_mempool *pool;
269 	need_buf_tailq_t *tailq;
270 	void *buf = NULL;
271 	struct spdk_bdev_mgmt_channel *ch;
272 
273 	assert(cb != NULL);
274 	assert(bdev_io->u.read.iovs != NULL);
275 
276 	if (spdk_unlikely(bdev_io->u.read.iovs[0].iov_base != NULL)) {
277 		/* Buffer already present */
278 		cb(bdev_io->ch->channel, bdev_io);
279 		return;
280 	}
281 
282 	ch = spdk_io_channel_get_ctx(bdev_io->ch->mgmt_channel);
283 
284 	bdev_io->get_buf_cb = cb;
285 	if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) {
286 		pool = g_bdev_mgr.buf_small_pool;
287 		tailq = &ch->need_buf_small;
288 	} else {
289 		pool = g_bdev_mgr.buf_large_pool;
290 		tailq = &ch->need_buf_large;
291 	}
292 
293 	buf = spdk_mempool_get(pool);
294 
295 	if (!buf) {
296 		TAILQ_INSERT_TAIL(tailq, bdev_io, buf_link);
297 	} else {
298 		spdk_bdev_io_set_buf(bdev_io, buf);
299 	}
300 }
301 
302 static int
303 spdk_bdev_module_get_max_ctx_size(void)
304 {
305 	struct spdk_bdev_module_if *bdev_module;
306 	int max_bdev_module_size = 0;
307 
308 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) {
309 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
310 			max_bdev_module_size = bdev_module->get_ctx_size();
311 		}
312 	}
313 
314 	return max_bdev_module_size;
315 }
316 
317 void
318 spdk_bdev_config_text(FILE *fp)
319 {
320 	struct spdk_bdev_module_if *bdev_module;
321 
322 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) {
323 		if (bdev_module->config_text) {
324 			bdev_module->config_text(fp);
325 		}
326 	}
327 }
328 
329 static int
330 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
331 {
332 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
333 
334 	TAILQ_INIT(&ch->need_buf_small);
335 	TAILQ_INIT(&ch->need_buf_large);
336 
337 	return 0;
338 }
339 
340 static void
341 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
342 {
343 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
344 
345 	if (!TAILQ_EMPTY(&ch->need_buf_small) || !TAILQ_EMPTY(&ch->need_buf_large)) {
346 		SPDK_ERRLOG("Pending I/O list wasn't empty on channel destruction\n");
347 	}
348 }
349 
350 static void
351 spdk_bdev_init_complete(int rc)
352 {
353 	spdk_bdev_init_cb cb_fn = g_cb_fn;
354 	void *cb_arg = g_cb_arg;
355 
356 	g_bdev_mgr.init_complete = true;
357 	g_cb_fn = NULL;
358 	g_cb_arg = NULL;
359 
360 	cb_fn(cb_arg, rc);
361 }
362 
363 static void
364 spdk_bdev_module_init_complete(int rc)
365 {
366 	struct spdk_bdev_module_if *m;
367 
368 	g_bdev_mgr.module_init_complete = true;
369 
370 	if (rc != 0) {
371 		spdk_bdev_init_complete(rc);
372 	}
373 
374 	/*
375 	 * Check all bdev modules for an examinations in progress.  If any
376 	 * exist, return immediately since we cannot finish bdev subsystem
377 	 * initialization until all are completed.
378 	 */
379 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) {
380 		if (m->examine_in_progress > 0) {
381 			return;
382 		}
383 	}
384 
385 	spdk_bdev_init_complete(0);
386 }
387 
388 static int
389 spdk_bdev_modules_init(void)
390 {
391 	struct spdk_bdev_module_if *module;
392 	int rc;
393 
394 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) {
395 		rc = module->module_init();
396 		if (rc != 0) {
397 			return rc;
398 		}
399 	}
400 
401 	return 0;
402 }
403 
404 void
405 spdk_bdev_poller_start(struct spdk_bdev_poller **ppoller,
406 		       spdk_bdev_poller_fn fn,
407 		       void *arg,
408 		       uint32_t lcore,
409 		       uint64_t period_microseconds)
410 {
411 	g_bdev_mgr.start_poller_fn(ppoller, fn, arg, lcore, period_microseconds);
412 }
413 
414 void
415 spdk_bdev_poller_stop(struct spdk_bdev_poller **ppoller)
416 {
417 	g_bdev_mgr.stop_poller_fn(ppoller);
418 }
419 
420 void
421 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg,
422 		     spdk_bdev_poller_start_cb start_poller_fn,
423 		     spdk_bdev_poller_stop_cb stop_poller_fn)
424 {
425 	int cache_size;
426 	int rc = 0;
427 
428 	assert(cb_fn != NULL);
429 
430 	g_cb_fn = cb_fn;
431 	g_cb_arg = cb_arg;
432 
433 	g_bdev_mgr.start_poller_fn = start_poller_fn;
434 	g_bdev_mgr.stop_poller_fn = stop_poller_fn;
435 
436 	g_bdev_mgr.bdev_io_pool = spdk_mempool_create("bdev_io",
437 				  SPDK_BDEV_IO_POOL_SIZE,
438 				  sizeof(struct spdk_bdev_io) +
439 				  spdk_bdev_module_get_max_ctx_size(),
440 				  64,
441 				  SPDK_ENV_SOCKET_ID_ANY);
442 
443 	if (g_bdev_mgr.bdev_io_pool == NULL) {
444 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool");
445 		spdk_bdev_module_init_complete(-1);
446 		return;
447 	}
448 
449 	/**
450 	 * Ensure no more than half of the total buffers end up local caches, by
451 	 *   using spdk_env_get_core_count() to determine how many local caches we need
452 	 *   to account for.
453 	 */
454 	cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count());
455 	g_bdev_mgr.buf_small_pool = spdk_mempool_create("buf_small_pool",
456 				    BUF_SMALL_POOL_SIZE,
457 				    SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512,
458 				    cache_size,
459 				    SPDK_ENV_SOCKET_ID_ANY);
460 	if (!g_bdev_mgr.buf_small_pool) {
461 		SPDK_ERRLOG("create rbuf small pool failed\n");
462 		spdk_bdev_module_init_complete(-1);
463 		return;
464 	}
465 
466 	cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count());
467 	g_bdev_mgr.buf_large_pool = spdk_mempool_create("buf_large_pool",
468 				    BUF_LARGE_POOL_SIZE,
469 				    SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512,
470 				    cache_size,
471 				    SPDK_ENV_SOCKET_ID_ANY);
472 	if (!g_bdev_mgr.buf_large_pool) {
473 		SPDK_ERRLOG("create rbuf large pool failed\n");
474 		spdk_bdev_module_init_complete(-1);
475 		return;
476 	}
477 
478 #ifdef SPDK_CONFIG_VTUNE
479 	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
480 #endif
481 
482 	spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create,
483 				spdk_bdev_mgmt_channel_destroy,
484 				sizeof(struct spdk_bdev_mgmt_channel));
485 
486 	rc = spdk_bdev_modules_init();
487 	spdk_bdev_module_init_complete(rc);
488 }
489 
490 int
491 spdk_bdev_finish(void)
492 {
493 	struct spdk_bdev_module_if *bdev_module;
494 
495 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) {
496 		if (bdev_module->module_fini) {
497 			bdev_module->module_fini();
498 		}
499 	}
500 
501 	if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != SPDK_BDEV_IO_POOL_SIZE) {
502 		SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
503 			    spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
504 			    SPDK_BDEV_IO_POOL_SIZE);
505 	}
506 
507 	if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) {
508 		SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n",
509 			    spdk_mempool_count(g_bdev_mgr.buf_small_pool),
510 			    BUF_SMALL_POOL_SIZE);
511 		assert(false);
512 	}
513 
514 	if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) {
515 		SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n",
516 			    spdk_mempool_count(g_bdev_mgr.buf_large_pool),
517 			    BUF_LARGE_POOL_SIZE);
518 		assert(false);
519 	}
520 
521 	spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
522 	spdk_mempool_free(g_bdev_mgr.buf_small_pool);
523 	spdk_mempool_free(g_bdev_mgr.buf_large_pool);
524 
525 	spdk_io_device_unregister(&g_bdev_mgr, NULL);
526 
527 	return 0;
528 }
529 
530 struct spdk_bdev_io *
531 spdk_bdev_get_io(void)
532 {
533 	struct spdk_bdev_io *bdev_io;
534 
535 	bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
536 	if (!bdev_io) {
537 		SPDK_ERRLOG("Unable to get spdk_bdev_io\n");
538 		abort();
539 	}
540 
541 	memset(bdev_io, 0, sizeof(*bdev_io));
542 
543 	return bdev_io;
544 }
545 
546 static void
547 spdk_bdev_put_io(struct spdk_bdev_io *bdev_io)
548 {
549 	if (!bdev_io) {
550 		return;
551 	}
552 
553 	if (bdev_io->buf != NULL) {
554 		spdk_bdev_io_put_buf(bdev_io);
555 	}
556 
557 	spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
558 }
559 
560 static void
561 __submit_request(struct spdk_bdev *bdev, struct spdk_bdev_io *bdev_io)
562 {
563 	struct spdk_io_channel *ch;
564 
565 	assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);
566 
567 	ch = bdev_io->ch->channel;
568 
569 	bdev_io->ch->io_outstanding++;
570 	bdev_io->in_submit_request = true;
571 	bdev->fn_table->submit_request(ch, bdev_io);
572 	bdev_io->in_submit_request = false;
573 }
574 
575 static int
576 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
577 {
578 	struct spdk_bdev *bdev = bdev_io->bdev;
579 
580 	__submit_request(bdev, bdev_io);
581 	return 0;
582 }
583 
584 void
585 spdk_bdev_io_resubmit(struct spdk_bdev_io *bdev_io, struct spdk_bdev_desc *new_bdev_desc)
586 {
587 	struct spdk_bdev *new_bdev = new_bdev_desc->bdev;
588 
589 	assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);
590 	bdev_io->bdev = new_bdev;
591 
592 	/*
593 	 * These fields are normally set during spdk_bdev_io_init(), but since bdev is
594 	 * being switched, they need to be reinitialized.
595 	 */
596 	bdev_io->gencnt = new_bdev->gencnt;
597 
598 	/*
599 	 * This bdev_io was already submitted so decrement io_outstanding to ensure it
600 	 *  does not get double-counted.
601 	 */
602 	assert(bdev_io->ch->io_outstanding > 0);
603 	bdev_io->ch->io_outstanding--;
604 	__submit_request(new_bdev, bdev_io);
605 }
606 
607 static void
608 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io,
609 		  struct spdk_bdev *bdev, void *cb_arg,
610 		  spdk_bdev_io_completion_cb cb)
611 {
612 	bdev_io->bdev = bdev;
613 	bdev_io->caller_ctx = cb_arg;
614 	bdev_io->cb = cb;
615 	bdev_io->gencnt = bdev->gencnt;
616 	bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING;
617 	bdev_io->in_submit_request = false;
618 }
619 
620 bool
621 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
622 {
623 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
624 }
625 
626 int
627 spdk_bdev_dump_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
628 {
629 	if (bdev->fn_table->dump_config_json) {
630 		return bdev->fn_table->dump_config_json(bdev->ctxt, w);
631 	}
632 
633 	return 0;
634 }
635 
636 static int
637 spdk_bdev_channel_create(void *io_device, void *ctx_buf)
638 {
639 	struct spdk_bdev		*bdev = io_device;
640 	struct spdk_bdev_channel	*ch = ctx_buf;
641 
642 	ch->bdev = io_device;
643 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
644 	ch->mgmt_channel = spdk_get_io_channel(&g_bdev_mgr);
645 	memset(&ch->stat, 0, sizeof(ch->stat));
646 	ch->io_outstanding = 0;
647 
648 #ifdef SPDK_CONFIG_VTUNE
649 	{
650 		char *name;
651 		__itt_init_ittlib(NULL, 0);
652 		name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
653 		if (!name) {
654 			return -1;
655 		}
656 		ch->handle = __itt_string_handle_create(name);
657 		free(name);
658 		ch->start_tsc = spdk_get_ticks();
659 		ch->interval_tsc = spdk_get_ticks_hz() / 100;
660 	}
661 #endif
662 
663 	return 0;
664 }
665 
666 static void
667 _spdk_bdev_abort_io(need_buf_tailq_t *queue, struct spdk_bdev_channel *ch)
668 {
669 	struct spdk_bdev_io *bdev_io, *tmp;
670 
671 	TAILQ_FOREACH_SAFE(bdev_io, queue, buf_link, tmp) {
672 		if (bdev_io->ch == ch) {
673 			TAILQ_REMOVE(queue, bdev_io, buf_link);
674 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
675 		}
676 	}
677 }
678 
679 static void
680 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf)
681 {
682 	struct spdk_bdev_channel	*ch = ctx_buf;
683 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
684 
685 	mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel);
686 
687 	_spdk_bdev_abort_io(&mgmt_channel->need_buf_small, ch);
688 	_spdk_bdev_abort_io(&mgmt_channel->need_buf_large, ch);
689 
690 	spdk_put_io_channel(ch->channel);
691 	spdk_put_io_channel(ch->mgmt_channel);
692 	assert(ch->io_outstanding == 0);
693 }
694 
695 struct spdk_io_channel *
696 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
697 {
698 	return spdk_get_io_channel(desc->bdev);
699 }
700 
701 const char *
702 spdk_bdev_get_name(const struct spdk_bdev *bdev)
703 {
704 	return bdev->name;
705 }
706 
707 const char *
708 spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
709 {
710 	return bdev->product_name;
711 }
712 
713 uint32_t
714 spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
715 {
716 	return bdev->blocklen;
717 }
718 
719 uint64_t
720 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
721 {
722 	return bdev->blockcnt;
723 }
724 
725 size_t
726 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
727 {
728 	/* TODO: push this logic down to the bdev modules */
729 	if (bdev->need_aligned_buffer) {
730 		return bdev->blocklen;
731 	}
732 
733 	return 1;
734 }
735 
736 bool
737 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
738 {
739 	return bdev->write_cache;
740 }
741 
742 static int
743 spdk_bdev_io_valid(struct spdk_bdev *bdev, uint64_t offset, uint64_t nbytes)
744 {
745 	/* Return failure if nbytes is not a multiple of bdev->blocklen */
746 	if (nbytes % bdev->blocklen) {
747 		return -1;
748 	}
749 
750 	/* Return failure if offset + nbytes is less than offset; indicates there
751 	 * has been an overflow and hence the offset has been wrapped around */
752 	if (offset + nbytes < offset) {
753 		return -1;
754 	}
755 
756 	/* Return failure if offset + nbytes exceeds the size of the bdev */
757 	if (offset + nbytes > bdev->blockcnt * bdev->blocklen) {
758 		return -1;
759 	}
760 
761 	return 0;
762 }
763 
764 int
765 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
766 	       void *buf, uint64_t offset, uint64_t nbytes,
767 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
768 {
769 	struct spdk_bdev *bdev = desc->bdev;
770 	struct spdk_bdev_io *bdev_io;
771 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
772 	int rc;
773 
774 	if (spdk_bdev_io_valid(bdev, offset, nbytes) != 0) {
775 		return -EINVAL;
776 	}
777 
778 	bdev_io = spdk_bdev_get_io();
779 	if (!bdev_io) {
780 		SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n");
781 		return -ENOMEM;
782 	}
783 
784 	bdev_io->ch = channel;
785 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
786 	bdev_io->u.read.iov.iov_base = buf;
787 	bdev_io->u.read.iov.iov_len = nbytes;
788 	bdev_io->u.read.iovs = &bdev_io->u.read.iov;
789 	bdev_io->u.read.iovcnt = 1;
790 	bdev_io->u.read.len = nbytes;
791 	bdev_io->u.read.offset = offset;
792 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
793 
794 	rc = spdk_bdev_io_submit(bdev_io);
795 	if (rc < 0) {
796 		spdk_bdev_put_io(bdev_io);
797 		return rc;
798 	}
799 
800 	return 0;
801 }
802 
803 int
804 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
805 		struct iovec *iov, int iovcnt,
806 		uint64_t offset, uint64_t nbytes,
807 		spdk_bdev_io_completion_cb cb, void *cb_arg)
808 {
809 	struct spdk_bdev *bdev = desc->bdev;
810 	struct spdk_bdev_io *bdev_io;
811 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
812 	int rc;
813 
814 	if (spdk_bdev_io_valid(bdev, offset, nbytes) != 0) {
815 		return -EINVAL;
816 	}
817 
818 	bdev_io = spdk_bdev_get_io();
819 	if (!bdev_io) {
820 		SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n");
821 		return -ENOMEM;
822 	}
823 
824 	bdev_io->ch = channel;
825 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
826 	bdev_io->u.read.iovs = iov;
827 	bdev_io->u.read.iovcnt = iovcnt;
828 	bdev_io->u.read.len = nbytes;
829 	bdev_io->u.read.offset = offset;
830 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
831 
832 	rc = spdk_bdev_io_submit(bdev_io);
833 	if (rc < 0) {
834 		spdk_bdev_put_io(bdev_io);
835 		return rc;
836 	}
837 
838 	return 0;
839 }
840 
841 int
842 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
843 		void *buf, uint64_t offset, uint64_t nbytes,
844 		spdk_bdev_io_completion_cb cb, void *cb_arg)
845 {
846 	struct spdk_bdev *bdev = desc->bdev;
847 	struct spdk_bdev_io *bdev_io;
848 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
849 	int rc;
850 
851 	if (!desc->write) {
852 		return -EBADF;
853 	}
854 
855 	if (spdk_bdev_io_valid(bdev, offset, nbytes) != 0) {
856 		return -EINVAL;
857 	}
858 
859 	bdev_io = spdk_bdev_get_io();
860 	if (!bdev_io) {
861 		SPDK_ERRLOG("bdev_io memory allocation failed duing write\n");
862 		return -ENOMEM;
863 	}
864 
865 	bdev_io->ch = channel;
866 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
867 	bdev_io->u.write.iov.iov_base = buf;
868 	bdev_io->u.write.iov.iov_len = nbytes;
869 	bdev_io->u.write.iovs = &bdev_io->u.write.iov;
870 	bdev_io->u.write.iovcnt = 1;
871 	bdev_io->u.write.len = nbytes;
872 	bdev_io->u.write.offset = offset;
873 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
874 
875 	rc = spdk_bdev_io_submit(bdev_io);
876 	if (rc < 0) {
877 		spdk_bdev_put_io(bdev_io);
878 		return rc;
879 	}
880 
881 	return 0;
882 }
883 
884 int
885 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
886 		 struct iovec *iov, int iovcnt,
887 		 uint64_t offset, uint64_t len,
888 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
889 {
890 	struct spdk_bdev *bdev = desc->bdev;
891 	struct spdk_bdev_io *bdev_io;
892 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
893 	int rc;
894 
895 	if (!desc->write) {
896 		return -EBADF;
897 	}
898 
899 	if (spdk_bdev_io_valid(bdev, offset, len) != 0) {
900 		return -EINVAL;
901 	}
902 
903 	bdev_io = spdk_bdev_get_io();
904 	if (!bdev_io) {
905 		SPDK_ERRLOG("bdev_io memory allocation failed duing writev\n");
906 		return -ENOMEM;
907 	}
908 
909 	bdev_io->ch = channel;
910 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
911 	bdev_io->u.write.iovs = iov;
912 	bdev_io->u.write.iovcnt = iovcnt;
913 	bdev_io->u.write.len = len;
914 	bdev_io->u.write.offset = offset;
915 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
916 
917 	rc = spdk_bdev_io_submit(bdev_io);
918 	if (rc < 0) {
919 		spdk_bdev_put_io(bdev_io);
920 		return rc;
921 	}
922 
923 	return 0;
924 }
925 
926 int
927 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
928 		uint64_t offset, uint64_t nbytes,
929 		spdk_bdev_io_completion_cb cb, void *cb_arg)
930 {
931 	struct spdk_bdev *bdev = desc->bdev;
932 	struct spdk_bdev_io *bdev_io;
933 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
934 	int rc;
935 
936 	if (!desc->write) {
937 		return -EBADF;
938 	}
939 
940 	if (spdk_bdev_io_valid(bdev, offset, nbytes) != 0) {
941 		return -EINVAL;
942 	}
943 
944 	if (nbytes == 0) {
945 		SPDK_ERRLOG("Can't unmap 0 bytes\n");
946 		return -EINVAL;
947 	}
948 
949 	bdev_io = spdk_bdev_get_io();
950 	if (!bdev_io) {
951 		SPDK_ERRLOG("bdev_io memory allocation failed duing unmap\n");
952 		return -ENOMEM;
953 	}
954 
955 	bdev_io->ch = channel;
956 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
957 	bdev_io->u.unmap.offset = offset;
958 	bdev_io->u.unmap.len = nbytes;
959 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
960 
961 	rc = spdk_bdev_io_submit(bdev_io);
962 	if (rc < 0) {
963 		spdk_bdev_put_io(bdev_io);
964 		return rc;
965 	}
966 
967 	return 0;
968 }
969 
970 int
971 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
972 		uint64_t offset, uint64_t length,
973 		spdk_bdev_io_completion_cb cb, void *cb_arg)
974 {
975 	struct spdk_bdev *bdev = desc->bdev;
976 	struct spdk_bdev_io *bdev_io;
977 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
978 	int rc;
979 
980 	if (!desc->write) {
981 		return -EBADF;
982 	}
983 
984 	bdev_io = spdk_bdev_get_io();
985 	if (!bdev_io) {
986 		SPDK_ERRLOG("bdev_io memory allocation failed duing flush\n");
987 		return -ENOMEM;
988 	}
989 
990 	bdev_io->ch = channel;
991 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
992 	bdev_io->u.flush.offset = offset;
993 	bdev_io->u.flush.length = length;
994 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
995 
996 	rc = spdk_bdev_io_submit(bdev_io);
997 	if (rc < 0) {
998 		spdk_bdev_put_io(bdev_io);
999 		return rc;
1000 	}
1001 
1002 	return 0;
1003 }
1004 
1005 static void
1006 _spdk_bdev_reset_dev(void *io_device, void *ctx)
1007 {
1008 	struct spdk_bdev_io *bdev_io = ctx;
1009 	int rc;
1010 
1011 	rc = spdk_bdev_io_submit(bdev_io);
1012 	if (rc < 0) {
1013 		spdk_bdev_put_io(bdev_io);
1014 		SPDK_ERRLOG("reset failed\n");
1015 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
1016 	}
1017 }
1018 
1019 static void
1020 _spdk_bdev_reset_abort_channel(void *io_device, struct spdk_io_channel *ch,
1021 			       void *ctx)
1022 {
1023 	struct spdk_bdev_channel	*channel;
1024 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
1025 
1026 	channel = spdk_io_channel_get_ctx(ch);
1027 	mgmt_channel = spdk_io_channel_get_ctx(channel->mgmt_channel);
1028 
1029 	_spdk_bdev_abort_io(&mgmt_channel->need_buf_small, channel);
1030 	_spdk_bdev_abort_io(&mgmt_channel->need_buf_large, channel);
1031 }
1032 
1033 static void
1034 _spdk_bdev_start_reset(void *ctx)
1035 {
1036 	struct spdk_bdev_io *bdev_io = ctx;
1037 
1038 	spdk_for_each_channel(bdev_io->bdev, _spdk_bdev_reset_abort_channel,
1039 			      bdev_io, _spdk_bdev_reset_dev);
1040 }
1041 
1042 static void
1043 _spdk_bdev_start_next_reset(struct spdk_bdev *bdev)
1044 {
1045 	struct spdk_bdev_io *bdev_io;
1046 	struct spdk_thread *thread;
1047 
1048 	pthread_mutex_lock(&bdev->mutex);
1049 
1050 	if (bdev->reset_in_progress || TAILQ_EMPTY(&bdev->queued_resets)) {
1051 		pthread_mutex_unlock(&bdev->mutex);
1052 		return;
1053 	} else {
1054 		bdev_io = TAILQ_FIRST(&bdev->queued_resets);
1055 		TAILQ_REMOVE(&bdev->queued_resets, bdev_io, link);
1056 		bdev->reset_in_progress = true;
1057 		thread = spdk_io_channel_get_thread(bdev_io->ch->channel);
1058 		spdk_thread_send_msg(thread, _spdk_bdev_start_reset, bdev_io);
1059 	}
1060 
1061 	pthread_mutex_unlock(&bdev->mutex);
1062 }
1063 
1064 int
1065 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1066 		spdk_bdev_io_completion_cb cb, void *cb_arg)
1067 {
1068 	struct spdk_bdev *bdev = desc->bdev;
1069 	struct spdk_bdev_io *bdev_io;
1070 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1071 
1072 	bdev_io = spdk_bdev_get_io();
1073 	if (!bdev_io) {
1074 		SPDK_ERRLOG("bdev_io memory allocation failed duing reset\n");
1075 		return -ENOMEM;;
1076 	}
1077 
1078 	bdev_io->ch = channel;
1079 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
1080 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1081 
1082 	pthread_mutex_lock(&bdev->mutex);
1083 	TAILQ_INSERT_TAIL(&bdev->queued_resets, bdev_io, link);
1084 	pthread_mutex_unlock(&bdev->mutex);
1085 
1086 	_spdk_bdev_start_next_reset(bdev);
1087 
1088 	return 0;
1089 }
1090 
1091 void
1092 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
1093 		      struct spdk_bdev_io_stat *stat)
1094 {
1095 #ifdef SPDK_CONFIG_VTUNE
1096 	SPDK_ERRLOG("Calling spdk_bdev_get_io_stat is not allowed when VTune integration is enabled.\n");
1097 	memset(stat, 0, sizeof(*stat));
1098 	return;
1099 #endif
1100 
1101 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1102 
1103 	*stat = channel->stat;
1104 	memset(&channel->stat, 0, sizeof(channel->stat));
1105 }
1106 
1107 int
1108 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1109 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
1110 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
1111 {
1112 	struct spdk_bdev *bdev = desc->bdev;
1113 	struct spdk_bdev_io *bdev_io;
1114 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1115 	int rc;
1116 
1117 	if (!desc->write) {
1118 		return -EBADF;
1119 	}
1120 
1121 	bdev_io = spdk_bdev_get_io();
1122 	if (!bdev_io) {
1123 		SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n");
1124 		return -ENOMEM;
1125 	}
1126 
1127 	bdev_io->ch = channel;
1128 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
1129 	bdev_io->u.nvme_passthru.cmd = *cmd;
1130 	bdev_io->u.nvme_passthru.buf = buf;
1131 	bdev_io->u.nvme_passthru.nbytes = nbytes;
1132 
1133 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1134 
1135 	rc = spdk_bdev_io_submit(bdev_io);
1136 	if (rc < 0) {
1137 		spdk_bdev_put_io(bdev_io);
1138 		return rc;
1139 	}
1140 
1141 	return 0;
1142 }
1143 
1144 int
1145 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1146 			   const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
1147 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
1148 {
1149 	struct spdk_bdev *bdev = desc->bdev;
1150 	struct spdk_bdev_io *bdev_io;
1151 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1152 	int rc;
1153 
1154 	if (!desc->write) {
1155 		/*
1156 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
1157 		 *  to easily determine if the command is a read or write, but for now just
1158 		 *  do not allow io_passthru with a read-only descriptor.
1159 		 */
1160 		return -EBADF;
1161 	}
1162 
1163 	bdev_io = spdk_bdev_get_io();
1164 	if (!bdev_io) {
1165 		SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n");
1166 		return -ENOMEM;
1167 	}
1168 
1169 	bdev_io->ch = channel;
1170 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
1171 	bdev_io->u.nvme_passthru.cmd = *cmd;
1172 	bdev_io->u.nvme_passthru.buf = buf;
1173 	bdev_io->u.nvme_passthru.nbytes = nbytes;
1174 
1175 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1176 
1177 	rc = spdk_bdev_io_submit(bdev_io);
1178 	if (rc < 0) {
1179 		spdk_bdev_put_io(bdev_io);
1180 		return rc;
1181 	}
1182 
1183 	return 0;
1184 }
1185 
1186 int
1187 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
1188 {
1189 	if (!bdev_io) {
1190 		SPDK_ERRLOG("bdev_io is NULL\n");
1191 		return -1;
1192 	}
1193 
1194 	if (bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING) {
1195 		SPDK_ERRLOG("bdev_io is in pending state\n");
1196 		assert(false);
1197 		return -1;
1198 	}
1199 
1200 	spdk_bdev_put_io(bdev_io);
1201 
1202 	return 0;
1203 }
1204 
1205 static void
1206 _spdk_bdev_io_complete(void *ctx)
1207 {
1208 	struct spdk_bdev_io *bdev_io = ctx;
1209 
1210 	assert(bdev_io->cb != NULL);
1211 	bdev_io->cb(bdev_io, bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS, bdev_io->caller_ctx);
1212 }
1213 
1214 void
1215 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
1216 {
1217 	bdev_io->status = status;
1218 
1219 	assert(bdev_io->ch->io_outstanding > 0);
1220 	bdev_io->ch->io_outstanding--;
1221 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_RESET) {
1222 		/* Successful reset */
1223 		if (status == SPDK_BDEV_IO_STATUS_SUCCESS) {
1224 			/* Increase the bdev generation */
1225 			bdev_io->bdev->gencnt++;
1226 		}
1227 		bdev_io->bdev->reset_in_progress = false;
1228 		_spdk_bdev_start_next_reset(bdev_io->bdev);
1229 	} else {
1230 		/*
1231 		 * Check the gencnt, to see if this I/O was issued before the most
1232 		 * recent reset. If the gencnt is not equal, then just free the I/O
1233 		 * without calling the callback, since the caller will have already
1234 		 * freed its context for this I/O.
1235 		 */
1236 		if (bdev_io->bdev->gencnt != bdev_io->gencnt) {
1237 			spdk_bdev_put_io(bdev_io);
1238 			return;
1239 		}
1240 	}
1241 
1242 	if (status == SPDK_BDEV_IO_STATUS_SUCCESS) {
1243 		switch (bdev_io->type) {
1244 		case SPDK_BDEV_IO_TYPE_READ:
1245 			bdev_io->ch->stat.bytes_read += bdev_io->u.read.len;
1246 			bdev_io->ch->stat.num_read_ops++;
1247 			break;
1248 		case SPDK_BDEV_IO_TYPE_WRITE:
1249 			bdev_io->ch->stat.bytes_written += bdev_io->u.write.len;
1250 			bdev_io->ch->stat.num_write_ops++;
1251 			break;
1252 		default:
1253 			break;
1254 		}
1255 	}
1256 
1257 #ifdef SPDK_CONFIG_VTUNE
1258 	uint64_t now_tsc = spdk_get_ticks();
1259 	if (now_tsc > (bdev_io->ch->start_tsc + bdev_io->ch->interval_tsc)) {
1260 		uint64_t data[5];
1261 
1262 		data[0] = bdev_io->ch->stat.num_read_ops;
1263 		data[1] = bdev_io->ch->stat.bytes_read;
1264 		data[2] = bdev_io->ch->stat.num_write_ops;
1265 		data[3] = bdev_io->ch->stat.bytes_written;
1266 		data[4] = bdev_io->bdev->fn_table->get_spin_time ?
1267 			  bdev_io->bdev->fn_table->get_spin_time(bdev_io->ch->channel) : 0;
1268 
1269 		__itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->ch->handle,
1270 				   __itt_metadata_u64, 5, data);
1271 
1272 		memset(&bdev_io->ch->stat, 0, sizeof(bdev_io->ch->stat));
1273 		bdev_io->ch->start_tsc = now_tsc;
1274 	}
1275 #endif
1276 
1277 	if (bdev_io->in_submit_request || bdev_io->type == SPDK_BDEV_IO_TYPE_RESET) {
1278 		/*
1279 		 * Defer completion to avoid potential infinite recursion if the
1280 		 * user's completion callback issues a new I/O.
1281 		 */
1282 		spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->ch->channel),
1283 				     _spdk_bdev_io_complete, bdev_io);
1284 	} else {
1285 		_spdk_bdev_io_complete(bdev_io);
1286 	}
1287 }
1288 
1289 void
1290 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
1291 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
1292 {
1293 	if (sc == SPDK_SCSI_STATUS_GOOD) {
1294 		bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS;
1295 	} else {
1296 		bdev_io->status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
1297 		bdev_io->error.scsi.sc = sc;
1298 		bdev_io->error.scsi.sk = sk;
1299 		bdev_io->error.scsi.asc = asc;
1300 		bdev_io->error.scsi.ascq = ascq;
1301 	}
1302 
1303 	spdk_bdev_io_complete(bdev_io, bdev_io->status);
1304 }
1305 
1306 void
1307 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
1308 			     int *sc, int *sk, int *asc, int *ascq)
1309 {
1310 	assert(sc != NULL);
1311 	assert(sk != NULL);
1312 	assert(asc != NULL);
1313 	assert(ascq != NULL);
1314 
1315 	switch (bdev_io->status) {
1316 	case SPDK_BDEV_IO_STATUS_SUCCESS:
1317 		*sc = SPDK_SCSI_STATUS_GOOD;
1318 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
1319 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
1320 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
1321 		break;
1322 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
1323 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
1324 		break;
1325 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
1326 		*sc = bdev_io->error.scsi.sc;
1327 		*sk = bdev_io->error.scsi.sk;
1328 		*asc = bdev_io->error.scsi.asc;
1329 		*ascq = bdev_io->error.scsi.ascq;
1330 		break;
1331 	default:
1332 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
1333 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
1334 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
1335 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
1336 		break;
1337 	}
1338 }
1339 
1340 void
1341 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc)
1342 {
1343 	if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
1344 		bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS;
1345 	} else {
1346 		bdev_io->error.nvme.sct = sct;
1347 		bdev_io->error.nvme.sc = sc;
1348 		bdev_io->status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
1349 	}
1350 
1351 	spdk_bdev_io_complete(bdev_io, bdev_io->status);
1352 }
1353 
1354 void
1355 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc)
1356 {
1357 	assert(sct != NULL);
1358 	assert(sc != NULL);
1359 
1360 	if (bdev_io->status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
1361 		*sct = bdev_io->error.nvme.sct;
1362 		*sc = bdev_io->error.nvme.sc;
1363 	} else if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) {
1364 		*sct = SPDK_NVME_SCT_GENERIC;
1365 		*sc = SPDK_NVME_SC_SUCCESS;
1366 	} else {
1367 		*sct = SPDK_NVME_SCT_GENERIC;
1368 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1369 	}
1370 }
1371 
1372 static void
1373 _spdk_bdev_register(struct spdk_bdev *bdev)
1374 {
1375 	struct spdk_bdev_module_if *module;
1376 
1377 	assert(bdev->module != NULL);
1378 
1379 	bdev->status = SPDK_BDEV_STATUS_READY;
1380 
1381 	/* initialize the reset generation value to zero */
1382 	bdev->gencnt = 0;
1383 	TAILQ_INIT(&bdev->open_descs);
1384 	bdev->bdev_opened_for_write = false;
1385 
1386 	TAILQ_INIT(&bdev->vbdevs);
1387 	TAILQ_INIT(&bdev->base_bdevs);
1388 
1389 	bdev->reset_in_progress = false;
1390 	TAILQ_INIT(&bdev->queued_resets);
1391 
1392 	spdk_io_device_register(bdev, spdk_bdev_channel_create, spdk_bdev_channel_destroy,
1393 				sizeof(struct spdk_bdev_channel));
1394 
1395 	pthread_mutex_init(&bdev->mutex, NULL);
1396 	SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Inserting bdev %s into list\n", bdev->name);
1397 	TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, link);
1398 
1399 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) {
1400 		if (module->examine) {
1401 			module->examine_in_progress++;
1402 			module->examine(bdev);
1403 		}
1404 	}
1405 }
1406 
1407 void
1408 spdk_bdev_register(struct spdk_bdev *bdev)
1409 {
1410 	_spdk_bdev_register(bdev);
1411 }
1412 
1413 void
1414 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count)
1415 {
1416 	int i;
1417 
1418 	_spdk_bdev_register(vbdev);
1419 	for (i = 0; i < base_bdev_count; i++) {
1420 		assert(base_bdevs[i] != NULL);
1421 		TAILQ_INSERT_TAIL(&vbdev->base_bdevs, base_bdevs[i], base_bdev_link);
1422 		TAILQ_INSERT_TAIL(&base_bdevs[i]->vbdevs, vbdev, vbdev_link);
1423 	}
1424 }
1425 
1426 void
1427 spdk_bdev_unregister(struct spdk_bdev *bdev)
1428 {
1429 	struct spdk_bdev_desc	*desc, *tmp;
1430 	int			rc;
1431 	bool			do_destruct = true;
1432 
1433 	SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Removing bdev %s from list\n", bdev->name);
1434 
1435 	pthread_mutex_lock(&bdev->mutex);
1436 
1437 	bdev->status = SPDK_BDEV_STATUS_REMOVING;
1438 
1439 	TAILQ_FOREACH_SAFE(desc, &bdev->open_descs, link, tmp) {
1440 		if (desc->remove_cb) {
1441 			pthread_mutex_unlock(&bdev->mutex);
1442 			do_destruct = false;
1443 			desc->remove_cb(desc->remove_ctx);
1444 			pthread_mutex_lock(&bdev->mutex);
1445 		}
1446 	}
1447 
1448 	if (!do_destruct) {
1449 		pthread_mutex_unlock(&bdev->mutex);
1450 		return;
1451 	}
1452 
1453 	TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link);
1454 	pthread_mutex_unlock(&bdev->mutex);
1455 
1456 	pthread_mutex_destroy(&bdev->mutex);
1457 
1458 	spdk_io_device_unregister(bdev, NULL);
1459 
1460 	rc = bdev->fn_table->destruct(bdev->ctxt);
1461 	if (rc < 0) {
1462 		SPDK_ERRLOG("destruct failed\n");
1463 	}
1464 }
1465 
1466 void
1467 spdk_vbdev_unregister(struct spdk_bdev *vbdev)
1468 {
1469 	struct spdk_bdev *base_bdev;
1470 
1471 	assert(!TAILQ_EMPTY(&vbdev->base_bdevs));
1472 	TAILQ_FOREACH(base_bdev, &vbdev->base_bdevs, base_bdev_link) {
1473 		TAILQ_REMOVE(&base_bdev->vbdevs, vbdev, vbdev_link);
1474 	}
1475 	spdk_bdev_unregister(vbdev);
1476 }
1477 
1478 void
1479 spdk_bdev_module_examine_done(struct spdk_bdev_module_if *module)
1480 {
1481 	struct spdk_bdev_module_if *m;
1482 
1483 	assert(module->examine_in_progress > 0);
1484 	module->examine_in_progress--;
1485 
1486 	/*
1487 	 * Check all bdev modules for an examinations in progress.  If any
1488 	 * exist, return immediately since we cannot finish bdev subsystem
1489 	 * initialization until all are completed.
1490 	 */
1491 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) {
1492 		if (m->examine_in_progress > 0) {
1493 			return;
1494 		}
1495 	}
1496 
1497 	if (g_bdev_mgr.module_init_complete && !g_bdev_mgr.init_complete) {
1498 		/*
1499 		 * Modules already finished initialization - now that all
1500 		 * the bdev moduless have finished their asynchronous I/O
1501 		 * processing, the entire bdev layer can be marked as complete.
1502 		 */
1503 		spdk_bdev_init_complete(0);
1504 	}
1505 }
1506 
1507 int
1508 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb,
1509 	       void *remove_ctx, struct spdk_bdev_desc **_desc)
1510 {
1511 	struct spdk_bdev_desc *desc;
1512 
1513 	desc = calloc(1, sizeof(*desc));
1514 	if (desc == NULL) {
1515 		return -ENOMEM;
1516 	}
1517 
1518 	pthread_mutex_lock(&bdev->mutex);
1519 
1520 	if (write && (bdev->bdev_opened_for_write || bdev->claim_module)) {
1521 		SPDK_ERRLOG("failed, %s already opened for write or claimed\n", bdev->name);
1522 		free(desc);
1523 		pthread_mutex_unlock(&bdev->mutex);
1524 		return -EPERM;
1525 	}
1526 
1527 	TAILQ_INSERT_TAIL(&bdev->open_descs, desc, link);
1528 
1529 	if (write) {
1530 		bdev->bdev_opened_for_write = true;
1531 	}
1532 
1533 	desc->bdev = bdev;
1534 	desc->remove_cb = remove_cb;
1535 	desc->remove_ctx = remove_ctx;
1536 	desc->write = write;
1537 	*_desc = desc;
1538 
1539 	pthread_mutex_unlock(&bdev->mutex);
1540 
1541 	return 0;
1542 }
1543 
1544 void
1545 spdk_bdev_close(struct spdk_bdev_desc *desc)
1546 {
1547 	struct spdk_bdev *bdev = desc->bdev;
1548 	bool do_unregister = false;
1549 
1550 	pthread_mutex_lock(&bdev->mutex);
1551 
1552 	if (desc->write) {
1553 		assert(bdev->bdev_opened_for_write);
1554 		bdev->bdev_opened_for_write = false;
1555 	}
1556 
1557 	TAILQ_REMOVE(&bdev->open_descs, desc, link);
1558 	free(desc);
1559 
1560 	if (bdev->status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->open_descs)) {
1561 		do_unregister = true;
1562 	}
1563 	pthread_mutex_unlock(&bdev->mutex);
1564 
1565 	if (do_unregister == true) {
1566 		spdk_bdev_unregister(bdev);
1567 	}
1568 }
1569 
1570 int
1571 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
1572 			    struct spdk_bdev_module_if *module)
1573 {
1574 	if (bdev->claim_module != NULL) {
1575 		SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name,
1576 			    bdev->claim_module->name);
1577 		return -EPERM;
1578 	}
1579 
1580 	if ((!desc || !desc->write) && bdev->bdev_opened_for_write) {
1581 		SPDK_ERRLOG("bdev %s already opened with write access\n", bdev->name);
1582 		return -EPERM;
1583 	}
1584 
1585 	if (desc && !desc->write) {
1586 		bdev->bdev_opened_for_write = true;
1587 		desc->write = true;
1588 	}
1589 
1590 	bdev->claim_module = module;
1591 	return 0;
1592 }
1593 
1594 void
1595 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
1596 {
1597 	assert(bdev->claim_module != NULL);
1598 	bdev->claim_module = NULL;
1599 }
1600 
1601 struct spdk_bdev *
1602 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
1603 {
1604 	return desc->bdev;
1605 }
1606 
1607 void
1608 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
1609 {
1610 	struct iovec *iovs;
1611 	int iovcnt;
1612 
1613 	if (bdev_io == NULL) {
1614 		return;
1615 	}
1616 
1617 	switch (bdev_io->type) {
1618 	case SPDK_BDEV_IO_TYPE_READ:
1619 		iovs = bdev_io->u.read.iovs;
1620 		iovcnt = bdev_io->u.read.iovcnt;
1621 		break;
1622 	case SPDK_BDEV_IO_TYPE_WRITE:
1623 		iovs = bdev_io->u.write.iovs;
1624 		iovcnt = bdev_io->u.write.iovcnt;
1625 		break;
1626 	default:
1627 		iovs = NULL;
1628 		iovcnt = 0;
1629 		break;
1630 	}
1631 
1632 	if (iovp) {
1633 		*iovp = iovs;
1634 	}
1635 	if (iovcntp) {
1636 		*iovcntp = iovcnt;
1637 	}
1638 }
1639 
1640 void
1641 spdk_bdev_module_list_add(struct spdk_bdev_module_if *bdev_module)
1642 {
1643 	/*
1644 	 * Modules with examine callbacks must be initialized first, so they are
1645 	 *  ready to handle examine callbacks from later modules that will
1646 	 *  register physical bdevs.
1647 	 */
1648 	if (bdev_module->examine != NULL) {
1649 		TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, tailq);
1650 	} else {
1651 		TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, tailq);
1652 	}
1653 }
1654