xref: /spdk/lib/bdev/bdev.c (revision 2d2fde0d7fd038942625dad26c0d62d8216afb51)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
5  *   Copyright (c) Intel Corporation.
6  *   All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "spdk/stdinc.h"
36 
37 #include "spdk/bdev.h"
38 
39 #include <rte_config.h>
40 #include <rte_lcore.h>
41 #include "spdk/env.h"
42 #include "spdk/io_channel.h"
43 #include "spdk/queue.h"
44 #include "spdk/nvme_spec.h"
45 #include "spdk/scsi_spec.h"
46 
47 #include "spdk_internal/bdev.h"
48 #include "spdk_internal/event.h"
49 #include "spdk_internal/log.h"
50 
51 #define SPDK_BDEV_IO_POOL_SIZE	(64 * 1024)
52 #define BUF_SMALL_POOL_SIZE	8192
53 #define BUF_LARGE_POOL_SIZE	1024
54 
55 static struct spdk_mempool *spdk_bdev_g_io_pool = NULL;
56 static struct spdk_mempool *g_buf_small_pool = NULL;
57 static struct spdk_mempool *g_buf_large_pool = NULL;
58 
59 typedef TAILQ_HEAD(, spdk_bdev_io) need_buf_tailq_t;
60 static need_buf_tailq_t g_need_buf_small[RTE_MAX_LCORE];
61 static need_buf_tailq_t g_need_buf_large[RTE_MAX_LCORE];
62 
63 static TAILQ_HEAD(, spdk_bdev_module_if) spdk_bdev_module_list =
64 	TAILQ_HEAD_INITIALIZER(spdk_bdev_module_list);
65 static TAILQ_HEAD(, spdk_bdev_module_if) spdk_vbdev_module_list =
66 	TAILQ_HEAD_INITIALIZER(spdk_vbdev_module_list);
67 
68 static TAILQ_HEAD(, spdk_bdev) spdk_bdev_list =
69 	TAILQ_HEAD_INITIALIZER(spdk_bdev_list);
70 
71 struct spdk_bdev_channel {
72 	struct spdk_bdev	*bdev;
73 
74 	/* The channel for the underlying device */
75 	struct spdk_io_channel	*channel;
76 };
77 
78 struct spdk_bdev *spdk_bdev_first(void)
79 {
80 	struct spdk_bdev *bdev;
81 
82 	bdev = TAILQ_FIRST(&spdk_bdev_list);
83 	if (bdev) {
84 		SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Starting bdev iteration at %s\n", bdev->name);
85 	}
86 
87 	return bdev;
88 }
89 
90 struct spdk_bdev *spdk_bdev_next(struct spdk_bdev *prev)
91 {
92 	struct spdk_bdev *bdev;
93 
94 	bdev = TAILQ_NEXT(prev, link);
95 	if (bdev) {
96 		SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Continuing bdev iteration at %s\n", bdev->name);
97 	}
98 
99 	return bdev;
100 }
101 
102 struct spdk_bdev *spdk_bdev_get_by_name(const char *bdev_name)
103 {
104 	struct spdk_bdev *bdev = spdk_bdev_first();
105 
106 	while (bdev != NULL) {
107 		if (strncmp(bdev_name, bdev->name, sizeof(bdev->name)) == 0) {
108 			return bdev;
109 		}
110 		bdev = spdk_bdev_next(bdev);
111 	}
112 
113 	return NULL;
114 }
115 
116 static void
117 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf)
118 {
119 	assert(bdev_io->get_buf_cb != NULL);
120 	assert(buf != NULL);
121 	assert(bdev_io->u.read.iovs != NULL);
122 
123 	bdev_io->u.read.buf_unaligned = buf;
124 	bdev_io->u.read.iovs[0].iov_base = (void *)((unsigned long)((char *)buf + 512) & ~511UL);
125 	bdev_io->u.read.iovs[0].iov_len = bdev_io->u.read.len;
126 	bdev_io->u.read.put_buf = true;
127 	bdev_io->get_buf_cb(bdev_io->ch->channel, bdev_io);
128 }
129 
130 static void
131 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
132 {
133 	struct spdk_mempool *pool;
134 	struct spdk_bdev_io *tmp;
135 	void *buf;
136 	need_buf_tailq_t *tailq;
137 	uint64_t length;
138 
139 	assert(bdev_io->u.read.iovcnt == 1);
140 
141 	length = bdev_io->u.read.len;
142 	buf = bdev_io->u.read.buf_unaligned;
143 
144 	if (length <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) {
145 		pool = g_buf_small_pool;
146 		tailq = &g_need_buf_small[rte_lcore_id()];
147 	} else {
148 		pool = g_buf_large_pool;
149 		tailq = &g_need_buf_large[rte_lcore_id()];
150 	}
151 
152 	if (TAILQ_EMPTY(tailq)) {
153 		spdk_mempool_put(pool, buf);
154 	} else {
155 		tmp = TAILQ_FIRST(tailq);
156 		TAILQ_REMOVE(tailq, tmp, buf_link);
157 		spdk_bdev_io_set_buf(tmp, buf);
158 	}
159 }
160 
161 static int spdk_initialize_buf_pool(void)
162 {
163 	int cache_size;
164 
165 	/**
166 	 * Ensure no more than half of the total buffers end up local caches, by
167 	 *   using spdk_event_get_active_core_count() to determine how many local caches we need
168 	 *   to account for.
169 	 */
170 	cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count());
171 	g_buf_small_pool = spdk_mempool_create("buf_small_pool",
172 					       BUF_SMALL_POOL_SIZE,
173 					       SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512,
174 					       cache_size,
175 					       SPDK_ENV_SOCKET_ID_ANY);
176 	if (!g_buf_small_pool) {
177 		SPDK_ERRLOG("create rbuf small pool failed\n");
178 		return -1;
179 	}
180 
181 	cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count());
182 	g_buf_large_pool = spdk_mempool_create("buf_large_pool",
183 					       BUF_LARGE_POOL_SIZE,
184 					       SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512,
185 					       cache_size,
186 					       SPDK_ENV_SOCKET_ID_ANY);
187 	if (!g_buf_large_pool) {
188 		SPDK_ERRLOG("create rbuf large pool failed\n");
189 		return -1;
190 	}
191 
192 	return 0;
193 }
194 
195 static int
196 spdk_bdev_module_get_max_ctx_size(void)
197 {
198 	struct spdk_bdev_module_if *bdev_module;
199 	int max_bdev_module_size = 0;
200 
201 	TAILQ_FOREACH(bdev_module, &spdk_bdev_module_list, tailq) {
202 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
203 			max_bdev_module_size = bdev_module->get_ctx_size();
204 		}
205 	}
206 
207 	TAILQ_FOREACH(bdev_module, &spdk_vbdev_module_list, tailq) {
208 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
209 			max_bdev_module_size = bdev_module->get_ctx_size();
210 		}
211 	}
212 
213 	return max_bdev_module_size;
214 }
215 
216 static int
217 spdk_bdev_module_initialize(void)
218 {
219 	struct spdk_bdev_module_if *bdev_module;
220 	int rc = 0;
221 
222 	TAILQ_FOREACH(bdev_module, &spdk_bdev_module_list, tailq) {
223 		rc = bdev_module->module_init();
224 		if (rc)
225 			return rc;
226 	}
227 	TAILQ_FOREACH(bdev_module, &spdk_vbdev_module_list, tailq) {
228 		rc = bdev_module->module_init();
229 		if (rc)
230 			return rc;
231 	}
232 	return rc;
233 }
234 
235 static void
236 spdk_bdev_module_finish(void)
237 {
238 	struct spdk_bdev_module_if *bdev_module;
239 
240 	TAILQ_FOREACH(bdev_module, &spdk_vbdev_module_list, tailq) {
241 		if (bdev_module->module_fini) {
242 			bdev_module->module_fini();
243 		}
244 	}
245 
246 	TAILQ_FOREACH(bdev_module, &spdk_bdev_module_list, tailq) {
247 		if (bdev_module->module_fini) {
248 			bdev_module->module_fini();
249 		}
250 	}
251 }
252 
253 static void
254 spdk_bdev_config_text(FILE *fp)
255 {
256 	struct spdk_bdev_module_if *bdev_module;
257 
258 	TAILQ_FOREACH(bdev_module, &spdk_bdev_module_list, tailq) {
259 		if (bdev_module->config_text) {
260 			bdev_module->config_text(fp);
261 		}
262 	}
263 	TAILQ_FOREACH(bdev_module, &spdk_vbdev_module_list, tailq) {
264 		if (bdev_module->config_text) {
265 			bdev_module->config_text(fp);
266 		}
267 	}
268 }
269 
270 static int
271 spdk_bdev_initialize(void)
272 {
273 	int i;
274 
275 	if (spdk_bdev_module_initialize()) {
276 		SPDK_ERRLOG("bdev module initialize failed");
277 		return -1;
278 	}
279 
280 	spdk_bdev_g_io_pool = spdk_mempool_create("blockdev_io",
281 			      SPDK_BDEV_IO_POOL_SIZE,
282 			      sizeof(struct spdk_bdev_io) +
283 			      spdk_bdev_module_get_max_ctx_size(),
284 			      64,
285 			      SPDK_ENV_SOCKET_ID_ANY);
286 
287 	if (spdk_bdev_g_io_pool == NULL) {
288 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool");
289 		return -1;
290 	}
291 
292 	for (i = 0; i < RTE_MAX_LCORE; i++) {
293 		TAILQ_INIT(&g_need_buf_small[i]);
294 		TAILQ_INIT(&g_need_buf_large[i]);
295 	}
296 
297 	return spdk_initialize_buf_pool();
298 }
299 
300 static int
301 spdk_bdev_check_pool(struct spdk_mempool *pool, uint32_t count)
302 {
303 	if (spdk_mempool_count(pool) != count) {
304 		SPDK_ERRLOG("spdk_mempool_count(%p) == %zu, should be %u\n",
305 			    pool, spdk_mempool_count(pool), count);
306 		return -1;
307 	} else {
308 		return 0;
309 	}
310 }
311 
312 static int
313 spdk_bdev_finish(void)
314 {
315 	int rc = 0;
316 
317 	spdk_bdev_module_finish();
318 
319 	rc += spdk_bdev_check_pool(g_buf_small_pool, BUF_SMALL_POOL_SIZE);
320 	rc += spdk_bdev_check_pool(g_buf_large_pool, BUF_LARGE_POOL_SIZE);
321 
322 	return (rc != 0);
323 }
324 
325 struct spdk_bdev_io *spdk_bdev_get_io(void)
326 {
327 	struct spdk_bdev_io *bdev_io;
328 
329 	bdev_io = spdk_mempool_get(spdk_bdev_g_io_pool);
330 	if (!bdev_io) {
331 		SPDK_ERRLOG("Unable to get spdk_bdev_io\n");
332 		abort();
333 	}
334 
335 	memset(bdev_io, 0, sizeof(*bdev_io));
336 
337 	return bdev_io;
338 }
339 
340 static void
341 spdk_bdev_put_io(struct spdk_bdev_io *bdev_io)
342 {
343 	if (!bdev_io) {
344 		return;
345 	}
346 
347 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && bdev_io->u.read.put_buf) {
348 		spdk_bdev_io_put_buf(bdev_io);
349 	}
350 
351 	spdk_mempool_put(spdk_bdev_g_io_pool, (void *)bdev_io);
352 }
353 
354 static void
355 _spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io)
356 {
357 	uint64_t len = bdev_io->u.read.len;
358 	struct spdk_mempool *pool;
359 	need_buf_tailq_t *tailq;
360 	void *buf = NULL;
361 
362 	if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) {
363 		pool = g_buf_small_pool;
364 		tailq = &g_need_buf_small[rte_lcore_id()];
365 	} else {
366 		pool = g_buf_large_pool;
367 		tailq = &g_need_buf_large[rte_lcore_id()];
368 	}
369 
370 	buf = spdk_mempool_get(pool);
371 
372 	if (!buf) {
373 		TAILQ_INSERT_TAIL(tailq, bdev_io, buf_link);
374 	} else {
375 		spdk_bdev_io_set_buf(bdev_io, buf);
376 	}
377 }
378 
379 
380 static void
381 spdk_bdev_cleanup_pending_buf_io(struct spdk_bdev *bdev)
382 {
383 	struct spdk_bdev_io *bdev_io, *tmp;
384 
385 	TAILQ_FOREACH_SAFE(bdev_io, &g_need_buf_small[rte_lcore_id()], buf_link, tmp) {
386 		if (bdev_io->bdev == bdev) {
387 			TAILQ_REMOVE(&g_need_buf_small[rte_lcore_id()], bdev_io, buf_link);
388 			bdev_io->status = SPDK_BDEV_IO_STATUS_FAILED;
389 		}
390 	}
391 
392 	TAILQ_FOREACH_SAFE(bdev_io, &g_need_buf_large[rte_lcore_id()], buf_link, tmp) {
393 		if (bdev_io->bdev == bdev) {
394 			TAILQ_REMOVE(&g_need_buf_large[rte_lcore_id()], bdev_io, buf_link);
395 			bdev_io->status = SPDK_BDEV_IO_STATUS_FAILED;
396 		}
397 	}
398 }
399 
400 static void
401 __submit_request(struct spdk_bdev *bdev, struct spdk_bdev_io *bdev_io)
402 {
403 	struct spdk_io_channel *ch;
404 
405 	assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);
406 
407 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_RESET) {
408 		spdk_bdev_cleanup_pending_buf_io(bdev);
409 		ch = NULL;
410 	} else {
411 		ch = bdev_io->ch->channel;
412 	}
413 
414 	bdev_io->in_submit_request = true;
415 	bdev->fn_table->submit_request(ch, bdev_io);
416 	bdev_io->in_submit_request = false;
417 }
418 
419 static int
420 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
421 {
422 	struct spdk_bdev *bdev = bdev_io->bdev;
423 
424 	__submit_request(bdev, bdev_io);
425 	return 0;
426 }
427 
428 void
429 spdk_bdev_io_resubmit(struct spdk_bdev_io *bdev_io, struct spdk_bdev *new_bdev)
430 {
431 	assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);
432 	bdev_io->bdev = new_bdev;
433 
434 	/*
435 	 * These fields are normally set during spdk_bdev_io_init(), but since bdev is
436 	 * being switched, they need to be reinitialized.
437 	 */
438 	bdev_io->gencnt = new_bdev->gencnt;
439 
440 	__submit_request(new_bdev, bdev_io);
441 }
442 
443 static void
444 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io,
445 		  struct spdk_bdev *bdev, void *cb_arg,
446 		  spdk_bdev_io_completion_cb cb)
447 {
448 	bdev_io->bdev = bdev;
449 	bdev_io->caller_ctx = cb_arg;
450 	bdev_io->cb = cb;
451 	bdev_io->gencnt = bdev->gencnt;
452 	bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING;
453 	bdev_io->in_submit_request = false;
454 	TAILQ_INIT(&bdev_io->child_io);
455 }
456 
457 struct spdk_bdev_io *
458 spdk_bdev_get_child_io(struct spdk_bdev_io *parent,
459 		       struct spdk_bdev *bdev,
460 		       spdk_bdev_io_completion_cb cb,
461 		       void *cb_arg)
462 {
463 	struct spdk_bdev_io *child;
464 
465 	child = spdk_bdev_get_io();
466 	if (!child) {
467 		SPDK_ERRLOG("Unable to get spdk_bdev_io\n");
468 		return NULL;
469 	}
470 
471 	if (cb_arg == NULL) {
472 		cb_arg = child;
473 	}
474 
475 	spdk_bdev_io_init(child, bdev, cb_arg, cb);
476 
477 	child->type = parent->type;
478 	memcpy(&child->u, &parent->u, sizeof(child->u));
479 	if (child->type == SPDK_BDEV_IO_TYPE_READ) {
480 		child->u.read.put_buf = false;
481 	}
482 	child->get_buf_cb = NULL;
483 	child->parent = parent;
484 
485 	TAILQ_INSERT_TAIL(&parent->child_io, child, link);
486 
487 	return child;
488 }
489 
490 bool
491 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
492 {
493 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
494 }
495 
496 int
497 spdk_bdev_dump_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
498 {
499 	if (bdev->fn_table->dump_config_json) {
500 		return bdev->fn_table->dump_config_json(bdev->ctxt, w);
501 	}
502 
503 	return 0;
504 }
505 
506 static int
507 spdk_bdev_channel_create(void *io_device, uint32_t priority, void *ctx_buf,
508 			 void *unique_ctx)
509 {
510 	struct spdk_bdev		*bdev = io_device;
511 	struct spdk_bdev_channel	*ch = ctx_buf;
512 
513 	ch->bdev = io_device;
514 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt, priority);
515 
516 	return 0;
517 }
518 
519 static void
520 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf)
521 {
522 	struct spdk_bdev_channel	*ch = ctx_buf;
523 
524 	spdk_put_io_channel(ch->channel);
525 }
526 
527 struct spdk_io_channel *
528 spdk_bdev_get_io_channel(struct spdk_bdev *bdev, uint32_t priority)
529 {
530 	return spdk_get_io_channel(bdev, priority, false, NULL);
531 }
532 
533 static int
534 spdk_bdev_io_valid(struct spdk_bdev *bdev, uint64_t offset, uint64_t nbytes)
535 {
536 	/* Return failure if nbytes is not a multiple of bdev->blocklen */
537 	if (nbytes % bdev->blocklen) {
538 		return -1;
539 	}
540 
541 	/* Return failure if offset + nbytes is less than offset; indicates there
542 	 * has been an overflow and hence the offset has been wrapped around */
543 	if (offset + nbytes < offset) {
544 		return -1;
545 	}
546 
547 	/* Return failure if offset + nbytes exceeds the size of the blockdev */
548 	if (offset + nbytes > bdev->blockcnt * bdev->blocklen) {
549 		return -1;
550 	}
551 
552 	return 0;
553 }
554 
555 struct spdk_bdev_io *
556 spdk_bdev_read(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
557 	       void *buf, uint64_t offset, uint64_t nbytes,
558 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
559 {
560 	struct spdk_bdev_io *bdev_io;
561 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
562 	int rc;
563 
564 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
565 	if (spdk_bdev_io_valid(bdev, offset, nbytes) != 0) {
566 		return NULL;
567 	}
568 
569 	bdev_io = spdk_bdev_get_io();
570 	if (!bdev_io) {
571 		SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n");
572 		return NULL;
573 	}
574 
575 	bdev_io->ch = channel;
576 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
577 	bdev_io->u.read.iov.iov_base = buf;
578 	bdev_io->u.read.iov.iov_len = nbytes;
579 	bdev_io->u.read.iovs = &bdev_io->u.read.iov;
580 	bdev_io->u.read.iovcnt = 1;
581 	bdev_io->u.read.len = nbytes;
582 	bdev_io->u.read.offset = offset;
583 	bdev_io->u.read.put_buf = false;
584 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
585 
586 	rc = spdk_bdev_io_submit(bdev_io);
587 	if (rc < 0) {
588 		spdk_bdev_put_io(bdev_io);
589 		return NULL;
590 	}
591 
592 	return bdev_io;
593 }
594 
595 struct spdk_bdev_io *
596 spdk_bdev_readv(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
597 		struct iovec *iov, int iovcnt,
598 		uint64_t offset, uint64_t nbytes,
599 		spdk_bdev_io_completion_cb cb, void *cb_arg)
600 {
601 	struct spdk_bdev_io *bdev_io;
602 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
603 	int rc;
604 
605 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
606 	if (spdk_bdev_io_valid(bdev, offset, nbytes) != 0) {
607 		return NULL;
608 	}
609 
610 	bdev_io = spdk_bdev_get_io();
611 	if (!bdev_io) {
612 		SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n");
613 		return NULL;
614 	}
615 
616 	bdev_io->ch = channel;
617 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
618 	bdev_io->u.read.iovs = iov;
619 	bdev_io->u.read.iovcnt = iovcnt;
620 	bdev_io->u.read.len = nbytes;
621 	bdev_io->u.read.offset = offset;
622 	bdev_io->u.read.put_buf = false;
623 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
624 
625 	rc = spdk_bdev_io_submit(bdev_io);
626 	if (rc < 0) {
627 		spdk_bdev_put_io(bdev_io);
628 		return NULL;
629 	}
630 
631 	return bdev_io;
632 }
633 
634 struct spdk_bdev_io *
635 spdk_bdev_write(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
636 		void *buf, uint64_t offset, uint64_t nbytes,
637 		spdk_bdev_io_completion_cb cb, void *cb_arg)
638 {
639 	struct spdk_bdev_io *bdev_io;
640 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
641 	int rc;
642 
643 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
644 	if (spdk_bdev_io_valid(bdev, offset, nbytes) != 0) {
645 		return NULL;
646 	}
647 
648 	bdev_io = spdk_bdev_get_io();
649 	if (!bdev_io) {
650 		SPDK_ERRLOG("blockdev_io memory allocation failed duing write\n");
651 		return NULL;
652 	}
653 
654 	bdev_io->ch = channel;
655 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
656 	bdev_io->u.write.iov.iov_base = buf;
657 	bdev_io->u.write.iov.iov_len = nbytes;
658 	bdev_io->u.write.iovs = &bdev_io->u.write.iov;
659 	bdev_io->u.write.iovcnt = 1;
660 	bdev_io->u.write.len = nbytes;
661 	bdev_io->u.write.offset = offset;
662 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
663 
664 	rc = spdk_bdev_io_submit(bdev_io);
665 	if (rc < 0) {
666 		spdk_bdev_put_io(bdev_io);
667 		return NULL;
668 	}
669 
670 	return bdev_io;
671 }
672 
673 struct spdk_bdev_io *
674 spdk_bdev_writev(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
675 		 struct iovec *iov, int iovcnt,
676 		 uint64_t offset, uint64_t len,
677 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
678 {
679 	struct spdk_bdev_io *bdev_io;
680 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
681 	int rc;
682 
683 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
684 	if (spdk_bdev_io_valid(bdev, offset, len) != 0) {
685 		return NULL;
686 	}
687 
688 	bdev_io = spdk_bdev_get_io();
689 	if (!bdev_io) {
690 		SPDK_ERRLOG("bdev_io memory allocation failed duing writev\n");
691 		return NULL;
692 	}
693 
694 	bdev_io->ch = channel;
695 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
696 	bdev_io->u.write.iovs = iov;
697 	bdev_io->u.write.iovcnt = iovcnt;
698 	bdev_io->u.write.len = len;
699 	bdev_io->u.write.offset = offset;
700 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
701 
702 	rc = spdk_bdev_io_submit(bdev_io);
703 	if (rc < 0) {
704 		spdk_bdev_put_io(bdev_io);
705 		return NULL;
706 	}
707 
708 	return bdev_io;
709 }
710 
711 struct spdk_bdev_io *
712 spdk_bdev_unmap(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
713 		struct spdk_scsi_unmap_bdesc *unmap_d,
714 		uint16_t bdesc_count,
715 		spdk_bdev_io_completion_cb cb, void *cb_arg)
716 {
717 	struct spdk_bdev_io *bdev_io;
718 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
719 	int rc;
720 
721 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
722 	if (bdesc_count == 0) {
723 		SPDK_ERRLOG("Invalid bdesc_count 0\n");
724 		return NULL;
725 	}
726 
727 	if (bdesc_count > bdev->max_unmap_bdesc_count) {
728 		SPDK_ERRLOG("Invalid bdesc_count %u > max_unmap_bdesc_count %u\n",
729 			    bdesc_count, bdev->max_unmap_bdesc_count);
730 		return NULL;
731 	}
732 
733 	bdev_io = spdk_bdev_get_io();
734 	if (!bdev_io) {
735 		SPDK_ERRLOG("bdev_io memory allocation failed duing unmap\n");
736 		return NULL;
737 	}
738 
739 	bdev_io->ch = channel;
740 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
741 	bdev_io->u.unmap.unmap_bdesc = unmap_d;
742 	bdev_io->u.unmap.bdesc_count = bdesc_count;
743 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
744 
745 	rc = spdk_bdev_io_submit(bdev_io);
746 	if (rc < 0) {
747 		spdk_bdev_put_io(bdev_io);
748 		return NULL;
749 	}
750 
751 	return bdev_io;
752 }
753 
754 struct spdk_bdev_io *
755 spdk_bdev_flush(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
756 		uint64_t offset, uint64_t length,
757 		spdk_bdev_io_completion_cb cb, void *cb_arg)
758 {
759 	struct spdk_bdev_io *bdev_io;
760 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
761 	int rc;
762 
763 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
764 	bdev_io = spdk_bdev_get_io();
765 	if (!bdev_io) {
766 		SPDK_ERRLOG("bdev_io memory allocation failed duing flush\n");
767 		return NULL;
768 	}
769 
770 	bdev_io->ch = channel;
771 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
772 	bdev_io->u.flush.offset = offset;
773 	bdev_io->u.flush.length = length;
774 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
775 
776 	rc = spdk_bdev_io_submit(bdev_io);
777 	if (rc < 0) {
778 		spdk_bdev_put_io(bdev_io);
779 		return NULL;
780 	}
781 
782 	return bdev_io;
783 }
784 
785 int
786 spdk_bdev_reset(struct spdk_bdev *bdev, enum spdk_bdev_reset_type reset_type,
787 		spdk_bdev_io_completion_cb cb, void *cb_arg)
788 {
789 	struct spdk_bdev_io *bdev_io;
790 	int rc;
791 
792 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
793 	bdev_io = spdk_bdev_get_io();
794 	if (!bdev_io) {
795 		SPDK_ERRLOG("bdev_io memory allocation failed duing reset\n");
796 		return -1;
797 	}
798 
799 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
800 	bdev_io->u.reset.type = reset_type;
801 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
802 
803 	rc = spdk_bdev_io_submit(bdev_io);
804 	if (rc < 0) {
805 		spdk_bdev_put_io(bdev_io);
806 		SPDK_ERRLOG("reset failed\n");
807 	}
808 
809 	return rc;
810 }
811 
812 int
813 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
814 {
815 	struct spdk_bdev_io *child_io, *tmp;
816 
817 	if (!bdev_io) {
818 		SPDK_ERRLOG("bdev_io is NULL\n");
819 		return -1;
820 	}
821 
822 	if (bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING) {
823 		SPDK_ERRLOG("bdev_io is in pending state\n");
824 		assert(false);
825 		return -1;
826 	}
827 
828 	TAILQ_FOREACH_SAFE(child_io, &bdev_io->child_io, link, tmp) {
829 		/*
830 		 * Make sure no references to the parent I/O remain, since it is being
831 		 * returned to the free pool.
832 		 */
833 		child_io->parent = NULL;
834 		TAILQ_REMOVE(&bdev_io->child_io, child_io, link);
835 
836 		/*
837 		 * Child I/O may have a buf that needs to be returned to a pool
838 		 *  on a different core, so free it through the request submission
839 		 *  process rather than calling put_io directly here.
840 		 */
841 		spdk_bdev_free_io(child_io);
842 	}
843 
844 	spdk_bdev_put_io(bdev_io);
845 
846 	return 0;
847 }
848 
849 static void
850 bdev_io_deferred_completion(void *arg1, void *arg2)
851 {
852 	struct spdk_bdev_io *bdev_io = arg1;
853 	enum spdk_bdev_io_status status = (enum spdk_bdev_io_status)arg2;
854 
855 	assert(bdev_io->in_submit_request == false);
856 
857 	spdk_bdev_io_complete(bdev_io, status);
858 }
859 
860 void
861 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
862 {
863 	if (bdev_io->in_submit_request) {
864 		/*
865 		 * Defer completion via an event to avoid potential infinite recursion if the
866 		 * user's completion callback issues a new I/O.
867 		 */
868 		spdk_event_call(spdk_event_allocate(spdk_env_get_current_core(),
869 						    bdev_io_deferred_completion,
870 						    bdev_io,
871 						    (void *)status));
872 		return;
873 	}
874 
875 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_RESET) {
876 		/* Successful reset */
877 		if (status == SPDK_BDEV_IO_STATUS_SUCCESS) {
878 			/* Increase the blockdev generation if it is a hard reset */
879 			if (bdev_io->u.reset.type == SPDK_BDEV_RESET_HARD) {
880 				bdev_io->bdev->gencnt++;
881 			}
882 		}
883 	} else {
884 		/*
885 		 * Check the gencnt, to see if this I/O was issued before the most
886 		 * recent reset. If the gencnt is not equal, then just free the I/O
887 		 * without calling the callback, since the caller will have already
888 		 * freed its context for this I/O.
889 		 */
890 		if (bdev_io->bdev->gencnt != bdev_io->gencnt) {
891 			spdk_bdev_put_io(bdev_io);
892 			return;
893 		}
894 	}
895 
896 	bdev_io->status = status;
897 
898 	assert(bdev_io->cb != NULL);
899 	bdev_io->cb(bdev_io, status, bdev_io->caller_ctx);
900 }
901 
902 void
903 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
904 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
905 {
906 	if (sc == SPDK_SCSI_STATUS_GOOD) {
907 		bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS;
908 	} else {
909 		bdev_io->status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
910 		bdev_io->error.scsi.sc = sc;
911 		bdev_io->error.scsi.sk = sk;
912 		bdev_io->error.scsi.asc = asc;
913 		bdev_io->error.scsi.ascq = ascq;
914 	}
915 
916 	spdk_bdev_io_complete(bdev_io, bdev_io->status);
917 }
918 
919 void
920 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
921 			     int *sc, int *sk, int *asc, int *ascq)
922 {
923 	assert(sc != NULL);
924 	assert(sk != NULL);
925 	assert(asc != NULL);
926 	assert(ascq != NULL);
927 
928 	switch (bdev_io->status) {
929 	case SPDK_BDEV_IO_STATUS_SUCCESS:
930 		*sc = SPDK_SCSI_STATUS_GOOD;
931 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
932 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
933 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
934 		break;
935 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
936 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
937 		break;
938 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
939 		*sc = bdev_io->error.scsi.sc;
940 		*sk = bdev_io->error.scsi.sk;
941 		*asc = bdev_io->error.scsi.asc;
942 		*ascq = bdev_io->error.scsi.ascq;
943 		break;
944 	default:
945 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
946 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
947 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
948 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
949 		break;
950 	}
951 }
952 
953 void
954 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc)
955 {
956 	if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
957 		bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS;
958 	} else {
959 		bdev_io->error.nvme.sct = sct;
960 		bdev_io->error.nvme.sc = sc;
961 		bdev_io->status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
962 	}
963 
964 	spdk_bdev_io_complete(bdev_io, bdev_io->status);
965 }
966 
967 void
968 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc)
969 {
970 	assert(sct != NULL);
971 	assert(sc != NULL);
972 
973 	if (bdev_io->status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
974 		*sct = bdev_io->error.nvme.sct;
975 		*sc = bdev_io->error.nvme.sc;
976 	} else if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) {
977 		*sct = SPDK_NVME_SCT_GENERIC;
978 		*sc = SPDK_NVME_SC_SUCCESS;
979 	} else {
980 		*sct = SPDK_NVME_SCT_GENERIC;
981 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
982 	}
983 }
984 
985 void
986 spdk_bdev_register(struct spdk_bdev *bdev)
987 {
988 	/* initialize the reset generation value to zero */
989 	bdev->gencnt = 0;
990 
991 	spdk_io_device_register(bdev, spdk_bdev_channel_create, spdk_bdev_channel_destroy,
992 				sizeof(struct spdk_bdev_channel));
993 
994 	pthread_mutex_init(&bdev->mutex, NULL);
995 	bdev->status = SPDK_BDEV_STATUS_UNCLAIMED;
996 	SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Inserting bdev %s into list\n", bdev->name);
997 	TAILQ_INSERT_TAIL(&spdk_bdev_list, bdev, link);
998 }
999 
1000 void
1001 spdk_bdev_unregister(struct spdk_bdev *bdev)
1002 {
1003 	int			rc;
1004 
1005 	SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Removing bdev %s from list\n", bdev->name);
1006 
1007 	pthread_mutex_lock(&bdev->mutex);
1008 	assert(bdev->status == SPDK_BDEV_STATUS_CLAIMED || bdev->status == SPDK_BDEV_STATUS_UNCLAIMED);
1009 	if (bdev->status == SPDK_BDEV_STATUS_CLAIMED) {
1010 		if (bdev->remove_cb) {
1011 			bdev->status = SPDK_BDEV_STATUS_REMOVING;
1012 			pthread_mutex_unlock(&bdev->mutex);
1013 			bdev->remove_cb(bdev->remove_ctx);
1014 			return;
1015 		} else {
1016 			bdev->status = SPDK_BDEV_STATUS_UNCLAIMED;
1017 		}
1018 	}
1019 
1020 	TAILQ_REMOVE(&spdk_bdev_list, bdev, link);
1021 	pthread_mutex_unlock(&bdev->mutex);
1022 
1023 	pthread_mutex_destroy(&bdev->mutex);
1024 
1025 	spdk_io_device_unregister(bdev);
1026 
1027 	rc = bdev->fn_table->destruct(bdev->ctxt);
1028 	if (rc < 0) {
1029 		SPDK_ERRLOG("destruct failed\n");
1030 	}
1031 }
1032 
1033 bool
1034 spdk_bdev_claim(struct spdk_bdev *bdev, spdk_bdev_remove_cb_t remove_cb,
1035 		void *remove_ctx)
1036 {
1037 	bool success;
1038 
1039 	pthread_mutex_lock(&bdev->mutex);
1040 
1041 	if (bdev->status != SPDK_BDEV_STATUS_CLAIMED) {
1042 		/* Take ownership of bdev. */
1043 		bdev->remove_cb = remove_cb;
1044 		bdev->remove_ctx = remove_ctx;
1045 		bdev->status = SPDK_BDEV_STATUS_CLAIMED;
1046 		success = true;
1047 	} else {
1048 		/* bdev is already claimed. */
1049 		success = false;
1050 	}
1051 
1052 	pthread_mutex_unlock(&bdev->mutex);
1053 
1054 	return success;
1055 }
1056 
1057 void
1058 spdk_bdev_unclaim(struct spdk_bdev *bdev)
1059 {
1060 	bool do_unregister = false;
1061 
1062 	pthread_mutex_lock(&bdev->mutex);
1063 	assert(bdev->status == SPDK_BDEV_STATUS_CLAIMED || bdev->status == SPDK_BDEV_STATUS_REMOVING);
1064 	if (bdev->status == SPDK_BDEV_STATUS_REMOVING) {
1065 		do_unregister = true;
1066 	}
1067 	bdev->remove_cb = NULL;
1068 	bdev->remove_ctx = NULL;
1069 	bdev->status = SPDK_BDEV_STATUS_UNCLAIMED;
1070 	pthread_mutex_unlock(&bdev->mutex);
1071 
1072 	if (do_unregister == true) {
1073 		spdk_bdev_unregister(bdev);
1074 	}
1075 }
1076 
1077 void
1078 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb)
1079 {
1080 	assert(cb != NULL);
1081 	assert(bdev_io->u.read.iovs != NULL);
1082 
1083 	if (bdev_io->u.read.iovs[0].iov_base == NULL) {
1084 		bdev_io->get_buf_cb = cb;
1085 		_spdk_bdev_io_get_buf(bdev_io);
1086 	} else {
1087 		cb(bdev_io->ch->channel, bdev_io);
1088 	}
1089 }
1090 
1091 void
1092 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
1093 {
1094 	struct iovec *iovs;
1095 	int iovcnt;
1096 
1097 	if (bdev_io == NULL) {
1098 		return;
1099 	}
1100 
1101 	switch (bdev_io->type) {
1102 	case SPDK_BDEV_IO_TYPE_READ:
1103 		iovs = bdev_io->u.read.iovs;
1104 		iovcnt = bdev_io->u.read.iovcnt;
1105 		break;
1106 	case SPDK_BDEV_IO_TYPE_WRITE:
1107 		iovs = bdev_io->u.write.iovs;
1108 		iovcnt = bdev_io->u.write.iovcnt;
1109 		break;
1110 	default:
1111 		iovs = NULL;
1112 		iovcnt = 0;
1113 		break;
1114 	}
1115 
1116 	if (iovp) {
1117 		*iovp = iovs;
1118 	}
1119 	if (iovcntp) {
1120 		*iovcntp = iovcnt;
1121 	}
1122 }
1123 
1124 void spdk_bdev_module_list_add(struct spdk_bdev_module_if *bdev_module)
1125 {
1126 	TAILQ_INSERT_TAIL(&spdk_bdev_module_list, bdev_module, tailq);
1127 }
1128 
1129 void spdk_vbdev_module_list_add(struct spdk_bdev_module_if *vbdev_module)
1130 {
1131 	TAILQ_INSERT_TAIL(&spdk_vbdev_module_list, vbdev_module, tailq);
1132 }
1133 SPDK_SUBSYSTEM_REGISTER(bdev, spdk_bdev_initialize, spdk_bdev_finish, spdk_bdev_config_text)
1134 SPDK_SUBSYSTEM_DEPEND(bdev, copy)
1135