xref: /spdk/lib/bdev/bdev.c (revision b961d9cc12de49251d135307eaa05ec0fc9dd2fa)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
5  *   Copyright (c) Intel Corporation.
6  *   All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "spdk/stdinc.h"
36 
37 #include "spdk/bdev.h"
38 
39 #include <rte_config.h>
40 #include <rte_mempool.h>
41 #include <rte_version.h>
42 
43 #include "spdk/env.h"
44 #include "spdk/io_channel.h"
45 #include "spdk/queue.h"
46 #include "spdk/nvme_spec.h"
47 #include "spdk/scsi_spec.h"
48 
49 #include "spdk_internal/bdev.h"
50 #include "spdk_internal/event.h"
51 #include "spdk_internal/log.h"
52 
53 #define SPDK_BDEV_IO_POOL_SIZE	(64 * 1024)
54 #define RBUF_SMALL_POOL_SIZE	8192
55 #define RBUF_LARGE_POOL_SIZE	1024
56 
57 static struct rte_mempool *spdk_bdev_g_io_pool = NULL;
58 static struct rte_mempool *g_rbuf_small_pool = NULL;
59 static struct rte_mempool *g_rbuf_large_pool = NULL;
60 
61 typedef TAILQ_HEAD(, spdk_bdev_io) need_rbuf_tailq_t;
62 static need_rbuf_tailq_t g_need_rbuf_small[RTE_MAX_LCORE];
63 static need_rbuf_tailq_t g_need_rbuf_large[RTE_MAX_LCORE];
64 
65 static TAILQ_HEAD(, spdk_bdev_module_if) spdk_bdev_module_list =
66 	TAILQ_HEAD_INITIALIZER(spdk_bdev_module_list);
67 static TAILQ_HEAD(, spdk_bdev_module_if) spdk_vbdev_module_list =
68 	TAILQ_HEAD_INITIALIZER(spdk_vbdev_module_list);
69 
70 static TAILQ_HEAD(, spdk_bdev) spdk_bdev_list =
71 	TAILQ_HEAD_INITIALIZER(spdk_bdev_list);
72 
73 struct spdk_bdev_channel {
74 	struct spdk_bdev	*bdev;
75 
76 	/* The channel for the underlying device */
77 	struct spdk_io_channel	*channel;
78 };
79 
80 struct spdk_bdev *spdk_bdev_first(void)
81 {
82 	struct spdk_bdev *bdev;
83 
84 	bdev = TAILQ_FIRST(&spdk_bdev_list);
85 	if (bdev) {
86 		SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Starting bdev iteration at %s\n", bdev->name);
87 	}
88 
89 	return bdev;
90 }
91 
92 struct spdk_bdev *spdk_bdev_next(struct spdk_bdev *prev)
93 {
94 	struct spdk_bdev *bdev;
95 
96 	bdev = TAILQ_NEXT(prev, link);
97 	if (bdev) {
98 		SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Continuing bdev iteration at %s\n", bdev->name);
99 	}
100 
101 	return bdev;
102 }
103 
104 struct spdk_bdev *spdk_bdev_get_by_name(const char *bdev_name)
105 {
106 	struct spdk_bdev *bdev = spdk_bdev_first();
107 
108 	while (bdev != NULL) {
109 		if (strncmp(bdev_name, bdev->name, sizeof(bdev->name)) == 0) {
110 			return bdev;
111 		}
112 		bdev = spdk_bdev_next(bdev);
113 	}
114 
115 	return NULL;
116 }
117 
118 static void
119 spdk_bdev_io_set_rbuf(struct spdk_bdev_io *bdev_io, void *buf)
120 {
121 	assert(bdev_io->get_rbuf_cb != NULL);
122 	assert(buf != NULL);
123 	assert(bdev_io->u.read.iovs != NULL);
124 
125 	bdev_io->u.read.buf_unaligned = buf;
126 	bdev_io->u.read.iovs[0].iov_base = (void *)((unsigned long)((char *)buf + 512) & ~511UL);
127 	bdev_io->u.read.iovs[0].iov_len = bdev_io->u.read.len;
128 	bdev_io->u.read.put_rbuf = true;
129 	bdev_io->get_rbuf_cb(bdev_io);
130 }
131 
132 static void
133 spdk_bdev_io_put_rbuf(struct spdk_bdev_io *bdev_io)
134 {
135 	struct rte_mempool *pool;
136 	struct spdk_bdev_io *tmp;
137 	void *buf;
138 	need_rbuf_tailq_t *tailq;
139 	uint64_t length;
140 
141 	assert(bdev_io->u.read.iovcnt == 1);
142 
143 	length = bdev_io->u.read.len;
144 	buf = bdev_io->u.read.buf_unaligned;
145 
146 	if (length <= SPDK_BDEV_SMALL_RBUF_MAX_SIZE) {
147 		pool = g_rbuf_small_pool;
148 		tailq = &g_need_rbuf_small[rte_lcore_id()];
149 	} else {
150 		pool = g_rbuf_large_pool;
151 		tailq = &g_need_rbuf_large[rte_lcore_id()];
152 	}
153 
154 	if (TAILQ_EMPTY(tailq)) {
155 		rte_mempool_put(pool, buf);
156 	} else {
157 		tmp = TAILQ_FIRST(tailq);
158 		TAILQ_REMOVE(tailq, tmp, rbuf_link);
159 		spdk_bdev_io_set_rbuf(tmp, buf);
160 	}
161 }
162 
163 static int spdk_initialize_rbuf_pool(void)
164 {
165 	int cache_size;
166 
167 	/**
168 	 * Ensure no more than half of the total buffers end up local caches, by
169 	 *   using spdk_event_get_active_core_count() to determine how many local caches we need
170 	 *   to account for.
171 	 */
172 	cache_size = RBUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count());
173 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE)
174 		cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
175 	g_rbuf_small_pool = rte_mempool_create("rbuf_small_pool",
176 					       RBUF_SMALL_POOL_SIZE,
177 					       SPDK_BDEV_SMALL_RBUF_MAX_SIZE + 512,
178 					       cache_size, 0, NULL, NULL, NULL, NULL,
179 					       SOCKET_ID_ANY, 0);
180 	if (!g_rbuf_small_pool) {
181 		SPDK_ERRLOG("create rbuf small pool failed\n");
182 		return -1;
183 	}
184 
185 	cache_size = RBUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count());
186 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE)
187 		cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
188 	g_rbuf_large_pool = rte_mempool_create("rbuf_large_pool",
189 					       RBUF_LARGE_POOL_SIZE,
190 					       SPDK_BDEV_LARGE_RBUF_MAX_SIZE + 512,
191 					       cache_size, 0, NULL, NULL, NULL, NULL,
192 					       SOCKET_ID_ANY, 0);
193 	if (!g_rbuf_large_pool) {
194 		SPDK_ERRLOG("create rbuf large pool failed\n");
195 		return -1;
196 	}
197 
198 	return 0;
199 }
200 
201 static int
202 spdk_bdev_module_get_max_ctx_size(void)
203 {
204 	struct spdk_bdev_module_if *bdev_module;
205 	int max_bdev_module_size = 0;
206 
207 	TAILQ_FOREACH(bdev_module, &spdk_bdev_module_list, tailq) {
208 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
209 			max_bdev_module_size = bdev_module->get_ctx_size();
210 		}
211 	}
212 
213 	TAILQ_FOREACH(bdev_module, &spdk_vbdev_module_list, tailq) {
214 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
215 			max_bdev_module_size = bdev_module->get_ctx_size();
216 		}
217 	}
218 
219 	return max_bdev_module_size;
220 }
221 
222 static int
223 spdk_bdev_module_initialize(void)
224 {
225 	struct spdk_bdev_module_if *bdev_module;
226 	int rc = 0;
227 
228 	TAILQ_FOREACH(bdev_module, &spdk_bdev_module_list, tailq) {
229 		rc = bdev_module->module_init();
230 		if (rc)
231 			return rc;
232 	}
233 	TAILQ_FOREACH(bdev_module, &spdk_vbdev_module_list, tailq) {
234 		rc = bdev_module->module_init();
235 		if (rc)
236 			return rc;
237 	}
238 	return rc;
239 }
240 
241 static void
242 spdk_bdev_module_finish(void)
243 {
244 	struct spdk_bdev_module_if *bdev_module;
245 
246 	TAILQ_FOREACH(bdev_module, &spdk_vbdev_module_list, tailq) {
247 		if (bdev_module->module_fini) {
248 			bdev_module->module_fini();
249 		}
250 	}
251 
252 	TAILQ_FOREACH(bdev_module, &spdk_bdev_module_list, tailq) {
253 		if (bdev_module->module_fini) {
254 			bdev_module->module_fini();
255 		}
256 	}
257 }
258 
259 static void
260 spdk_bdev_config_text(FILE *fp)
261 {
262 	struct spdk_bdev_module_if *bdev_module;
263 
264 	TAILQ_FOREACH(bdev_module, &spdk_bdev_module_list, tailq) {
265 		if (bdev_module->config_text) {
266 			bdev_module->config_text(fp);
267 		}
268 	}
269 	TAILQ_FOREACH(bdev_module, &spdk_vbdev_module_list, tailq) {
270 		if (bdev_module->config_text) {
271 			bdev_module->config_text(fp);
272 		}
273 	}
274 }
275 
276 static int
277 spdk_bdev_initialize(void)
278 {
279 	int i;
280 
281 	if (spdk_bdev_module_initialize()) {
282 		SPDK_ERRLOG("bdev module initialize failed");
283 		return -1;
284 	}
285 
286 	spdk_bdev_g_io_pool = rte_mempool_create("blockdev_io",
287 			      SPDK_BDEV_IO_POOL_SIZE,
288 			      sizeof(struct spdk_bdev_io) +
289 			      spdk_bdev_module_get_max_ctx_size(),
290 			      64, 0,
291 			      NULL, NULL, NULL, NULL,
292 			      SOCKET_ID_ANY, 0);
293 
294 	if (spdk_bdev_g_io_pool == NULL) {
295 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool");
296 		return -1;
297 	}
298 
299 	for (i = 0; i < RTE_MAX_LCORE; i++) {
300 		TAILQ_INIT(&g_need_rbuf_small[i]);
301 		TAILQ_INIT(&g_need_rbuf_large[i]);
302 	}
303 
304 	return spdk_initialize_rbuf_pool();
305 }
306 
307 /*
308  * Wrapper to provide rte_mempool_avail_count() on older DPDK versions.
309  * Drop this if the minimum DPDK version is raised to at least 16.07.
310  */
311 #if RTE_VERSION < RTE_VERSION_NUM(16, 7, 0, 1)
312 static unsigned rte_mempool_avail_count(const struct rte_mempool *pool)
313 {
314 	return rte_mempool_count(pool);
315 }
316 #endif
317 
318 static int
319 spdk_bdev_check_pool(struct rte_mempool *pool, uint32_t count)
320 {
321 	if (rte_mempool_avail_count(pool) != count) {
322 		SPDK_ERRLOG("rte_mempool_avail_count(%s) == %d, should be %d\n",
323 			    pool->name, rte_mempool_avail_count(pool), count);
324 		return -1;
325 	} else {
326 		return 0;
327 	}
328 }
329 
330 static int
331 spdk_bdev_finish(void)
332 {
333 	int rc = 0;
334 
335 	spdk_bdev_module_finish();
336 
337 	rc += spdk_bdev_check_pool(g_rbuf_small_pool, RBUF_SMALL_POOL_SIZE);
338 	rc += spdk_bdev_check_pool(g_rbuf_large_pool, RBUF_LARGE_POOL_SIZE);
339 
340 	return (rc != 0);
341 }
342 
343 struct spdk_bdev_io *spdk_bdev_get_io(void)
344 {
345 	struct spdk_bdev_io *bdev_io;
346 	int rc;
347 
348 	rc = rte_mempool_get(spdk_bdev_g_io_pool, (void **)&bdev_io);
349 	if (rc < 0 || !bdev_io) {
350 		SPDK_ERRLOG("Unable to get spdk_bdev_io\n");
351 		abort();
352 	}
353 
354 	memset(bdev_io, 0, sizeof(*bdev_io));
355 
356 	return bdev_io;
357 }
358 
359 static void
360 spdk_bdev_put_io(struct spdk_bdev_io *bdev_io)
361 {
362 	if (!bdev_io) {
363 		return;
364 	}
365 
366 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && bdev_io->u.read.put_rbuf) {
367 		spdk_bdev_io_put_rbuf(bdev_io);
368 	}
369 
370 	rte_mempool_put(spdk_bdev_g_io_pool, bdev_io);
371 }
372 
373 static void
374 _spdk_bdev_io_get_rbuf(struct spdk_bdev_io *bdev_io)
375 {
376 	uint64_t len = bdev_io->u.read.len;
377 	struct rte_mempool *pool;
378 	need_rbuf_tailq_t *tailq;
379 	int rc;
380 	void *buf = NULL;
381 
382 	if (len <= SPDK_BDEV_SMALL_RBUF_MAX_SIZE) {
383 		pool = g_rbuf_small_pool;
384 		tailq = &g_need_rbuf_small[rte_lcore_id()];
385 	} else {
386 		pool = g_rbuf_large_pool;
387 		tailq = &g_need_rbuf_large[rte_lcore_id()];
388 	}
389 
390 	rc = rte_mempool_get(pool, (void **)&buf);
391 	if (rc < 0 || !buf) {
392 		TAILQ_INSERT_TAIL(tailq, bdev_io, rbuf_link);
393 	} else {
394 		spdk_bdev_io_set_rbuf(bdev_io, buf);
395 	}
396 }
397 
398 
399 static void
400 spdk_bdev_cleanup_pending_rbuf_io(struct spdk_bdev *bdev)
401 {
402 	struct spdk_bdev_io *bdev_io, *tmp;
403 
404 	TAILQ_FOREACH_SAFE(bdev_io, &g_need_rbuf_small[rte_lcore_id()], rbuf_link, tmp) {
405 		if (bdev_io->bdev == bdev) {
406 			TAILQ_REMOVE(&g_need_rbuf_small[rte_lcore_id()], bdev_io, rbuf_link);
407 			bdev_io->status = SPDK_BDEV_IO_STATUS_FAILED;
408 		}
409 	}
410 
411 	TAILQ_FOREACH_SAFE(bdev_io, &g_need_rbuf_large[rte_lcore_id()], rbuf_link, tmp) {
412 		if (bdev_io->bdev == bdev) {
413 			TAILQ_REMOVE(&g_need_rbuf_large[rte_lcore_id()], bdev_io, rbuf_link);
414 			bdev_io->status = SPDK_BDEV_IO_STATUS_FAILED;
415 		}
416 	}
417 }
418 
419 static void
420 __submit_request(struct spdk_bdev *bdev, struct spdk_bdev_io *bdev_io)
421 {
422 	assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);
423 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_RESET) {
424 		spdk_bdev_cleanup_pending_rbuf_io(bdev);
425 	}
426 	bdev_io->in_submit_request = true;
427 	bdev->fn_table->submit_request(bdev_io);
428 	bdev_io->in_submit_request = false;
429 }
430 
431 static int
432 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
433 {
434 	struct spdk_bdev *bdev = bdev_io->bdev;
435 
436 	__submit_request(bdev, bdev_io);
437 	return 0;
438 }
439 
440 void
441 spdk_bdev_io_resubmit(struct spdk_bdev_io *bdev_io, struct spdk_bdev *new_bdev)
442 {
443 	assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);
444 	bdev_io->bdev = new_bdev;
445 
446 	/*
447 	 * These fields are normally set during spdk_bdev_io_init(), but since bdev is
448 	 * being switched, they need to be reinitialized.
449 	 */
450 	bdev_io->gencnt = new_bdev->gencnt;
451 	bdev_io->ctx = new_bdev->ctxt;
452 
453 	__submit_request(new_bdev, bdev_io);
454 }
455 
456 static void
457 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io,
458 		  struct spdk_bdev *bdev, void *cb_arg,
459 		  spdk_bdev_io_completion_cb cb)
460 {
461 	bdev_io->bdev = bdev;
462 	bdev_io->ctx = bdev->ctxt;
463 	bdev_io->caller_ctx = cb_arg;
464 	bdev_io->cb = cb;
465 	bdev_io->gencnt = bdev->gencnt;
466 	bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING;
467 	bdev_io->in_submit_request = false;
468 	TAILQ_INIT(&bdev_io->child_io);
469 }
470 
471 struct spdk_bdev_io *
472 spdk_bdev_get_child_io(struct spdk_bdev_io *parent,
473 		       struct spdk_bdev *bdev,
474 		       spdk_bdev_io_completion_cb cb,
475 		       void *cb_arg)
476 {
477 	struct spdk_bdev_io *child;
478 
479 	child = spdk_bdev_get_io();
480 	if (!child) {
481 		SPDK_ERRLOG("Unable to get spdk_bdev_io\n");
482 		return NULL;
483 	}
484 
485 	if (cb_arg == NULL) {
486 		cb_arg = child;
487 	}
488 
489 	spdk_bdev_io_init(child, bdev, cb_arg, cb);
490 
491 	child->type = parent->type;
492 	memcpy(&child->u, &parent->u, sizeof(child->u));
493 	if (child->type == SPDK_BDEV_IO_TYPE_READ) {
494 		child->u.read.put_rbuf = false;
495 	}
496 	child->get_rbuf_cb = NULL;
497 	child->parent = parent;
498 
499 	TAILQ_INSERT_TAIL(&parent->child_io, child, link);
500 
501 	return child;
502 }
503 
504 bool
505 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
506 {
507 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
508 }
509 
510 int
511 spdk_bdev_dump_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
512 {
513 	if (bdev->fn_table->dump_config_json) {
514 		return bdev->fn_table->dump_config_json(bdev->ctxt, w);
515 	}
516 
517 	return 0;
518 }
519 
520 static int
521 spdk_bdev_channel_create(void *io_device, uint32_t priority, void *ctx_buf,
522 			 void *unique_ctx)
523 {
524 	struct spdk_bdev		*bdev = io_device;
525 	struct spdk_bdev_channel	*ch = ctx_buf;
526 
527 	ch->bdev = io_device;
528 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt, priority);
529 
530 	return 0;
531 }
532 
533 static void
534 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf)
535 {
536 	struct spdk_bdev_channel	*ch = ctx_buf;
537 
538 	spdk_put_io_channel(ch->channel);
539 }
540 
541 struct spdk_io_channel *
542 spdk_bdev_get_io_channel(struct spdk_bdev *bdev, uint32_t priority)
543 {
544 	return spdk_get_io_channel(bdev, priority, false, NULL);
545 }
546 
547 static int
548 spdk_bdev_io_valid(struct spdk_bdev *bdev, uint64_t offset, uint64_t nbytes)
549 {
550 	/* Return failure if nbytes is not a multiple of bdev->blocklen */
551 	if (nbytes % bdev->blocklen) {
552 		return -1;
553 	}
554 
555 	/* Return failure if offset + nbytes is less than offset; indicates there
556 	 * has been an overflow and hence the offset has been wrapped around */
557 	if (offset + nbytes < offset) {
558 		return -1;
559 	}
560 
561 	/* Return failure if offset + nbytes exceeds the size of the blockdev */
562 	if (offset + nbytes > bdev->blockcnt * bdev->blocklen) {
563 		return -1;
564 	}
565 
566 	return 0;
567 }
568 
569 struct spdk_bdev_io *
570 spdk_bdev_read(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
571 	       void *buf, uint64_t offset, uint64_t nbytes,
572 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
573 {
574 	struct spdk_bdev_io *bdev_io;
575 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
576 	int rc;
577 
578 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
579 	if (spdk_bdev_io_valid(bdev, offset, nbytes) != 0) {
580 		return NULL;
581 	}
582 
583 	bdev_io = spdk_bdev_get_io();
584 	if (!bdev_io) {
585 		SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n");
586 		return NULL;
587 	}
588 
589 	bdev_io->ch = channel->channel;
590 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
591 	bdev_io->u.read.iov.iov_base = buf;
592 	bdev_io->u.read.iov.iov_len = nbytes;
593 	bdev_io->u.read.iovs = &bdev_io->u.read.iov;
594 	bdev_io->u.read.iovcnt = 1;
595 	bdev_io->u.read.len = nbytes;
596 	bdev_io->u.read.offset = offset;
597 	bdev_io->u.read.put_rbuf = false;
598 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
599 
600 	rc = spdk_bdev_io_submit(bdev_io);
601 	if (rc < 0) {
602 		spdk_bdev_put_io(bdev_io);
603 		return NULL;
604 	}
605 
606 	return bdev_io;
607 }
608 
609 struct spdk_bdev_io *
610 spdk_bdev_readv(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
611 		struct iovec *iov, int iovcnt,
612 		uint64_t offset, uint64_t nbytes,
613 		spdk_bdev_io_completion_cb cb, void *cb_arg)
614 {
615 	struct spdk_bdev_io *bdev_io;
616 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
617 	int rc;
618 
619 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
620 	if (spdk_bdev_io_valid(bdev, offset, nbytes) != 0) {
621 		return NULL;
622 	}
623 
624 	bdev_io = spdk_bdev_get_io();
625 	if (!bdev_io) {
626 		SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n");
627 		return NULL;
628 	}
629 
630 	bdev_io->ch = channel->channel;
631 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
632 	bdev_io->u.read.iovs = iov;
633 	bdev_io->u.read.iovcnt = iovcnt;
634 	bdev_io->u.read.len = nbytes;
635 	bdev_io->u.read.offset = offset;
636 	bdev_io->u.read.put_rbuf = false;
637 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
638 
639 	rc = spdk_bdev_io_submit(bdev_io);
640 	if (rc < 0) {
641 		spdk_bdev_put_io(bdev_io);
642 		return NULL;
643 	}
644 
645 	return bdev_io;
646 }
647 
648 struct spdk_bdev_io *
649 spdk_bdev_write(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
650 		void *buf, uint64_t offset, uint64_t nbytes,
651 		spdk_bdev_io_completion_cb cb, void *cb_arg)
652 {
653 	struct spdk_bdev_io *bdev_io;
654 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
655 	int rc;
656 
657 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
658 	if (spdk_bdev_io_valid(bdev, offset, nbytes) != 0) {
659 		return NULL;
660 	}
661 
662 	bdev_io = spdk_bdev_get_io();
663 	if (!bdev_io) {
664 		SPDK_ERRLOG("blockdev_io memory allocation failed duing write\n");
665 		return NULL;
666 	}
667 
668 	bdev_io->ch = channel->channel;
669 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
670 	bdev_io->u.write.iov.iov_base = buf;
671 	bdev_io->u.write.iov.iov_len = nbytes;
672 	bdev_io->u.write.iovs = &bdev_io->u.write.iov;
673 	bdev_io->u.write.iovcnt = 1;
674 	bdev_io->u.write.len = nbytes;
675 	bdev_io->u.write.offset = offset;
676 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
677 
678 	rc = spdk_bdev_io_submit(bdev_io);
679 	if (rc < 0) {
680 		spdk_bdev_put_io(bdev_io);
681 		return NULL;
682 	}
683 
684 	return bdev_io;
685 }
686 
687 struct spdk_bdev_io *
688 spdk_bdev_writev(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
689 		 struct iovec *iov, int iovcnt,
690 		 uint64_t offset, uint64_t len,
691 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
692 {
693 	struct spdk_bdev_io *bdev_io;
694 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
695 	int rc;
696 
697 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
698 	if (spdk_bdev_io_valid(bdev, offset, len) != 0) {
699 		return NULL;
700 	}
701 
702 	bdev_io = spdk_bdev_get_io();
703 	if (!bdev_io) {
704 		SPDK_ERRLOG("bdev_io memory allocation failed duing writev\n");
705 		return NULL;
706 	}
707 
708 	bdev_io->ch = channel->channel;
709 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
710 	bdev_io->u.write.iovs = iov;
711 	bdev_io->u.write.iovcnt = iovcnt;
712 	bdev_io->u.write.len = len;
713 	bdev_io->u.write.offset = offset;
714 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
715 
716 	rc = spdk_bdev_io_submit(bdev_io);
717 	if (rc < 0) {
718 		spdk_bdev_put_io(bdev_io);
719 		return NULL;
720 	}
721 
722 	return bdev_io;
723 }
724 
725 struct spdk_bdev_io *
726 spdk_bdev_unmap(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
727 		struct spdk_scsi_unmap_bdesc *unmap_d,
728 		uint16_t bdesc_count,
729 		spdk_bdev_io_completion_cb cb, void *cb_arg)
730 {
731 	struct spdk_bdev_io *bdev_io;
732 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
733 	int rc;
734 
735 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
736 	if (bdesc_count == 0) {
737 		SPDK_ERRLOG("Invalid bdesc_count 0\n");
738 		return NULL;
739 	}
740 
741 	if (bdesc_count > bdev->max_unmap_bdesc_count) {
742 		SPDK_ERRLOG("Invalid bdesc_count %u > max_unmap_bdesc_count %u\n",
743 			    bdesc_count, bdev->max_unmap_bdesc_count);
744 		return NULL;
745 	}
746 
747 	bdev_io = spdk_bdev_get_io();
748 	if (!bdev_io) {
749 		SPDK_ERRLOG("bdev_io memory allocation failed duing unmap\n");
750 		return NULL;
751 	}
752 
753 	bdev_io->ch = channel->channel;
754 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
755 	bdev_io->u.unmap.unmap_bdesc = unmap_d;
756 	bdev_io->u.unmap.bdesc_count = bdesc_count;
757 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
758 
759 	rc = spdk_bdev_io_submit(bdev_io);
760 	if (rc < 0) {
761 		spdk_bdev_put_io(bdev_io);
762 		return NULL;
763 	}
764 
765 	return bdev_io;
766 }
767 
768 struct spdk_bdev_io *
769 spdk_bdev_flush(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
770 		uint64_t offset, uint64_t length,
771 		spdk_bdev_io_completion_cb cb, void *cb_arg)
772 {
773 	struct spdk_bdev_io *bdev_io;
774 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
775 	int rc;
776 
777 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
778 	bdev_io = spdk_bdev_get_io();
779 	if (!bdev_io) {
780 		SPDK_ERRLOG("bdev_io memory allocation failed duing flush\n");
781 		return NULL;
782 	}
783 
784 	bdev_io->ch = channel->channel;
785 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
786 	bdev_io->u.flush.offset = offset;
787 	bdev_io->u.flush.length = length;
788 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
789 
790 	rc = spdk_bdev_io_submit(bdev_io);
791 	if (rc < 0) {
792 		spdk_bdev_put_io(bdev_io);
793 		return NULL;
794 	}
795 
796 	return bdev_io;
797 }
798 
799 int
800 spdk_bdev_reset(struct spdk_bdev *bdev, enum spdk_bdev_reset_type reset_type,
801 		spdk_bdev_io_completion_cb cb, void *cb_arg)
802 {
803 	struct spdk_bdev_io *bdev_io;
804 	int rc;
805 
806 	assert(bdev->status != SPDK_BDEV_STATUS_UNCLAIMED);
807 	bdev_io = spdk_bdev_get_io();
808 	if (!bdev_io) {
809 		SPDK_ERRLOG("bdev_io memory allocation failed duing reset\n");
810 		return -1;
811 	}
812 
813 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
814 	bdev_io->u.reset.type = reset_type;
815 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
816 
817 	rc = spdk_bdev_io_submit(bdev_io);
818 	if (rc < 0) {
819 		spdk_bdev_put_io(bdev_io);
820 		SPDK_ERRLOG("reset failed\n");
821 	}
822 
823 	return rc;
824 }
825 
826 int
827 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
828 {
829 	struct spdk_bdev_io *child_io, *tmp;
830 
831 	if (!bdev_io) {
832 		SPDK_ERRLOG("bdev_io is NULL\n");
833 		return -1;
834 	}
835 
836 	if (bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING) {
837 		SPDK_ERRLOG("bdev_io is in pending state\n");
838 		assert(false);
839 		return -1;
840 	}
841 
842 	TAILQ_FOREACH_SAFE(child_io, &bdev_io->child_io, link, tmp) {
843 		/*
844 		 * Make sure no references to the parent I/O remain, since it is being
845 		 * returned to the free pool.
846 		 */
847 		child_io->parent = NULL;
848 		TAILQ_REMOVE(&bdev_io->child_io, child_io, link);
849 
850 		/*
851 		 * Child I/O may have an rbuf that needs to be returned to a pool
852 		 *  on a different core, so free it through the request submission
853 		 *  process rather than calling put_io directly here.
854 		 */
855 		spdk_bdev_free_io(child_io);
856 	}
857 
858 	spdk_bdev_put_io(bdev_io);
859 
860 	return 0;
861 }
862 
863 static void
864 bdev_io_deferred_completion(void *arg1, void *arg2)
865 {
866 	struct spdk_bdev_io *bdev_io = arg1;
867 	enum spdk_bdev_io_status status = (enum spdk_bdev_io_status)arg2;
868 
869 	assert(bdev_io->in_submit_request == false);
870 
871 	spdk_bdev_io_complete(bdev_io, status);
872 }
873 
874 void
875 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
876 {
877 	if (bdev_io->in_submit_request) {
878 		/*
879 		 * Defer completion via an event to avoid potential infinite recursion if the
880 		 * user's completion callback issues a new I/O.
881 		 */
882 		spdk_event_call(spdk_event_allocate(spdk_env_get_current_core(),
883 						    bdev_io_deferred_completion,
884 						    bdev_io,
885 						    (void *)status));
886 		return;
887 	}
888 
889 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_RESET) {
890 		/* Successful reset */
891 		if (status == SPDK_BDEV_IO_STATUS_SUCCESS) {
892 			/* Increase the blockdev generation if it is a hard reset */
893 			if (bdev_io->u.reset.type == SPDK_BDEV_RESET_HARD) {
894 				bdev_io->bdev->gencnt++;
895 			}
896 		}
897 	} else {
898 		/*
899 		 * Check the gencnt, to see if this I/O was issued before the most
900 		 * recent reset. If the gencnt is not equal, then just free the I/O
901 		 * without calling the callback, since the caller will have already
902 		 * freed its context for this I/O.
903 		 */
904 		if (bdev_io->bdev->gencnt != bdev_io->gencnt) {
905 			spdk_bdev_put_io(bdev_io);
906 			return;
907 		}
908 	}
909 
910 	bdev_io->status = status;
911 
912 	assert(bdev_io->cb != NULL);
913 	bdev_io->cb(bdev_io, status, bdev_io->caller_ctx);
914 }
915 
916 void
917 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
918 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
919 {
920 	if (sc == SPDK_SCSI_STATUS_GOOD) {
921 		bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS;
922 	} else {
923 		bdev_io->status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
924 		bdev_io->error.scsi.sc = sc;
925 		bdev_io->error.scsi.sk = sk;
926 		bdev_io->error.scsi.asc = asc;
927 		bdev_io->error.scsi.ascq = ascq;
928 	}
929 
930 	spdk_bdev_io_complete(bdev_io, bdev_io->status);
931 }
932 
933 void
934 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
935 			     int *sc, int *sk, int *asc, int *ascq)
936 {
937 	assert(sc != NULL);
938 	assert(sk != NULL);
939 	assert(asc != NULL);
940 	assert(ascq != NULL);
941 
942 	switch (bdev_io->status) {
943 	case SPDK_BDEV_IO_STATUS_SUCCESS:
944 		*sc = SPDK_SCSI_STATUS_GOOD;
945 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
946 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
947 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
948 		break;
949 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
950 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
951 		break;
952 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
953 		*sc = bdev_io->error.scsi.sc;
954 		*sk = bdev_io->error.scsi.sk;
955 		*asc = bdev_io->error.scsi.asc;
956 		*ascq = bdev_io->error.scsi.ascq;
957 		break;
958 	default:
959 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
960 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
961 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
962 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
963 		break;
964 	}
965 }
966 
967 void
968 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc)
969 {
970 	if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
971 		bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS;
972 	} else {
973 		bdev_io->error.nvme.sct = sct;
974 		bdev_io->error.nvme.sc = sc;
975 		bdev_io->status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
976 	}
977 
978 	spdk_bdev_io_complete(bdev_io, bdev_io->status);
979 }
980 
981 void
982 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc)
983 {
984 	assert(sct != NULL);
985 	assert(sc != NULL);
986 
987 	if (bdev_io->status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
988 		*sct = bdev_io->error.nvme.sct;
989 		*sc = bdev_io->error.nvme.sc;
990 	} else if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) {
991 		*sct = SPDK_NVME_SCT_GENERIC;
992 		*sc = SPDK_NVME_SC_SUCCESS;
993 	} else {
994 		*sct = SPDK_NVME_SCT_GENERIC;
995 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
996 	}
997 }
998 
999 void
1000 spdk_bdev_register(struct spdk_bdev *bdev)
1001 {
1002 	/* initialize the reset generation value to zero */
1003 	bdev->gencnt = 0;
1004 
1005 	spdk_io_device_register(bdev, spdk_bdev_channel_create, spdk_bdev_channel_destroy,
1006 				sizeof(struct spdk_bdev_channel));
1007 
1008 	pthread_mutex_init(&bdev->mutex, NULL);
1009 	bdev->status = SPDK_BDEV_STATUS_UNCLAIMED;
1010 	SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Inserting bdev %s into list\n", bdev->name);
1011 	TAILQ_INSERT_TAIL(&spdk_bdev_list, bdev, link);
1012 }
1013 
1014 void
1015 spdk_bdev_unregister(struct spdk_bdev *bdev)
1016 {
1017 	int			rc;
1018 
1019 	SPDK_TRACELOG(SPDK_TRACE_DEBUG, "Removing bdev %s from list\n", bdev->name);
1020 
1021 	pthread_mutex_lock(&bdev->mutex);
1022 	assert(bdev->status == SPDK_BDEV_STATUS_CLAIMED || bdev->status == SPDK_BDEV_STATUS_UNCLAIMED);
1023 	if (bdev->status == SPDK_BDEV_STATUS_CLAIMED) {
1024 		if (bdev->remove_cb) {
1025 			bdev->status = SPDK_BDEV_STATUS_REMOVING;
1026 			pthread_mutex_unlock(&bdev->mutex);
1027 			bdev->remove_cb(bdev->remove_ctx);
1028 			return;
1029 		} else {
1030 			bdev->status = SPDK_BDEV_STATUS_UNCLAIMED;
1031 		}
1032 	}
1033 
1034 	TAILQ_REMOVE(&spdk_bdev_list, bdev, link);
1035 	pthread_mutex_unlock(&bdev->mutex);
1036 
1037 	pthread_mutex_destroy(&bdev->mutex);
1038 
1039 	spdk_io_device_unregister(bdev);
1040 
1041 	rc = bdev->fn_table->destruct(bdev->ctxt);
1042 	if (rc < 0) {
1043 		SPDK_ERRLOG("destruct failed\n");
1044 	}
1045 }
1046 
1047 bool
1048 spdk_bdev_claim(struct spdk_bdev *bdev, spdk_bdev_remove_cb_t remove_cb,
1049 		void *remove_ctx)
1050 {
1051 	bool success;
1052 
1053 	pthread_mutex_lock(&bdev->mutex);
1054 
1055 	if (bdev->status != SPDK_BDEV_STATUS_CLAIMED) {
1056 		/* Take ownership of bdev. */
1057 		bdev->remove_cb = remove_cb;
1058 		bdev->remove_ctx = remove_ctx;
1059 		bdev->status = SPDK_BDEV_STATUS_CLAIMED;
1060 		success = true;
1061 	} else {
1062 		/* bdev is already claimed. */
1063 		success = false;
1064 	}
1065 
1066 	pthread_mutex_unlock(&bdev->mutex);
1067 
1068 	return success;
1069 }
1070 
1071 void
1072 spdk_bdev_unclaim(struct spdk_bdev *bdev)
1073 {
1074 	bool do_unregister = false;
1075 
1076 	pthread_mutex_lock(&bdev->mutex);
1077 	assert(bdev->status == SPDK_BDEV_STATUS_CLAIMED || bdev->status == SPDK_BDEV_STATUS_REMOVING);
1078 	if (bdev->status == SPDK_BDEV_STATUS_REMOVING) {
1079 		do_unregister = true;
1080 	}
1081 	bdev->remove_cb = NULL;
1082 	bdev->remove_ctx = NULL;
1083 	bdev->status = SPDK_BDEV_STATUS_UNCLAIMED;
1084 	pthread_mutex_unlock(&bdev->mutex);
1085 
1086 	if (do_unregister == true) {
1087 		spdk_bdev_unregister(bdev);
1088 	}
1089 }
1090 
1091 void
1092 spdk_bdev_io_get_rbuf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_rbuf_cb cb)
1093 {
1094 	assert(cb != NULL);
1095 	assert(bdev_io->u.read.iovs != NULL);
1096 
1097 	if (bdev_io->u.read.iovs[0].iov_base == NULL) {
1098 		bdev_io->get_rbuf_cb = cb;
1099 		_spdk_bdev_io_get_rbuf(bdev_io);
1100 	} else {
1101 		cb(bdev_io);
1102 	}
1103 }
1104 
1105 void
1106 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
1107 {
1108 	struct iovec *iovs;
1109 	int iovcnt;
1110 
1111 	if (bdev_io == NULL) {
1112 		return;
1113 	}
1114 
1115 	switch (bdev_io->type) {
1116 	case SPDK_BDEV_IO_TYPE_READ:
1117 		iovs = bdev_io->u.read.iovs;
1118 		iovcnt = bdev_io->u.read.iovcnt;
1119 		break;
1120 	case SPDK_BDEV_IO_TYPE_WRITE:
1121 		iovs = bdev_io->u.write.iovs;
1122 		iovcnt = bdev_io->u.write.iovcnt;
1123 		break;
1124 	default:
1125 		iovs = NULL;
1126 		iovcnt = 0;
1127 		break;
1128 	}
1129 
1130 	if (iovp) {
1131 		*iovp = iovs;
1132 	}
1133 	if (iovcntp) {
1134 		*iovcntp = iovcnt;
1135 	}
1136 }
1137 
1138 void spdk_bdev_module_list_add(struct spdk_bdev_module_if *bdev_module)
1139 {
1140 	TAILQ_INSERT_TAIL(&spdk_bdev_module_list, bdev_module, tailq);
1141 }
1142 
1143 void spdk_vbdev_module_list_add(struct spdk_bdev_module_if *vbdev_module)
1144 {
1145 	TAILQ_INSERT_TAIL(&spdk_vbdev_module_list, vbdev_module, tailq);
1146 }
1147 SPDK_SUBSYSTEM_REGISTER(bdev, spdk_bdev_initialize, spdk_bdev_finish, spdk_bdev_config_text)
1148 SPDK_SUBSYSTEM_DEPEND(bdev, copy)
1149