xref: /spdk/module/bdev/malloc/bdev_malloc.c (revision 7506a7aa53d239f533af3bc768f0d2af55e735fe)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *   Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "spdk/stdinc.h"
36 
37 #include "bdev_malloc.h"
38 #include "spdk/bdev.h"
39 #include "spdk/endian.h"
40 #include "spdk/env.h"
41 #include "spdk/accel_engine.h"
42 #include "spdk/json.h"
43 #include "spdk/thread.h"
44 #include "spdk/queue.h"
45 #include "spdk/string.h"
46 
47 #include "spdk/bdev_module.h"
48 #include "spdk/log.h"
49 
50 struct malloc_disk {
51 	struct spdk_bdev		disk;
52 	void				*malloc_buf;
53 	TAILQ_ENTRY(malloc_disk)	link;
54 };
55 
56 struct malloc_task {
57 	int				num_outstanding;
58 	enum spdk_bdev_io_status	status;
59 	TAILQ_ENTRY(malloc_task)	tailq;
60 };
61 
62 struct malloc_channel {
63 	struct spdk_io_channel		*accel_channel;
64 	struct spdk_poller		*completion_poller;
65 	TAILQ_HEAD(, malloc_task)	completed_tasks;
66 };
67 
68 static void
69 malloc_done(void *ref, int status)
70 {
71 	struct malloc_task *task = (struct malloc_task *)ref;
72 
73 	if (status != 0) {
74 		if (status == -ENOMEM) {
75 			task->status = SPDK_BDEV_IO_STATUS_NOMEM;
76 		} else {
77 			task->status = SPDK_BDEV_IO_STATUS_FAILED;
78 		}
79 	}
80 
81 	if (--task->num_outstanding == 0) {
82 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status);
83 	}
84 }
85 
86 static void
87 malloc_complete_task(struct malloc_task *task, struct malloc_channel *mch,
88 		     enum spdk_bdev_io_status status)
89 {
90 	task->status = status;
91 	TAILQ_INSERT_TAIL(&mch->completed_tasks, task, tailq);
92 }
93 
94 static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks);
95 
96 int malloc_disk_count = 0;
97 
98 static int bdev_malloc_initialize(void);
99 static void bdev_malloc_deinitialize(void);
100 
101 static int
102 bdev_malloc_get_ctx_size(void)
103 {
104 	return sizeof(struct malloc_task);
105 }
106 
107 static struct spdk_bdev_module malloc_if = {
108 	.name = "malloc",
109 	.module_init = bdev_malloc_initialize,
110 	.module_fini = bdev_malloc_deinitialize,
111 	.get_ctx_size = bdev_malloc_get_ctx_size,
112 
113 };
114 
115 SPDK_BDEV_MODULE_REGISTER(malloc, &malloc_if)
116 
117 static void
118 malloc_disk_free(struct malloc_disk *malloc_disk)
119 {
120 	if (!malloc_disk) {
121 		return;
122 	}
123 
124 	free(malloc_disk->disk.name);
125 	spdk_free(malloc_disk->malloc_buf);
126 	free(malloc_disk);
127 }
128 
129 static int
130 bdev_malloc_destruct(void *ctx)
131 {
132 	struct malloc_disk *malloc_disk = ctx;
133 
134 	TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link);
135 	malloc_disk_free(malloc_disk);
136 	return 0;
137 }
138 
139 static int
140 bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes)
141 {
142 	int i;
143 
144 	for (i = 0; i < iovcnt; i++) {
145 		if (nbytes < iovs[i].iov_len) {
146 			return 0;
147 		}
148 
149 		nbytes -= iovs[i].iov_len;
150 	}
151 
152 	return nbytes != 0;
153 }
154 
155 static void
156 bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
157 		  struct malloc_task *task,
158 		  struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
159 {
160 	int64_t res = 0;
161 	void *src = mdisk->malloc_buf + offset;
162 	int i;
163 
164 	if (bdev_malloc_check_iov_len(iov, iovcnt, len)) {
165 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task),
166 				      SPDK_BDEV_IO_STATUS_FAILED);
167 		return;
168 	}
169 
170 	SPDK_DEBUGLOG(bdev_malloc, "read %zu bytes from offset %#" PRIx64 ", iovcnt=%d\n",
171 		      len, offset, iovcnt);
172 
173 	task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
174 	task->num_outstanding = 0;
175 
176 	for (i = 0; i < iovcnt; i++) {
177 		task->num_outstanding++;
178 		res = spdk_accel_submit_copy(ch, iov[i].iov_base,
179 					     src, iov[i].iov_len, 0, malloc_done, task);
180 
181 		if (res != 0) {
182 			malloc_done(task, res);
183 			break;
184 		}
185 
186 		src += iov[i].iov_len;
187 		len -= iov[i].iov_len;
188 	}
189 }
190 
191 static void
192 bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
193 		   struct malloc_task *task,
194 		   struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
195 {
196 	int64_t res = 0;
197 	void *dst = mdisk->malloc_buf + offset;
198 	int i;
199 
200 	if (bdev_malloc_check_iov_len(iov, iovcnt, len)) {
201 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task),
202 				      SPDK_BDEV_IO_STATUS_FAILED);
203 		return;
204 	}
205 
206 	SPDK_DEBUGLOG(bdev_malloc, "wrote %zu bytes to offset %#" PRIx64 ", iovcnt=%d\n",
207 		      len, offset, iovcnt);
208 
209 	task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
210 	task->num_outstanding = 0;
211 
212 	for (i = 0; i < iovcnt; i++) {
213 		task->num_outstanding++;
214 		res = spdk_accel_submit_copy(ch, dst, iov[i].iov_base,
215 					     iov[i].iov_len, 0, malloc_done, task);
216 
217 		if (res != 0) {
218 			malloc_done(task, res);
219 			break;
220 		}
221 
222 		dst += iov[i].iov_len;
223 	}
224 }
225 
226 static int
227 bdev_malloc_unmap(struct malloc_disk *mdisk,
228 		  struct spdk_io_channel *ch,
229 		  struct malloc_task *task,
230 		  uint64_t offset,
231 		  uint64_t byte_count)
232 {
233 	task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
234 	task->num_outstanding = 1;
235 
236 	return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0,
237 				      byte_count, 0, malloc_done, task);
238 }
239 
240 static int _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bdev_io)
241 {
242 	uint32_t block_size = bdev_io->bdev->blocklen;
243 
244 	switch (bdev_io->type) {
245 	case SPDK_BDEV_IO_TYPE_READ:
246 		if (bdev_io->u.bdev.iovs[0].iov_base == NULL) {
247 			assert(bdev_io->u.bdev.iovcnt == 1);
248 			bdev_io->u.bdev.iovs[0].iov_base =
249 				((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf +
250 				bdev_io->u.bdev.offset_blocks * block_size;
251 			bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size;
252 			malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
253 					     SPDK_BDEV_IO_STATUS_SUCCESS);
254 			return 0;
255 		}
256 
257 		bdev_malloc_readv((struct malloc_disk *)bdev_io->bdev->ctxt,
258 				  mch->accel_channel,
259 				  (struct malloc_task *)bdev_io->driver_ctx,
260 				  bdev_io->u.bdev.iovs,
261 				  bdev_io->u.bdev.iovcnt,
262 				  bdev_io->u.bdev.num_blocks * block_size,
263 				  bdev_io->u.bdev.offset_blocks * block_size);
264 		return 0;
265 
266 	case SPDK_BDEV_IO_TYPE_WRITE:
267 		bdev_malloc_writev((struct malloc_disk *)bdev_io->bdev->ctxt,
268 				   mch->accel_channel,
269 				   (struct malloc_task *)bdev_io->driver_ctx,
270 				   bdev_io->u.bdev.iovs,
271 				   bdev_io->u.bdev.iovcnt,
272 				   bdev_io->u.bdev.num_blocks * block_size,
273 				   bdev_io->u.bdev.offset_blocks * block_size);
274 		return 0;
275 
276 	case SPDK_BDEV_IO_TYPE_RESET:
277 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
278 				     SPDK_BDEV_IO_STATUS_SUCCESS);
279 		return 0;
280 
281 	case SPDK_BDEV_IO_TYPE_FLUSH:
282 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
283 				     SPDK_BDEV_IO_STATUS_SUCCESS);
284 		return 0;
285 
286 	case SPDK_BDEV_IO_TYPE_UNMAP:
287 		return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt,
288 					 mch->accel_channel,
289 					 (struct malloc_task *)bdev_io->driver_ctx,
290 					 bdev_io->u.bdev.offset_blocks * block_size,
291 					 bdev_io->u.bdev.num_blocks * block_size);
292 
293 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
294 		/* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */
295 		return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt,
296 					 mch->accel_channel,
297 					 (struct malloc_task *)bdev_io->driver_ctx,
298 					 bdev_io->u.bdev.offset_blocks * block_size,
299 					 bdev_io->u.bdev.num_blocks * block_size);
300 
301 	case SPDK_BDEV_IO_TYPE_ZCOPY:
302 		if (bdev_io->u.bdev.zcopy.start) {
303 			void *buf;
304 			size_t len;
305 
306 			buf = ((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf +
307 			      bdev_io->u.bdev.offset_blocks * block_size;
308 			len = bdev_io->u.bdev.num_blocks * block_size;
309 			spdk_bdev_io_set_buf(bdev_io, buf, len);
310 
311 		}
312 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
313 				     SPDK_BDEV_IO_STATUS_SUCCESS);
314 		return 0;
315 	case SPDK_BDEV_IO_TYPE_ABORT:
316 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
317 				     SPDK_BDEV_IO_STATUS_FAILED);
318 		return 0;
319 	default:
320 		return -1;
321 	}
322 	return 0;
323 }
324 
325 static void bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
326 {
327 	struct malloc_channel *mch = spdk_io_channel_get_ctx(ch);
328 
329 	if (_bdev_malloc_submit_request(mch, bdev_io) != 0) {
330 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
331 				     SPDK_BDEV_IO_STATUS_FAILED);
332 	}
333 }
334 
335 static bool
336 bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
337 {
338 	switch (io_type) {
339 	case SPDK_BDEV_IO_TYPE_READ:
340 	case SPDK_BDEV_IO_TYPE_WRITE:
341 	case SPDK_BDEV_IO_TYPE_FLUSH:
342 	case SPDK_BDEV_IO_TYPE_RESET:
343 	case SPDK_BDEV_IO_TYPE_UNMAP:
344 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
345 	case SPDK_BDEV_IO_TYPE_ZCOPY:
346 	case SPDK_BDEV_IO_TYPE_ABORT:
347 		return true;
348 
349 	default:
350 		return false;
351 	}
352 }
353 
354 static struct spdk_io_channel *
355 bdev_malloc_get_io_channel(void *ctx)
356 {
357 	return spdk_get_io_channel(&g_malloc_disks);
358 }
359 
360 static void
361 bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
362 {
363 	char uuid_str[SPDK_UUID_STRING_LEN];
364 
365 	spdk_json_write_object_begin(w);
366 
367 	spdk_json_write_named_string(w, "method", "bdev_malloc_create");
368 
369 	spdk_json_write_named_object_begin(w, "params");
370 	spdk_json_write_named_string(w, "name", bdev->name);
371 	spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt);
372 	spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
373 	spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid);
374 	spdk_json_write_named_string(w, "uuid", uuid_str);
375 	spdk_json_write_named_uint32(w, "optimal_io_boundary", bdev->optimal_io_boundary);
376 
377 	spdk_json_write_object_end(w);
378 
379 	spdk_json_write_object_end(w);
380 }
381 
382 static const struct spdk_bdev_fn_table malloc_fn_table = {
383 	.destruct		= bdev_malloc_destruct,
384 	.submit_request		= bdev_malloc_submit_request,
385 	.io_type_supported	= bdev_malloc_io_type_supported,
386 	.get_io_channel		= bdev_malloc_get_io_channel,
387 	.write_config_json	= bdev_malloc_write_json_config,
388 };
389 
390 int
391 create_malloc_disk(struct spdk_bdev **bdev, const char *name, const struct spdk_uuid *uuid,
392 		   uint64_t num_blocks, uint32_t block_size, uint32_t optimal_io_boundary)
393 {
394 	struct malloc_disk	*mdisk;
395 	int rc;
396 
397 	if (num_blocks == 0) {
398 		SPDK_ERRLOG("Disk num_blocks must be greater than 0");
399 		return -EINVAL;
400 	}
401 
402 	if (block_size % 512) {
403 		SPDK_ERRLOG("block size must be 512 bytes aligned\n");
404 		return -EINVAL;
405 	}
406 
407 	mdisk = calloc(1, sizeof(*mdisk));
408 	if (!mdisk) {
409 		SPDK_ERRLOG("mdisk calloc() failed\n");
410 		return -ENOMEM;
411 	}
412 
413 	/*
414 	 * Allocate the large backend memory buffer from pinned memory.
415 	 *
416 	 * TODO: need to pass a hint so we know which socket to allocate
417 	 *  from on multi-socket systems.
418 	 */
419 	mdisk->malloc_buf = spdk_zmalloc(num_blocks * block_size, 2 * 1024 * 1024, NULL,
420 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
421 	if (!mdisk->malloc_buf) {
422 		SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n");
423 		malloc_disk_free(mdisk);
424 		return -ENOMEM;
425 	}
426 
427 	if (name) {
428 		mdisk->disk.name = strdup(name);
429 	} else {
430 		/* Auto-generate a name */
431 		mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count);
432 		malloc_disk_count++;
433 	}
434 	if (!mdisk->disk.name) {
435 		malloc_disk_free(mdisk);
436 		return -ENOMEM;
437 	}
438 	mdisk->disk.product_name = "Malloc disk";
439 
440 	mdisk->disk.write_cache = 1;
441 	mdisk->disk.blocklen = block_size;
442 	mdisk->disk.blockcnt = num_blocks;
443 	if (optimal_io_boundary) {
444 		mdisk->disk.optimal_io_boundary = optimal_io_boundary;
445 		mdisk->disk.split_on_optimal_io_boundary = true;
446 	}
447 	if (uuid) {
448 		mdisk->disk.uuid = *uuid;
449 	} else {
450 		spdk_uuid_generate(&mdisk->disk.uuid);
451 	}
452 
453 	mdisk->disk.ctxt = mdisk;
454 	mdisk->disk.fn_table = &malloc_fn_table;
455 	mdisk->disk.module = &malloc_if;
456 
457 	rc = spdk_bdev_register(&mdisk->disk);
458 	if (rc) {
459 		malloc_disk_free(mdisk);
460 		return rc;
461 	}
462 
463 	*bdev = &(mdisk->disk);
464 
465 	TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link);
466 
467 	return rc;
468 }
469 
470 void
471 delete_malloc_disk(const char *name, spdk_delete_malloc_complete cb_fn, void *cb_arg)
472 {
473 	int rc;
474 
475 	rc = spdk_bdev_unregister_by_name(name, &malloc_if, cb_fn, cb_arg);
476 	if (rc != 0) {
477 		cb_fn(cb_arg, rc);
478 	}
479 }
480 
481 static int
482 malloc_completion_poller(void *ctx)
483 {
484 	struct malloc_channel *ch = ctx;
485 	struct malloc_task *task;
486 	TAILQ_HEAD(, malloc_task) completed_tasks;
487 	uint32_t num_completions = 0;
488 
489 	TAILQ_INIT(&completed_tasks);
490 	TAILQ_SWAP(&completed_tasks, &ch->completed_tasks, malloc_task, tailq);
491 
492 	while (!TAILQ_EMPTY(&completed_tasks)) {
493 		task = TAILQ_FIRST(&completed_tasks);
494 		TAILQ_REMOVE(&completed_tasks, task, tailq);
495 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status);
496 		num_completions++;
497 	}
498 
499 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
500 }
501 
502 static int
503 malloc_create_channel_cb(void *io_device, void *ctx)
504 {
505 	struct malloc_channel *ch = ctx;
506 
507 	ch->accel_channel = spdk_accel_engine_get_io_channel();
508 	if (!ch->accel_channel) {
509 		SPDK_ERRLOG("Failed to get accel engine's IO channel\n");
510 		return -ENOMEM;
511 	}
512 
513 	ch->completion_poller = SPDK_POLLER_REGISTER(malloc_completion_poller, ch, 0);
514 	if (!ch->completion_poller) {
515 		SPDK_ERRLOG("Failed to register malloc completion poller\n");
516 		spdk_put_io_channel(ch->accel_channel);
517 		return -ENOMEM;
518 	}
519 
520 	TAILQ_INIT(&ch->completed_tasks);
521 
522 	return 0;
523 }
524 
525 static void
526 malloc_destroy_channel_cb(void *io_device, void *ctx)
527 {
528 	struct malloc_channel *ch = ctx;
529 
530 	assert(TAILQ_EMPTY(&ch->completed_tasks));
531 
532 	spdk_put_io_channel(ch->accel_channel);
533 	spdk_poller_unregister(&ch->completion_poller);
534 }
535 
536 static int bdev_malloc_initialize(void)
537 {
538 	/* This needs to be reset for each reinitialization of submodules.
539 	 * Otherwise after enough devices or reinitializations the value gets too high.
540 	 * TODO: Make malloc bdev name mandatory and remove this counter. */
541 	malloc_disk_count = 0;
542 
543 	spdk_io_device_register(&g_malloc_disks, malloc_create_channel_cb,
544 				malloc_destroy_channel_cb, sizeof(struct malloc_channel),
545 				"bdev_malloc");
546 
547 	return 0;
548 }
549 
550 static void
551 bdev_malloc_deinitialize(void)
552 {
553 	spdk_io_device_unregister(&g_malloc_disks, NULL);
554 }
555 
556 SPDK_LOG_REGISTER_COMPONENT(bdev_malloc)
557