xref: /spdk/module/bdev/malloc/bdev_malloc.c (revision 488570ebd418ba07c9e69e65106dcc964f3bb41b)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation.
3  *   All rights reserved.
4  *   Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "bdev_malloc.h"
10 #include "spdk/bdev.h"
11 #include "spdk/endian.h"
12 #include "spdk/env.h"
13 #include "spdk/accel_engine.h"
14 #include "spdk/json.h"
15 #include "spdk/thread.h"
16 #include "spdk/queue.h"
17 #include "spdk/string.h"
18 
19 #include "spdk/bdev_module.h"
20 #include "spdk/log.h"
21 
22 struct malloc_disk {
23 	struct spdk_bdev		disk;
24 	void				*malloc_buf;
25 	TAILQ_ENTRY(malloc_disk)	link;
26 };
27 
28 struct malloc_task {
29 	int				num_outstanding;
30 	enum spdk_bdev_io_status	status;
31 	TAILQ_ENTRY(malloc_task)	tailq;
32 };
33 
34 struct malloc_channel {
35 	struct spdk_io_channel		*accel_channel;
36 	struct spdk_poller		*completion_poller;
37 	TAILQ_HEAD(, malloc_task)	completed_tasks;
38 };
39 
40 static void
41 malloc_done(void *ref, int status)
42 {
43 	struct malloc_task *task = (struct malloc_task *)ref;
44 
45 	if (status != 0) {
46 		if (status == -ENOMEM) {
47 			task->status = SPDK_BDEV_IO_STATUS_NOMEM;
48 		} else {
49 			task->status = SPDK_BDEV_IO_STATUS_FAILED;
50 		}
51 	}
52 
53 	if (--task->num_outstanding == 0) {
54 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status);
55 	}
56 }
57 
58 static void
59 malloc_complete_task(struct malloc_task *task, struct malloc_channel *mch,
60 		     enum spdk_bdev_io_status status)
61 {
62 	task->status = status;
63 	TAILQ_INSERT_TAIL(&mch->completed_tasks, task, tailq);
64 }
65 
66 static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks);
67 
68 int malloc_disk_count = 0;
69 
70 static int bdev_malloc_initialize(void);
71 static void bdev_malloc_deinitialize(void);
72 
73 static int
74 bdev_malloc_get_ctx_size(void)
75 {
76 	return sizeof(struct malloc_task);
77 }
78 
79 static struct spdk_bdev_module malloc_if = {
80 	.name = "malloc",
81 	.module_init = bdev_malloc_initialize,
82 	.module_fini = bdev_malloc_deinitialize,
83 	.get_ctx_size = bdev_malloc_get_ctx_size,
84 
85 };
86 
87 SPDK_BDEV_MODULE_REGISTER(malloc, &malloc_if)
88 
89 static void
90 malloc_disk_free(struct malloc_disk *malloc_disk)
91 {
92 	if (!malloc_disk) {
93 		return;
94 	}
95 
96 	free(malloc_disk->disk.name);
97 	spdk_free(malloc_disk->malloc_buf);
98 	free(malloc_disk);
99 }
100 
101 static int
102 bdev_malloc_destruct(void *ctx)
103 {
104 	struct malloc_disk *malloc_disk = ctx;
105 
106 	TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link);
107 	malloc_disk_free(malloc_disk);
108 	return 0;
109 }
110 
111 static int
112 bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes)
113 {
114 	int i;
115 
116 	for (i = 0; i < iovcnt; i++) {
117 		if (nbytes < iovs[i].iov_len) {
118 			return 0;
119 		}
120 
121 		nbytes -= iovs[i].iov_len;
122 	}
123 
124 	return nbytes != 0;
125 }
126 
127 static void
128 bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
129 		  struct malloc_task *task,
130 		  struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
131 {
132 	int64_t res = 0;
133 	void *src = mdisk->malloc_buf + offset;
134 	int i;
135 
136 	if (bdev_malloc_check_iov_len(iov, iovcnt, len)) {
137 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task),
138 				      SPDK_BDEV_IO_STATUS_FAILED);
139 		return;
140 	}
141 
142 	SPDK_DEBUGLOG(bdev_malloc, "read %zu bytes from offset %#" PRIx64 ", iovcnt=%d\n",
143 		      len, offset, iovcnt);
144 
145 	task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
146 	task->num_outstanding = 0;
147 
148 	for (i = 0; i < iovcnt; i++) {
149 		task->num_outstanding++;
150 		res = spdk_accel_submit_copy(ch, iov[i].iov_base,
151 					     src, iov[i].iov_len, 0, malloc_done, task);
152 
153 		if (res != 0) {
154 			malloc_done(task, res);
155 			break;
156 		}
157 
158 		src += iov[i].iov_len;
159 		len -= iov[i].iov_len;
160 	}
161 }
162 
163 static void
164 bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
165 		   struct malloc_task *task,
166 		   struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
167 {
168 	int64_t res = 0;
169 	void *dst = mdisk->malloc_buf + offset;
170 	int i;
171 
172 	if (bdev_malloc_check_iov_len(iov, iovcnt, len)) {
173 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task),
174 				      SPDK_BDEV_IO_STATUS_FAILED);
175 		return;
176 	}
177 
178 	SPDK_DEBUGLOG(bdev_malloc, "wrote %zu bytes to offset %#" PRIx64 ", iovcnt=%d\n",
179 		      len, offset, iovcnt);
180 
181 	task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
182 	task->num_outstanding = 0;
183 
184 	for (i = 0; i < iovcnt; i++) {
185 		task->num_outstanding++;
186 		res = spdk_accel_submit_copy(ch, dst, iov[i].iov_base,
187 					     iov[i].iov_len, 0, malloc_done, task);
188 
189 		if (res != 0) {
190 			malloc_done(task, res);
191 			break;
192 		}
193 
194 		dst += iov[i].iov_len;
195 	}
196 }
197 
198 static int
199 bdev_malloc_unmap(struct malloc_disk *mdisk,
200 		  struct spdk_io_channel *ch,
201 		  struct malloc_task *task,
202 		  uint64_t offset,
203 		  uint64_t byte_count)
204 {
205 	task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
206 	task->num_outstanding = 1;
207 
208 	return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0,
209 				      byte_count, 0, malloc_done, task);
210 }
211 
212 static int _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bdev_io)
213 {
214 	uint32_t block_size = bdev_io->bdev->blocklen;
215 
216 	switch (bdev_io->type) {
217 	case SPDK_BDEV_IO_TYPE_READ:
218 		if (bdev_io->u.bdev.iovs[0].iov_base == NULL) {
219 			assert(bdev_io->u.bdev.iovcnt == 1);
220 			bdev_io->u.bdev.iovs[0].iov_base =
221 				((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf +
222 				bdev_io->u.bdev.offset_blocks * block_size;
223 			bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size;
224 			malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
225 					     SPDK_BDEV_IO_STATUS_SUCCESS);
226 			return 0;
227 		}
228 
229 		bdev_malloc_readv((struct malloc_disk *)bdev_io->bdev->ctxt,
230 				  mch->accel_channel,
231 				  (struct malloc_task *)bdev_io->driver_ctx,
232 				  bdev_io->u.bdev.iovs,
233 				  bdev_io->u.bdev.iovcnt,
234 				  bdev_io->u.bdev.num_blocks * block_size,
235 				  bdev_io->u.bdev.offset_blocks * block_size);
236 		return 0;
237 
238 	case SPDK_BDEV_IO_TYPE_WRITE:
239 		bdev_malloc_writev((struct malloc_disk *)bdev_io->bdev->ctxt,
240 				   mch->accel_channel,
241 				   (struct malloc_task *)bdev_io->driver_ctx,
242 				   bdev_io->u.bdev.iovs,
243 				   bdev_io->u.bdev.iovcnt,
244 				   bdev_io->u.bdev.num_blocks * block_size,
245 				   bdev_io->u.bdev.offset_blocks * block_size);
246 		return 0;
247 
248 	case SPDK_BDEV_IO_TYPE_RESET:
249 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
250 				     SPDK_BDEV_IO_STATUS_SUCCESS);
251 		return 0;
252 
253 	case SPDK_BDEV_IO_TYPE_FLUSH:
254 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
255 				     SPDK_BDEV_IO_STATUS_SUCCESS);
256 		return 0;
257 
258 	case SPDK_BDEV_IO_TYPE_UNMAP:
259 		return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt,
260 					 mch->accel_channel,
261 					 (struct malloc_task *)bdev_io->driver_ctx,
262 					 bdev_io->u.bdev.offset_blocks * block_size,
263 					 bdev_io->u.bdev.num_blocks * block_size);
264 
265 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
266 		/* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */
267 		return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt,
268 					 mch->accel_channel,
269 					 (struct malloc_task *)bdev_io->driver_ctx,
270 					 bdev_io->u.bdev.offset_blocks * block_size,
271 					 bdev_io->u.bdev.num_blocks * block_size);
272 
273 	case SPDK_BDEV_IO_TYPE_ZCOPY:
274 		if (bdev_io->u.bdev.zcopy.start) {
275 			void *buf;
276 			size_t len;
277 
278 			buf = ((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf +
279 			      bdev_io->u.bdev.offset_blocks * block_size;
280 			len = bdev_io->u.bdev.num_blocks * block_size;
281 			spdk_bdev_io_set_buf(bdev_io, buf, len);
282 
283 		}
284 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
285 				     SPDK_BDEV_IO_STATUS_SUCCESS);
286 		return 0;
287 	case SPDK_BDEV_IO_TYPE_ABORT:
288 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
289 				     SPDK_BDEV_IO_STATUS_FAILED);
290 		return 0;
291 	default:
292 		return -1;
293 	}
294 	return 0;
295 }
296 
297 static void bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
298 {
299 	struct malloc_channel *mch = spdk_io_channel_get_ctx(ch);
300 
301 	if (_bdev_malloc_submit_request(mch, bdev_io) != 0) {
302 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
303 				     SPDK_BDEV_IO_STATUS_FAILED);
304 	}
305 }
306 
307 static bool
308 bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
309 {
310 	switch (io_type) {
311 	case SPDK_BDEV_IO_TYPE_READ:
312 	case SPDK_BDEV_IO_TYPE_WRITE:
313 	case SPDK_BDEV_IO_TYPE_FLUSH:
314 	case SPDK_BDEV_IO_TYPE_RESET:
315 	case SPDK_BDEV_IO_TYPE_UNMAP:
316 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
317 	case SPDK_BDEV_IO_TYPE_ZCOPY:
318 	case SPDK_BDEV_IO_TYPE_ABORT:
319 		return true;
320 
321 	default:
322 		return false;
323 	}
324 }
325 
326 static struct spdk_io_channel *
327 bdev_malloc_get_io_channel(void *ctx)
328 {
329 	return spdk_get_io_channel(&g_malloc_disks);
330 }
331 
332 static void
333 bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
334 {
335 	char uuid_str[SPDK_UUID_STRING_LEN];
336 
337 	spdk_json_write_object_begin(w);
338 
339 	spdk_json_write_named_string(w, "method", "bdev_malloc_create");
340 
341 	spdk_json_write_named_object_begin(w, "params");
342 	spdk_json_write_named_string(w, "name", bdev->name);
343 	spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt);
344 	spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
345 	spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid);
346 	spdk_json_write_named_string(w, "uuid", uuid_str);
347 	spdk_json_write_named_uint32(w, "optimal_io_boundary", bdev->optimal_io_boundary);
348 
349 	spdk_json_write_object_end(w);
350 
351 	spdk_json_write_object_end(w);
352 }
353 
354 static const struct spdk_bdev_fn_table malloc_fn_table = {
355 	.destruct		= bdev_malloc_destruct,
356 	.submit_request		= bdev_malloc_submit_request,
357 	.io_type_supported	= bdev_malloc_io_type_supported,
358 	.get_io_channel		= bdev_malloc_get_io_channel,
359 	.write_config_json	= bdev_malloc_write_json_config,
360 };
361 
362 int
363 create_malloc_disk(struct spdk_bdev **bdev, const char *name, const struct spdk_uuid *uuid,
364 		   uint64_t num_blocks, uint32_t block_size, uint32_t optimal_io_boundary)
365 {
366 	struct malloc_disk	*mdisk;
367 	int rc;
368 
369 	if (num_blocks == 0) {
370 		SPDK_ERRLOG("Disk num_blocks must be greater than 0");
371 		return -EINVAL;
372 	}
373 
374 	if (block_size % 512) {
375 		SPDK_ERRLOG("block size must be 512 bytes aligned\n");
376 		return -EINVAL;
377 	}
378 
379 	mdisk = calloc(1, sizeof(*mdisk));
380 	if (!mdisk) {
381 		SPDK_ERRLOG("mdisk calloc() failed\n");
382 		return -ENOMEM;
383 	}
384 
385 	/*
386 	 * Allocate the large backend memory buffer from pinned memory.
387 	 *
388 	 * TODO: need to pass a hint so we know which socket to allocate
389 	 *  from on multi-socket systems.
390 	 */
391 	mdisk->malloc_buf = spdk_zmalloc(num_blocks * block_size, 2 * 1024 * 1024, NULL,
392 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
393 	if (!mdisk->malloc_buf) {
394 		SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n");
395 		malloc_disk_free(mdisk);
396 		return -ENOMEM;
397 	}
398 
399 	if (name) {
400 		mdisk->disk.name = strdup(name);
401 	} else {
402 		/* Auto-generate a name */
403 		mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count);
404 		malloc_disk_count++;
405 	}
406 	if (!mdisk->disk.name) {
407 		malloc_disk_free(mdisk);
408 		return -ENOMEM;
409 	}
410 	mdisk->disk.product_name = "Malloc disk";
411 
412 	mdisk->disk.write_cache = 1;
413 	mdisk->disk.blocklen = block_size;
414 	mdisk->disk.blockcnt = num_blocks;
415 	if (optimal_io_boundary) {
416 		mdisk->disk.optimal_io_boundary = optimal_io_boundary;
417 		mdisk->disk.split_on_optimal_io_boundary = true;
418 	}
419 	if (uuid) {
420 		mdisk->disk.uuid = *uuid;
421 	} else {
422 		spdk_uuid_generate(&mdisk->disk.uuid);
423 	}
424 
425 	mdisk->disk.ctxt = mdisk;
426 	mdisk->disk.fn_table = &malloc_fn_table;
427 	mdisk->disk.module = &malloc_if;
428 
429 	rc = spdk_bdev_register(&mdisk->disk);
430 	if (rc) {
431 		malloc_disk_free(mdisk);
432 		return rc;
433 	}
434 
435 	*bdev = &(mdisk->disk);
436 
437 	TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link);
438 
439 	return rc;
440 }
441 
442 void
443 delete_malloc_disk(const char *name, spdk_delete_malloc_complete cb_fn, void *cb_arg)
444 {
445 	int rc;
446 
447 	rc = spdk_bdev_unregister_by_name(name, &malloc_if, cb_fn, cb_arg);
448 	if (rc != 0) {
449 		cb_fn(cb_arg, rc);
450 	}
451 }
452 
453 static int
454 malloc_completion_poller(void *ctx)
455 {
456 	struct malloc_channel *ch = ctx;
457 	struct malloc_task *task;
458 	TAILQ_HEAD(, malloc_task) completed_tasks;
459 	uint32_t num_completions = 0;
460 
461 	TAILQ_INIT(&completed_tasks);
462 	TAILQ_SWAP(&completed_tasks, &ch->completed_tasks, malloc_task, tailq);
463 
464 	while (!TAILQ_EMPTY(&completed_tasks)) {
465 		task = TAILQ_FIRST(&completed_tasks);
466 		TAILQ_REMOVE(&completed_tasks, task, tailq);
467 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status);
468 		num_completions++;
469 	}
470 
471 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
472 }
473 
474 static int
475 malloc_create_channel_cb(void *io_device, void *ctx)
476 {
477 	struct malloc_channel *ch = ctx;
478 
479 	ch->accel_channel = spdk_accel_engine_get_io_channel();
480 	if (!ch->accel_channel) {
481 		SPDK_ERRLOG("Failed to get accel engine's IO channel\n");
482 		return -ENOMEM;
483 	}
484 
485 	ch->completion_poller = SPDK_POLLER_REGISTER(malloc_completion_poller, ch, 0);
486 	if (!ch->completion_poller) {
487 		SPDK_ERRLOG("Failed to register malloc completion poller\n");
488 		spdk_put_io_channel(ch->accel_channel);
489 		return -ENOMEM;
490 	}
491 
492 	TAILQ_INIT(&ch->completed_tasks);
493 
494 	return 0;
495 }
496 
497 static void
498 malloc_destroy_channel_cb(void *io_device, void *ctx)
499 {
500 	struct malloc_channel *ch = ctx;
501 
502 	assert(TAILQ_EMPTY(&ch->completed_tasks));
503 
504 	spdk_put_io_channel(ch->accel_channel);
505 	spdk_poller_unregister(&ch->completion_poller);
506 }
507 
508 static int bdev_malloc_initialize(void)
509 {
510 	/* This needs to be reset for each reinitialization of submodules.
511 	 * Otherwise after enough devices or reinitializations the value gets too high.
512 	 * TODO: Make malloc bdev name mandatory and remove this counter. */
513 	malloc_disk_count = 0;
514 
515 	spdk_io_device_register(&g_malloc_disks, malloc_create_channel_cb,
516 				malloc_destroy_channel_cb, sizeof(struct malloc_channel),
517 				"bdev_malloc");
518 
519 	return 0;
520 }
521 
522 static void
523 bdev_malloc_deinitialize(void)
524 {
525 	spdk_io_device_unregister(&g_malloc_disks, NULL);
526 }
527 
528 SPDK_LOG_REGISTER_COMPONENT(bdev_malloc)
529