xref: /spdk/module/bdev/malloc/bdev_malloc.c (revision 307b8c112ffd90a26d53dd15fad67bd9038ef526)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation.
3  *   All rights reserved.
4  *   Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "bdev_malloc.h"
10 #include "spdk/bdev.h"
11 #include "spdk/endian.h"
12 #include "spdk/env.h"
13 #include "spdk/accel.h"
14 #include "spdk/json.h"
15 #include "spdk/thread.h"
16 #include "spdk/queue.h"
17 #include "spdk/string.h"
18 
19 #include "spdk/bdev_module.h"
20 #include "spdk/log.h"
21 
22 struct malloc_disk {
23 	struct spdk_bdev		disk;
24 	void				*malloc_buf;
25 	TAILQ_ENTRY(malloc_disk)	link;
26 };
27 
28 struct malloc_task {
29 	int				num_outstanding;
30 	enum spdk_bdev_io_status	status;
31 	TAILQ_ENTRY(malloc_task)	tailq;
32 };
33 
34 struct malloc_channel {
35 	struct spdk_io_channel		*accel_channel;
36 	struct spdk_poller		*completion_poller;
37 	TAILQ_HEAD(, malloc_task)	completed_tasks;
38 };
39 
40 static void
41 malloc_done(void *ref, int status)
42 {
43 	struct malloc_task *task = (struct malloc_task *)ref;
44 
45 	if (status != 0) {
46 		if (status == -ENOMEM) {
47 			task->status = SPDK_BDEV_IO_STATUS_NOMEM;
48 		} else {
49 			task->status = SPDK_BDEV_IO_STATUS_FAILED;
50 		}
51 	}
52 
53 	if (--task->num_outstanding == 0) {
54 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status);
55 	}
56 }
57 
58 static void
59 malloc_complete_task(struct malloc_task *task, struct malloc_channel *mch,
60 		     enum spdk_bdev_io_status status)
61 {
62 	task->status = status;
63 	TAILQ_INSERT_TAIL(&mch->completed_tasks, task, tailq);
64 }
65 
66 static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks);
67 
68 int malloc_disk_count = 0;
69 
70 static int bdev_malloc_initialize(void);
71 static void bdev_malloc_deinitialize(void);
72 
73 static int
74 bdev_malloc_get_ctx_size(void)
75 {
76 	return sizeof(struct malloc_task);
77 }
78 
79 static struct spdk_bdev_module malloc_if = {
80 	.name = "malloc",
81 	.module_init = bdev_malloc_initialize,
82 	.module_fini = bdev_malloc_deinitialize,
83 	.get_ctx_size = bdev_malloc_get_ctx_size,
84 
85 };
86 
87 SPDK_BDEV_MODULE_REGISTER(malloc, &malloc_if)
88 
89 static void
90 malloc_disk_free(struct malloc_disk *malloc_disk)
91 {
92 	if (!malloc_disk) {
93 		return;
94 	}
95 
96 	free(malloc_disk->disk.name);
97 	spdk_free(malloc_disk->malloc_buf);
98 	free(malloc_disk);
99 }
100 
101 static int
102 bdev_malloc_destruct(void *ctx)
103 {
104 	struct malloc_disk *malloc_disk = ctx;
105 
106 	TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link);
107 	malloc_disk_free(malloc_disk);
108 	return 0;
109 }
110 
111 static int
112 bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes)
113 {
114 	int i;
115 
116 	for (i = 0; i < iovcnt; i++) {
117 		if (nbytes < iovs[i].iov_len) {
118 			return 0;
119 		}
120 
121 		nbytes -= iovs[i].iov_len;
122 	}
123 
124 	return nbytes != 0;
125 }
126 
127 static void
128 bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
129 		  struct malloc_task *task,
130 		  struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
131 {
132 	int64_t res = 0;
133 	void *src = mdisk->malloc_buf + offset;
134 	int i;
135 
136 	if (bdev_malloc_check_iov_len(iov, iovcnt, len)) {
137 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task),
138 				      SPDK_BDEV_IO_STATUS_FAILED);
139 		return;
140 	}
141 
142 	SPDK_DEBUGLOG(bdev_malloc, "read %zu bytes from offset %#" PRIx64 ", iovcnt=%d\n",
143 		      len, offset, iovcnt);
144 
145 	task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
146 	task->num_outstanding = 0;
147 
148 	for (i = 0; i < iovcnt; i++) {
149 		task->num_outstanding++;
150 		res = spdk_accel_submit_copy(ch, iov[i].iov_base,
151 					     src, iov[i].iov_len, 0, malloc_done, task);
152 
153 		if (res != 0) {
154 			malloc_done(task, res);
155 			break;
156 		}
157 
158 		src += iov[i].iov_len;
159 		len -= iov[i].iov_len;
160 	}
161 }
162 
163 static void
164 bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
165 		   struct malloc_task *task,
166 		   struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
167 {
168 	int64_t res = 0;
169 	void *dst = mdisk->malloc_buf + offset;
170 	int i;
171 
172 	if (bdev_malloc_check_iov_len(iov, iovcnt, len)) {
173 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task),
174 				      SPDK_BDEV_IO_STATUS_FAILED);
175 		return;
176 	}
177 
178 	SPDK_DEBUGLOG(bdev_malloc, "wrote %zu bytes to offset %#" PRIx64 ", iovcnt=%d\n",
179 		      len, offset, iovcnt);
180 
181 	task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
182 	task->num_outstanding = 0;
183 
184 	for (i = 0; i < iovcnt; i++) {
185 		task->num_outstanding++;
186 		res = spdk_accel_submit_copy(ch, dst, iov[i].iov_base,
187 					     iov[i].iov_len, 0, malloc_done, task);
188 
189 		if (res != 0) {
190 			malloc_done(task, res);
191 			break;
192 		}
193 
194 		dst += iov[i].iov_len;
195 	}
196 }
197 
198 static int
199 bdev_malloc_unmap(struct malloc_disk *mdisk,
200 		  struct spdk_io_channel *ch,
201 		  struct malloc_task *task,
202 		  uint64_t offset,
203 		  uint64_t byte_count)
204 {
205 	task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
206 	task->num_outstanding = 1;
207 
208 	return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0,
209 				      byte_count, 0, malloc_done, task);
210 }
211 
212 static int
213 _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bdev_io)
214 {
215 	uint32_t block_size = bdev_io->bdev->blocklen;
216 
217 	switch (bdev_io->type) {
218 	case SPDK_BDEV_IO_TYPE_READ:
219 		if (bdev_io->u.bdev.iovs[0].iov_base == NULL) {
220 			assert(bdev_io->u.bdev.iovcnt == 1);
221 			bdev_io->u.bdev.iovs[0].iov_base =
222 				((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf +
223 				bdev_io->u.bdev.offset_blocks * block_size;
224 			bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size;
225 			malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
226 					     SPDK_BDEV_IO_STATUS_SUCCESS);
227 			return 0;
228 		}
229 
230 		bdev_malloc_readv((struct malloc_disk *)bdev_io->bdev->ctxt,
231 				  mch->accel_channel,
232 				  (struct malloc_task *)bdev_io->driver_ctx,
233 				  bdev_io->u.bdev.iovs,
234 				  bdev_io->u.bdev.iovcnt,
235 				  bdev_io->u.bdev.num_blocks * block_size,
236 				  bdev_io->u.bdev.offset_blocks * block_size);
237 		return 0;
238 
239 	case SPDK_BDEV_IO_TYPE_WRITE:
240 		bdev_malloc_writev((struct malloc_disk *)bdev_io->bdev->ctxt,
241 				   mch->accel_channel,
242 				   (struct malloc_task *)bdev_io->driver_ctx,
243 				   bdev_io->u.bdev.iovs,
244 				   bdev_io->u.bdev.iovcnt,
245 				   bdev_io->u.bdev.num_blocks * block_size,
246 				   bdev_io->u.bdev.offset_blocks * block_size);
247 		return 0;
248 
249 	case SPDK_BDEV_IO_TYPE_RESET:
250 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
251 				     SPDK_BDEV_IO_STATUS_SUCCESS);
252 		return 0;
253 
254 	case SPDK_BDEV_IO_TYPE_FLUSH:
255 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
256 				     SPDK_BDEV_IO_STATUS_SUCCESS);
257 		return 0;
258 
259 	case SPDK_BDEV_IO_TYPE_UNMAP:
260 		return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt,
261 					 mch->accel_channel,
262 					 (struct malloc_task *)bdev_io->driver_ctx,
263 					 bdev_io->u.bdev.offset_blocks * block_size,
264 					 bdev_io->u.bdev.num_blocks * block_size);
265 
266 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
267 		/* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */
268 		return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt,
269 					 mch->accel_channel,
270 					 (struct malloc_task *)bdev_io->driver_ctx,
271 					 bdev_io->u.bdev.offset_blocks * block_size,
272 					 bdev_io->u.bdev.num_blocks * block_size);
273 
274 	case SPDK_BDEV_IO_TYPE_ZCOPY:
275 		if (bdev_io->u.bdev.zcopy.start) {
276 			void *buf;
277 			size_t len;
278 
279 			buf = ((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf +
280 			      bdev_io->u.bdev.offset_blocks * block_size;
281 			len = bdev_io->u.bdev.num_blocks * block_size;
282 			spdk_bdev_io_set_buf(bdev_io, buf, len);
283 
284 		}
285 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
286 				     SPDK_BDEV_IO_STATUS_SUCCESS);
287 		return 0;
288 	case SPDK_BDEV_IO_TYPE_ABORT:
289 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
290 				     SPDK_BDEV_IO_STATUS_FAILED);
291 		return 0;
292 	default:
293 		return -1;
294 	}
295 	return 0;
296 }
297 
298 static void
299 bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
300 {
301 	struct malloc_channel *mch = spdk_io_channel_get_ctx(ch);
302 
303 	if (_bdev_malloc_submit_request(mch, bdev_io) != 0) {
304 		malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch,
305 				     SPDK_BDEV_IO_STATUS_FAILED);
306 	}
307 }
308 
309 static bool
310 bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
311 {
312 	switch (io_type) {
313 	case SPDK_BDEV_IO_TYPE_READ:
314 	case SPDK_BDEV_IO_TYPE_WRITE:
315 	case SPDK_BDEV_IO_TYPE_FLUSH:
316 	case SPDK_BDEV_IO_TYPE_RESET:
317 	case SPDK_BDEV_IO_TYPE_UNMAP:
318 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
319 	case SPDK_BDEV_IO_TYPE_ZCOPY:
320 	case SPDK_BDEV_IO_TYPE_ABORT:
321 		return true;
322 
323 	default:
324 		return false;
325 	}
326 }
327 
328 static struct spdk_io_channel *
329 bdev_malloc_get_io_channel(void *ctx)
330 {
331 	return spdk_get_io_channel(&g_malloc_disks);
332 }
333 
334 static void
335 bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
336 {
337 	char uuid_str[SPDK_UUID_STRING_LEN];
338 
339 	spdk_json_write_object_begin(w);
340 
341 	spdk_json_write_named_string(w, "method", "bdev_malloc_create");
342 
343 	spdk_json_write_named_object_begin(w, "params");
344 	spdk_json_write_named_string(w, "name", bdev->name);
345 	spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt);
346 	spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
347 	spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid);
348 	spdk_json_write_named_string(w, "uuid", uuid_str);
349 	spdk_json_write_named_uint32(w, "optimal_io_boundary", bdev->optimal_io_boundary);
350 
351 	spdk_json_write_object_end(w);
352 
353 	spdk_json_write_object_end(w);
354 }
355 
356 static const struct spdk_bdev_fn_table malloc_fn_table = {
357 	.destruct		= bdev_malloc_destruct,
358 	.submit_request		= bdev_malloc_submit_request,
359 	.io_type_supported	= bdev_malloc_io_type_supported,
360 	.get_io_channel		= bdev_malloc_get_io_channel,
361 	.write_config_json	= bdev_malloc_write_json_config,
362 };
363 
364 int
365 create_malloc_disk(struct spdk_bdev **bdev, const char *name, const struct spdk_uuid *uuid,
366 		   uint64_t num_blocks, uint32_t block_size, uint32_t optimal_io_boundary)
367 {
368 	struct malloc_disk	*mdisk;
369 	int rc;
370 
371 	if (num_blocks == 0) {
372 		SPDK_ERRLOG("Disk num_blocks must be greater than 0");
373 		return -EINVAL;
374 	}
375 
376 	if (block_size % 512) {
377 		SPDK_ERRLOG("block size must be 512 bytes aligned\n");
378 		return -EINVAL;
379 	}
380 
381 	mdisk = calloc(1, sizeof(*mdisk));
382 	if (!mdisk) {
383 		SPDK_ERRLOG("mdisk calloc() failed\n");
384 		return -ENOMEM;
385 	}
386 
387 	/*
388 	 * Allocate the large backend memory buffer from pinned memory.
389 	 *
390 	 * TODO: need to pass a hint so we know which socket to allocate
391 	 *  from on multi-socket systems.
392 	 */
393 	mdisk->malloc_buf = spdk_zmalloc(num_blocks * block_size, 2 * 1024 * 1024, NULL,
394 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
395 	if (!mdisk->malloc_buf) {
396 		SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n");
397 		malloc_disk_free(mdisk);
398 		return -ENOMEM;
399 	}
400 
401 	if (name) {
402 		mdisk->disk.name = strdup(name);
403 	} else {
404 		/* Auto-generate a name */
405 		mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count);
406 		malloc_disk_count++;
407 	}
408 	if (!mdisk->disk.name) {
409 		malloc_disk_free(mdisk);
410 		return -ENOMEM;
411 	}
412 	mdisk->disk.product_name = "Malloc disk";
413 
414 	mdisk->disk.write_cache = 1;
415 	mdisk->disk.blocklen = block_size;
416 	mdisk->disk.blockcnt = num_blocks;
417 	if (optimal_io_boundary) {
418 		mdisk->disk.optimal_io_boundary = optimal_io_boundary;
419 		mdisk->disk.split_on_optimal_io_boundary = true;
420 	}
421 	if (uuid) {
422 		mdisk->disk.uuid = *uuid;
423 	} else {
424 		spdk_uuid_generate(&mdisk->disk.uuid);
425 	}
426 
427 	mdisk->disk.ctxt = mdisk;
428 	mdisk->disk.fn_table = &malloc_fn_table;
429 	mdisk->disk.module = &malloc_if;
430 
431 	rc = spdk_bdev_register(&mdisk->disk);
432 	if (rc) {
433 		malloc_disk_free(mdisk);
434 		return rc;
435 	}
436 
437 	*bdev = &(mdisk->disk);
438 
439 	TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link);
440 
441 	return rc;
442 }
443 
444 void
445 delete_malloc_disk(const char *name, spdk_delete_malloc_complete cb_fn, void *cb_arg)
446 {
447 	int rc;
448 
449 	rc = spdk_bdev_unregister_by_name(name, &malloc_if, cb_fn, cb_arg);
450 	if (rc != 0) {
451 		cb_fn(cb_arg, rc);
452 	}
453 }
454 
455 static int
456 malloc_completion_poller(void *ctx)
457 {
458 	struct malloc_channel *ch = ctx;
459 	struct malloc_task *task;
460 	TAILQ_HEAD(, malloc_task) completed_tasks;
461 	uint32_t num_completions = 0;
462 
463 	TAILQ_INIT(&completed_tasks);
464 	TAILQ_SWAP(&completed_tasks, &ch->completed_tasks, malloc_task, tailq);
465 
466 	while (!TAILQ_EMPTY(&completed_tasks)) {
467 		task = TAILQ_FIRST(&completed_tasks);
468 		TAILQ_REMOVE(&completed_tasks, task, tailq);
469 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status);
470 		num_completions++;
471 	}
472 
473 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
474 }
475 
476 static int
477 malloc_create_channel_cb(void *io_device, void *ctx)
478 {
479 	struct malloc_channel *ch = ctx;
480 
481 	ch->accel_channel = spdk_accel_get_io_channel();
482 	if (!ch->accel_channel) {
483 		SPDK_ERRLOG("Failed to get accel framework's IO channel\n");
484 		return -ENOMEM;
485 	}
486 
487 	ch->completion_poller = SPDK_POLLER_REGISTER(malloc_completion_poller, ch, 0);
488 	if (!ch->completion_poller) {
489 		SPDK_ERRLOG("Failed to register malloc completion poller\n");
490 		spdk_put_io_channel(ch->accel_channel);
491 		return -ENOMEM;
492 	}
493 
494 	TAILQ_INIT(&ch->completed_tasks);
495 
496 	return 0;
497 }
498 
499 static void
500 malloc_destroy_channel_cb(void *io_device, void *ctx)
501 {
502 	struct malloc_channel *ch = ctx;
503 
504 	assert(TAILQ_EMPTY(&ch->completed_tasks));
505 
506 	spdk_put_io_channel(ch->accel_channel);
507 	spdk_poller_unregister(&ch->completion_poller);
508 }
509 
510 static int
511 bdev_malloc_initialize(void)
512 {
513 	/* This needs to be reset for each reinitialization of submodules.
514 	 * Otherwise after enough devices or reinitializations the value gets too high.
515 	 * TODO: Make malloc bdev name mandatory and remove this counter. */
516 	malloc_disk_count = 0;
517 
518 	spdk_io_device_register(&g_malloc_disks, malloc_create_channel_cb,
519 				malloc_destroy_channel_cb, sizeof(struct malloc_channel),
520 				"bdev_malloc");
521 
522 	return 0;
523 }
524 
525 static void
526 bdev_malloc_deinitialize(void)
527 {
528 	spdk_io_device_unregister(&g_malloc_disks, NULL);
529 }
530 
531 SPDK_LOG_REGISTER_COMPONENT(bdev_malloc)
532