xref: /spdk/lib/nvme/nvme_cuse.c (revision 2d65fd75787eb0bb41e61d598e380b026090c148)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2019 Intel Corporation.
3  *   All rights reserved.
4  */
5 #include "spdk/stdinc.h"
6 #include "spdk/string.h"
7 #include "spdk/config.h"
8 #include "spdk/fd_group.h"
9 #include "spdk/log.h"
10 #include "spdk/nvme.h"
11 
12 #define FUSE_USE_VERSION 31
13 
14 #include <fuse3/cuse_lowlevel.h>
15 
16 #include <linux/nvme_ioctl.h>
17 #include <linux/fs.h>
18 
19 #include "nvme_internal.h"
20 #include "nvme_io_msg.h"
21 #include "nvme_cuse.h"
22 
23 struct cuse_device {
24 	bool				force_exit;
25 	char				dev_name[128];
26 	uint32_t			index;
27 	int				claim_fd;
28 	char				lock_name[64];
29 
30 	struct spdk_nvme_ctrlr		*ctrlr;		/**< NVMe controller */
31 	uint32_t			nsid;		/**< NVMe name space id, or 0 */
32 
33 	struct fuse_session		*session;
34 	int				fuse_efd;
35 
36 	struct cuse_device		*ctrlr_device;
37 	TAILQ_HEAD(, cuse_device)	ns_devices;
38 
39 	TAILQ_ENTRY(cuse_device)	tailq;
40 	TAILQ_ENTRY(cuse_device)	cuse_thread_tailq;
41 };
42 
43 static pthread_mutex_t g_cuse_mtx = PTHREAD_MUTEX_INITIALIZER;
44 static TAILQ_HEAD(, cuse_device) g_ctrlr_ctx_head = TAILQ_HEAD_INITIALIZER(g_ctrlr_ctx_head);
45 static struct spdk_bit_array *g_ctrlr_started;
46 
47 static pthread_mutex_t g_pending_device_mtx = PTHREAD_MUTEX_INITIALIZER;
48 static struct spdk_fd_group *g_device_fdgrp;
49 static int g_cuse_thread_msg_fd;
50 static TAILQ_HEAD(, cuse_device) g_pending_device_head = TAILQ_HEAD_INITIALIZER(
51 			g_pending_device_head);
52 static TAILQ_HEAD(, cuse_device) g_active_device_head = TAILQ_HEAD_INITIALIZER(
53 			g_active_device_head);
54 
55 struct cuse_io_ctx {
56 	struct spdk_nvme_cmd		nvme_cmd;
57 	enum spdk_nvme_data_transfer	data_transfer;
58 
59 	uint64_t			lba;
60 	uint32_t			lba_count;
61 	uint16_t			apptag;
62 	uint16_t			appmask;
63 
64 	void				*data;
65 	void				*metadata;
66 
67 	int				data_len;
68 	int				metadata_len;
69 
70 	fuse_req_t			req;
71 };
72 
73 static void
74 cuse_io_ctx_free(struct cuse_io_ctx *ctx)
75 {
76 	spdk_free(ctx->data);
77 	spdk_free(ctx->metadata);
78 	free(ctx);
79 }
80 
81 #define FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, val)		\
82 	if (out_bufsz == 0) {						\
83 		struct iovec out_iov;					\
84 		out_iov.iov_base = (void *)arg;				\
85 		out_iov.iov_len = sizeof(val);				\
86 		fuse_reply_ioctl_retry(req, NULL, 0, &out_iov, 1);	\
87 		return;							\
88 	}
89 
90 #define FUSE_MAX_SIZE 128*1024
91 
92 static bool
93 fuse_check_req_size(fuse_req_t req, struct iovec iov[], int iovcnt)
94 {
95 	int total_iov_len = 0;
96 	for (int i = 0; i < iovcnt; i++) {
97 		total_iov_len += iov[i].iov_len;
98 		if (total_iov_len > FUSE_MAX_SIZE) {
99 			fuse_reply_err(req, ENOMEM);
100 			SPDK_ERRLOG("FUSE request cannot be larger that %d\n", FUSE_MAX_SIZE);
101 			return false;
102 		}
103 	}
104 	return true;
105 }
106 
107 static void
108 cuse_nvme_passthru_cmd_cb(void *arg, const struct spdk_nvme_cpl *cpl)
109 {
110 	struct cuse_io_ctx *ctx = arg;
111 	struct iovec out_iov[3];
112 	struct spdk_nvme_cpl _cpl;
113 	int out_iovcnt = 0;
114 	uint16_t status_field = cpl->status_raw >> 1; /* Drop out phase bit */
115 
116 	memcpy(&_cpl, cpl, sizeof(struct spdk_nvme_cpl));
117 	out_iov[out_iovcnt].iov_base = &_cpl.cdw0;
118 	out_iov[out_iovcnt].iov_len = sizeof(_cpl.cdw0);
119 	out_iovcnt += 1;
120 
121 	if (ctx->data_transfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
122 		if (ctx->data_len > 0) {
123 			out_iov[out_iovcnt].iov_base = ctx->data;
124 			out_iov[out_iovcnt].iov_len = ctx->data_len;
125 			out_iovcnt += 1;
126 		}
127 		if (ctx->metadata_len > 0) {
128 			out_iov[out_iovcnt].iov_base = ctx->metadata;
129 			out_iov[out_iovcnt].iov_len = ctx->metadata_len;
130 			out_iovcnt += 1;
131 		}
132 	}
133 
134 	fuse_reply_ioctl_iov(ctx->req, status_field, out_iov, out_iovcnt);
135 	cuse_io_ctx_free(ctx);
136 }
137 
138 static void
139 cuse_nvme_passthru_cmd_execute(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
140 {
141 	int rc;
142 	struct cuse_io_ctx *ctx = arg;
143 
144 	if (nsid != 0) {
145 		rc = spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, ctrlr->external_io_msgs_qpair, &ctx->nvme_cmd,
146 							ctx->data,
147 							ctx->data_len, ctx->metadata, cuse_nvme_passthru_cmd_cb, (void *)ctx);
148 	} else {
149 		rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &ctx->nvme_cmd, ctx->data, ctx->data_len,
150 						   cuse_nvme_passthru_cmd_cb, (void *)ctx);
151 	}
152 	if (rc < 0) {
153 		fuse_reply_err(ctx->req, EINVAL);
154 		cuse_io_ctx_free(ctx);
155 	}
156 }
157 
158 static void
159 cuse_nvme_passthru_cmd_send(fuse_req_t req, struct nvme_passthru_cmd *passthru_cmd,
160 			    const void *data, const void *metadata, int cmd)
161 {
162 	struct cuse_io_ctx *ctx;
163 	struct cuse_device *cuse_device = fuse_req_userdata(req);
164 	int rv;
165 
166 	ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx));
167 	if (!ctx) {
168 		SPDK_ERRLOG("Cannot allocate memory for cuse_io_ctx\n");
169 		fuse_reply_err(req, ENOMEM);
170 		return;
171 	}
172 
173 	ctx->req = req;
174 	ctx->data_transfer = spdk_nvme_opc_get_data_transfer(passthru_cmd->opcode);
175 
176 	memset(&ctx->nvme_cmd, 0, sizeof(ctx->nvme_cmd));
177 	ctx->nvme_cmd.opc = passthru_cmd->opcode;
178 	ctx->nvme_cmd.nsid = passthru_cmd->nsid;
179 	ctx->nvme_cmd.cdw10 = passthru_cmd->cdw10;
180 	ctx->nvme_cmd.cdw11 = passthru_cmd->cdw11;
181 	ctx->nvme_cmd.cdw12 = passthru_cmd->cdw12;
182 	ctx->nvme_cmd.cdw13 = passthru_cmd->cdw13;
183 	ctx->nvme_cmd.cdw14 = passthru_cmd->cdw14;
184 	ctx->nvme_cmd.cdw15 = passthru_cmd->cdw15;
185 
186 	ctx->data_len = passthru_cmd->data_len;
187 	ctx->metadata_len = passthru_cmd->metadata_len;
188 
189 	if (ctx->data_len > 0) {
190 		ctx->data = spdk_malloc(ctx->data_len, 4096, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
191 		if (!ctx->data) {
192 			SPDK_ERRLOG("Cannot allocate memory for data\n");
193 			fuse_reply_err(req, ENOMEM);
194 			free(ctx);
195 			return;
196 		}
197 		if (data != NULL) {
198 			memcpy(ctx->data, data, ctx->data_len);
199 		}
200 	}
201 
202 	if (ctx->metadata_len > 0) {
203 		ctx->metadata = spdk_malloc(ctx->metadata_len, 4096, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
204 		if (!ctx->metadata) {
205 			SPDK_ERRLOG("Cannot allocate memory for metadata\n");
206 			fuse_reply_err(req, ENOMEM);
207 			cuse_io_ctx_free(ctx);
208 			return;
209 		}
210 		if (metadata != NULL) {
211 			memcpy(ctx->metadata, metadata, ctx->metadata_len);
212 		}
213 	}
214 
215 	if ((unsigned int)cmd != NVME_IOCTL_ADMIN_CMD) {
216 		/* Send NS for IO IOCTLs */
217 		rv = nvme_io_msg_send(cuse_device->ctrlr, passthru_cmd->nsid, cuse_nvme_passthru_cmd_execute, ctx);
218 	} else {
219 		/* NS == 0 for Admin IOCTLs */
220 		rv = nvme_io_msg_send(cuse_device->ctrlr, 0, cuse_nvme_passthru_cmd_execute, ctx);
221 	}
222 	if (rv) {
223 		SPDK_ERRLOG("Cannot send io msg to the controller\n");
224 		fuse_reply_err(req, -rv);
225 		cuse_io_ctx_free(ctx);
226 		return;
227 	}
228 }
229 
230 static void
231 cuse_nvme_passthru_cmd(fuse_req_t req, int cmd, void *arg,
232 		       struct fuse_file_info *fi, unsigned flags,
233 		       const void *in_buf, size_t in_bufsz, size_t out_bufsz)
234 {
235 	struct nvme_passthru_cmd *passthru_cmd;
236 	struct iovec in_iov[3], out_iov[3];
237 	int in_iovcnt = 0, out_iovcnt = 0;
238 	const void *dptr = NULL, *mdptr = NULL;
239 	enum spdk_nvme_data_transfer data_transfer;
240 
241 	in_iov[in_iovcnt].iov_base = (void *)arg;
242 	in_iov[in_iovcnt].iov_len = sizeof(*passthru_cmd);
243 	in_iovcnt += 1;
244 	if (in_bufsz == 0) {
245 		fuse_reply_ioctl_retry(req, in_iov, in_iovcnt, NULL, out_iovcnt);
246 		return;
247 	}
248 
249 	passthru_cmd = (struct nvme_passthru_cmd *)in_buf;
250 	data_transfer = spdk_nvme_opc_get_data_transfer(passthru_cmd->opcode);
251 
252 	if (data_transfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
253 		/* Make data pointer accessible (RO) */
254 		if (passthru_cmd->addr != 0) {
255 			in_iov[in_iovcnt].iov_base = (void *)passthru_cmd->addr;
256 			in_iov[in_iovcnt].iov_len = passthru_cmd->data_len;
257 			in_iovcnt += 1;
258 		}
259 		/* Make metadata pointer accessible (RO) */
260 		if (passthru_cmd->metadata != 0) {
261 			in_iov[in_iovcnt].iov_base = (void *)passthru_cmd->metadata;
262 			in_iov[in_iovcnt].iov_len = passthru_cmd->metadata_len;
263 			in_iovcnt += 1;
264 		}
265 	}
266 
267 	if (!fuse_check_req_size(req, in_iov, in_iovcnt)) {
268 		return;
269 	}
270 	/* Always make result field writeable regardless of data transfer bits */
271 	out_iov[out_iovcnt].iov_base = &((struct nvme_passthru_cmd *)arg)->result;
272 	out_iov[out_iovcnt].iov_len = sizeof(uint32_t);
273 	out_iovcnt += 1;
274 
275 	if (data_transfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
276 		/* Make data pointer accessible (WO) */
277 		if (passthru_cmd->data_len > 0) {
278 			out_iov[out_iovcnt].iov_base = (void *)passthru_cmd->addr;
279 			out_iov[out_iovcnt].iov_len = passthru_cmd->data_len;
280 			out_iovcnt += 1;
281 		}
282 		/* Make metadata pointer accessible (WO) */
283 		if (passthru_cmd->metadata_len > 0) {
284 			out_iov[out_iovcnt].iov_base = (void *)passthru_cmd->metadata;
285 			out_iov[out_iovcnt].iov_len = passthru_cmd->metadata_len;
286 			out_iovcnt += 1;
287 		}
288 	}
289 
290 	if (!fuse_check_req_size(req, out_iov, out_iovcnt)) {
291 		return;
292 	}
293 
294 	if (out_bufsz == 0) {
295 		fuse_reply_ioctl_retry(req, in_iov, in_iovcnt, out_iov, out_iovcnt);
296 		return;
297 	}
298 
299 	if (data_transfer == SPDK_NVME_DATA_BIDIRECTIONAL) {
300 		fuse_reply_err(req, EINVAL);
301 		return;
302 	}
303 
304 	if (data_transfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
305 		dptr = (passthru_cmd->addr == 0) ? NULL : (uint8_t *)in_buf + sizeof(*passthru_cmd);
306 		mdptr = (passthru_cmd->metadata == 0) ? NULL : (uint8_t *)in_buf + sizeof(*passthru_cmd) +
307 			passthru_cmd->data_len;
308 	}
309 
310 	cuse_nvme_passthru_cmd_send(req, passthru_cmd, dptr, mdptr, cmd);
311 }
312 
313 static void
314 cuse_nvme_reset_execute(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
315 {
316 	int rc;
317 	fuse_req_t req = arg;
318 
319 	rc = spdk_nvme_ctrlr_reset(ctrlr);
320 	if (rc) {
321 		fuse_reply_err(req, rc);
322 		return;
323 	}
324 
325 	fuse_reply_ioctl_iov(req, 0, NULL, 0);
326 }
327 
328 static void
329 cuse_nvme_subsys_reset_execute(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
330 {
331 	int rc;
332 	fuse_req_t req = arg;
333 
334 	rc = spdk_nvme_ctrlr_reset_subsystem(ctrlr);
335 	if (rc) {
336 		fuse_reply_err(req, rc);
337 		return;
338 	}
339 
340 	fuse_reply_ioctl_iov(req, 0, NULL, 0);
341 }
342 
343 static void
344 cuse_nvme_reset(fuse_req_t req, int cmd, void *arg,
345 		struct fuse_file_info *fi, unsigned flags,
346 		const void *in_buf, size_t in_bufsz, size_t out_bufsz)
347 {
348 	int rv;
349 	struct cuse_device *cuse_device = fuse_req_userdata(req);
350 
351 	if (cuse_device->nsid) {
352 		SPDK_ERRLOG("Namespace reset not supported\n");
353 		fuse_reply_err(req, EINVAL);
354 		return;
355 	}
356 
357 	if (cmd == NVME_IOCTL_SUBSYS_RESET) {
358 		SPDK_DEBUGLOG(nvme_cuse, "NVME_IOCTL_SUBSYS_RESET\n");
359 		rv = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_subsys_reset_execute,
360 				      (void *)req);
361 	} else {
362 		SPDK_DEBUGLOG(nvme_cuse, "NVME_IOCTL_RESET\n");
363 		rv = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_reset_execute, (void *)req);
364 	}
365 	if (rv) {
366 		SPDK_ERRLOG("Cannot send reset\n");
367 		fuse_reply_err(req, EINVAL);
368 	}
369 }
370 
371 static void
372 cuse_nvme_rescan_execute(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
373 {
374 	fuse_req_t req = arg;
375 
376 	nvme_ctrlr_update_namespaces(ctrlr);
377 	fuse_reply_ioctl_iov(req, 0, NULL, 0);
378 }
379 
380 static void
381 cuse_nvme_rescan(fuse_req_t req, int cmd, void *arg,
382 		 struct fuse_file_info *fi, unsigned flags,
383 		 const void *in_buf, size_t in_bufsz, size_t out_bufsz)
384 {
385 	int rv;
386 	struct cuse_device *cuse_device = fuse_req_userdata(req);
387 
388 	if (cuse_device->nsid) {
389 		SPDK_ERRLOG("Namespace rescan not supported\n");
390 		fuse_reply_err(req, EINVAL);
391 		return;
392 	}
393 
394 	rv = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_rescan_execute, (void *)req);
395 	if (rv) {
396 		SPDK_ERRLOG("Cannot send rescan\n");
397 		fuse_reply_err(req, EINVAL);
398 	}
399 }
400 
401 /*****************************************************************************
402  * Namespace IO requests
403  */
404 
405 static void
406 cuse_nvme_submit_io_write_done(void *ref, const struct spdk_nvme_cpl *cpl)
407 {
408 	struct cuse_io_ctx *ctx = (struct cuse_io_ctx *)ref;
409 	uint16_t status_field = cpl->status_raw >> 1; /* Drop out phase bit */
410 
411 	fuse_reply_ioctl_iov(ctx->req, status_field, NULL, 0);
412 
413 	cuse_io_ctx_free(ctx);
414 }
415 
416 static void
417 cuse_nvme_submit_io_write_cb(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
418 {
419 	int rc;
420 	struct cuse_io_ctx *ctx = arg;
421 	struct spdk_nvme_ns *ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
422 
423 	rc = spdk_nvme_ns_cmd_write_with_md(ns, ctrlr->external_io_msgs_qpair, ctx->data, ctx->metadata,
424 					    ctx->lba, /* LBA start */
425 					    ctx->lba_count, /* number of LBAs */
426 					    cuse_nvme_submit_io_write_done, ctx, 0,
427 					    ctx->appmask, ctx->apptag);
428 
429 	if (rc != 0) {
430 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
431 		fuse_reply_err(ctx->req, rc);
432 		cuse_io_ctx_free(ctx);
433 		return;
434 	}
435 }
436 
437 static void
438 cuse_nvme_submit_io_write(struct cuse_device *cuse_device, fuse_req_t req, int cmd, void *arg,
439 			  struct fuse_file_info *fi, unsigned flags, uint32_t block_size, uint32_t md_size,
440 			  const void *in_buf, size_t in_bufsz, size_t out_bufsz)
441 {
442 	const struct nvme_user_io *user_io = in_buf;
443 	struct cuse_io_ctx *ctx;
444 	int rc;
445 
446 	ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx));
447 	if (!ctx) {
448 		SPDK_ERRLOG("Cannot allocate memory for context\n");
449 		fuse_reply_err(req, ENOMEM);
450 		return;
451 	}
452 
453 	ctx->req = req;
454 	ctx->lba = user_io->slba;
455 	ctx->lba_count = user_io->nblocks + 1;
456 	ctx->data_len = ctx->lba_count * block_size;
457 
458 	ctx->data = spdk_zmalloc(ctx->data_len, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
459 				 SPDK_MALLOC_DMA);
460 	if (ctx->data == NULL) {
461 		SPDK_ERRLOG("Write buffer allocation failed\n");
462 		fuse_reply_err(ctx->req, ENOMEM);
463 		free(ctx);
464 		return;
465 	}
466 
467 	memcpy(ctx->data, (uint8_t *)in_buf + sizeof(*user_io), ctx->data_len);
468 
469 	if (user_io->metadata) {
470 		ctx->apptag = user_io->apptag;
471 		ctx->appmask = user_io->appmask;
472 		ctx->metadata_len = md_size * ctx->lba_count;
473 		ctx->metadata = spdk_zmalloc(ctx->metadata_len, 4096, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
474 
475 		if (ctx->metadata == NULL) {
476 			SPDK_ERRLOG("Cannot allocate memory for metadata\n");
477 			if (ctx->metadata_len == 0) {
478 				SPDK_ERRLOG("Device format does not support metadata\n");
479 			}
480 			fuse_reply_err(req, ENOMEM);
481 			cuse_io_ctx_free(ctx);
482 			return;
483 		}
484 
485 		memcpy(ctx->metadata, (uint8_t *)in_buf + sizeof(*user_io) + ctx->data_len,
486 		       ctx->metadata_len);
487 	}
488 
489 	rc = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_submit_io_write_cb,
490 			      ctx);
491 	if (rc < 0) {
492 		SPDK_ERRLOG("Cannot send write io\n");
493 		fuse_reply_err(ctx->req, rc);
494 		cuse_io_ctx_free(ctx);
495 	}
496 }
497 
498 static void
499 cuse_nvme_submit_io_read_done(void *ref, const struct spdk_nvme_cpl *cpl)
500 {
501 	struct cuse_io_ctx *ctx = (struct cuse_io_ctx *)ref;
502 	struct iovec iov[2];
503 	int iovcnt = 0;
504 	uint16_t status_field = cpl->status_raw >> 1; /* Drop out phase bit */
505 
506 	iov[iovcnt].iov_base = ctx->data;
507 	iov[iovcnt].iov_len = ctx->data_len;
508 	iovcnt += 1;
509 
510 	if (ctx->metadata) {
511 		iov[iovcnt].iov_base = ctx->metadata;
512 		iov[iovcnt].iov_len = ctx->metadata_len;
513 		iovcnt += 1;
514 	}
515 
516 	fuse_reply_ioctl_iov(ctx->req, status_field, iov, iovcnt);
517 
518 	cuse_io_ctx_free(ctx);
519 }
520 
521 static void
522 cuse_nvme_submit_io_read_cb(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
523 {
524 	int rc;
525 	struct cuse_io_ctx *ctx = arg;
526 	struct spdk_nvme_ns *ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
527 
528 	rc = spdk_nvme_ns_cmd_read_with_md(ns, ctrlr->external_io_msgs_qpair, ctx->data, ctx->metadata,
529 					   ctx->lba, /* LBA start */
530 					   ctx->lba_count, /* number of LBAs */
531 					   cuse_nvme_submit_io_read_done, ctx, 0,
532 					   ctx->appmask, ctx->apptag);
533 
534 	if (rc != 0) {
535 		SPDK_ERRLOG("read failed: rc = %d\n", rc);
536 		fuse_reply_err(ctx->req, rc);
537 		cuse_io_ctx_free(ctx);
538 		return;
539 	}
540 }
541 
542 static void
543 cuse_nvme_submit_io_read(struct cuse_device *cuse_device, fuse_req_t req, int cmd, void *arg,
544 			 struct fuse_file_info *fi, unsigned flags, uint32_t block_size, uint32_t md_size,
545 			 const void *in_buf, size_t in_bufsz, size_t out_bufsz)
546 {
547 	int rc;
548 	struct cuse_io_ctx *ctx;
549 	const struct nvme_user_io *user_io = in_buf;
550 
551 	ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx));
552 	if (!ctx) {
553 		SPDK_ERRLOG("Cannot allocate memory for context\n");
554 		fuse_reply_err(req, ENOMEM);
555 		return;
556 	}
557 
558 	ctx->req = req;
559 	ctx->lba = user_io->slba;
560 	ctx->lba_count = user_io->nblocks + 1;
561 
562 	ctx->data_len = ctx->lba_count * block_size;
563 	ctx->data = spdk_zmalloc(ctx->data_len, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
564 				 SPDK_MALLOC_DMA);
565 	if (ctx->data == NULL) {
566 		SPDK_ERRLOG("Read buffer allocation failed\n");
567 		fuse_reply_err(ctx->req, ENOMEM);
568 		free(ctx);
569 		return;
570 	}
571 
572 	if (user_io->metadata) {
573 		ctx->apptag = user_io->apptag;
574 		ctx->appmask = user_io->appmask;
575 		ctx->metadata_len = md_size * ctx->lba_count;
576 		ctx->metadata = spdk_zmalloc(ctx->metadata_len, 4096, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
577 
578 		if (ctx->metadata == NULL) {
579 			SPDK_ERRLOG("Cannot allocate memory for metadata\n");
580 			if (ctx->metadata_len == 0) {
581 				SPDK_ERRLOG("Device format does not support metadata\n");
582 			}
583 			fuse_reply_err(req, ENOMEM);
584 			cuse_io_ctx_free(ctx);
585 			return;
586 		}
587 	}
588 
589 	rc = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_submit_io_read_cb, ctx);
590 	if (rc < 0) {
591 		SPDK_ERRLOG("Cannot send read io\n");
592 		fuse_reply_err(ctx->req, rc);
593 		cuse_io_ctx_free(ctx);
594 	}
595 }
596 
597 
598 static void
599 cuse_nvme_submit_io(fuse_req_t req, int cmd, void *arg,
600 		    struct fuse_file_info *fi, unsigned flags,
601 		    const void *in_buf, size_t in_bufsz, size_t out_bufsz)
602 {
603 	const struct nvme_user_io *user_io;
604 	struct iovec in_iov[3], out_iov[2];
605 	int in_iovcnt = 0, out_iovcnt = 0;
606 	struct cuse_device *cuse_device = fuse_req_userdata(req);
607 	struct spdk_nvme_ns *ns;
608 	uint32_t block_size;
609 	uint32_t md_size;
610 
611 	in_iov[in_iovcnt].iov_base = (void *)arg;
612 	in_iov[in_iovcnt].iov_len = sizeof(*user_io);
613 	in_iovcnt += 1;
614 	if (in_bufsz == 0) {
615 		fuse_reply_ioctl_retry(req, in_iov, in_iovcnt, NULL, 0);
616 		return;
617 	}
618 
619 	user_io = in_buf;
620 
621 	ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
622 	block_size = spdk_nvme_ns_get_sector_size(ns);
623 	md_size = spdk_nvme_ns_get_md_size(ns);
624 
625 	switch (user_io->opcode) {
626 	case SPDK_NVME_OPC_READ:
627 		out_iov[out_iovcnt].iov_base = (void *)user_io->addr;
628 		out_iov[out_iovcnt].iov_len = (user_io->nblocks + 1) * block_size;
629 		out_iovcnt += 1;
630 		if (user_io->metadata != 0) {
631 			out_iov[out_iovcnt].iov_base = (void *)user_io->metadata;
632 			out_iov[out_iovcnt].iov_len = (user_io->nblocks + 1) * md_size;
633 			out_iovcnt += 1;
634 		}
635 		if (!fuse_check_req_size(req, out_iov, out_iovcnt)) {
636 			return;
637 		}
638 		if (out_bufsz == 0) {
639 			fuse_reply_ioctl_retry(req, in_iov, in_iovcnt, out_iov, out_iovcnt);
640 			return;
641 		}
642 
643 		cuse_nvme_submit_io_read(cuse_device, req, cmd, arg, fi, flags,
644 					 block_size, md_size, in_buf, in_bufsz, out_bufsz);
645 		break;
646 	case SPDK_NVME_OPC_WRITE:
647 		in_iov[in_iovcnt].iov_base = (void *)user_io->addr;
648 		in_iov[in_iovcnt].iov_len = (user_io->nblocks + 1) * block_size;
649 		in_iovcnt += 1;
650 		if (user_io->metadata != 0) {
651 			in_iov[in_iovcnt].iov_base = (void *)user_io->metadata;
652 			in_iov[in_iovcnt].iov_len = (user_io->nblocks + 1) * md_size;
653 			in_iovcnt += 1;
654 		}
655 		if (!fuse_check_req_size(req, in_iov, in_iovcnt)) {
656 			return;
657 		}
658 		if (in_bufsz == sizeof(*user_io)) {
659 			fuse_reply_ioctl_retry(req, in_iov, in_iovcnt, NULL, out_iovcnt);
660 			return;
661 		}
662 
663 		cuse_nvme_submit_io_write(cuse_device, req, cmd, arg, fi, flags,
664 					  block_size, md_size, in_buf, in_bufsz, out_bufsz);
665 		break;
666 	default:
667 		SPDK_ERRLOG("SUBMIT_IO: opc:%d not valid\n", user_io->opcode);
668 		fuse_reply_err(req, EINVAL);
669 		return;
670 	}
671 
672 }
673 
674 /*****************************************************************************
675  * Other namespace IOCTLs
676  */
677 static void
678 cuse_blkgetsize64(fuse_req_t req, int cmd, void *arg,
679 		  struct fuse_file_info *fi, unsigned flags,
680 		  const void *in_buf, size_t in_bufsz, size_t out_bufsz)
681 {
682 	uint64_t size;
683 	struct spdk_nvme_ns *ns;
684 	struct cuse_device *cuse_device = fuse_req_userdata(req);
685 
686 	FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, size);
687 
688 	ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
689 	size = spdk_nvme_ns_get_num_sectors(ns);
690 	fuse_reply_ioctl(req, 0, &size, sizeof(size));
691 }
692 
693 static void
694 cuse_blkpbszget(fuse_req_t req, int cmd, void *arg,
695 		struct fuse_file_info *fi, unsigned flags,
696 		const void *in_buf, size_t in_bufsz, size_t out_bufsz)
697 {
698 	int pbsz;
699 	struct spdk_nvme_ns *ns;
700 	struct cuse_device *cuse_device = fuse_req_userdata(req);
701 
702 	FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, pbsz);
703 
704 	ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
705 	pbsz = spdk_nvme_ns_get_sector_size(ns);
706 	fuse_reply_ioctl(req, 0, &pbsz, sizeof(pbsz));
707 }
708 
709 static void
710 cuse_blkgetsize(fuse_req_t req, int cmd, void *arg,
711 		struct fuse_file_info *fi, unsigned flags,
712 		const void *in_buf, size_t in_bufsz, size_t out_bufsz)
713 {
714 	long size;
715 	struct spdk_nvme_ns *ns;
716 	struct cuse_device *cuse_device = fuse_req_userdata(req);
717 
718 	FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, size);
719 
720 	ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
721 
722 	/* return size in 512 bytes blocks */
723 	size = spdk_nvme_ns_get_num_sectors(ns) * 512 / spdk_nvme_ns_get_sector_size(ns);
724 	fuse_reply_ioctl(req, 0, &size, sizeof(size));
725 }
726 
727 static void
728 cuse_blkgetsectorsize(fuse_req_t req, int cmd, void *arg,
729 		      struct fuse_file_info *fi, unsigned flags,
730 		      const void *in_buf, size_t in_bufsz, size_t out_bufsz)
731 {
732 	int ssize;
733 	struct spdk_nvme_ns *ns;
734 	struct cuse_device *cuse_device = fuse_req_userdata(req);
735 
736 	FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, ssize);
737 
738 	ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
739 	ssize = spdk_nvme_ns_get_sector_size(ns);
740 	fuse_reply_ioctl(req, 0, &ssize, sizeof(ssize));
741 }
742 
743 static void
744 cuse_getid(fuse_req_t req, int cmd, void *arg,
745 	   struct fuse_file_info *fi, unsigned flags,
746 	   const void *in_buf, size_t in_bufsz, size_t out_bufsz)
747 {
748 	struct cuse_device *cuse_device = fuse_req_userdata(req);
749 
750 	fuse_reply_ioctl(req, cuse_device->nsid, NULL, 0);
751 }
752 
753 struct cuse_transport {
754 	char trstring[SPDK_NVMF_TRSTRING_MAX_LEN + 1];
755 	char traddr[SPDK_NVMF_TRADDR_MAX_LEN + 1];
756 };
757 
758 #define SPDK_CUSE_GET_TRANSPORT _IOWR('n', 0x1, struct cuse_transport)
759 
760 static void
761 cuse_get_transport(fuse_req_t req, int cmd, void *arg,
762 		   struct fuse_file_info *fi, unsigned flags,
763 		   const void *in_buf, size_t in_bufsz, size_t out_bufsz)
764 {
765 	struct cuse_device *cuse_device = fuse_req_userdata(req);
766 	struct cuse_transport tr = {};
767 
768 	FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, tr);
769 
770 	memcpy(tr.trstring, cuse_device->ctrlr->trid.trstring, SPDK_NVMF_TRSTRING_MAX_LEN + 1);
771 	memcpy(tr.traddr, cuse_device->ctrlr->trid.traddr, SPDK_NVMF_TRADDR_MAX_LEN + 1);
772 
773 	fuse_reply_ioctl(req, 0, &tr, sizeof(tr));
774 }
775 
776 static void
777 cuse_ctrlr_ioctl(fuse_req_t req, int cmd, void *arg,
778 		 struct fuse_file_info *fi, unsigned flags,
779 		 const void *in_buf, size_t in_bufsz, size_t out_bufsz)
780 {
781 	if (flags & FUSE_IOCTL_COMPAT) {
782 		fuse_reply_err(req, ENOSYS);
783 		return;
784 	}
785 
786 	switch ((unsigned int)cmd) {
787 	case NVME_IOCTL_ADMIN_CMD:
788 		SPDK_DEBUGLOG(nvme_cuse, "NVME_IOCTL_ADMIN_CMD\n");
789 		cuse_nvme_passthru_cmd(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
790 		break;
791 
792 	case NVME_IOCTL_RESET:
793 	case NVME_IOCTL_SUBSYS_RESET:
794 		cuse_nvme_reset(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
795 		break;
796 
797 	case NVME_IOCTL_RESCAN:
798 		SPDK_DEBUGLOG(nvme_cuse, "NVME_IOCTL_RESCAN\n");
799 		cuse_nvme_rescan(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
800 		break;
801 
802 	case NVME_IOCTL_ID:
803 		/* Return error but don't ERRLOG - nvme-cli will frequently send this
804 		 * IOCTL to controller devices.
805 		 */
806 		fuse_reply_err(req, ENOTTY);
807 		break;
808 
809 	case SPDK_CUSE_GET_TRANSPORT:
810 		SPDK_DEBUGLOG(nvme_cuse, "SPDK_CUSE_GET_TRANSPORT\n");
811 		cuse_get_transport(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
812 		break;
813 
814 	default:
815 		SPDK_ERRLOG("Unsupported IOCTL 0x%X.\n", cmd);
816 		fuse_reply_err(req, ENOTTY);
817 	}
818 }
819 
820 static void
821 cuse_ns_ioctl(fuse_req_t req, int cmd, void *arg,
822 	      struct fuse_file_info *fi, unsigned flags,
823 	      const void *in_buf, size_t in_bufsz, size_t out_bufsz)
824 {
825 	if (flags & FUSE_IOCTL_COMPAT) {
826 		fuse_reply_err(req, ENOSYS);
827 		return;
828 	}
829 
830 	switch ((unsigned int)cmd) {
831 	case NVME_IOCTL_ADMIN_CMD:
832 		SPDK_DEBUGLOG(nvme_cuse, "NVME_IOCTL_ADMIN_CMD\n");
833 		cuse_nvme_passthru_cmd(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
834 		break;
835 
836 	case NVME_IOCTL_SUBMIT_IO:
837 		SPDK_DEBUGLOG(nvme_cuse, "NVME_IOCTL_SUBMIT_IO\n");
838 		cuse_nvme_submit_io(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
839 		break;
840 
841 	case NVME_IOCTL_IO_CMD:
842 		SPDK_DEBUGLOG(nvme_cuse, "NVME_IOCTL_IO_CMD\n");
843 		cuse_nvme_passthru_cmd(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
844 		break;
845 
846 	case NVME_IOCTL_ID:
847 		SPDK_DEBUGLOG(nvme_cuse, "NVME_IOCTL_ID\n");
848 		cuse_getid(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
849 		break;
850 
851 	case BLKPBSZGET:
852 		SPDK_DEBUGLOG(nvme_cuse, "BLKPBSZGET\n");
853 		cuse_blkpbszget(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
854 		break;
855 
856 	case BLKSSZGET:
857 		SPDK_DEBUGLOG(nvme_cuse, "BLKSSZGET\n");
858 		cuse_blkgetsectorsize(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
859 		break;
860 
861 	case BLKGETSIZE:
862 		SPDK_DEBUGLOG(nvme_cuse, "BLKGETSIZE\n");
863 		/* Returns the device size as a number of 512-byte blocks (returns pointer to long) */
864 		cuse_blkgetsize(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
865 		break;
866 
867 	case BLKGETSIZE64:
868 		SPDK_DEBUGLOG(nvme_cuse, "BLKGETSIZE64\n");
869 		/* Returns the device size in sectors (returns pointer to uint64_t) */
870 		cuse_blkgetsize64(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
871 		break;
872 
873 	default:
874 		SPDK_ERRLOG("Unsupported IOCTL 0x%X.\n", cmd);
875 		fuse_reply_err(req, ENOTTY);
876 	}
877 }
878 
879 /*****************************************************************************
880  * CUSE threads initialization.
881  */
882 
883 static void
884 cuse_open(fuse_req_t req, struct fuse_file_info *fi)
885 {
886 	fuse_reply_open(req, fi);
887 }
888 
889 static const struct cuse_lowlevel_ops cuse_ctrlr_clop = {
890 	.open		= cuse_open,
891 	.ioctl		= cuse_ctrlr_ioctl,
892 };
893 
894 static const struct cuse_lowlevel_ops cuse_ns_clop = {
895 	.open		= cuse_open,
896 	.ioctl		= cuse_ns_ioctl,
897 };
898 
899 static int
900 cuse_session_create(struct cuse_device *cuse_device)
901 {
902 	char *cuse_argv[] = { "cuse", "-f" };
903 	int multithreaded;
904 	int cuse_argc = SPDK_COUNTOF(cuse_argv);
905 	struct cuse_info ci;
906 	char devname_arg[128 + 8];
907 	const char *dev_info_argv[] = { devname_arg };
908 
909 	snprintf(devname_arg, sizeof(devname_arg), "DEVNAME=%s", cuse_device->dev_name);
910 
911 	memset(&ci, 0, sizeof(ci));
912 	ci.dev_info_argc = 1;
913 	ci.dev_info_argv = dev_info_argv;
914 	ci.flags = CUSE_UNRESTRICTED_IOCTL;
915 
916 	if (cuse_device->nsid) {
917 		cuse_device->session = cuse_lowlevel_setup(cuse_argc, cuse_argv, &ci, &cuse_ns_clop,
918 				       &multithreaded, cuse_device);
919 	} else {
920 		cuse_device->session = cuse_lowlevel_setup(cuse_argc, cuse_argv, &ci, &cuse_ctrlr_clop,
921 				       &multithreaded, cuse_device);
922 	}
923 
924 	if (!cuse_device->session) {
925 		SPDK_ERRLOG("Cannot create cuse session\n");
926 		return -1;
927 	}
928 	SPDK_NOTICELOG("fuse session for device %s created\n", cuse_device->dev_name);
929 	cuse_device->fuse_efd = fuse_session_fd(cuse_device->session);
930 
931 	pthread_mutex_lock(&g_pending_device_mtx);
932 	TAILQ_INSERT_TAIL(&g_pending_device_head, cuse_device, cuse_thread_tailq);
933 	if (eventfd_write(g_cuse_thread_msg_fd, 1) != 0) {
934 		TAILQ_REMOVE(&g_pending_device_head, cuse_device, cuse_thread_tailq);
935 		pthread_mutex_unlock(&g_pending_device_mtx);
936 		SPDK_ERRLOG("eventfd_write failed: (%s).\n", spdk_strerror(errno));
937 		return -errno;
938 	}
939 	pthread_mutex_unlock(&g_pending_device_mtx);
940 	return 0;
941 }
942 
943 static int
944 process_cuse_event(void *arg)
945 {
946 	struct fuse_session *session = arg;
947 	struct fuse_buf buf = { .mem = NULL };
948 	int rc = fuse_session_receive_buf(session, &buf);
949 
950 	if (rc > 0) {
951 		fuse_session_process_buf(session, &buf);
952 	}
953 	free(buf.mem);
954 	return 0;
955 }
956 
957 static int
958 cuse_thread_add_session(void *arg)
959 {
960 	struct cuse_device *cuse_device, *tmp;
961 	int ret;
962 	eventfd_t val;
963 
964 	eventfd_read(g_cuse_thread_msg_fd, &val);
965 
966 	pthread_mutex_lock(&g_pending_device_mtx);
967 	TAILQ_FOREACH_SAFE(cuse_device, &g_pending_device_head, cuse_thread_tailq, tmp) {
968 		ret = spdk_fd_group_add(g_device_fdgrp, cuse_device->fuse_efd, process_cuse_event,
969 					cuse_device->session, cuse_device->dev_name);
970 		if (ret < 0) {
971 			SPDK_ERRLOG("Failed to add fd %d: (%s).\n", cuse_device->fuse_efd,
972 				    spdk_strerror(-ret));
973 			TAILQ_REMOVE(&g_pending_device_head, cuse_device, cuse_thread_tailq);
974 			free(cuse_device);
975 			assert(false);
976 		}
977 	}
978 	TAILQ_CONCAT(&g_active_device_head, &g_pending_device_head, cuse_thread_tailq);
979 	pthread_mutex_unlock(&g_pending_device_mtx);
980 	return 0;
981 }
982 
983 static void *
984 cuse_thread(void *unused)
985 {
986 	struct cuse_device *cuse_device, *tmp;
987 	int timeout_msecs = 500;
988 	bool retry;
989 
990 	spdk_unaffinitize_thread();
991 
992 	do {
993 		retry = false;
994 		spdk_fd_group_wait(g_device_fdgrp, timeout_msecs);
995 		while (!TAILQ_EMPTY(&g_active_device_head)) {
996 			TAILQ_FOREACH_SAFE(cuse_device, &g_active_device_head, cuse_thread_tailq, tmp) {
997 				if (fuse_session_exited(cuse_device->session)) {
998 					spdk_fd_group_remove(g_device_fdgrp, cuse_device->fuse_efd);
999 					fuse_session_reset(cuse_device->session);
1000 					TAILQ_REMOVE(&g_active_device_head, cuse_device, cuse_thread_tailq);
1001 					if (cuse_device->force_exit) {
1002 						cuse_lowlevel_teardown(cuse_device->session);
1003 						free(cuse_device);
1004 					}
1005 				}
1006 			}
1007 			/* Receive and process fuse event and new cuse device addition requests. */
1008 			spdk_fd_group_wait(g_device_fdgrp, timeout_msecs);
1009 		}
1010 		pthread_mutex_lock(&g_cuse_mtx);
1011 		if (!TAILQ_EMPTY(&g_pending_device_head)) {
1012 			pthread_mutex_unlock(&g_cuse_mtx);
1013 			/* Retry as we have some cuse devices pending to be polled on. */
1014 			retry = true;
1015 		}
1016 	} while (retry);
1017 
1018 	spdk_fd_group_remove(g_device_fdgrp, g_cuse_thread_msg_fd);
1019 	close(g_cuse_thread_msg_fd);
1020 	spdk_fd_group_destroy(g_device_fdgrp);
1021 	g_device_fdgrp = NULL;
1022 	pthread_mutex_unlock(&g_cuse_mtx);
1023 	SPDK_NOTICELOG("Cuse thread exited.\n");
1024 	return NULL;
1025 }
1026 
1027 static struct cuse_device *nvme_cuse_get_cuse_ns_device(struct spdk_nvme_ctrlr *ctrlr,
1028 		uint32_t nsid);
1029 
1030 /*****************************************************************************
1031  * CUSE devices management
1032  */
1033 
1034 static int
1035 cuse_nvme_ns_start(struct cuse_device *ctrlr_device, uint32_t nsid)
1036 {
1037 	struct cuse_device *ns_device = NULL;
1038 	int rv;
1039 
1040 	ns_device = nvme_cuse_get_cuse_ns_device(ctrlr_device->ctrlr, nsid);
1041 	if (ns_device != NULL) {
1042 		return 0;
1043 	}
1044 
1045 	ns_device = calloc(1, sizeof(struct cuse_device));
1046 	if (ns_device == NULL) {
1047 		return -ENOMEM;
1048 	}
1049 
1050 	ns_device->ctrlr = ctrlr_device->ctrlr;
1051 	ns_device->ctrlr_device = ctrlr_device;
1052 	ns_device->nsid = nsid;
1053 	rv = snprintf(ns_device->dev_name, sizeof(ns_device->dev_name), "%sn%d",
1054 		      ctrlr_device->dev_name, ns_device->nsid);
1055 	if (rv < 0) {
1056 		SPDK_ERRLOG("Device name too long.\n");
1057 		rv = -ENAMETOOLONG;
1058 		goto free_device;
1059 	}
1060 
1061 	rv = cuse_session_create(ns_device);
1062 	if (rv != 0) {
1063 		goto free_device;
1064 	}
1065 
1066 	TAILQ_INSERT_TAIL(&ctrlr_device->ns_devices, ns_device, tailq);
1067 
1068 	return 0;
1069 
1070 free_device:
1071 	free(ns_device);
1072 	return rv;
1073 }
1074 
1075 static void
1076 cuse_nvme_ns_stop(struct cuse_device *ctrlr_device, struct cuse_device *ns_device)
1077 {
1078 	TAILQ_REMOVE(&ctrlr_device->ns_devices, ns_device, tailq);
1079 	/* ns_device will be freed by cuse_thread */
1080 	if (ns_device->session != NULL) {
1081 		ns_device->force_exit = true;
1082 		fuse_session_exit(ns_device->session);
1083 	}
1084 }
1085 
1086 static int
1087 nvme_cuse_claim(struct cuse_device *ctrlr_device, uint32_t index)
1088 {
1089 	int dev_fd;
1090 	int pid;
1091 	void *dev_map;
1092 	struct flock cusedev_lock = {
1093 		.l_type = F_WRLCK,
1094 		.l_whence = SEEK_SET,
1095 		.l_start = 0,
1096 		.l_len = 0,
1097 	};
1098 
1099 	snprintf(ctrlr_device->lock_name, sizeof(ctrlr_device->lock_name),
1100 		 "/var/tmp/spdk_nvme_cuse_lock_%" PRIu32, index);
1101 
1102 	dev_fd = open(ctrlr_device->lock_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
1103 	if (dev_fd == -1) {
1104 		SPDK_ERRLOG("could not open %s\n", ctrlr_device->lock_name);
1105 		return -errno;
1106 	}
1107 
1108 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
1109 		SPDK_ERRLOG("could not truncate %s\n", ctrlr_device->lock_name);
1110 		close(dev_fd);
1111 		return -errno;
1112 	}
1113 
1114 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
1115 		       MAP_SHARED, dev_fd, 0);
1116 	if (dev_map == MAP_FAILED) {
1117 		SPDK_ERRLOG("could not mmap dev %s (%d)\n", ctrlr_device->lock_name, errno);
1118 		close(dev_fd);
1119 		return -errno;
1120 	}
1121 
1122 	if (fcntl(dev_fd, F_SETLK, &cusedev_lock) != 0) {
1123 		pid = *(int *)dev_map;
1124 		SPDK_ERRLOG("Cannot create lock on device %s, probably"
1125 			    " process %d has claimed it\n", ctrlr_device->lock_name, pid);
1126 		munmap(dev_map, sizeof(int));
1127 		close(dev_fd);
1128 		/* F_SETLK returns unspecified errnos, normalize them */
1129 		return -EACCES;
1130 	}
1131 
1132 	*(int *)dev_map = (int)getpid();
1133 	munmap(dev_map, sizeof(int));
1134 	ctrlr_device->claim_fd = dev_fd;
1135 	ctrlr_device->index = index;
1136 	/* Keep dev_fd open to maintain the lock. */
1137 	return 0;
1138 }
1139 
1140 static void
1141 nvme_cuse_unclaim(struct cuse_device *ctrlr_device)
1142 {
1143 	close(ctrlr_device->claim_fd);
1144 	ctrlr_device->claim_fd = -1;
1145 	unlink(ctrlr_device->lock_name);
1146 }
1147 
1148 static void
1149 cuse_nvme_ctrlr_stop(struct cuse_device *ctrlr_device)
1150 {
1151 	struct cuse_device *ns_device, *tmp;
1152 
1153 	TAILQ_FOREACH_SAFE(ns_device, &ctrlr_device->ns_devices, tailq, tmp) {
1154 		cuse_nvme_ns_stop(ctrlr_device, ns_device);
1155 	}
1156 
1157 	assert(TAILQ_EMPTY(&ctrlr_device->ns_devices));
1158 
1159 	spdk_bit_array_clear(g_ctrlr_started, ctrlr_device->index);
1160 	if (spdk_bit_array_count_set(g_ctrlr_started) == 0) {
1161 		spdk_bit_array_free(&g_ctrlr_started);
1162 	}
1163 	nvme_cuse_unclaim(ctrlr_device);
1164 
1165 	TAILQ_REMOVE(&g_ctrlr_ctx_head, ctrlr_device, tailq);
1166 	/* ctrlr_device will be freed by cuse_thread */
1167 	ctrlr_device->force_exit = true;
1168 	fuse_session_exit(ctrlr_device->session);
1169 }
1170 
1171 static int
1172 cuse_nvme_ctrlr_update_namespaces(struct cuse_device *ctrlr_device)
1173 {
1174 	struct cuse_device *ns_device, *tmp;
1175 	uint32_t nsid;
1176 
1177 	/* Remove namespaces that have disappeared */
1178 	TAILQ_FOREACH_SAFE(ns_device, &ctrlr_device->ns_devices, tailq, tmp) {
1179 		if (!spdk_nvme_ctrlr_is_active_ns(ctrlr_device->ctrlr, ns_device->nsid)) {
1180 			cuse_nvme_ns_stop(ctrlr_device, ns_device);
1181 		}
1182 	}
1183 
1184 	/* Add new namespaces */
1185 	nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr_device->ctrlr);
1186 	while (nsid != 0) {
1187 		if (cuse_nvme_ns_start(ctrlr_device, nsid) < 0) {
1188 			SPDK_ERRLOG("Cannot start CUSE namespace device.");
1189 			return -1;
1190 		}
1191 
1192 		nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr_device->ctrlr, nsid);
1193 	}
1194 
1195 	return 0;
1196 }
1197 
1198 #ifdef FUSE_LOG_H_
1199 static void
1200 nvme_fuse_log_func(enum fuse_log_level level, const char *fmt, va_list ap)
1201 {
1202 	/* fuse will unnecessarily print this log message when tearing down
1203 	 * sessions, once for every session after the first. So use this custom
1204 	 * log handler to silence that specific log message.
1205 	 */
1206 	if (strstr(fmt, "fuse_remove_signal_handlers: unknown session") != NULL) {
1207 		return;
1208 	}
1209 
1210 	vfprintf(stderr, fmt, ap);
1211 }
1212 #endif
1213 
1214 static int
1215 nvme_cuse_start(struct spdk_nvme_ctrlr *ctrlr)
1216 {
1217 	int rv = 0;
1218 	struct cuse_device *ctrlr_device;
1219 
1220 	SPDK_NOTICELOG("Creating cuse device for controller\n");
1221 
1222 	if (g_ctrlr_started == NULL) {
1223 		g_ctrlr_started = spdk_bit_array_create(128);
1224 		if (g_ctrlr_started == NULL) {
1225 			SPDK_ERRLOG("Cannot create bit array\n");
1226 			return -ENOMEM;
1227 		}
1228 #ifdef FUSE_LOG_H_
1229 		/* Older versions of libfuse don't have fuse_set_log_func nor
1230 		 * fuse_log.h, so this is the easiest way to check for it
1231 		 * without adding a separate CONFIG flag.
1232 		 */
1233 		fuse_set_log_func(nvme_fuse_log_func);
1234 #endif
1235 	}
1236 
1237 	ctrlr_device = (struct cuse_device *)calloc(1, sizeof(struct cuse_device));
1238 	if (!ctrlr_device) {
1239 		SPDK_ERRLOG("Cannot allocate memory for ctrlr_device.");
1240 		rv = -ENOMEM;
1241 		goto free_device;
1242 	}
1243 
1244 	ctrlr_device->ctrlr = ctrlr;
1245 
1246 	/* Check if device already exists, if not increment index until success */
1247 	ctrlr_device->index = 0;
1248 	while (1) {
1249 		ctrlr_device->index = spdk_bit_array_find_first_clear(g_ctrlr_started, ctrlr_device->index);
1250 		if (ctrlr_device->index == UINT32_MAX) {
1251 			SPDK_ERRLOG("Too many registered controllers\n");
1252 			goto free_device;
1253 		}
1254 
1255 		if (nvme_cuse_claim(ctrlr_device, ctrlr_device->index) == 0) {
1256 			break;
1257 		}
1258 		ctrlr_device->index++;
1259 	}
1260 	spdk_bit_array_set(g_ctrlr_started, ctrlr_device->index);
1261 	snprintf(ctrlr_device->dev_name, sizeof(ctrlr_device->dev_name), "spdk/nvme%d",
1262 		 ctrlr_device->index);
1263 
1264 	rv = cuse_session_create(ctrlr_device);
1265 	if (rv != 0) {
1266 		goto clear_and_free;
1267 	}
1268 
1269 	TAILQ_INSERT_TAIL(&g_ctrlr_ctx_head, ctrlr_device, tailq);
1270 
1271 	TAILQ_INIT(&ctrlr_device->ns_devices);
1272 
1273 	/* Start all active namespaces */
1274 	if (cuse_nvme_ctrlr_update_namespaces(ctrlr_device) < 0) {
1275 		SPDK_ERRLOG("Cannot start CUSE namespace devices.");
1276 		cuse_nvme_ctrlr_stop(ctrlr_device);
1277 		return -1;
1278 	}
1279 
1280 	return 0;
1281 
1282 clear_and_free:
1283 	spdk_bit_array_clear(g_ctrlr_started, ctrlr_device->index);
1284 free_device:
1285 	free(ctrlr_device);
1286 	if (spdk_bit_array_count_set(g_ctrlr_started) == 0) {
1287 		spdk_bit_array_free(&g_ctrlr_started);
1288 	}
1289 	return rv;
1290 }
1291 
1292 static struct cuse_device *
1293 nvme_cuse_get_cuse_ctrlr_device(struct spdk_nvme_ctrlr *ctrlr)
1294 {
1295 	struct cuse_device *ctrlr_device = NULL;
1296 
1297 	TAILQ_FOREACH(ctrlr_device, &g_ctrlr_ctx_head, tailq) {
1298 		if (ctrlr_device->ctrlr == ctrlr) {
1299 			break;
1300 		}
1301 	}
1302 
1303 	return ctrlr_device;
1304 }
1305 
1306 static struct cuse_device *
1307 nvme_cuse_get_cuse_ns_device(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
1308 {
1309 	struct cuse_device *ctrlr_device = NULL;
1310 	struct cuse_device *ns_device;
1311 
1312 	ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
1313 	if (!ctrlr_device) {
1314 		return NULL;
1315 	}
1316 
1317 	TAILQ_FOREACH(ns_device, &ctrlr_device->ns_devices, tailq) {
1318 		if (ns_device->nsid == nsid) {
1319 			return ns_device;
1320 		}
1321 	}
1322 
1323 	return NULL;
1324 }
1325 
1326 static void
1327 nvme_cuse_stop(struct spdk_nvme_ctrlr *ctrlr)
1328 {
1329 	struct cuse_device *ctrlr_device;
1330 
1331 	assert(spdk_process_is_primary());
1332 
1333 	pthread_mutex_lock(&g_cuse_mtx);
1334 
1335 	ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
1336 	if (!ctrlr_device) {
1337 		SPDK_ERRLOG("Cannot find associated CUSE device\n");
1338 		pthread_mutex_unlock(&g_cuse_mtx);
1339 		return;
1340 	}
1341 
1342 	cuse_nvme_ctrlr_stop(ctrlr_device);
1343 
1344 	pthread_mutex_unlock(&g_cuse_mtx);
1345 }
1346 
1347 static void
1348 nvme_cuse_update(struct spdk_nvme_ctrlr *ctrlr)
1349 {
1350 	struct cuse_device *ctrlr_device;
1351 
1352 	assert(spdk_process_is_primary());
1353 
1354 	pthread_mutex_lock(&g_cuse_mtx);
1355 
1356 	ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
1357 	if (!ctrlr_device) {
1358 		pthread_mutex_unlock(&g_cuse_mtx);
1359 		return;
1360 	}
1361 
1362 	cuse_nvme_ctrlr_update_namespaces(ctrlr_device);
1363 
1364 	pthread_mutex_unlock(&g_cuse_mtx);
1365 }
1366 
1367 static struct nvme_io_msg_producer cuse_nvme_io_msg_producer = {
1368 	.name = "cuse",
1369 	.stop = nvme_cuse_stop,
1370 	.update = nvme_cuse_update,
1371 };
1372 
1373 static int
1374 start_cuse_thread(void)
1375 {
1376 	int rc = 0;
1377 	pthread_t tid;
1378 
1379 	rc = spdk_fd_group_create(&g_device_fdgrp);
1380 	if (rc < 0) {
1381 		SPDK_ERRLOG("Failed to create fd group: (%s).\n", spdk_strerror(-rc));
1382 		return rc;
1383 	}
1384 
1385 	g_cuse_thread_msg_fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1386 	if (g_cuse_thread_msg_fd < 0) {
1387 		SPDK_ERRLOG("Failed to create eventfd: (%s).\n", spdk_strerror(errno));
1388 		rc = -errno;
1389 		goto destroy_fd_group;
1390 	}
1391 
1392 	rc = SPDK_FD_GROUP_ADD(g_device_fdgrp, g_cuse_thread_msg_fd,
1393 			       cuse_thread_add_session, NULL);
1394 	if (rc < 0) {
1395 		SPDK_ERRLOG("Failed to add fd %d: %s.\n", g_cuse_thread_msg_fd,
1396 			    spdk_strerror(-rc));
1397 		goto close_and_destroy_fd;
1398 	}
1399 
1400 	rc = pthread_create(&tid, NULL, cuse_thread, NULL);
1401 	if (rc != 0) {
1402 		SPDK_ERRLOG("pthread_create failed\n");
1403 		rc = -rc;
1404 		goto remove_close_and_destroy_fd;
1405 	}
1406 	pthread_detach(tid);
1407 	pthread_setname_np(tid, "cuse_thread");
1408 	SPDK_NOTICELOG("Successfully started cuse thread to poll for admin commands\n");
1409 	return rc;
1410 
1411 remove_close_and_destroy_fd:
1412 	spdk_fd_group_remove(g_device_fdgrp, g_cuse_thread_msg_fd);
1413 close_and_destroy_fd:
1414 	close(g_cuse_thread_msg_fd);
1415 destroy_fd_group:
1416 	spdk_fd_group_destroy(g_device_fdgrp);
1417 	g_device_fdgrp = NULL;
1418 	return rc;
1419 }
1420 
1421 int
1422 spdk_nvme_cuse_register(struct spdk_nvme_ctrlr *ctrlr)
1423 {
1424 	int rc;
1425 
1426 	if (!spdk_process_is_primary()) {
1427 		SPDK_ERRLOG("only allowed from primary process\n");
1428 		return -EINVAL;
1429 	}
1430 
1431 	rc = nvme_io_msg_ctrlr_register(ctrlr, &cuse_nvme_io_msg_producer);
1432 	if (rc) {
1433 		return rc;
1434 	}
1435 
1436 	pthread_mutex_lock(&g_cuse_mtx);
1437 
1438 	if (g_device_fdgrp == NULL) {
1439 		rc = start_cuse_thread();
1440 		if (rc < 0) {
1441 			SPDK_ERRLOG("Failed to start cuse thread to poll for admin commands\n");
1442 			pthread_mutex_unlock(&g_cuse_mtx);
1443 			return rc;
1444 		}
1445 	}
1446 
1447 	rc = nvme_cuse_start(ctrlr);
1448 	if (rc) {
1449 		nvme_io_msg_ctrlr_unregister(ctrlr, &cuse_nvme_io_msg_producer);
1450 	}
1451 
1452 	pthread_mutex_unlock(&g_cuse_mtx);
1453 
1454 	return rc;
1455 }
1456 
1457 int
1458 spdk_nvme_cuse_unregister(struct spdk_nvme_ctrlr *ctrlr)
1459 {
1460 	struct cuse_device *ctrlr_device;
1461 
1462 	if (!spdk_process_is_primary()) {
1463 		SPDK_ERRLOG("only allowed from primary process\n");
1464 		return -EINVAL;
1465 	}
1466 
1467 	pthread_mutex_lock(&g_cuse_mtx);
1468 
1469 	ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
1470 	if (!ctrlr_device) {
1471 		SPDK_ERRLOG("Cannot find associated CUSE device\n");
1472 		pthread_mutex_unlock(&g_cuse_mtx);
1473 		return -ENODEV;
1474 	}
1475 
1476 	cuse_nvme_ctrlr_stop(ctrlr_device);
1477 
1478 	pthread_mutex_unlock(&g_cuse_mtx);
1479 
1480 	nvme_io_msg_ctrlr_unregister(ctrlr, &cuse_nvme_io_msg_producer);
1481 
1482 	return 0;
1483 }
1484 
1485 void
1486 spdk_nvme_cuse_update_namespaces(struct spdk_nvme_ctrlr *ctrlr)
1487 {
1488 	nvme_cuse_update(ctrlr);
1489 }
1490 
1491 int
1492 spdk_nvme_cuse_get_ctrlr_name(struct spdk_nvme_ctrlr *ctrlr, char *name, size_t *size)
1493 {
1494 	struct cuse_device *ctrlr_device;
1495 	size_t req_len;
1496 
1497 	pthread_mutex_lock(&g_cuse_mtx);
1498 
1499 	ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
1500 	if (!ctrlr_device) {
1501 		pthread_mutex_unlock(&g_cuse_mtx);
1502 		return -ENODEV;
1503 	}
1504 
1505 	req_len = strnlen(ctrlr_device->dev_name, sizeof(ctrlr_device->dev_name));
1506 	if (*size < req_len) {
1507 		*size = req_len;
1508 		pthread_mutex_unlock(&g_cuse_mtx);
1509 		return -ENOSPC;
1510 	}
1511 	snprintf(name, req_len + 1, "%s", ctrlr_device->dev_name);
1512 
1513 	pthread_mutex_unlock(&g_cuse_mtx);
1514 
1515 	return 0;
1516 }
1517 
1518 int
1519 spdk_nvme_cuse_get_ns_name(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, char *name, size_t *size)
1520 {
1521 	struct cuse_device *ns_device;
1522 	size_t req_len;
1523 
1524 	pthread_mutex_lock(&g_cuse_mtx);
1525 
1526 	ns_device = nvme_cuse_get_cuse_ns_device(ctrlr, nsid);
1527 	if (!ns_device) {
1528 		pthread_mutex_unlock(&g_cuse_mtx);
1529 		return -ENODEV;
1530 	}
1531 
1532 	req_len = strnlen(ns_device->dev_name, sizeof(ns_device->dev_name));
1533 	if (*size < req_len) {
1534 		*size = req_len;
1535 		pthread_mutex_unlock(&g_cuse_mtx);
1536 		return -ENOSPC;
1537 	}
1538 	snprintf(name, req_len + 1, "%s", ns_device->dev_name);
1539 
1540 	pthread_mutex_unlock(&g_cuse_mtx);
1541 
1542 	return 0;
1543 }
1544 
1545 SPDK_LOG_REGISTER_COMPONENT(nvme_cuse)
1546