xref: /spdk/app/fio/nvme/fio_plugin.c (revision 34edd9f1bf5fda4c987f4500ddc3c9f50be32e7d)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include "spdk/nvme.h"
9 #include "spdk/nvme_zns.h"
10 #include "spdk/vmd.h"
11 #include "spdk/env.h"
12 #include "spdk/string.h"
13 #include "spdk/log.h"
14 #include "spdk/likely.h"
15 #include "spdk/endian.h"
16 #include "spdk/dif.h"
17 #include "spdk/util.h"
18 #include "spdk/trace.h"
19 
20 #include "config-host.h"
21 #include "fio.h"
22 #include "optgroup.h"
23 
24 #ifdef for_each_rw_ddir
25 #define FIO_HAS_ZBD (FIO_IOOPS_VERSION >= 26)
26 #define FIO_HAS_FDP (FIO_IOOPS_VERSION >= 35)
27 #define FIO_HAS_MRT (FIO_IOOPS_VERSION >= 34)
28 #else
29 #define FIO_HAS_ZBD (0)
30 #define FIO_HAS_FDP (0)
31 #define FIO_HAS_MRT (0)
32 #endif
33 
34 /* FreeBSD is missing CLOCK_MONOTONIC_RAW,
35  * so alternative is provided. */
36 #ifndef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */
37 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
38 #endif
39 
40 #define NVME_IO_ALIGN		4096
41 
42 static bool g_spdk_env_initialized;
43 static bool g_log_flag_error;
44 static int g_spdk_enable_sgl = 0;
45 static uint32_t g_spdk_sge_size = 4096;
46 static uint32_t g_spdk_bit_bucket_data_len = 0;
47 static uint32_t g_spdk_pract_flag;
48 static uint32_t g_spdk_prchk_flags;
49 static uint32_t g_spdk_md_per_io_size = 4096;
50 static uint16_t g_spdk_apptag;
51 static uint16_t g_spdk_apptag_mask;
52 
53 struct spdk_fio_options {
54 	void	*pad;	/* off1 used in option descriptions may not be 0 */
55 	int	enable_wrr;
56 	int	arbitration_burst;
57 	int	low_weight;
58 	int	medium_weight;
59 	int	high_weight;
60 	int	wrr_priority;
61 	int	mem_size;
62 	int	shm_id;
63 	int	enable_sgl;
64 	int	sge_size;
65 	int	bit_bucket_data_len;
66 	char	*hostnqn;
67 	int	pi_act;
68 	char	*pi_chk;
69 	int	md_per_io_size;
70 	int	apptag;
71 	int	apptag_mask;
72 	char	*digest_enable;
73 	int	enable_vmd;
74 	int	initial_zone_reset;
75 	int	zone_append;
76 	int	print_qid_mappings;
77 	int	spdk_tracing;
78 	char	*log_flags;
79 	int	disable_pcie_sgl_merge;
80 };
81 
82 struct spdk_fio_request {
83 	struct io_u		*io;
84 	/** Offset in current iovec, fio only uses 1 vector */
85 	uint32_t		iov_offset;
86 
87 	/** Amount of data used for Bit Bucket SGL */
88 	uint32_t		bit_bucket_data_len;
89 
90 	/** Context for NVMe PI */
91 	struct spdk_dif_ctx	dif_ctx;
92 	/** Separate metadata buffer pointer */
93 	void			*md_buf;
94 
95 	/** Dataset management range information */
96 	struct spdk_nvme_dsm_range *dsm_range;
97 
98 	struct spdk_fio_thread	*fio_thread;
99 	struct spdk_fio_qpair	*fio_qpair;
100 };
101 
102 struct spdk_fio_ctrlr {
103 	struct spdk_nvme_transport_id	tr_id;
104 	struct spdk_nvme_ctrlr_opts	opts;
105 	struct spdk_nvme_ctrlr		*ctrlr;
106 	TAILQ_ENTRY(spdk_fio_ctrlr)	link;
107 };
108 
109 static TAILQ_HEAD(, spdk_fio_ctrlr) g_ctrlrs = TAILQ_HEAD_INITIALIZER(g_ctrlrs);
110 static int g_td_count;
111 static pthread_t g_ctrlr_thread_id = 0;
112 static pthread_mutex_t g_mutex = PTHREAD_MUTEX_INITIALIZER;
113 static bool g_error;
114 
115 struct spdk_fio_qpair {
116 	struct fio_file			*f;
117 	struct spdk_nvme_qpair		*qpair;
118 	struct spdk_nvme_ns		*ns;
119 	uint32_t			io_flags;
120 	bool				zone_append_enabled;
121 	bool				nvme_pi_enabled;
122 	/* True for DIF and false for DIX, and this is valid only if nvme_pi_enabled is true. */
123 	bool				extended_lba;
124 	/* True for protection info transferred at start of metadata,
125 	 * false for protection info transferred at end of metadata, and
126 	 * this is valid only if nvme_pi_enabled is true.
127 	 */
128 	bool				md_start;
129 	TAILQ_ENTRY(spdk_fio_qpair)	link;
130 	struct spdk_fio_ctrlr		*fio_ctrlr;
131 };
132 
133 struct spdk_fio_thread {
134 	struct thread_data		*td;
135 
136 	TAILQ_HEAD(, spdk_fio_qpair)	fio_qpair;
137 	struct spdk_fio_qpair		*fio_qpair_current;	/* the current fio_qpair to be handled. */
138 
139 	struct io_u			**iocq;		/* io completion queue */
140 	unsigned int			iocq_count;	/* number of iocq entries filled by last getevents */
141 	unsigned int			iocq_size;	/* number of iocq entries allocated */
142 
143 };
144 
145 struct spdk_fio_probe_ctx {
146 	struct thread_data	*td;
147 	char			hostnqn[SPDK_NVMF_NQN_MAX_LEN + 1];
148 	struct fio_file		*f; /* fio_file given by user */
149 };
150 
151 static void *
152 spdk_fio_poll_ctrlrs(void *arg)
153 {
154 	struct spdk_fio_ctrlr *fio_ctrlr;
155 	int oldstate;
156 	int rc;
157 
158 	/* Loop until the thread is cancelled */
159 	while (true) {
160 		rc = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate);
161 		if (rc != 0) {
162 			SPDK_ERRLOG("Unable to set cancel state disabled on g_init_thread (%d): %s\n",
163 				    rc, spdk_strerror(rc));
164 		}
165 
166 		pthread_mutex_lock(&g_mutex);
167 
168 		TAILQ_FOREACH(fio_ctrlr, &g_ctrlrs, link) {
169 			spdk_nvme_ctrlr_process_admin_completions(fio_ctrlr->ctrlr);
170 		}
171 
172 		pthread_mutex_unlock(&g_mutex);
173 
174 		rc = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);
175 		if (rc != 0) {
176 			SPDK_ERRLOG("Unable to set cancel state enabled on g_init_thread (%d): %s\n",
177 				    rc, spdk_strerror(rc));
178 		}
179 
180 		/* This is a pthread cancellation point and cannot be removed. */
181 		sleep(1);
182 	}
183 
184 	return NULL;
185 }
186 
187 static bool
188 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
189 	 struct spdk_nvme_ctrlr_opts *opts)
190 {
191 	struct spdk_fio_probe_ctx *ctx = cb_ctx;
192 	struct thread_data *td = ctx->td;
193 	struct spdk_fio_options *fio_options = td->eo;
194 
195 	if (ctx->hostnqn[0] != '\0') {
196 		memcpy(opts->hostnqn, ctx->hostnqn, sizeof(opts->hostnqn));
197 	} else if (fio_options->hostnqn) {
198 		snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", fio_options->hostnqn);
199 	}
200 
201 	if (fio_options->enable_wrr) {
202 		opts->arb_mechanism		= SPDK_NVME_CC_AMS_WRR;
203 		opts->arbitration_burst		= fio_options->arbitration_burst;
204 		opts->low_priority_weight	= fio_options->low_weight;
205 		opts->medium_priority_weight	= fio_options->medium_weight;
206 		opts->high_priority_weight	= fio_options->high_weight;
207 	}
208 
209 	if (fio_options->digest_enable) {
210 		if (strcasecmp(fio_options->digest_enable, "HEADER") == 0) {
211 			opts->header_digest = true;
212 		} else if (strcasecmp(fio_options->digest_enable, "DATA") == 0) {
213 			opts->data_digest = true;
214 		} else if (strcasecmp(fio_options->digest_enable, "BOTH") == 0) {
215 			opts->header_digest = true;
216 			opts->data_digest = true;
217 		}
218 	}
219 
220 	return true;
221 }
222 
223 static struct spdk_fio_ctrlr *
224 get_fio_ctrlr(const struct spdk_nvme_transport_id *trid)
225 {
226 	struct spdk_fio_ctrlr	*fio_ctrlr;
227 
228 	TAILQ_FOREACH(fio_ctrlr, &g_ctrlrs, link) {
229 		if (spdk_nvme_transport_id_compare(trid, &fio_ctrlr->tr_id) == 0) {
230 			return fio_ctrlr;
231 		}
232 	}
233 
234 	return NULL;
235 }
236 
237 /**
238  * Returns the fio_qpair matching the given fio_file and has an associated ns
239  */
240 static struct spdk_fio_qpair *
241 get_fio_qpair(struct spdk_fio_thread *fio_thread, struct fio_file *f)
242 {
243 	struct spdk_fio_qpair	*fio_qpair;
244 
245 	TAILQ_FOREACH(fio_qpair, &fio_thread->fio_qpair, link) {
246 		if ((fio_qpair->f == f) && fio_qpair->ns) {
247 			return fio_qpair;
248 		}
249 	}
250 
251 	return NULL;
252 }
253 
254 #if FIO_HAS_ZBD
255 /**
256  * Callback function to use while processing completions until completion-indicator turns non-zero
257  */
258 static void
259 pcu_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
260 {
261 	int *completed = ctx;
262 
263 	*completed = spdk_nvme_cpl_is_error(cpl) ? -1 : 1;
264 }
265 
266 /**
267  * Process Completions Until the given 'completed' indicator turns non-zero or an error occurs
268  */
269 static int32_t
270 pcu(struct spdk_nvme_qpair *qpair, int *completed)
271 {
272 	int32_t ret;
273 
274 	while (!*completed) {
275 		ret = spdk_nvme_qpair_process_completions(qpair, 1);
276 		if (ret < 0) {
277 			log_err("spdk/nvme: process_compl(): ret: %d\n", ret);
278 			return ret;
279 		}
280 	}
281 
282 	return 0;
283 }
284 #endif
285 
286 static inline uint32_t
287 _nvme_get_host_buffer_sector_size(struct spdk_nvme_ns *ns, uint32_t io_flags)
288 {
289 	bool md_excluded_from_xfer = false;
290 	uint32_t md_size;
291 	uint32_t ns_flags;
292 
293 	ns_flags = spdk_nvme_ns_get_flags(ns);
294 	md_size = spdk_nvme_ns_get_md_size(ns);
295 
296 	/* For extended LBA format, if the metadata size is 8 bytes and PRACT is
297 	 * enabled(controller inserts/strips PI), we should reduce metadata size
298 	 * from block size.
299 	 */
300 	md_excluded_from_xfer = ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) &&
301 				 (ns_flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) &&
302 				 (ns_flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) &&
303 				 (md_size == 8));
304 
305 	return md_excluded_from_xfer ? spdk_nvme_ns_get_sector_size(ns) :
306 	       spdk_nvme_ns_get_extended_sector_size(ns);
307 }
308 
309 static void
310 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
311 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
312 {
313 	struct spdk_fio_probe_ctx *ctx = cb_ctx;
314 	struct thread_data	*td = ctx->td;
315 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
316 	struct spdk_fio_ctrlr	*fio_ctrlr;
317 	struct spdk_fio_qpair	*fio_qpair;
318 	struct spdk_nvme_ns	*ns;
319 	const struct spdk_nvme_ns_data	*nsdata;
320 	struct fio_file		*f = ctx->f;
321 	uint32_t		ns_id;
322 	char			*p;
323 	long int		tmp;
324 	uint32_t		block_size;
325 	struct spdk_fio_options *fio_options = td->eo;
326 
327 	p = strstr(f->file_name, "ns=");
328 	if (p != NULL) {
329 		tmp = spdk_strtol(p + 3, 10);
330 		if (tmp <= 0) {
331 			SPDK_ERRLOG("namespace id should be >=1, but was invalid: %ld\n", tmp);
332 			g_error = true;
333 			return;
334 		}
335 		ns_id = (uint32_t)tmp;
336 	} else {
337 		ns_id = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
338 		if (ns_id == 0) {
339 			/* The ctrlr has no active namespaces and we didn't specify any so nothing to do. */
340 			return;
341 		}
342 	}
343 
344 	pthread_mutex_lock(&g_mutex);
345 	fio_ctrlr = get_fio_ctrlr(trid);
346 	/* it is a new ctrlr and needs to be added */
347 	if (!fio_ctrlr) {
348 		/* Create an fio_ctrlr and add it to the list */
349 		fio_ctrlr = calloc(1, sizeof(*fio_ctrlr));
350 		if (!fio_ctrlr) {
351 			SPDK_ERRLOG("Cannot allocate space for fio_ctrlr\n");
352 			g_error = true;
353 			pthread_mutex_unlock(&g_mutex);
354 			return;
355 		}
356 		fio_ctrlr->opts = *opts;
357 		fio_ctrlr->ctrlr = ctrlr;
358 		fio_ctrlr->tr_id = *trid;
359 		TAILQ_INSERT_TAIL(&g_ctrlrs, fio_ctrlr, link);
360 	}
361 	pthread_mutex_unlock(&g_mutex);
362 
363 	ns = spdk_nvme_ctrlr_get_ns(fio_ctrlr->ctrlr, ns_id);
364 	if (ns == NULL) {
365 		SPDK_ERRLOG("Cannot get namespace by ns_id=%d\n", ns_id);
366 		g_error = true;
367 		return;
368 	}
369 
370 	if (!spdk_nvme_ns_is_active(ns)) {
371 		SPDK_ERRLOG("Inactive namespace by ns_id=%d\n", ns_id);
372 		g_error = true;
373 		return;
374 	}
375 	nsdata = spdk_nvme_ns_get_data(ns);
376 
377 	TAILQ_FOREACH(fio_qpair, &fio_thread->fio_qpair, link) {
378 		if ((fio_qpair->f == f) ||
379 		    ((spdk_nvme_transport_id_compare(trid, &fio_qpair->fio_ctrlr->tr_id) == 0) &&
380 		     (spdk_nvme_ns_get_id(fio_qpair->ns) == ns_id))) {
381 			/* Not the error case. Avoid duplicated connection */
382 			return;
383 		}
384 	}
385 
386 	/* create a new qpair */
387 	fio_qpair = calloc(1, sizeof(*fio_qpair));
388 	if (!fio_qpair) {
389 		g_error = true;
390 		SPDK_ERRLOG("Cannot allocate space for fio_qpair\n");
391 		return;
392 	}
393 
394 	f->engine_data = fio_qpair;
395 	fio_qpair->ns = ns;
396 	fio_qpair->f = f;
397 	fio_qpair->fio_ctrlr = fio_ctrlr;
398 	TAILQ_INSERT_TAIL(&fio_thread->fio_qpair, fio_qpair, link);
399 
400 	if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) {
401 		assert(spdk_nvme_ns_get_pi_type(ns) != SPDK_NVME_FMT_NVM_PROTECTION_DISABLE);
402 		fio_qpair->io_flags = g_spdk_pract_flag | g_spdk_prchk_flags;
403 		fio_qpair->nvme_pi_enabled = true;
404 		fio_qpair->md_start = nsdata->dps.md_start;
405 		fio_qpair->extended_lba = spdk_nvme_ns_supports_extended_lba(ns);
406 		fprintf(stdout, "PI type%u enabled with %s\n", spdk_nvme_ns_get_pi_type(ns),
407 			fio_qpair->extended_lba ? "extended lba" : "separate metadata");
408 	}
409 
410 	block_size = _nvme_get_host_buffer_sector_size(ns, fio_qpair->io_flags);
411 	for_each_rw_ddir(ddir) {
412 		if (td->o.min_bs[ddir] % block_size != 0 || td->o.max_bs[ddir] % block_size != 0) {
413 			if (spdk_nvme_ns_supports_extended_lba(ns)) {
414 				SPDK_ERRLOG("--bs or other block size related option has to be a multiple of (LBA data size + Metadata size)\n");
415 			} else {
416 				SPDK_ERRLOG("--bs or other block size related option has to be a multiple of LBA data size\n");
417 			}
418 			g_error = true;
419 			return;
420 		}
421 	}
422 
423 	if (fio_options->zone_append && spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS) {
424 		if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED) {
425 			SPDK_DEBUGLOG(fio_nvme, "Using zone appends instead of writes on: '%s'\n",
426 				      f->file_name);
427 			fio_qpair->zone_append_enabled = true;
428 		} else {
429 			SPDK_WARNLOG("Falling back to writes on: '%s' - ns lacks zone append cmd\n",
430 				     f->file_name);
431 		}
432 	}
433 
434 #if FIO_HAS_ZBD
435 	if (td_trim(td) && td->o.zone_mode == ZONE_MODE_ZBD) {
436 		td->io_ops->flags |= FIO_ASYNCIO_SYNC_TRIM;
437 	}
438 #endif
439 
440 	if (fio_options->initial_zone_reset == 1 && spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS) {
441 #if FIO_HAS_ZBD
442 		struct spdk_nvme_qpair *tmp_qpair;
443 		int completed = 0, err;
444 
445 		/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
446 		 * Create a temporary qpair in order to perform the initial zone reset.
447 		 */
448 		assert(!fio_qpair->qpair);
449 
450 		tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, NULL, 0);
451 		if (!tmp_qpair) {
452 			SPDK_ERRLOG("Cannot allocate a temporary qpair\n");
453 			g_error = true;
454 			return;
455 		}
456 
457 		err = spdk_nvme_zns_reset_zone(ns, tmp_qpair, 0x0, true, pcu_cb, &completed);
458 		if (err || pcu(tmp_qpair, &completed) || completed < 0) {
459 			log_err("spdk/nvme: warn: initial_zone_reset: err: %d, cpl: %d\n",
460 				err, completed);
461 		}
462 
463 		spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
464 #else
465 		log_err("spdk/nvme: ZBD/ZNS is not supported\n");
466 #endif
467 	}
468 
469 	f->real_file_size = spdk_nvme_ns_get_size(fio_qpair->ns);
470 	if (f->real_file_size <= 0) {
471 		g_error = true;
472 		SPDK_ERRLOG("Cannot get namespace size by ns=%p\n", ns);
473 		return;
474 	}
475 
476 	f->filetype = FIO_TYPE_BLOCK;
477 	fio_file_set_size_known(f);
478 }
479 
480 static void
481 parse_prchk_flags(const char *prchk_str)
482 {
483 	if (!prchk_str) {
484 		return;
485 	}
486 
487 	if (strstr(prchk_str, "GUARD") != NULL) {
488 		g_spdk_prchk_flags = SPDK_NVME_IO_FLAGS_PRCHK_GUARD;
489 	}
490 	if (strstr(prchk_str, "REFTAG") != NULL) {
491 		g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG;
492 	}
493 	if (strstr(prchk_str, "APPTAG") != NULL) {
494 		g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG;
495 	}
496 }
497 
498 static void
499 parse_pract_flag(int pract)
500 {
501 	if (pract == 1) {
502 		g_spdk_pract_flag = SPDK_NVME_IO_FLAGS_PRACT;
503 	} else {
504 		g_spdk_pract_flag = 0;
505 	}
506 }
507 
508 static bool
509 fio_redirected_to_dev_null(void)
510 {
511 	char path[PATH_MAX] = "";
512 	ssize_t ret;
513 
514 	ret = readlink("/proc/self/fd/1", path, sizeof(path));
515 
516 	if (ret == -1 || strcmp(path, "/dev/null") != 0) {
517 		return false;
518 	}
519 
520 	ret = readlink("/proc/self/fd/2", path, sizeof(path));
521 
522 	if (ret == -1 || strcmp(path, "/dev/null") != 0) {
523 		return false;
524 	}
525 
526 	return true;
527 }
528 
529 static int
530 spdk_fio_init(struct thread_data *td)
531 {
532 	int ret = 0;
533 	struct spdk_fio_options *fio_options = td->eo;
534 
535 	if (fio_options->spdk_tracing) {
536 		ret = spdk_trace_register_user_thread();
537 	}
538 
539 	return ret;
540 }
541 
542 /* Called once at initialization. This is responsible for gathering the size of
543  * each "file", which in our case are in the form
544  * 'key=value [key=value] ... ns=value'
545  * For example, For local PCIe NVMe device  - 'trtype=PCIe traddr=0000.04.00.0 ns=1'
546  * For remote exported by NVMe-oF target, 'trtype=RDMA adrfam=IPv4 traddr=192.168.100.8 trsvcid=4420 ns=1' */
547 static int
548 spdk_fio_setup(struct thread_data *td)
549 {
550 	struct spdk_fio_thread *fio_thread;
551 	struct spdk_fio_options *fio_options = td->eo;
552 	struct spdk_fio_probe_ctx ctx;
553 	struct spdk_env_opts opts;
554 	struct fio_file *f;
555 	char *p;
556 	int rc = 0;
557 	struct spdk_nvme_transport_id trid;
558 	struct spdk_fio_ctrlr *fio_ctrlr;
559 	char *trid_info;
560 	unsigned int i;
561 	size_t size;
562 
563 	/*
564 	 * If we're running in a daemonized FIO instance, it's possible
565 	 * fd 1/2 were re-used for something important by FIO. Newer fio
566 	 * versions are careful to redirect those to /dev/null, but if we're
567 	 * not, we'll abort early, so we don't accidentally write messages to
568 	 * an important file, etc.
569 	 */
570 	if (is_backend && !fio_redirected_to_dev_null()) {
571 		char buf[1024];
572 		snprintf(buf, sizeof(buf),
573 			 "SPDK FIO plugin is in daemon mode, but stdout/stderr "
574 			 "aren't redirected to /dev/null. Aborting.");
575 		fio_server_text_output(FIO_LOG_ERR, buf, sizeof(buf));
576 		return -1;
577 	}
578 
579 	if (!td->o.use_thread) {
580 		log_err("spdk: must set thread=1 when using spdk plugin\n");
581 		return 1;
582 	}
583 
584 	if (g_log_flag_error) {
585 		/* The first thread found an error when parsing log flags, so
586 		 * just return error immediately for all of the other threads.
587 		 */
588 		return 1;
589 	}
590 
591 	pthread_mutex_lock(&g_mutex);
592 
593 	fio_thread = calloc(1, sizeof(*fio_thread));
594 	assert(fio_thread != NULL);
595 
596 	td->io_ops_data = fio_thread;
597 	fio_thread->td = td;
598 
599 	fio_thread->iocq_size = td->o.iodepth;
600 	fio_thread->iocq = calloc(fio_thread->iocq_size, sizeof(struct io_u *));
601 	assert(fio_thread->iocq != NULL);
602 
603 	TAILQ_INIT(&fio_thread->fio_qpair);
604 
605 	if (!g_spdk_env_initialized) {
606 		spdk_env_opts_init(&opts);
607 		opts.name = "fio";
608 		opts.mem_size = fio_options->mem_size;
609 		opts.shm_id = fio_options->shm_id;
610 		g_spdk_enable_sgl = fio_options->enable_sgl;
611 		g_spdk_sge_size = fio_options->sge_size;
612 		g_spdk_bit_bucket_data_len = fio_options->bit_bucket_data_len;
613 		parse_pract_flag(fio_options->pi_act);
614 		g_spdk_md_per_io_size = spdk_max(fio_options->md_per_io_size, 4096);
615 		g_spdk_apptag = (uint16_t)fio_options->apptag;
616 		g_spdk_apptag_mask = (uint16_t)fio_options->apptag_mask;
617 		parse_prchk_flags(fio_options->pi_chk);
618 		if (spdk_env_init(&opts) < 0) {
619 			SPDK_ERRLOG("Unable to initialize SPDK env\n");
620 			free(fio_thread->iocq);
621 			free(fio_thread);
622 			fio_thread = NULL;
623 			pthread_mutex_unlock(&g_mutex);
624 			return 1;
625 		}
626 
627 		if (fio_options->log_flags) {
628 			char *tok = strtok(fio_options->log_flags, ",");
629 			do {
630 				rc = spdk_log_set_flag(tok);
631 				if (rc < 0) {
632 					SPDK_ERRLOG("unknown log flag %s\n", tok);
633 					g_log_flag_error = true;
634 					return 1;
635 				}
636 			} while ((tok = strtok(NULL, ",")) != NULL);
637 #ifdef DEBUG
638 			spdk_log_set_print_level(SPDK_LOG_DEBUG);
639 #endif
640 		}
641 
642 		g_spdk_env_initialized = true;
643 		spdk_unaffinitize_thread();
644 
645 		if (fio_options->spdk_tracing) {
646 			spdk_trace_init("spdk_fio_tracepoints", 65536, td->o.numjobs);
647 			spdk_trace_enable_tpoint_group("nvme_pcie");
648 			spdk_trace_enable_tpoint_group("nvme_tcp");
649 		}
650 
651 		/* Spawn a thread to continue polling the controllers */
652 		rc = pthread_create(&g_ctrlr_thread_id, NULL, &spdk_fio_poll_ctrlrs, NULL);
653 		if (rc != 0) {
654 			SPDK_ERRLOG("Unable to spawn a thread to poll admin queues. They won't be polled.\n");
655 		}
656 
657 		if (fio_options->enable_vmd && spdk_vmd_init()) {
658 			SPDK_ERRLOG("Failed to initialize VMD. Some NVMe devices can be unavailable.\n");
659 		}
660 	}
661 	pthread_mutex_unlock(&g_mutex);
662 
663 	for_each_file(td, f, i) {
664 		memset(&trid, 0, sizeof(trid));
665 		memset(&ctx, 0, sizeof(ctx));
666 
667 		trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
668 
669 		p = strstr(f->file_name, " ns=");
670 		if (p != NULL) {
671 			trid_info = strndup(f->file_name, p - f->file_name);
672 		} else {
673 			trid_info = strndup(f->file_name, strlen(f->file_name));
674 		}
675 
676 		if (!trid_info) {
677 			SPDK_ERRLOG("Failed to allocate space for trid_info\n");
678 			continue;
679 		}
680 
681 		rc = spdk_nvme_transport_id_parse(&trid, trid_info);
682 		if (rc < 0) {
683 			SPDK_ERRLOG("Failed to parse given str: %s\n", trid_info);
684 			free(trid_info);
685 			continue;
686 		}
687 		free(trid_info);
688 
689 		if (trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
690 			struct spdk_pci_addr pci_addr;
691 			if (spdk_pci_addr_parse(&pci_addr, trid.traddr) < 0) {
692 				SPDK_ERRLOG("Invalid traddr=%s\n", trid.traddr);
693 				continue;
694 			}
695 			spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
696 		} else {
697 			if (trid.subnqn[0] == '\0') {
698 				snprintf(trid.subnqn, sizeof(trid.subnqn), "%s",
699 					 SPDK_NVMF_DISCOVERY_NQN);
700 			}
701 			if ((p = strcasestr(f->file_name, "hostnqn:")) ||
702 			    (p = strcasestr(f->file_name, "hostnqn="))) {
703 				p += strlen("hostnqn:");
704 				size = strcspn(p, " \t\n");
705 				if (size > sizeof(ctx.hostnqn)) {
706 					SPDK_ERRLOG("Invalid hostnqn: too long\n");
707 					continue;
708 				}
709 				memcpy(ctx.hostnqn, p, size);
710 			}
711 		}
712 
713 		ctx.td = td;
714 		ctx.f = f;
715 
716 		pthread_mutex_lock(&g_mutex);
717 		fio_ctrlr = get_fio_ctrlr(&trid);
718 		pthread_mutex_unlock(&g_mutex);
719 		if (fio_ctrlr) {
720 			attach_cb(&ctx, &trid, fio_ctrlr->ctrlr, &fio_ctrlr->opts);
721 		} else {
722 			/* Enumerate all of the controllers */
723 			if (spdk_nvme_probe(&trid, &ctx, probe_cb, attach_cb, NULL) != 0) {
724 				SPDK_ERRLOG("spdk_nvme_probe() failed\n");
725 				continue;
726 			}
727 		}
728 
729 		if (g_error) {
730 			log_err("Failed to initialize spdk fio plugin\n");
731 			rc = 1;
732 			break;
733 		}
734 	}
735 
736 	pthread_mutex_lock(&g_mutex);
737 	g_td_count++;
738 	pthread_mutex_unlock(&g_mutex);
739 
740 	return rc;
741 }
742 
743 static int
744 spdk_fio_open(struct thread_data *td, struct fio_file *f)
745 {
746 	struct spdk_fio_qpair *fio_qpair = f->engine_data;
747 	struct spdk_fio_ctrlr *fio_ctrlr = fio_qpair->fio_ctrlr;
748 	struct spdk_fio_options *fio_options = td->eo;
749 	struct spdk_nvme_io_qpair_opts	qpopts;
750 
751 	assert(fio_qpair->qpair == NULL);
752 	spdk_nvme_ctrlr_get_default_io_qpair_opts(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts));
753 	qpopts.delay_cmd_submit = true;
754 	if (fio_options->enable_wrr) {
755 		qpopts.qprio = fio_options->wrr_priority;
756 	}
757 	qpopts.disable_pcie_sgl_merge = fio_options->disable_pcie_sgl_merge;
758 
759 	fio_qpair->qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts));
760 	if (!fio_qpair->qpair) {
761 		SPDK_ERRLOG("Cannot allocate nvme io_qpair any more\n");
762 		g_error = true;
763 		free(fio_qpair);
764 		return -1;
765 	}
766 
767 	if (fio_options->print_qid_mappings == 1) {
768 		log_info("job %s: %s qid %d\n", td->o.name, f->file_name,
769 			 spdk_nvme_qpair_get_id(fio_qpair->qpair));
770 	}
771 
772 	return 0;
773 }
774 
775 static int
776 spdk_fio_close(struct thread_data *td, struct fio_file *f)
777 {
778 	struct spdk_fio_qpair *fio_qpair = f->engine_data;
779 
780 	assert(fio_qpair->qpair != NULL);
781 	spdk_nvme_ctrlr_free_io_qpair(fio_qpair->qpair);
782 	fio_qpair->qpair = NULL;
783 	return 0;
784 }
785 
786 static int
787 spdk_fio_iomem_alloc(struct thread_data *td, size_t total_mem)
788 {
789 	td->orig_buffer = spdk_dma_zmalloc(total_mem, NVME_IO_ALIGN, NULL);
790 	return td->orig_buffer == NULL;
791 }
792 
793 static void
794 spdk_fio_iomem_free(struct thread_data *td)
795 {
796 	spdk_dma_free(td->orig_buffer);
797 }
798 
799 static int
800 spdk_fio_io_u_init(struct thread_data *td, struct io_u *io_u)
801 {
802 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
803 	struct spdk_fio_request	*fio_req;
804 	uint32_t dsm_size;
805 
806 	io_u->engine_data = NULL;
807 
808 	fio_req = calloc(1, sizeof(*fio_req));
809 	if (fio_req == NULL) {
810 		return 1;
811 	}
812 
813 	if (!(td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM)) {
814 #if FIO_HAS_MRT
815 		/* By default number of range is set to 1 */
816 		dsm_size = td->o.num_range * sizeof(struct spdk_nvme_dsm_range);
817 #else
818 		dsm_size = sizeof(struct spdk_nvme_dsm_range);
819 #endif
820 		fio_req->dsm_range = calloc(1, dsm_size);
821 		if (fio_req->dsm_range == NULL) {
822 			free(fio_req);
823 			return 1;
824 		}
825 	}
826 
827 	fio_req->md_buf = spdk_dma_zmalloc(g_spdk_md_per_io_size, NVME_IO_ALIGN, NULL);
828 	if (fio_req->md_buf == NULL) {
829 		fprintf(stderr, "Allocate %u metadata failed\n", g_spdk_md_per_io_size);
830 		free(fio_req->dsm_range);
831 		free(fio_req);
832 		return 1;
833 	}
834 
835 	fio_req->io = io_u;
836 	fio_req->fio_thread = fio_thread;
837 
838 	io_u->engine_data = fio_req;
839 
840 	return 0;
841 }
842 
843 static void
844 spdk_fio_io_u_free(struct thread_data *td, struct io_u *io_u)
845 {
846 	struct spdk_fio_request *fio_req = io_u->engine_data;
847 
848 	if (fio_req) {
849 		assert(fio_req->io == io_u);
850 		spdk_dma_free(fio_req->md_buf);
851 		free(fio_req->dsm_range);
852 		free(fio_req);
853 		io_u->engine_data = NULL;
854 	}
855 }
856 
857 static inline uint64_t
858 fio_offset_to_zslba(unsigned long long offset, struct spdk_nvme_ns *ns)
859 {
860 	return (offset / spdk_nvme_zns_ns_get_zone_size(ns)) * spdk_nvme_zns_ns_get_zone_size_sectors(ns);
861 }
862 
863 static int
864 fio_extended_lba_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
865 {
866 	struct spdk_nvme_ns *ns = fio_qpair->ns;
867 	struct spdk_fio_request *fio_req = io_u->engine_data;
868 	uint32_t md_size, extended_lba_size, lba_count;
869 	uint64_t lba;
870 	struct iovec iov;
871 	int rc;
872 	struct spdk_dif_ctx_init_ext_opts dif_opts;
873 
874 	/* Set appmask and apptag when PRACT is enabled */
875 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
876 		fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask;
877 		fio_req->dif_ctx.app_tag = g_spdk_apptag;
878 		return 0;
879 	}
880 
881 	extended_lba_size = spdk_nvme_ns_get_extended_sector_size(ns);
882 	md_size = spdk_nvme_ns_get_md_size(ns);
883 	lba = io_u->offset / extended_lba_size;
884 	lba_count = io_u->xfer_buflen / extended_lba_size;
885 
886 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
887 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
888 	rc = spdk_dif_ctx_init(&fio_req->dif_ctx, extended_lba_size, md_size,
889 			       true, fio_qpair->md_start,
890 			       (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns),
891 			       fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag,
892 			       0, 0, &dif_opts);
893 	if (rc != 0) {
894 		fprintf(stderr, "Initialization of DIF context failed\n");
895 		return rc;
896 	}
897 
898 	if (io_u->ddir != DDIR_WRITE) {
899 		return 0;
900 	}
901 
902 	iov.iov_base = io_u->buf;
903 	iov.iov_len = io_u->xfer_buflen;
904 	rc = spdk_dif_generate(&iov, 1, lba_count, &fio_req->dif_ctx);
905 	if (rc != 0) {
906 		fprintf(stderr, "Generation of DIF failed\n");
907 	}
908 
909 	return rc;
910 }
911 
912 static int
913 fio_separate_md_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
914 {
915 	struct spdk_nvme_ns *ns = fio_qpair->ns;
916 	struct spdk_fio_request *fio_req = io_u->engine_data;
917 	uint32_t md_size, block_size, lba_count;
918 	uint64_t lba;
919 	struct iovec iov, md_iov;
920 	int rc;
921 	struct spdk_dif_ctx_init_ext_opts dif_opts;
922 
923 	/* Set appmask and apptag when PRACT is enabled */
924 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
925 		fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask;
926 		fio_req->dif_ctx.app_tag = g_spdk_apptag;
927 		return 0;
928 	}
929 
930 	block_size = spdk_nvme_ns_get_sector_size(ns);
931 	md_size = spdk_nvme_ns_get_md_size(ns);
932 	lba = io_u->offset / block_size;
933 	lba_count = io_u->xfer_buflen / block_size;
934 
935 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
936 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
937 	rc = spdk_dif_ctx_init(&fio_req->dif_ctx, block_size, md_size,
938 			       false, fio_qpair->md_start,
939 			       (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns),
940 			       fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag,
941 			       0, 0, &dif_opts);
942 	if (rc != 0) {
943 		fprintf(stderr, "Initialization of DIF context failed\n");
944 		return rc;
945 	}
946 
947 	if (io_u->ddir != DDIR_WRITE) {
948 		return 0;
949 	}
950 
951 	iov.iov_base = io_u->buf;
952 	iov.iov_len = io_u->xfer_buflen;
953 	md_iov.iov_base = fio_req->md_buf;
954 	md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size);
955 	rc = spdk_dix_generate(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx);
956 	if (rc < 0) {
957 		fprintf(stderr, "Generation of DIX failed\n");
958 	}
959 
960 	return rc;
961 }
962 
963 static int
964 fio_extended_lba_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
965 {
966 	struct spdk_nvme_ns *ns = fio_qpair->ns;
967 	struct spdk_fio_request *fio_req = io_u->engine_data;
968 	uint32_t lba_count;
969 	struct iovec iov;
970 	struct spdk_dif_error err_blk = {};
971 	int rc;
972 
973 	/* Do nothing when PRACT is enabled */
974 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
975 		return 0;
976 	}
977 
978 	iov.iov_base = io_u->buf;
979 	iov.iov_len = io_u->xfer_buflen;
980 	lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_extended_sector_size(ns);
981 
982 	rc = spdk_dif_verify(&iov, 1, lba_count, &fio_req->dif_ctx, &err_blk);
983 	if (rc != 0) {
984 		fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n",
985 			err_blk.err_type, err_blk.err_offset);
986 	}
987 
988 	return rc;
989 }
990 
991 static int
992 fio_separate_md_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
993 {
994 	struct spdk_nvme_ns *ns = fio_qpair->ns;
995 	struct spdk_fio_request *fio_req = io_u->engine_data;
996 	uint32_t md_size, lba_count;
997 	struct iovec iov, md_iov;
998 	struct spdk_dif_error err_blk = {};
999 	int rc;
1000 
1001 	/* Do nothing when PRACT is enabled */
1002 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
1003 		return 0;
1004 	}
1005 
1006 	iov.iov_base = io_u->buf;
1007 	iov.iov_len = io_u->xfer_buflen;
1008 	lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_sector_size(ns);
1009 	md_size = spdk_nvme_ns_get_md_size(ns);
1010 	md_iov.iov_base = fio_req->md_buf;
1011 	md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size);
1012 
1013 	rc = spdk_dix_verify(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx, &err_blk);
1014 	if (rc != 0) {
1015 		fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n",
1016 			err_blk.err_type, err_blk.err_offset);
1017 	}
1018 
1019 	return rc;
1020 }
1021 
1022 static void
1023 spdk_fio_completion_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
1024 {
1025 	struct spdk_fio_request		*fio_req = ctx;
1026 	struct spdk_fio_thread		*fio_thread = fio_req->fio_thread;
1027 	struct spdk_fio_qpair		*fio_qpair = fio_req->fio_qpair;
1028 	int				rc;
1029 
1030 	if (fio_qpair->nvme_pi_enabled && fio_req->io->ddir == DDIR_READ) {
1031 		if (fio_qpair->extended_lba) {
1032 			rc = fio_extended_lba_verify_pi(fio_qpair, fio_req->io);
1033 		} else {
1034 			rc = fio_separate_md_verify_pi(fio_qpair, fio_req->io);
1035 		}
1036 		if (rc != 0) {
1037 			fio_req->io->error = abs(rc);
1038 		}
1039 	}
1040 
1041 	if (spdk_nvme_cpl_is_error(cpl)) {
1042 		fio_req->io->error = EIO;
1043 	}
1044 
1045 	assert(fio_thread->iocq_count < fio_thread->iocq_size);
1046 	fio_thread->iocq[fio_thread->iocq_count++] = fio_req->io;
1047 }
1048 
1049 static void
1050 spdk_nvme_io_reset_sgl(void *ref, uint32_t sgl_offset)
1051 {
1052 	struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref;
1053 
1054 	fio_req->iov_offset = sgl_offset;
1055 	fio_req->bit_bucket_data_len = 0;
1056 }
1057 
1058 static int
1059 spdk_nvme_io_next_sge(void *ref, void **address, uint32_t *length)
1060 {
1061 	struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref;
1062 	struct io_u *io_u = fio_req->io;
1063 	uint32_t iov_len;
1064 	uint32_t bit_bucket_len;
1065 
1066 	*address = io_u->buf;
1067 
1068 	if (fio_req->iov_offset) {
1069 		assert(fio_req->iov_offset <= io_u->xfer_buflen);
1070 		*address += fio_req->iov_offset;
1071 	}
1072 
1073 	iov_len = io_u->xfer_buflen - fio_req->iov_offset;
1074 	if (iov_len > g_spdk_sge_size) {
1075 		iov_len = g_spdk_sge_size;
1076 	}
1077 
1078 	if ((fio_req->bit_bucket_data_len < g_spdk_bit_bucket_data_len) && (io_u->ddir == DDIR_READ)) {
1079 		assert(g_spdk_bit_bucket_data_len < io_u->xfer_buflen);
1080 		*address = (void *)UINT64_MAX;
1081 		bit_bucket_len = g_spdk_bit_bucket_data_len - fio_req->bit_bucket_data_len;
1082 		if (iov_len > bit_bucket_len) {
1083 			iov_len = bit_bucket_len;
1084 		}
1085 		fio_req->bit_bucket_data_len += iov_len;
1086 	}
1087 
1088 	fio_req->iov_offset += iov_len;
1089 	*length = iov_len;
1090 
1091 	return 0;
1092 }
1093 
1094 #if FIO_IOOPS_VERSION >= 24
1095 typedef enum fio_q_status fio_q_status_t;
1096 #else
1097 typedef int fio_q_status_t;
1098 #endif
1099 
1100 static fio_q_status_t
1101 spdk_fio_queue(struct thread_data *td, struct io_u *io_u)
1102 {
1103 	int rc = 1;
1104 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
1105 	struct spdk_fio_request	*fio_req = io_u->engine_data;
1106 	struct spdk_fio_qpair	*fio_qpair;
1107 	struct spdk_nvme_ns	*ns = NULL;
1108 	void			*md_buf = NULL;
1109 	struct spdk_dif_ctx	*dif_ctx = &fio_req->dif_ctx;
1110 #if FIO_HAS_FDP
1111 	struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
1112 #endif
1113 	struct spdk_nvme_dsm_range *range;
1114 	uint32_t		block_size;
1115 	uint64_t		lba;
1116 	uint32_t		lba_count;
1117 	uint32_t		num_range;
1118 
1119 	fio_qpair = get_fio_qpair(fio_thread, io_u->file);
1120 	if (fio_qpair == NULL) {
1121 		return -ENXIO;
1122 	}
1123 	ns = fio_qpair->ns;
1124 
1125 	if (fio_qpair->nvme_pi_enabled && !fio_qpair->extended_lba) {
1126 		md_buf = fio_req->md_buf;
1127 	}
1128 	fio_req->fio_qpair = fio_qpair;
1129 
1130 	block_size = _nvme_get_host_buffer_sector_size(ns, fio_qpair->io_flags);
1131 	lba = io_u->offset / block_size;
1132 	lba_count = io_u->xfer_buflen / block_size;
1133 
1134 #if FIO_HAS_FDP
1135 	/* Only SGL support for write command with directives */
1136 	if (io_u->ddir == DDIR_WRITE && io_u->dtype && !g_spdk_enable_sgl) {
1137 		log_err("spdk/nvme: queue() directives require SGL to be enabled\n");
1138 		io_u->error = -EINVAL;
1139 		return FIO_Q_COMPLETED;
1140 	}
1141 #endif
1142 
1143 	/* TODO: considering situations that fio will randomize and verify io_u */
1144 	if (fio_qpair->nvme_pi_enabled) {
1145 		if (fio_qpair->extended_lba) {
1146 			rc = fio_extended_lba_setup_pi(fio_qpair, io_u);
1147 		} else {
1148 			rc = fio_separate_md_setup_pi(fio_qpair, io_u);
1149 		}
1150 		if (rc < 0) {
1151 			io_u->error = -rc;
1152 			return FIO_Q_COMPLETED;
1153 		}
1154 	}
1155 
1156 	switch (io_u->ddir) {
1157 	case DDIR_READ:
1158 		if (!g_spdk_enable_sgl) {
1159 			rc = spdk_nvme_ns_cmd_read_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba, lba_count,
1160 							   spdk_fio_completion_cb, fio_req,
1161 							   fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1162 		} else {
1163 			rc = spdk_nvme_ns_cmd_readv_with_md(ns, fio_qpair->qpair, lba,
1164 							    lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1165 							    spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1166 							    dif_ctx->apptag_mask, dif_ctx->app_tag);
1167 		}
1168 		break;
1169 	case DDIR_WRITE:
1170 		if (!g_spdk_enable_sgl) {
1171 			if (!fio_qpair->zone_append_enabled) {
1172 				rc = spdk_nvme_ns_cmd_write_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba,
1173 								    lba_count,
1174 								    spdk_fio_completion_cb, fio_req,
1175 								    fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1176 			} else {
1177 				uint64_t zslba = fio_offset_to_zslba(io_u->offset, ns);
1178 				rc = spdk_nvme_zns_zone_append_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, zslba,
1179 								       lba_count,
1180 								       spdk_fio_completion_cb, fio_req,
1181 								       fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1182 			}
1183 		} else {
1184 			if (!fio_qpair->zone_append_enabled) {
1185 #if FIO_HAS_FDP
1186 				if (spdk_unlikely(io_u->dtype)) {
1187 					ext_opts.size = SPDK_SIZEOF(&ext_opts, cdw13);
1188 					ext_opts.io_flags = fio_qpair->io_flags | (io_u->dtype << 20);
1189 					ext_opts.metadata = md_buf;
1190 					ext_opts.cdw13 = (io_u->dspec << 16);
1191 					ext_opts.apptag = dif_ctx->app_tag;
1192 					ext_opts.apptag_mask = dif_ctx->apptag_mask;
1193 					rc = spdk_nvme_ns_cmd_writev_ext(ns, fio_qpair->qpair, lba, lba_count,
1194 									 spdk_fio_completion_cb, fio_req,
1195 									 spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, &ext_opts);
1196 					break;
1197 				}
1198 #endif
1199 				rc = spdk_nvme_ns_cmd_writev_with_md(ns, fio_qpair->qpair, lba,
1200 								     lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1201 								     spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1202 								     dif_ctx->apptag_mask, dif_ctx->app_tag);
1203 			} else {
1204 				uint64_t zslba = fio_offset_to_zslba(io_u->offset, ns);
1205 				rc = spdk_nvme_zns_zone_appendv_with_md(ns, fio_qpair->qpair, zslba,
1206 									lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1207 									spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1208 									dif_ctx->apptag_mask, dif_ctx->app_tag);
1209 			}
1210 		}
1211 		break;
1212 	case DDIR_TRIM:
1213 		if (td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM) {
1214 			do_io_u_trim(td, io_u);
1215 			io_u_mark_submit(td, 1);
1216 			io_u_mark_complete(td, 1);
1217 			return FIO_Q_COMPLETED;
1218 		}
1219 
1220 		range = fio_req->dsm_range;
1221 #if FIO_HAS_MRT
1222 		if (td->o.num_range == 1) {
1223 			range->attributes.raw = 0;
1224 			range->length = lba_count;
1225 			range->starting_lba = lba;
1226 			num_range = 1;
1227 		} else {
1228 			struct trim_range *tr = (struct trim_range *)io_u->xfer_buf;
1229 			for (uint32_t i = 0; i < io_u->number_trim; i++) {
1230 				range->attributes.raw = 0;
1231 				range->length = tr->len / block_size;
1232 				range->starting_lba = tr->start / block_size;
1233 				range++;
1234 				tr++;
1235 			}
1236 			num_range = io_u->number_trim;
1237 			range = fio_req->dsm_range;
1238 		}
1239 #else
1240 		range->attributes.raw = 0;
1241 		range->length = lba_count;
1242 		range->starting_lba = lba;
1243 		num_range = 1;
1244 #endif
1245 
1246 		rc = spdk_nvme_ns_cmd_dataset_management(ns, fio_qpair->qpair,
1247 				SPDK_NVME_DSM_ATTR_DEALLOCATE, range, num_range,
1248 				spdk_fio_completion_cb, fio_req);
1249 		break;
1250 	default:
1251 		assert(false);
1252 		break;
1253 	}
1254 
1255 	/* NVMe read/write functions return -ENOMEM if there are no free requests. */
1256 	if (rc == -ENOMEM) {
1257 		return FIO_Q_BUSY;
1258 	}
1259 
1260 	if (rc != 0) {
1261 		io_u->error = abs(rc);
1262 		return FIO_Q_COMPLETED;
1263 	}
1264 
1265 	return FIO_Q_QUEUED;
1266 }
1267 
1268 static struct io_u *
1269 spdk_fio_event(struct thread_data *td, int event)
1270 {
1271 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1272 
1273 	assert(event >= 0);
1274 	assert((unsigned)event < fio_thread->iocq_count);
1275 	return fio_thread->iocq[event];
1276 }
1277 
1278 static int
1279 spdk_fio_getevents(struct thread_data *td, unsigned int min,
1280 		   unsigned int max, const struct timespec *t)
1281 {
1282 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1283 	struct spdk_fio_qpair *fio_qpair = NULL;
1284 	struct timespec t0, t1;
1285 	uint64_t timeout = 0;
1286 
1287 	if (t) {
1288 		timeout = t->tv_sec * 1000000000L + t->tv_nsec;
1289 		clock_gettime(CLOCK_MONOTONIC_RAW, &t0);
1290 	}
1291 
1292 	fio_thread->iocq_count = 0;
1293 
1294 	/* fetch the next qpair */
1295 	if (fio_thread->fio_qpair_current) {
1296 		fio_qpair = TAILQ_NEXT(fio_thread->fio_qpair_current, link);
1297 	}
1298 
1299 	for (;;) {
1300 		if (fio_qpair == NULL) {
1301 			fio_qpair = TAILQ_FIRST(&fio_thread->fio_qpair);
1302 		}
1303 
1304 		while (fio_qpair != NULL) {
1305 			/*
1306 			 * We can be called while spdk_fio_open()s are still
1307 			 * ongoing, in which case, ->qpair can still be NULL.
1308 			 */
1309 			if (fio_qpair->qpair == NULL) {
1310 				fio_qpair = TAILQ_NEXT(fio_qpair, link);
1311 				continue;
1312 			}
1313 
1314 			spdk_nvme_qpair_process_completions(fio_qpair->qpair, max - fio_thread->iocq_count);
1315 
1316 			if (fio_thread->iocq_count >= min) {
1317 				/* reset the current handling qpair */
1318 				fio_thread->fio_qpair_current = fio_qpair;
1319 				return fio_thread->iocq_count;
1320 			}
1321 
1322 			fio_qpair = TAILQ_NEXT(fio_qpair, link);
1323 		}
1324 
1325 		if (t) {
1326 			uint64_t elapse;
1327 
1328 			clock_gettime(CLOCK_MONOTONIC_RAW, &t1);
1329 			elapse = ((t1.tv_sec - t0.tv_sec) * 1000000000L)
1330 				 + t1.tv_nsec - t0.tv_nsec;
1331 			if (elapse > timeout) {
1332 				break;
1333 			}
1334 		}
1335 	}
1336 
1337 	/* reset the current handling qpair */
1338 	fio_thread->fio_qpair_current = fio_qpair;
1339 	return fio_thread->iocq_count;
1340 }
1341 
1342 static int
1343 spdk_fio_invalidate(struct thread_data *td, struct fio_file *f)
1344 {
1345 	/* TODO: This should probably send a flush to the device, but for now just return successful. */
1346 	return 0;
1347 }
1348 
1349 #if FIO_HAS_ZBD
1350 static int
1351 spdk_fio_get_zoned_model(struct thread_data *td, struct fio_file *f, enum zbd_zoned_model *model)
1352 {
1353 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1354 	struct spdk_fio_qpair *fio_qpair = NULL;
1355 	const struct spdk_nvme_zns_ns_data *zns_data = NULL;
1356 
1357 	if (f->filetype != FIO_TYPE_BLOCK) {
1358 		log_info("spdk/nvme: unsupported filetype: %d\n", f->filetype);
1359 		return -EINVAL;
1360 	}
1361 
1362 	fio_qpair = get_fio_qpair(fio_thread, f);
1363 	if (!fio_qpair) {
1364 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1365 		return -ENODEV;
1366 	}
1367 
1368 	switch (spdk_nvme_ns_get_csi(fio_qpair->ns)) {
1369 	case SPDK_NVME_CSI_NVM:
1370 		*model = ZBD_NONE;
1371 		return 0;
1372 
1373 	case SPDK_NVME_CSI_KV:
1374 		log_err("spdk/nvme: KV namespace is currently not supported\n");
1375 		return -ENOSYS;
1376 
1377 	case SPDK_NVME_CSI_ZNS:
1378 		zns_data = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1379 		if (!zns_data) {
1380 			log_err("spdk/nvme: file_name: '%s', ZNS is not enabled\n", f->file_name);
1381 			return -EINVAL;
1382 		}
1383 
1384 		*model = ZBD_HOST_MANAGED;
1385 
1386 		return 0;
1387 	}
1388 
1389 	return -EINVAL;
1390 }
1391 
1392 static int
1393 spdk_fio_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset,
1394 		      struct zbd_zone *zbdz, unsigned int nr_zones)
1395 {
1396 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1397 	struct spdk_fio_qpair *fio_qpair = NULL;
1398 	const struct spdk_nvme_zns_ns_data *zns = NULL;
1399 	struct spdk_nvme_zns_zone_report *report;
1400 	struct spdk_nvme_qpair *tmp_qpair;
1401 	uint32_t report_nzones = 0, report_nzones_max, report_nbytes, mdts_nbytes;
1402 	uint64_t zsze_nbytes, ns_nzones, lba_nbytes;
1403 	int completed = 0, err;
1404 
1405 	fio_qpair = get_fio_qpair(fio_thread, f);
1406 	if (!fio_qpair) {
1407 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1408 		return -ENODEV;
1409 	}
1410 	zns = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1411 	if (!zns) {
1412 		log_err("spdk/nvme: file_name: '%s', zns is not enabled\n", f->file_name);
1413 		return -EINVAL;
1414 	}
1415 
1416 	/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
1417 	 * Create a temporary qpair in order to perform report zones.
1418 	 */
1419 	assert(!fio_qpair->qpair);
1420 
1421 	tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_qpair->fio_ctrlr->ctrlr, NULL, 0);
1422 	if (!tmp_qpair) {
1423 		log_err("spdk/nvme: cannot allocate a temporary qpair\n");
1424 		return -EIO;
1425 	}
1426 
1427 	/** Retrieve device parameters */
1428 	mdts_nbytes = spdk_nvme_ns_get_max_io_xfer_size(fio_qpair->ns);
1429 	lba_nbytes = spdk_nvme_ns_get_sector_size(fio_qpair->ns);
1430 	zsze_nbytes = spdk_nvme_zns_ns_get_zone_size(fio_qpair->ns);
1431 	ns_nzones = spdk_nvme_zns_ns_get_num_zones(fio_qpair->ns);
1432 
1433 	/** Allocate report-buffer without exceeding mdts, zbdz-storage, and what is needed */
1434 	report_nzones_max = (mdts_nbytes - sizeof(*report)) / sizeof(report->descs[0]);
1435 	report_nzones_max = spdk_min(spdk_min(report_nzones_max, nr_zones), ns_nzones);
1436 	report_nbytes = sizeof(report->descs[0]) * report_nzones_max + sizeof(*report);
1437 	report = calloc(1, report_nbytes);
1438 	if (!report) {
1439 		log_err("spdk/nvme: failed report_zones(): ENOMEM\n");
1440 		err = -ENOMEM;
1441 		goto exit;
1442 	}
1443 
1444 	err = spdk_nvme_zns_report_zones(fio_qpair->ns, tmp_qpair, report, report_nbytes,
1445 					 offset / lba_nbytes, SPDK_NVME_ZRA_LIST_ALL, true, pcu_cb,
1446 					 &completed);
1447 	if (err || pcu(tmp_qpair, &completed) || completed < 0) {
1448 		log_err("spdk/nvme: report_zones(): err: %d, cpl: %d\n", err, completed);
1449 		err = err ? err : -EIO;
1450 		goto exit;
1451 	}
1452 	assert(report->nr_zones <= report_nzones_max);
1453 	report_nzones = report->nr_zones;
1454 
1455 	for (uint64_t idx = 0; idx < report->nr_zones; ++idx) {
1456 		struct spdk_nvme_zns_zone_desc *zdesc = &report->descs[idx];
1457 
1458 		zbdz[idx].start = zdesc->zslba * lba_nbytes;
1459 		zbdz[idx].len = zsze_nbytes;
1460 		zbdz[idx].capacity = zdesc->zcap * lba_nbytes;
1461 		zbdz[idx].wp = zdesc->wp * lba_nbytes;
1462 
1463 		switch (zdesc->zt) {
1464 		case SPDK_NVME_ZONE_TYPE_SEQWR:
1465 			zbdz[idx].type = ZBD_ZONE_TYPE_SWR;
1466 			break;
1467 
1468 		default:
1469 			log_err("spdk/nvme: %s: inv. zone-type: 0x%x\n", f->file_name, zdesc->zt);
1470 			err = -EIO;
1471 			goto exit;
1472 		}
1473 
1474 		switch (zdesc->zs) {
1475 		case SPDK_NVME_ZONE_STATE_EMPTY:
1476 			zbdz[idx].cond = ZBD_ZONE_COND_EMPTY;
1477 			break;
1478 		case SPDK_NVME_ZONE_STATE_IOPEN:
1479 			zbdz[idx].cond = ZBD_ZONE_COND_IMP_OPEN;
1480 			break;
1481 		case SPDK_NVME_ZONE_STATE_EOPEN:
1482 			zbdz[idx].cond = ZBD_ZONE_COND_EXP_OPEN;
1483 			break;
1484 		case SPDK_NVME_ZONE_STATE_CLOSED:
1485 			zbdz[idx].cond = ZBD_ZONE_COND_CLOSED;
1486 			break;
1487 		case SPDK_NVME_ZONE_STATE_RONLY:
1488 			zbdz[idx].cond = ZBD_ZONE_COND_READONLY;
1489 			break;
1490 		case SPDK_NVME_ZONE_STATE_FULL:
1491 			zbdz[idx].cond = ZBD_ZONE_COND_FULL;
1492 			break;
1493 		case SPDK_NVME_ZONE_STATE_OFFLINE:
1494 			zbdz[idx].cond = ZBD_ZONE_COND_OFFLINE;
1495 			break;
1496 
1497 		default:
1498 			log_err("spdk/nvme: %s: inv. zone-state: 0x%x\n", f->file_name, zdesc->zs);
1499 			err = -EIO;
1500 			goto exit;
1501 		}
1502 	}
1503 
1504 exit:
1505 	spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
1506 	free(report);
1507 
1508 	return err ? err : (int)report_nzones;
1509 }
1510 
1511 static int
1512 spdk_fio_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset, uint64_t length)
1513 {
1514 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1515 	struct spdk_fio_qpair *fio_qpair = NULL;
1516 	const struct spdk_nvme_zns_ns_data *zns = NULL;
1517 	uint64_t zsze_nbytes, lba_nbytes;
1518 	int err = 0;
1519 
1520 	fio_qpair = get_fio_qpair(fio_thread, f);
1521 	if (!fio_qpair) {
1522 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1523 		return -ENODEV;
1524 	}
1525 	zns = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1526 	if (!zns) {
1527 		log_err("spdk/nvme: file_name: '%s', zns is not enabled\n", f->file_name);
1528 		return -EINVAL;
1529 	}
1530 	zsze_nbytes = spdk_nvme_zns_ns_get_zone_size(fio_qpair->ns);
1531 	lba_nbytes = spdk_nvme_ns_get_sector_size(fio_qpair->ns);
1532 
1533 	/** check the assumption that offset is valid zone-start lba */
1534 	if (offset % zsze_nbytes) {
1535 		log_err("spdk/nvme: offset: %zu is not a valid zslba\n", offset);
1536 		return -EINVAL;
1537 	}
1538 
1539 	for (uint64_t cur = offset; cur < offset + length; cur += zsze_nbytes) {
1540 		int completed = 0;
1541 
1542 		err = spdk_nvme_zns_reset_zone(fio_qpair->ns, fio_qpair->qpair, cur / lba_nbytes,
1543 					       false, pcu_cb, &completed);
1544 		if (err || pcu(fio_qpair->qpair, &completed) || completed < 0) {
1545 			log_err("spdk/nvme: zns_reset_zone(): err: %d, cpl: %d\n", err, completed);
1546 			err = err ? err : -EIO;
1547 			break;
1548 		}
1549 	}
1550 
1551 	return err;
1552 }
1553 #endif
1554 
1555 #if FIO_IOOPS_VERSION >= 30
1556 static int
1557 spdk_fio_get_max_open_zones(struct thread_data *td, struct fio_file *f,
1558 			    unsigned int *max_open_zones)
1559 {
1560 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1561 	struct spdk_fio_qpair *fio_qpair = NULL;
1562 
1563 	fio_qpair = get_fio_qpair(fio_thread, f);
1564 	if (!fio_qpair) {
1565 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1566 		return -ENODEV;
1567 	}
1568 
1569 	*max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(fio_qpair->ns);
1570 
1571 	return 0;
1572 }
1573 #endif
1574 
1575 #if FIO_HAS_FDP
1576 /**
1577  * This is called twice as the number of ruhs descriptors are unknown.
1578  * In the first call fio only sends a buffer to fetch the number of ruhs
1579  * descriptors. In the second call fio will send a buffer to fetch all the
1580  * ruhs descriptors.
1581  */
1582 static int
1583 spdk_fio_fdp_fetch_ruhs(struct thread_data *td, struct fio_file *f,
1584 			struct fio_ruhs_info *fruhs_info)
1585 {
1586 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1587 	struct spdk_fio_qpair *fio_qpair = NULL;
1588 	struct spdk_nvme_qpair *tmp_qpair;
1589 	struct spdk_nvme_fdp_ruhs *fdp_ruhs;
1590 	uint32_t ruhs_nbytes;
1591 	uint16_t idx, nruhsd;
1592 	int completed = 0, err;
1593 
1594 	fio_qpair = get_fio_qpair(fio_thread, f);
1595 	if (!fio_qpair) {
1596 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1597 		return -ENODEV;
1598 	}
1599 
1600 	/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
1601 	 * Create a temporary qpair in order to perform report zones.
1602 	 */
1603 	assert(!fio_qpair->qpair);
1604 
1605 	tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_qpair->fio_ctrlr->ctrlr, NULL, 0);
1606 	if (!tmp_qpair) {
1607 		log_err("spdk/nvme: cannot allocate a temporary qpair\n");
1608 		return -EIO;
1609 	}
1610 
1611 	nruhsd = fruhs_info->nr_ruhs;
1612 	ruhs_nbytes = sizeof(*fdp_ruhs) + nruhsd * sizeof(struct spdk_nvme_fdp_ruhs_desc);
1613 	fdp_ruhs = calloc(1, ruhs_nbytes);
1614 	if (!fdp_ruhs) {
1615 		log_err("spdk/nvme: failed fdp_fetch_ruhs(): ENOMEM\n");
1616 		err = -ENOMEM;
1617 		goto exit;
1618 	}
1619 
1620 	err = spdk_nvme_ns_cmd_io_mgmt_recv(fio_qpair->ns, tmp_qpair, fdp_ruhs, ruhs_nbytes,
1621 					    SPDK_NVME_FDP_IO_MGMT_RECV_RUHS, 0, pcu_cb, &completed);
1622 	if (err || pcu(tmp_qpair, &completed) || completed < 0) {
1623 		log_err("spdk/nvme: fetch_ruhs(): err: %d, cpl: %d\n", err, completed);
1624 		err = err ? err : -EIO;
1625 		goto exit;
1626 	}
1627 
1628 	fruhs_info->nr_ruhs = fdp_ruhs->nruhsd;
1629 	for (idx = 0; idx < nruhsd; idx++) {
1630 		fruhs_info->plis[idx] = fdp_ruhs->ruhs_desc[idx].pid;
1631 	}
1632 
1633 exit:
1634 	spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
1635 	free(fdp_ruhs);
1636 
1637 	return err;
1638 }
1639 #endif
1640 
1641 static void
1642 spdk_fio_cleanup(struct thread_data *td)
1643 {
1644 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
1645 	struct spdk_fio_qpair	*fio_qpair, *fio_qpair_tmp;
1646 	struct spdk_fio_options *fio_options = td->eo;
1647 
1648 	if (fio_options->spdk_tracing) {
1649 		spdk_trace_unregister_user_thread();
1650 	}
1651 
1652 	TAILQ_FOREACH_SAFE(fio_qpair, &fio_thread->fio_qpair, link, fio_qpair_tmp) {
1653 		TAILQ_REMOVE(&fio_thread->fio_qpair, fio_qpair, link);
1654 		free(fio_qpair);
1655 	}
1656 
1657 	free(fio_thread->iocq);
1658 	free(fio_thread);
1659 
1660 	pthread_mutex_lock(&g_mutex);
1661 	g_td_count--;
1662 	if (g_td_count == 0) {
1663 		struct spdk_fio_ctrlr	*fio_ctrlr, *fio_ctrlr_tmp;
1664 		struct spdk_nvme_detach_ctx	*detach_ctx = NULL;
1665 
1666 		TAILQ_FOREACH_SAFE(fio_ctrlr, &g_ctrlrs, link, fio_ctrlr_tmp) {
1667 			TAILQ_REMOVE(&g_ctrlrs, fio_ctrlr, link);
1668 			spdk_nvme_detach_async(fio_ctrlr->ctrlr, &detach_ctx);
1669 			free(fio_ctrlr);
1670 		}
1671 
1672 		if (detach_ctx) {
1673 			spdk_nvme_detach_poll(detach_ctx);
1674 		}
1675 
1676 		if (fio_options->enable_vmd) {
1677 			spdk_vmd_fini();
1678 		}
1679 	}
1680 	pthread_mutex_unlock(&g_mutex);
1681 	if (TAILQ_EMPTY(&g_ctrlrs)) {
1682 		if (pthread_cancel(g_ctrlr_thread_id) == 0) {
1683 			pthread_join(g_ctrlr_thread_id, NULL);
1684 		}
1685 	}
1686 }
1687 
1688 /* This function enables addition of SPDK parameters to the fio config
1689  * Adding new parameters by defining them here and defining a callback
1690  * function to read the parameter value. */
1691 static struct fio_option options[] = {
1692 	{
1693 		.name           = "enable_wrr",
1694 		.lname          = "Enable weighted round robin (WRR) for IO submission queues",
1695 		.type           = FIO_OPT_INT,
1696 		.off1           = offsetof(struct spdk_fio_options, enable_wrr),
1697 		.def            = "0",
1698 		.help           = "Enable weighted round robin (WRR) for IO submission queues",
1699 		.category       = FIO_OPT_C_ENGINE,
1700 		.group          = FIO_OPT_G_INVALID,
1701 	},
1702 	{
1703 		.name           = "arbitration_burst",
1704 		.lname          = "Arbitration Burst",
1705 		.type           = FIO_OPT_INT,
1706 		.off1           = offsetof(struct spdk_fio_options, arbitration_burst),
1707 		.def            = "0",
1708 		.help           = "Arbitration Burst used for WRR (valid range from 0 - 7)",
1709 		.category       = FIO_OPT_C_ENGINE,
1710 		.group          = FIO_OPT_G_INVALID,
1711 	},
1712 	{
1713 		.name           = "low_weight",
1714 		.lname          = "low_weight for WRR",
1715 		.type           = FIO_OPT_INT,
1716 		.off1           = offsetof(struct spdk_fio_options, low_weight),
1717 		.def            = "0",
1718 		.help           = "low_weight used for WRR (valid range from 0 - 255)",
1719 		.category       = FIO_OPT_C_ENGINE,
1720 		.group          = FIO_OPT_G_INVALID,
1721 	},
1722 	{
1723 		.name           = "medium_weight",
1724 		.lname          = "medium_weight for WRR",
1725 		.type           = FIO_OPT_INT,
1726 		.off1           = offsetof(struct spdk_fio_options, medium_weight),
1727 		.def            = "0",
1728 		.help           = "medium weight used for WRR (valid range from 0 - 255)",
1729 		.category       = FIO_OPT_C_ENGINE,
1730 		.group          = FIO_OPT_G_INVALID,
1731 	},
1732 	{
1733 		.name           = "high_weight",
1734 		.lname          = "high_weight for WRR",
1735 		.type           = FIO_OPT_INT,
1736 		.off1           = offsetof(struct spdk_fio_options, high_weight),
1737 		.def            = "0",
1738 		.help           = "high weight used for WRR (valid range from 0 - 255)",
1739 		.category       = FIO_OPT_C_ENGINE,
1740 		.group          = FIO_OPT_G_INVALID,
1741 	},
1742 	{
1743 		.name           = "wrr_priority",
1744 		.lname          = "priority used for WRR",
1745 		.type           = FIO_OPT_INT,
1746 		.off1           = offsetof(struct spdk_fio_options, wrr_priority),
1747 		.def            = "0",
1748 		.help           = "priority used for WRR (valid range from 0-3)",
1749 		.category       = FIO_OPT_C_ENGINE,
1750 		.group          = FIO_OPT_G_INVALID,
1751 	},
1752 	{
1753 		.name		= "mem_size_mb",
1754 		.lname		= "Memory size in MB",
1755 		.type		= FIO_OPT_INT,
1756 		.off1		= offsetof(struct spdk_fio_options, mem_size),
1757 		.def		= "0",
1758 		.help		= "Memory Size for SPDK (MB)",
1759 		.category	= FIO_OPT_C_ENGINE,
1760 		.group		= FIO_OPT_G_INVALID,
1761 	},
1762 	{
1763 		.name		= "shm_id",
1764 		.lname		= "shared memory ID",
1765 		.type		= FIO_OPT_INT,
1766 		.off1		= offsetof(struct spdk_fio_options, shm_id),
1767 		.def		= "-1",
1768 		.help		= "Shared Memory ID",
1769 		.category	= FIO_OPT_C_ENGINE,
1770 		.group		= FIO_OPT_G_INVALID,
1771 	},
1772 	{
1773 		.name		= "enable_sgl",
1774 		.lname		= "SGL used for I/O commands",
1775 		.type		= FIO_OPT_INT,
1776 		.off1		= offsetof(struct spdk_fio_options, enable_sgl),
1777 		.def		= "0",
1778 		.help		= "SGL Used for I/O Commands (enable_sgl=1 or enable_sgl=0)",
1779 		.category	= FIO_OPT_C_ENGINE,
1780 		.group		= FIO_OPT_G_INVALID,
1781 	},
1782 	{
1783 		.name		= "sge_size",
1784 		.lname		= "SGL size used for I/O commands",
1785 		.type		= FIO_OPT_INT,
1786 		.off1		= offsetof(struct spdk_fio_options, sge_size),
1787 		.def		= "4096",
1788 		.help		= "SGL size in bytes for I/O Commands (default 4096)",
1789 		.category	= FIO_OPT_C_ENGINE,
1790 		.group		= FIO_OPT_G_INVALID,
1791 	},
1792 	{
1793 		.name		= "disable_pcie_sgl_merge",
1794 		.lname		= "Disable merging of physically contiguous SGL elements",
1795 		.type		= FIO_OPT_INT,
1796 		.off1		= offsetof(struct spdk_fio_options, disable_pcie_sgl_merge),
1797 		.def		= "0",
1798 		.help		= "Disable SGL element merging (0=merging, 1=no merging)",
1799 		.category	= FIO_OPT_C_ENGINE,
1800 		.group		= FIO_OPT_G_INVALID,
1801 	},
1802 	{
1803 		.name		= "bit_bucket_data_len",
1804 		.lname		= "Amount of data used for Bit Bucket",
1805 		.type		= FIO_OPT_INT,
1806 		.off1		= offsetof(struct spdk_fio_options, bit_bucket_data_len),
1807 		.def		= "0",
1808 		.help		= "Bit Bucket Data Length for READ commands (disabled by default)",
1809 		.category	= FIO_OPT_C_ENGINE,
1810 		.group		= FIO_OPT_G_INVALID,
1811 	},
1812 	{
1813 		.name		= "hostnqn",
1814 		.lname		= "Host NQN to use when connecting to controllers.",
1815 		.type		= FIO_OPT_STR_STORE,
1816 		.off1		= offsetof(struct spdk_fio_options, hostnqn),
1817 		.help		= "Host NQN",
1818 		.category	= FIO_OPT_C_ENGINE,
1819 		.group		= FIO_OPT_G_INVALID,
1820 	},
1821 	{
1822 		.name		= "pi_act",
1823 		.lname		= "Protection Information Action",
1824 		.type		= FIO_OPT_INT,
1825 		.off1		= offsetof(struct spdk_fio_options, pi_act),
1826 		.def		= "1",
1827 		.help		= "Protection Information Action bit (pi_act=1 or pi_act=0)",
1828 		.category	= FIO_OPT_C_ENGINE,
1829 		.group		= FIO_OPT_G_INVALID,
1830 	},
1831 	{
1832 		.name		= "pi_chk",
1833 		.lname		= "Protection Information Check(GUARD|REFTAG|APPTAG)",
1834 		.type		= FIO_OPT_STR_STORE,
1835 		.off1		= offsetof(struct spdk_fio_options, pi_chk),
1836 		.def		= NULL,
1837 		.help		= "Control of Protection Information Checking (pi_chk=GUARD|REFTAG|APPTAG)",
1838 		.category	= FIO_OPT_C_ENGINE,
1839 		.group		= FIO_OPT_G_INVALID,
1840 	},
1841 	{
1842 		.name		= "md_per_io_size",
1843 		.lname		= "Separate Metadata Buffer Size per I/O",
1844 		.type		= FIO_OPT_INT,
1845 		.off1		= offsetof(struct spdk_fio_options, md_per_io_size),
1846 		.def		= "4096",
1847 		.help		= "Size of separate metadata buffer per I/O (Default: 4096)",
1848 		.category	= FIO_OPT_C_ENGINE,
1849 		.group		= FIO_OPT_G_INVALID,
1850 	},
1851 	{
1852 		.name		= "apptag",
1853 		.lname		= "Application Tag used in Protection Information",
1854 		.type		= FIO_OPT_INT,
1855 		.off1		= offsetof(struct spdk_fio_options, apptag),
1856 		.def		= "0x1234",
1857 		.help		= "Application Tag used in Protection Information field (Default: 0x1234)",
1858 		.category	= FIO_OPT_C_ENGINE,
1859 		.group		= FIO_OPT_G_INVALID,
1860 	},
1861 	{
1862 		.name		= "apptag_mask",
1863 		.lname		= "Application Tag Mask",
1864 		.type		= FIO_OPT_INT,
1865 		.off1		= offsetof(struct spdk_fio_options, apptag_mask),
1866 		.def		= "0xffff",
1867 		.help		= "Application Tag Mask used with Application Tag (Default: 0xffff)",
1868 		.category	= FIO_OPT_C_ENGINE,
1869 		.group		= FIO_OPT_G_INVALID,
1870 	},
1871 	{
1872 		.name		= "digest_enable",
1873 		.lname		= "PDU digest choice for NVMe/TCP Transport(NONE|HEADER|DATA|BOTH)",
1874 		.type		= FIO_OPT_STR_STORE,
1875 		.off1		= offsetof(struct spdk_fio_options, digest_enable),
1876 		.def		= NULL,
1877 		.help		= "Control the NVMe/TCP control(digest_enable=NONE|HEADER|DATA|BOTH)",
1878 		.category	= FIO_OPT_C_ENGINE,
1879 		.group		= FIO_OPT_G_INVALID,
1880 	},
1881 	{
1882 		.name		= "enable_vmd",
1883 		.lname		= "Enable VMD enumeration",
1884 		.type		= FIO_OPT_INT,
1885 		.off1		= offsetof(struct spdk_fio_options, enable_vmd),
1886 		.def		= "0",
1887 		.help		= "Enable VMD enumeration (enable_vmd=1 or enable_vmd=0)",
1888 		.category	= FIO_OPT_C_ENGINE,
1889 		.group		= FIO_OPT_G_INVALID,
1890 	},
1891 	{
1892 		.name		= "initial_zone_reset",
1893 		.lname		= "Reset Zones on initialization",
1894 		.type		= FIO_OPT_INT,
1895 		.off1		= offsetof(struct spdk_fio_options, initial_zone_reset),
1896 		.def		= "0",
1897 		.help		= "Reset Zones on initialization (0=disable, 1=Reset All Zones)",
1898 		.category	= FIO_OPT_C_ENGINE,
1899 		.group		= FIO_OPT_G_INVALID,
1900 	},
1901 	{
1902 		.name		= "zone_append",
1903 		.lname		= "Use zone append instead of write",
1904 		.type		= FIO_OPT_INT,
1905 		.off1		= offsetof(struct spdk_fio_options, zone_append),
1906 		.def		= "0",
1907 		.help		= "Use zone append instead of write (1=zone append, 0=write)",
1908 		.category	= FIO_OPT_C_ENGINE,
1909 		.group		= FIO_OPT_G_INVALID,
1910 	},
1911 	{
1912 		.name		= "print_qid_mappings",
1913 		.lname		= "Print job-to-qid mappings",
1914 		.type		= FIO_OPT_INT,
1915 		.off1		= offsetof(struct spdk_fio_options, print_qid_mappings),
1916 		.def		= "0",
1917 		.help		= "Print job-to-qid mappings (0=disable, 1=enable)",
1918 		.category	= FIO_OPT_C_ENGINE,
1919 		.group		= FIO_OPT_G_INVALID,
1920 	},
1921 	{
1922 		.name		= "log_flags",
1923 		.lname		= "log_flags",
1924 		.type		= FIO_OPT_STR_STORE,
1925 		.off1		= offsetof(struct spdk_fio_options, log_flags),
1926 		.help		= "Enable log flags (comma-separated list)",
1927 		.category	= FIO_OPT_C_ENGINE,
1928 		.group		= FIO_OPT_G_INVALID,
1929 	},
1930 	{
1931 		.name		= "spdk_tracing",
1932 		.lname		= "Enable SPDK Tracing",
1933 		.type		= FIO_OPT_INT,
1934 		.off1		= offsetof(struct spdk_fio_options, spdk_tracing),
1935 		.def		= "0",
1936 		.help		= "SPDK Tracing (0=disable, 1=enable)",
1937 		.category	= FIO_OPT_C_ENGINE,
1938 		.group		= FIO_OPT_G_INVALID,
1939 	},
1940 	{
1941 		.name		= NULL,
1942 	},
1943 };
1944 
1945 /* FIO imports this structure using dlsym */
1946 struct ioengine_ops ioengine = {
1947 	.name			= "spdk",
1948 	.version		= FIO_IOOPS_VERSION,
1949 	.queue			= spdk_fio_queue,
1950 	.getevents		= spdk_fio_getevents,
1951 	.event			= spdk_fio_event,
1952 	.cleanup		= spdk_fio_cleanup,
1953 	.open_file		= spdk_fio_open,
1954 	.close_file		= spdk_fio_close,
1955 	.invalidate		= spdk_fio_invalidate,
1956 	.iomem_alloc		= spdk_fio_iomem_alloc,
1957 	.iomem_free		= spdk_fio_iomem_free,
1958 	.setup			= spdk_fio_setup,
1959 	.init			= spdk_fio_init,
1960 	.io_u_init		= spdk_fio_io_u_init,
1961 	.io_u_free		= spdk_fio_io_u_free,
1962 #if FIO_HAS_ZBD
1963 	.get_zoned_model	= spdk_fio_get_zoned_model,
1964 	.report_zones		= spdk_fio_report_zones,
1965 	.reset_wp		= spdk_fio_reset_wp,
1966 #endif
1967 #if FIO_IOOPS_VERSION >= 30
1968 	.get_max_open_zones	= spdk_fio_get_max_open_zones,
1969 #endif
1970 #if FIO_HAS_FDP
1971 	.fdp_fetch_ruhs		= spdk_fio_fdp_fetch_ruhs,
1972 #endif
1973 #if FIO_HAS_MRT
1974 	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN | FIO_DISKLESSIO | FIO_MULTI_RANGE_TRIM,
1975 #else
1976 	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN | FIO_DISKLESSIO,
1977 #endif
1978 	.options		= options,
1979 	.option_struct_size	= sizeof(struct spdk_fio_options),
1980 };
1981 
1982 static void fio_init
1983 fio_spdk_register(void)
1984 {
1985 	register_ioengine(&ioengine);
1986 }
1987 
1988 static void fio_exit
1989 fio_spdk_unregister(void)
1990 {
1991 	if (g_spdk_env_initialized) {
1992 		spdk_trace_cleanup();
1993 		spdk_env_fini();
1994 	}
1995 
1996 	unregister_ioengine(&ioengine);
1997 }
1998 
1999 SPDK_LOG_REGISTER_COMPONENT(fio_nvme)
2000