xref: /spdk/app/fio/nvme/fio_plugin.c (revision 16d862d0380886f6fc765f68a87e240bb4295595)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include "spdk/nvme.h"
9 #include "spdk/nvme_zns.h"
10 #include "spdk/vmd.h"
11 #include "spdk/env.h"
12 #include "spdk/string.h"
13 #include "spdk/log.h"
14 #include "spdk/likely.h"
15 #include "spdk/endian.h"
16 #include "spdk/dif.h"
17 #include "spdk/util.h"
18 #include "spdk/trace.h"
19 
20 #include "config-host.h"
21 #include "fio.h"
22 #include "optgroup.h"
23 
24 #ifdef for_each_rw_ddir
25 #define FIO_HAS_ZBD (FIO_IOOPS_VERSION >= 26)
26 #define FIO_HAS_FDP (FIO_IOOPS_VERSION >= 32)
27 #define FIO_HAS_MRT (FIO_IOOPS_VERSION >= 34)
28 #else
29 #define FIO_HAS_ZBD (0)
30 #define FIO_HAS_FDP (0)
31 #define FIO_HAS_MRT (0)
32 #endif
33 
34 /* FreeBSD is missing CLOCK_MONOTONIC_RAW,
35  * so alternative is provided. */
36 #ifndef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */
37 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
38 #endif
39 
40 #define NVME_IO_ALIGN		4096
41 
42 static bool g_spdk_env_initialized;
43 static bool g_log_flag_error;
44 static int g_spdk_enable_sgl = 0;
45 static uint32_t g_spdk_sge_size = 4096;
46 static uint32_t g_spdk_bit_bucket_data_len = 0;
47 static uint32_t g_spdk_pract_flag;
48 static uint32_t g_spdk_prchk_flags;
49 static uint32_t g_spdk_md_per_io_size = 4096;
50 static uint16_t g_spdk_apptag;
51 static uint16_t g_spdk_apptag_mask;
52 
53 struct spdk_fio_options {
54 	void	*pad;	/* off1 used in option descriptions may not be 0 */
55 	int	enable_wrr;
56 	int	arbitration_burst;
57 	int	low_weight;
58 	int	medium_weight;
59 	int	high_weight;
60 	int	wrr_priority;
61 	int	mem_size;
62 	int	shm_id;
63 	int	enable_sgl;
64 	int	sge_size;
65 	int	bit_bucket_data_len;
66 	char	*hostnqn;
67 	int	pi_act;
68 	char	*pi_chk;
69 	int	md_per_io_size;
70 	int	apptag;
71 	int	apptag_mask;
72 	char	*digest_enable;
73 	int	enable_vmd;
74 	int	initial_zone_reset;
75 	int	zone_append;
76 	int	print_qid_mappings;
77 	int	spdk_tracing;
78 	char	*log_flags;
79 };
80 
81 struct spdk_fio_request {
82 	struct io_u		*io;
83 	/** Offset in current iovec, fio only uses 1 vector */
84 	uint32_t		iov_offset;
85 
86 	/** Amount of data used for Bit Bucket SGL */
87 	uint32_t		bit_bucket_data_len;
88 
89 	/** Context for NVMe PI */
90 	struct spdk_dif_ctx	dif_ctx;
91 	/** Separate metadata buffer pointer */
92 	void			*md_buf;
93 
94 	/** Dataset management range information */
95 	struct spdk_nvme_dsm_range *dsm_range;
96 
97 	struct spdk_fio_thread	*fio_thread;
98 	struct spdk_fio_qpair	*fio_qpair;
99 };
100 
101 struct spdk_fio_ctrlr {
102 	struct spdk_nvme_transport_id	tr_id;
103 	struct spdk_nvme_ctrlr_opts	opts;
104 	struct spdk_nvme_ctrlr		*ctrlr;
105 	TAILQ_ENTRY(spdk_fio_ctrlr)	link;
106 };
107 
108 static TAILQ_HEAD(, spdk_fio_ctrlr) g_ctrlrs = TAILQ_HEAD_INITIALIZER(g_ctrlrs);
109 static int g_td_count;
110 static pthread_t g_ctrlr_thread_id = 0;
111 static pthread_mutex_t g_mutex = PTHREAD_MUTEX_INITIALIZER;
112 static bool g_error;
113 
114 struct spdk_fio_qpair {
115 	struct fio_file			*f;
116 	struct spdk_nvme_qpair		*qpair;
117 	struct spdk_nvme_ns		*ns;
118 	uint32_t			io_flags;
119 	bool				zone_append_enabled;
120 	bool				nvme_pi_enabled;
121 	/* True for DIF and false for DIX, and this is valid only if nvme_pi_enabled is true. */
122 	bool				extended_lba;
123 	/* True for protection info transferred at start of metadata,
124 	 * false for protection info transferred at end of metadata, and
125 	 * this is valid only if nvme_pi_enabled is true.
126 	 */
127 	bool				md_start;
128 	TAILQ_ENTRY(spdk_fio_qpair)	link;
129 	struct spdk_fio_ctrlr		*fio_ctrlr;
130 };
131 
132 struct spdk_fio_thread {
133 	struct thread_data		*td;
134 
135 	TAILQ_HEAD(, spdk_fio_qpair)	fio_qpair;
136 	struct spdk_fio_qpair		*fio_qpair_current;	/* the current fio_qpair to be handled. */
137 
138 	struct io_u			**iocq;		/* io completion queue */
139 	unsigned int			iocq_count;	/* number of iocq entries filled by last getevents */
140 	unsigned int			iocq_size;	/* number of iocq entries allocated */
141 
142 };
143 
144 struct spdk_fio_probe_ctx {
145 	struct thread_data	*td;
146 	char			hostnqn[SPDK_NVMF_NQN_MAX_LEN + 1];
147 	struct fio_file		*f; /* fio_file given by user */
148 };
149 
150 static void *
151 spdk_fio_poll_ctrlrs(void *arg)
152 {
153 	struct spdk_fio_ctrlr *fio_ctrlr;
154 	int oldstate;
155 	int rc;
156 
157 	/* Loop until the thread is cancelled */
158 	while (true) {
159 		rc = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate);
160 		if (rc != 0) {
161 			SPDK_ERRLOG("Unable to set cancel state disabled on g_init_thread (%d): %s\n",
162 				    rc, spdk_strerror(rc));
163 		}
164 
165 		pthread_mutex_lock(&g_mutex);
166 
167 		TAILQ_FOREACH(fio_ctrlr, &g_ctrlrs, link) {
168 			spdk_nvme_ctrlr_process_admin_completions(fio_ctrlr->ctrlr);
169 		}
170 
171 		pthread_mutex_unlock(&g_mutex);
172 
173 		rc = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);
174 		if (rc != 0) {
175 			SPDK_ERRLOG("Unable to set cancel state enabled on g_init_thread (%d): %s\n",
176 				    rc, spdk_strerror(rc));
177 		}
178 
179 		/* This is a pthread cancellation point and cannot be removed. */
180 		sleep(1);
181 	}
182 
183 	return NULL;
184 }
185 
186 static bool
187 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
188 	 struct spdk_nvme_ctrlr_opts *opts)
189 {
190 	struct spdk_fio_probe_ctx *ctx = cb_ctx;
191 	struct thread_data *td = ctx->td;
192 	struct spdk_fio_options *fio_options = td->eo;
193 
194 	if (ctx->hostnqn[0] != '\0') {
195 		memcpy(opts->hostnqn, ctx->hostnqn, sizeof(opts->hostnqn));
196 	} else if (fio_options->hostnqn) {
197 		snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", fio_options->hostnqn);
198 	}
199 
200 	if (fio_options->enable_wrr) {
201 		opts->arb_mechanism		= SPDK_NVME_CC_AMS_WRR;
202 		opts->arbitration_burst		= fio_options->arbitration_burst;
203 		opts->low_priority_weight	= fio_options->low_weight;
204 		opts->medium_priority_weight	= fio_options->medium_weight;
205 		opts->high_priority_weight	= fio_options->high_weight;
206 	}
207 
208 	if (fio_options->digest_enable) {
209 		if (strcasecmp(fio_options->digest_enable, "HEADER") == 0) {
210 			opts->header_digest = true;
211 		} else if (strcasecmp(fio_options->digest_enable, "DATA") == 0) {
212 			opts->data_digest = true;
213 		} else if (strcasecmp(fio_options->digest_enable, "BOTH") == 0) {
214 			opts->header_digest = true;
215 			opts->data_digest = true;
216 		}
217 	}
218 
219 	return true;
220 }
221 
222 static struct spdk_fio_ctrlr *
223 get_fio_ctrlr(const struct spdk_nvme_transport_id *trid)
224 {
225 	struct spdk_fio_ctrlr	*fio_ctrlr;
226 
227 	TAILQ_FOREACH(fio_ctrlr, &g_ctrlrs, link) {
228 		if (spdk_nvme_transport_id_compare(trid, &fio_ctrlr->tr_id) == 0) {
229 			return fio_ctrlr;
230 		}
231 	}
232 
233 	return NULL;
234 }
235 
236 /**
237  * Returns the fio_qpair matching the given fio_file and has an associated ns
238  */
239 static struct spdk_fio_qpair *
240 get_fio_qpair(struct spdk_fio_thread *fio_thread, struct fio_file *f)
241 {
242 	struct spdk_fio_qpair	*fio_qpair;
243 
244 	TAILQ_FOREACH(fio_qpair, &fio_thread->fio_qpair, link) {
245 		if ((fio_qpair->f == f) && fio_qpair->ns) {
246 			return fio_qpair;
247 		}
248 	}
249 
250 	return NULL;
251 }
252 
253 #if FIO_HAS_ZBD
254 /**
255  * Callback function to use while processing completions until completion-indicator turns non-zero
256  */
257 static void
258 pcu_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
259 {
260 	int *completed = ctx;
261 
262 	*completed = spdk_nvme_cpl_is_error(cpl) ? -1 : 1;
263 }
264 
265 /**
266  * Process Completions Until the given 'completed' indicator turns non-zero or an error occurs
267  */
268 static int32_t
269 pcu(struct spdk_nvme_qpair *qpair, int *completed)
270 {
271 	int32_t ret;
272 
273 	while (!*completed) {
274 		ret = spdk_nvme_qpair_process_completions(qpair, 1);
275 		if (ret < 0) {
276 			log_err("spdk/nvme: process_compl(): ret: %d\n", ret);
277 			return ret;
278 		}
279 	}
280 
281 	return 0;
282 }
283 #endif
284 
285 static inline uint32_t
286 _nvme_get_host_buffer_sector_size(struct spdk_nvme_ns *ns, uint32_t io_flags)
287 {
288 	bool md_excluded_from_xfer = false;
289 	uint32_t md_size;
290 	uint32_t ns_flags;
291 
292 	ns_flags = spdk_nvme_ns_get_flags(ns);
293 	md_size = spdk_nvme_ns_get_md_size(ns);
294 
295 	/* For extended LBA format, if the metadata size is 8 bytes and PRACT is
296 	 * enabled(controller inserts/strips PI), we should reduce metadata size
297 	 * from block size.
298 	 */
299 	md_excluded_from_xfer = ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) &&
300 				 (ns_flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) &&
301 				 (ns_flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) &&
302 				 (md_size == 8));
303 
304 	return md_excluded_from_xfer ? spdk_nvme_ns_get_sector_size(ns) :
305 	       spdk_nvme_ns_get_extended_sector_size(ns);
306 }
307 
308 static void
309 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
310 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
311 {
312 	struct spdk_fio_probe_ctx *ctx = cb_ctx;
313 	struct thread_data	*td = ctx->td;
314 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
315 	struct spdk_fio_ctrlr	*fio_ctrlr;
316 	struct spdk_fio_qpair	*fio_qpair;
317 	struct spdk_nvme_ns	*ns;
318 	const struct spdk_nvme_ns_data	*nsdata;
319 	struct fio_file		*f = ctx->f;
320 	uint32_t		ns_id;
321 	char			*p;
322 	long int		tmp;
323 	uint32_t		block_size;
324 	struct spdk_fio_options *fio_options = td->eo;
325 
326 	p = strstr(f->file_name, "ns=");
327 	if (p != NULL) {
328 		tmp = spdk_strtol(p + 3, 10);
329 		if (tmp <= 0) {
330 			SPDK_ERRLOG("namespace id should be >=1, but was invalid: %ld\n", tmp);
331 			g_error = true;
332 			return;
333 		}
334 		ns_id = (uint32_t)tmp;
335 	} else {
336 		ns_id = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
337 		if (ns_id == 0) {
338 			/* The ctrlr has no active namespaces and we didn't specify any so nothing to do. */
339 			return;
340 		}
341 	}
342 
343 	pthread_mutex_lock(&g_mutex);
344 	fio_ctrlr = get_fio_ctrlr(trid);
345 	/* it is a new ctrlr and needs to be added */
346 	if (!fio_ctrlr) {
347 		/* Create an fio_ctrlr and add it to the list */
348 		fio_ctrlr = calloc(1, sizeof(*fio_ctrlr));
349 		if (!fio_ctrlr) {
350 			SPDK_ERRLOG("Cannot allocate space for fio_ctrlr\n");
351 			g_error = true;
352 			pthread_mutex_unlock(&g_mutex);
353 			return;
354 		}
355 		fio_ctrlr->opts = *opts;
356 		fio_ctrlr->ctrlr = ctrlr;
357 		fio_ctrlr->tr_id = *trid;
358 		TAILQ_INSERT_TAIL(&g_ctrlrs, fio_ctrlr, link);
359 	}
360 	pthread_mutex_unlock(&g_mutex);
361 
362 	ns = spdk_nvme_ctrlr_get_ns(fio_ctrlr->ctrlr, ns_id);
363 	if (ns == NULL) {
364 		SPDK_ERRLOG("Cannot get namespace by ns_id=%d\n", ns_id);
365 		g_error = true;
366 		return;
367 	}
368 
369 	if (!spdk_nvme_ns_is_active(ns)) {
370 		SPDK_ERRLOG("Inactive namespace by ns_id=%d\n", ns_id);
371 		g_error = true;
372 		return;
373 	}
374 	nsdata = spdk_nvme_ns_get_data(ns);
375 
376 	TAILQ_FOREACH(fio_qpair, &fio_thread->fio_qpair, link) {
377 		if ((fio_qpair->f == f) ||
378 		    ((spdk_nvme_transport_id_compare(trid, &fio_qpair->fio_ctrlr->tr_id) == 0) &&
379 		     (spdk_nvme_ns_get_id(fio_qpair->ns) == ns_id))) {
380 			/* Not the error case. Avoid duplicated connection */
381 			return;
382 		}
383 	}
384 
385 	/* create a new qpair */
386 	fio_qpair = calloc(1, sizeof(*fio_qpair));
387 	if (!fio_qpair) {
388 		g_error = true;
389 		SPDK_ERRLOG("Cannot allocate space for fio_qpair\n");
390 		return;
391 	}
392 
393 	f->engine_data = fio_qpair;
394 	fio_qpair->ns = ns;
395 	fio_qpair->f = f;
396 	fio_qpair->fio_ctrlr = fio_ctrlr;
397 	TAILQ_INSERT_TAIL(&fio_thread->fio_qpair, fio_qpair, link);
398 
399 	if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) {
400 		assert(spdk_nvme_ns_get_pi_type(ns) != SPDK_NVME_FMT_NVM_PROTECTION_DISABLE);
401 		fio_qpair->io_flags = g_spdk_pract_flag | g_spdk_prchk_flags;
402 		fio_qpair->nvme_pi_enabled = true;
403 		fio_qpair->md_start = nsdata->dps.md_start;
404 		fio_qpair->extended_lba = spdk_nvme_ns_supports_extended_lba(ns);
405 		fprintf(stdout, "PI type%u enabled with %s\n", spdk_nvme_ns_get_pi_type(ns),
406 			fio_qpair->extended_lba ? "extended lba" : "separate metadata");
407 	}
408 
409 	block_size = _nvme_get_host_buffer_sector_size(ns, fio_qpair->io_flags);
410 	for_each_rw_ddir(ddir) {
411 		if (td->o.min_bs[ddir] % block_size != 0 || td->o.max_bs[ddir] % block_size != 0) {
412 			if (spdk_nvme_ns_supports_extended_lba(ns)) {
413 				SPDK_ERRLOG("--bs or other block size related option has to be a multiple of (LBA data size + Metadata size)\n");
414 			} else {
415 				SPDK_ERRLOG("--bs or other block size related option has to be a multiple of LBA data size\n");
416 			}
417 			g_error = true;
418 			return;
419 		}
420 	}
421 
422 	if (fio_options->zone_append && spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS) {
423 		if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED) {
424 			SPDK_DEBUGLOG(fio_nvme, "Using zone appends instead of writes on: '%s'\n",
425 				      f->file_name);
426 			fio_qpair->zone_append_enabled = true;
427 		} else {
428 			SPDK_WARNLOG("Falling back to writes on: '%s' - ns lacks zone append cmd\n",
429 				     f->file_name);
430 		}
431 	}
432 
433 #if FIO_HAS_ZBD
434 	if (td_trim(td) && td->o.zone_mode == ZONE_MODE_ZBD) {
435 		td->io_ops->flags |= FIO_ASYNCIO_SYNC_TRIM;
436 	}
437 #endif
438 
439 	if (fio_options->initial_zone_reset == 1 && spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS) {
440 #if FIO_HAS_ZBD
441 		struct spdk_nvme_qpair *tmp_qpair;
442 		int completed = 0, err;
443 
444 		/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
445 		 * Create a temporary qpair in order to perform the initial zone reset.
446 		 */
447 		assert(!fio_qpair->qpair);
448 
449 		tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, NULL, 0);
450 		if (!tmp_qpair) {
451 			SPDK_ERRLOG("Cannot allocate a temporary qpair\n");
452 			g_error = true;
453 			return;
454 		}
455 
456 		err = spdk_nvme_zns_reset_zone(ns, tmp_qpair, 0x0, true, pcu_cb, &completed);
457 		if (err || pcu(tmp_qpair, &completed) || completed < 0) {
458 			log_err("spdk/nvme: warn: initial_zone_reset: err: %d, cpl: %d\n",
459 				err, completed);
460 		}
461 
462 		spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
463 #else
464 		log_err("spdk/nvme: ZBD/ZNS is not supported\n");
465 #endif
466 	}
467 
468 	f->real_file_size = spdk_nvme_ns_get_size(fio_qpair->ns);
469 	if (f->real_file_size <= 0) {
470 		g_error = true;
471 		SPDK_ERRLOG("Cannot get namespace size by ns=%p\n", ns);
472 		return;
473 	}
474 
475 	f->filetype = FIO_TYPE_BLOCK;
476 	fio_file_set_size_known(f);
477 }
478 
479 static void
480 parse_prchk_flags(const char *prchk_str)
481 {
482 	if (!prchk_str) {
483 		return;
484 	}
485 
486 	if (strstr(prchk_str, "GUARD") != NULL) {
487 		g_spdk_prchk_flags = SPDK_NVME_IO_FLAGS_PRCHK_GUARD;
488 	}
489 	if (strstr(prchk_str, "REFTAG") != NULL) {
490 		g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG;
491 	}
492 	if (strstr(prchk_str, "APPTAG") != NULL) {
493 		g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG;
494 	}
495 }
496 
497 static void
498 parse_pract_flag(int pract)
499 {
500 	if (pract == 1) {
501 		g_spdk_pract_flag = SPDK_NVME_IO_FLAGS_PRACT;
502 	} else {
503 		g_spdk_pract_flag = 0;
504 	}
505 }
506 
507 static bool
508 fio_redirected_to_dev_null(void)
509 {
510 	char path[PATH_MAX] = "";
511 	ssize_t ret;
512 
513 	ret = readlink("/proc/self/fd/1", path, sizeof(path));
514 
515 	if (ret == -1 || strcmp(path, "/dev/null") != 0) {
516 		return false;
517 	}
518 
519 	ret = readlink("/proc/self/fd/2", path, sizeof(path));
520 
521 	if (ret == -1 || strcmp(path, "/dev/null") != 0) {
522 		return false;
523 	}
524 
525 	return true;
526 }
527 
528 static int
529 spdk_fio_init(struct thread_data *td)
530 {
531 	int ret = 0;
532 	struct spdk_fio_options *fio_options = td->eo;
533 
534 	if (fio_options->spdk_tracing) {
535 		ret = spdk_trace_register_user_thread();
536 	}
537 
538 	return ret;
539 }
540 
541 /* Called once at initialization. This is responsible for gathering the size of
542  * each "file", which in our case are in the form
543  * 'key=value [key=value] ... ns=value'
544  * For example, For local PCIe NVMe device  - 'trtype=PCIe traddr=0000.04.00.0 ns=1'
545  * For remote exported by NVMe-oF target, 'trtype=RDMA adrfam=IPv4 traddr=192.168.100.8 trsvcid=4420 ns=1' */
546 static int
547 spdk_fio_setup(struct thread_data *td)
548 {
549 	struct spdk_fio_thread *fio_thread;
550 	struct spdk_fio_options *fio_options = td->eo;
551 	struct spdk_fio_probe_ctx ctx;
552 	struct spdk_env_opts opts;
553 	struct fio_file *f;
554 	char *p;
555 	int rc = 0;
556 	struct spdk_nvme_transport_id trid;
557 	struct spdk_fio_ctrlr *fio_ctrlr;
558 	char *trid_info;
559 	unsigned int i;
560 	size_t size;
561 
562 	/*
563 	 * If we're running in a daemonized FIO instance, it's possible
564 	 * fd 1/2 were re-used for something important by FIO. Newer fio
565 	 * versions are careful to redirect those to /dev/null, but if we're
566 	 * not, we'll abort early, so we don't accidentally write messages to
567 	 * an important file, etc.
568 	 */
569 	if (is_backend && !fio_redirected_to_dev_null()) {
570 		char buf[1024];
571 		snprintf(buf, sizeof(buf),
572 			 "SPDK FIO plugin is in daemon mode, but stdout/stderr "
573 			 "aren't redirected to /dev/null. Aborting.");
574 		fio_server_text_output(FIO_LOG_ERR, buf, sizeof(buf));
575 		return -1;
576 	}
577 
578 	if (!td->o.use_thread) {
579 		log_err("spdk: must set thread=1 when using spdk plugin\n");
580 		return 1;
581 	}
582 
583 	if (g_log_flag_error) {
584 		/* The first thread found an error when parsing log flags, so
585 		 * just return error immediately for all of the other threads.
586 		 */
587 		return 1;
588 	}
589 
590 	pthread_mutex_lock(&g_mutex);
591 
592 	fio_thread = calloc(1, sizeof(*fio_thread));
593 	assert(fio_thread != NULL);
594 
595 	td->io_ops_data = fio_thread;
596 	fio_thread->td = td;
597 
598 	fio_thread->iocq_size = td->o.iodepth;
599 	fio_thread->iocq = calloc(fio_thread->iocq_size, sizeof(struct io_u *));
600 	assert(fio_thread->iocq != NULL);
601 
602 	TAILQ_INIT(&fio_thread->fio_qpair);
603 
604 	if (!g_spdk_env_initialized) {
605 		spdk_env_opts_init(&opts);
606 		opts.name = "fio";
607 		opts.mem_size = fio_options->mem_size;
608 		opts.shm_id = fio_options->shm_id;
609 		g_spdk_enable_sgl = fio_options->enable_sgl;
610 		g_spdk_sge_size = fio_options->sge_size;
611 		g_spdk_bit_bucket_data_len = fio_options->bit_bucket_data_len;
612 		parse_pract_flag(fio_options->pi_act);
613 		g_spdk_md_per_io_size = spdk_max(fio_options->md_per_io_size, 4096);
614 		g_spdk_apptag = (uint16_t)fio_options->apptag;
615 		g_spdk_apptag_mask = (uint16_t)fio_options->apptag_mask;
616 		parse_prchk_flags(fio_options->pi_chk);
617 		if (spdk_env_init(&opts) < 0) {
618 			SPDK_ERRLOG("Unable to initialize SPDK env\n");
619 			free(fio_thread->iocq);
620 			free(fio_thread);
621 			fio_thread = NULL;
622 			pthread_mutex_unlock(&g_mutex);
623 			return 1;
624 		}
625 
626 		if (fio_options->log_flags) {
627 			char *tok = strtok(fio_options->log_flags, ",");
628 			do {
629 				rc = spdk_log_set_flag(tok);
630 				if (rc < 0) {
631 					SPDK_ERRLOG("unknown log flag %s\n", tok);
632 					g_log_flag_error = true;
633 					return 1;
634 				}
635 			} while ((tok = strtok(NULL, ",")) != NULL);
636 #ifdef DEBUG
637 			spdk_log_set_print_level(SPDK_LOG_DEBUG);
638 #endif
639 		}
640 
641 		g_spdk_env_initialized = true;
642 		spdk_unaffinitize_thread();
643 
644 		if (fio_options->spdk_tracing) {
645 			spdk_trace_init("spdk_fio_tracepoints", 65536, td->o.numjobs);
646 			spdk_trace_enable_tpoint_group("nvme_pcie");
647 			spdk_trace_enable_tpoint_group("nvme_tcp");
648 		}
649 
650 		/* Spawn a thread to continue polling the controllers */
651 		rc = pthread_create(&g_ctrlr_thread_id, NULL, &spdk_fio_poll_ctrlrs, NULL);
652 		if (rc != 0) {
653 			SPDK_ERRLOG("Unable to spawn a thread to poll admin queues. They won't be polled.\n");
654 		}
655 
656 		if (fio_options->enable_vmd && spdk_vmd_init()) {
657 			SPDK_ERRLOG("Failed to initialize VMD. Some NVMe devices can be unavailable.\n");
658 		}
659 	}
660 	pthread_mutex_unlock(&g_mutex);
661 
662 	for_each_file(td, f, i) {
663 		memset(&trid, 0, sizeof(trid));
664 		memset(&ctx, 0, sizeof(ctx));
665 
666 		trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
667 
668 		p = strstr(f->file_name, " ns=");
669 		if (p != NULL) {
670 			trid_info = strndup(f->file_name, p - f->file_name);
671 		} else {
672 			trid_info = strndup(f->file_name, strlen(f->file_name));
673 		}
674 
675 		if (!trid_info) {
676 			SPDK_ERRLOG("Failed to allocate space for trid_info\n");
677 			continue;
678 		}
679 
680 		rc = spdk_nvme_transport_id_parse(&trid, trid_info);
681 		if (rc < 0) {
682 			SPDK_ERRLOG("Failed to parse given str: %s\n", trid_info);
683 			free(trid_info);
684 			continue;
685 		}
686 		free(trid_info);
687 
688 		if (trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
689 			struct spdk_pci_addr pci_addr;
690 			if (spdk_pci_addr_parse(&pci_addr, trid.traddr) < 0) {
691 				SPDK_ERRLOG("Invalid traddr=%s\n", trid.traddr);
692 				continue;
693 			}
694 			spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
695 		} else {
696 			if (trid.subnqn[0] == '\0') {
697 				snprintf(trid.subnqn, sizeof(trid.subnqn), "%s",
698 					 SPDK_NVMF_DISCOVERY_NQN);
699 			}
700 			if ((p = strcasestr(f->file_name, "hostnqn:")) ||
701 			    (p = strcasestr(f->file_name, "hostnqn="))) {
702 				p += strlen("hostnqn:");
703 				size = strcspn(p, " \t\n");
704 				if (size > sizeof(ctx.hostnqn)) {
705 					SPDK_ERRLOG("Invalid hostnqn: too long\n");
706 					continue;
707 				}
708 				memcpy(ctx.hostnqn, p, size);
709 			}
710 		}
711 
712 		ctx.td = td;
713 		ctx.f = f;
714 
715 		pthread_mutex_lock(&g_mutex);
716 		fio_ctrlr = get_fio_ctrlr(&trid);
717 		pthread_mutex_unlock(&g_mutex);
718 		if (fio_ctrlr) {
719 			attach_cb(&ctx, &trid, fio_ctrlr->ctrlr, &fio_ctrlr->opts);
720 		} else {
721 			/* Enumerate all of the controllers */
722 			if (spdk_nvme_probe(&trid, &ctx, probe_cb, attach_cb, NULL) != 0) {
723 				SPDK_ERRLOG("spdk_nvme_probe() failed\n");
724 				continue;
725 			}
726 		}
727 
728 		if (g_error) {
729 			log_err("Failed to initialize spdk fio plugin\n");
730 			rc = 1;
731 			break;
732 		}
733 	}
734 
735 	pthread_mutex_lock(&g_mutex);
736 	g_td_count++;
737 	pthread_mutex_unlock(&g_mutex);
738 
739 	return rc;
740 }
741 
742 static int
743 spdk_fio_open(struct thread_data *td, struct fio_file *f)
744 {
745 	struct spdk_fio_qpair *fio_qpair = f->engine_data;
746 	struct spdk_fio_ctrlr *fio_ctrlr = fio_qpair->fio_ctrlr;
747 	struct spdk_fio_options *fio_options = td->eo;
748 	struct spdk_nvme_io_qpair_opts	qpopts;
749 
750 	assert(fio_qpair->qpair == NULL);
751 	spdk_nvme_ctrlr_get_default_io_qpair_opts(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts));
752 	qpopts.delay_cmd_submit = true;
753 	if (fio_options->enable_wrr) {
754 		qpopts.qprio = fio_options->wrr_priority;
755 	}
756 
757 	fio_qpair->qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts));
758 	if (!fio_qpair->qpair) {
759 		SPDK_ERRLOG("Cannot allocate nvme io_qpair any more\n");
760 		g_error = true;
761 		free(fio_qpair);
762 		return -1;
763 	}
764 
765 	if (fio_options->print_qid_mappings == 1) {
766 		log_info("job %s: %s qid %d\n", td->o.name, f->file_name,
767 			 spdk_nvme_qpair_get_id(fio_qpair->qpair));
768 	}
769 
770 	return 0;
771 }
772 
773 static int
774 spdk_fio_close(struct thread_data *td, struct fio_file *f)
775 {
776 	struct spdk_fio_qpair *fio_qpair = f->engine_data;
777 
778 	assert(fio_qpair->qpair != NULL);
779 	spdk_nvme_ctrlr_free_io_qpair(fio_qpair->qpair);
780 	fio_qpair->qpair = NULL;
781 	return 0;
782 }
783 
784 static int
785 spdk_fio_iomem_alloc(struct thread_data *td, size_t total_mem)
786 {
787 	td->orig_buffer = spdk_dma_zmalloc(total_mem, NVME_IO_ALIGN, NULL);
788 	return td->orig_buffer == NULL;
789 }
790 
791 static void
792 spdk_fio_iomem_free(struct thread_data *td)
793 {
794 	spdk_dma_free(td->orig_buffer);
795 }
796 
797 static int
798 spdk_fio_io_u_init(struct thread_data *td, struct io_u *io_u)
799 {
800 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
801 	struct spdk_fio_request	*fio_req;
802 	uint32_t dsm_size;
803 
804 	io_u->engine_data = NULL;
805 
806 	fio_req = calloc(1, sizeof(*fio_req));
807 	if (fio_req == NULL) {
808 		return 1;
809 	}
810 
811 	if (!(td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM)) {
812 #if FIO_HAS_MRT
813 		/* By default number of range is set to 1 */
814 		dsm_size = td->o.num_range * sizeof(struct spdk_nvme_dsm_range);
815 #else
816 		dsm_size = sizeof(struct spdk_nvme_dsm_range);
817 #endif
818 		fio_req->dsm_range = calloc(1, dsm_size);
819 		if (fio_req->dsm_range == NULL) {
820 			free(fio_req);
821 			return 1;
822 		}
823 	}
824 
825 	fio_req->md_buf = spdk_dma_zmalloc(g_spdk_md_per_io_size, NVME_IO_ALIGN, NULL);
826 	if (fio_req->md_buf == NULL) {
827 		fprintf(stderr, "Allocate %u metadata failed\n", g_spdk_md_per_io_size);
828 		free(fio_req->dsm_range);
829 		free(fio_req);
830 		return 1;
831 	}
832 
833 	fio_req->io = io_u;
834 	fio_req->fio_thread = fio_thread;
835 
836 	io_u->engine_data = fio_req;
837 
838 	return 0;
839 }
840 
841 static void
842 spdk_fio_io_u_free(struct thread_data *td, struct io_u *io_u)
843 {
844 	struct spdk_fio_request *fio_req = io_u->engine_data;
845 
846 	if (fio_req) {
847 		assert(fio_req->io == io_u);
848 		spdk_dma_free(fio_req->md_buf);
849 		free(fio_req->dsm_range);
850 		free(fio_req);
851 		io_u->engine_data = NULL;
852 	}
853 }
854 
855 static inline uint64_t
856 fio_offset_to_zslba(unsigned long long offset, struct spdk_nvme_ns *ns)
857 {
858 	return (offset / spdk_nvme_zns_ns_get_zone_size(ns)) * spdk_nvme_zns_ns_get_zone_size_sectors(ns);
859 }
860 
861 static int
862 fio_extended_lba_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
863 {
864 	struct spdk_nvme_ns *ns = fio_qpair->ns;
865 	struct spdk_fio_request *fio_req = io_u->engine_data;
866 	uint32_t md_size, extended_lba_size, lba_count;
867 	uint64_t lba;
868 	struct iovec iov;
869 	int rc;
870 	struct spdk_dif_ctx_init_ext_opts dif_opts;
871 
872 	/* Set appmask and apptag when PRACT is enabled */
873 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
874 		fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask;
875 		fio_req->dif_ctx.app_tag = g_spdk_apptag;
876 		return 0;
877 	}
878 
879 	extended_lba_size = spdk_nvme_ns_get_extended_sector_size(ns);
880 	md_size = spdk_nvme_ns_get_md_size(ns);
881 	lba = io_u->offset / extended_lba_size;
882 	lba_count = io_u->xfer_buflen / extended_lba_size;
883 
884 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
885 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
886 	rc = spdk_dif_ctx_init(&fio_req->dif_ctx, extended_lba_size, md_size,
887 			       true, fio_qpair->md_start,
888 			       (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns),
889 			       fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag,
890 			       0, 0, &dif_opts);
891 	if (rc != 0) {
892 		fprintf(stderr, "Initialization of DIF context failed\n");
893 		return rc;
894 	}
895 
896 	if (io_u->ddir != DDIR_WRITE) {
897 		return 0;
898 	}
899 
900 	iov.iov_base = io_u->buf;
901 	iov.iov_len = io_u->xfer_buflen;
902 	rc = spdk_dif_generate(&iov, 1, lba_count, &fio_req->dif_ctx);
903 	if (rc != 0) {
904 		fprintf(stderr, "Generation of DIF failed\n");
905 	}
906 
907 	return rc;
908 }
909 
910 static int
911 fio_separate_md_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
912 {
913 	struct spdk_nvme_ns *ns = fio_qpair->ns;
914 	struct spdk_fio_request *fio_req = io_u->engine_data;
915 	uint32_t md_size, block_size, lba_count;
916 	uint64_t lba;
917 	struct iovec iov, md_iov;
918 	int rc;
919 	struct spdk_dif_ctx_init_ext_opts dif_opts;
920 
921 	/* Set appmask and apptag when PRACT is enabled */
922 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
923 		fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask;
924 		fio_req->dif_ctx.app_tag = g_spdk_apptag;
925 		return 0;
926 	}
927 
928 	block_size = spdk_nvme_ns_get_sector_size(ns);
929 	md_size = spdk_nvme_ns_get_md_size(ns);
930 	lba = io_u->offset / block_size;
931 	lba_count = io_u->xfer_buflen / block_size;
932 
933 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
934 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
935 	rc = spdk_dif_ctx_init(&fio_req->dif_ctx, block_size, md_size,
936 			       false, fio_qpair->md_start,
937 			       (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns),
938 			       fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag,
939 			       0, 0, &dif_opts);
940 	if (rc != 0) {
941 		fprintf(stderr, "Initialization of DIF context failed\n");
942 		return rc;
943 	}
944 
945 	if (io_u->ddir != DDIR_WRITE) {
946 		return 0;
947 	}
948 
949 	iov.iov_base = io_u->buf;
950 	iov.iov_len = io_u->xfer_buflen;
951 	md_iov.iov_base = fio_req->md_buf;
952 	md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size);
953 	rc = spdk_dix_generate(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx);
954 	if (rc < 0) {
955 		fprintf(stderr, "Generation of DIX failed\n");
956 	}
957 
958 	return rc;
959 }
960 
961 static int
962 fio_extended_lba_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
963 {
964 	struct spdk_nvme_ns *ns = fio_qpair->ns;
965 	struct spdk_fio_request *fio_req = io_u->engine_data;
966 	uint32_t lba_count;
967 	struct iovec iov;
968 	struct spdk_dif_error err_blk = {};
969 	int rc;
970 
971 	/* Do nothing when PRACT is enabled */
972 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
973 		return 0;
974 	}
975 
976 	iov.iov_base = io_u->buf;
977 	iov.iov_len = io_u->xfer_buflen;
978 	lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_extended_sector_size(ns);
979 
980 	rc = spdk_dif_verify(&iov, 1, lba_count, &fio_req->dif_ctx, &err_blk);
981 	if (rc != 0) {
982 		fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n",
983 			err_blk.err_type, err_blk.err_offset);
984 	}
985 
986 	return rc;
987 }
988 
989 static int
990 fio_separate_md_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
991 {
992 	struct spdk_nvme_ns *ns = fio_qpair->ns;
993 	struct spdk_fio_request *fio_req = io_u->engine_data;
994 	uint32_t md_size, lba_count;
995 	struct iovec iov, md_iov;
996 	struct spdk_dif_error err_blk = {};
997 	int rc;
998 
999 	/* Do nothing when PRACT is enabled */
1000 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
1001 		return 0;
1002 	}
1003 
1004 	iov.iov_base = io_u->buf;
1005 	iov.iov_len = io_u->xfer_buflen;
1006 	lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_sector_size(ns);
1007 	md_size = spdk_nvme_ns_get_md_size(ns);
1008 	md_iov.iov_base = fio_req->md_buf;
1009 	md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size);
1010 
1011 	rc = spdk_dix_verify(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx, &err_blk);
1012 	if (rc != 0) {
1013 		fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n",
1014 			err_blk.err_type, err_blk.err_offset);
1015 	}
1016 
1017 	return rc;
1018 }
1019 
1020 static void
1021 spdk_fio_completion_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
1022 {
1023 	struct spdk_fio_request		*fio_req = ctx;
1024 	struct spdk_fio_thread		*fio_thread = fio_req->fio_thread;
1025 	struct spdk_fio_qpair		*fio_qpair = fio_req->fio_qpair;
1026 	int				rc;
1027 
1028 	if (fio_qpair->nvme_pi_enabled && fio_req->io->ddir == DDIR_READ) {
1029 		if (fio_qpair->extended_lba) {
1030 			rc = fio_extended_lba_verify_pi(fio_qpair, fio_req->io);
1031 		} else {
1032 			rc = fio_separate_md_verify_pi(fio_qpair, fio_req->io);
1033 		}
1034 		if (rc != 0) {
1035 			fio_req->io->error = abs(rc);
1036 		}
1037 	}
1038 
1039 	if (spdk_nvme_cpl_is_error(cpl)) {
1040 		fio_req->io->error = EIO;
1041 	}
1042 
1043 	assert(fio_thread->iocq_count < fio_thread->iocq_size);
1044 	fio_thread->iocq[fio_thread->iocq_count++] = fio_req->io;
1045 }
1046 
1047 static void
1048 spdk_nvme_io_reset_sgl(void *ref, uint32_t sgl_offset)
1049 {
1050 	struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref;
1051 
1052 	fio_req->iov_offset = sgl_offset;
1053 	fio_req->bit_bucket_data_len = 0;
1054 }
1055 
1056 static int
1057 spdk_nvme_io_next_sge(void *ref, void **address, uint32_t *length)
1058 {
1059 	struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref;
1060 	struct io_u *io_u = fio_req->io;
1061 	uint32_t iov_len;
1062 	uint32_t bit_bucket_len;
1063 
1064 	*address = io_u->buf;
1065 
1066 	if (fio_req->iov_offset) {
1067 		assert(fio_req->iov_offset <= io_u->xfer_buflen);
1068 		*address += fio_req->iov_offset;
1069 	}
1070 
1071 	iov_len = io_u->xfer_buflen - fio_req->iov_offset;
1072 	if (iov_len > g_spdk_sge_size) {
1073 		iov_len = g_spdk_sge_size;
1074 	}
1075 
1076 	if ((fio_req->bit_bucket_data_len < g_spdk_bit_bucket_data_len) && (io_u->ddir == DDIR_READ)) {
1077 		assert(g_spdk_bit_bucket_data_len < io_u->xfer_buflen);
1078 		*address = (void *)UINT64_MAX;
1079 		bit_bucket_len = g_spdk_bit_bucket_data_len - fio_req->bit_bucket_data_len;
1080 		if (iov_len > bit_bucket_len) {
1081 			iov_len = bit_bucket_len;
1082 		}
1083 		fio_req->bit_bucket_data_len += iov_len;
1084 	}
1085 
1086 	fio_req->iov_offset += iov_len;
1087 	*length = iov_len;
1088 
1089 	return 0;
1090 }
1091 
1092 #if FIO_IOOPS_VERSION >= 24
1093 typedef enum fio_q_status fio_q_status_t;
1094 #else
1095 typedef int fio_q_status_t;
1096 #endif
1097 
1098 static fio_q_status_t
1099 spdk_fio_queue(struct thread_data *td, struct io_u *io_u)
1100 {
1101 	int rc = 1;
1102 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
1103 	struct spdk_fio_request	*fio_req = io_u->engine_data;
1104 	struct spdk_fio_qpair	*fio_qpair;
1105 	struct spdk_nvme_ns	*ns = NULL;
1106 	void			*md_buf = NULL;
1107 	struct spdk_dif_ctx	*dif_ctx = &fio_req->dif_ctx;
1108 #if FIO_HAS_FDP
1109 	struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
1110 #endif
1111 	struct spdk_nvme_dsm_range *range;
1112 	uint32_t		block_size;
1113 	uint64_t		lba;
1114 	uint32_t		lba_count;
1115 	uint32_t		num_range;
1116 
1117 	fio_qpair = get_fio_qpair(fio_thread, io_u->file);
1118 	if (fio_qpair == NULL) {
1119 		return -ENXIO;
1120 	}
1121 	ns = fio_qpair->ns;
1122 
1123 	if (fio_qpair->nvme_pi_enabled && !fio_qpair->extended_lba) {
1124 		md_buf = fio_req->md_buf;
1125 	}
1126 	fio_req->fio_qpair = fio_qpair;
1127 
1128 	block_size = _nvme_get_host_buffer_sector_size(ns, fio_qpair->io_flags);
1129 	lba = io_u->offset / block_size;
1130 	lba_count = io_u->xfer_buflen / block_size;
1131 
1132 #if FIO_HAS_FDP
1133 	/* Only SGL support for write command with directives */
1134 	if (io_u->ddir == DDIR_WRITE && io_u->dtype && !g_spdk_enable_sgl) {
1135 		log_err("spdk/nvme: queue() directives require SGL to be enabled\n");
1136 		io_u->error = -EINVAL;
1137 		return FIO_Q_COMPLETED;
1138 	}
1139 #endif
1140 
1141 	/* TODO: considering situations that fio will randomize and verify io_u */
1142 	if (fio_qpair->nvme_pi_enabled) {
1143 		if (fio_qpair->extended_lba) {
1144 			rc = fio_extended_lba_setup_pi(fio_qpair, io_u);
1145 		} else {
1146 			rc = fio_separate_md_setup_pi(fio_qpair, io_u);
1147 		}
1148 		if (rc < 0) {
1149 			io_u->error = -rc;
1150 			return FIO_Q_COMPLETED;
1151 		}
1152 	}
1153 
1154 	switch (io_u->ddir) {
1155 	case DDIR_READ:
1156 		if (!g_spdk_enable_sgl) {
1157 			rc = spdk_nvme_ns_cmd_read_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba, lba_count,
1158 							   spdk_fio_completion_cb, fio_req,
1159 							   fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1160 		} else {
1161 			rc = spdk_nvme_ns_cmd_readv_with_md(ns, fio_qpair->qpair, lba,
1162 							    lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1163 							    spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1164 							    dif_ctx->apptag_mask, dif_ctx->app_tag);
1165 		}
1166 		break;
1167 	case DDIR_WRITE:
1168 		if (!g_spdk_enable_sgl) {
1169 			if (!fio_qpair->zone_append_enabled) {
1170 				rc = spdk_nvme_ns_cmd_write_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba,
1171 								    lba_count,
1172 								    spdk_fio_completion_cb, fio_req,
1173 								    fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1174 			} else {
1175 				uint64_t zslba = fio_offset_to_zslba(io_u->offset, ns);
1176 				rc = spdk_nvme_zns_zone_append_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, zslba,
1177 								       lba_count,
1178 								       spdk_fio_completion_cb, fio_req,
1179 								       fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1180 			}
1181 		} else {
1182 			if (!fio_qpair->zone_append_enabled) {
1183 #if FIO_HAS_FDP
1184 				if (spdk_unlikely(io_u->dtype)) {
1185 					ext_opts.size = SPDK_SIZEOF(&ext_opts, cdw13);
1186 					ext_opts.io_flags = fio_qpair->io_flags | (io_u->dtype << 20);
1187 					ext_opts.metadata = md_buf;
1188 					ext_opts.cdw13 = (io_u->dspec << 16);
1189 					ext_opts.apptag = dif_ctx->app_tag;
1190 					ext_opts.apptag_mask = dif_ctx->apptag_mask;
1191 					rc = spdk_nvme_ns_cmd_writev_ext(ns, fio_qpair->qpair, lba, lba_count,
1192 									 spdk_fio_completion_cb, fio_req,
1193 									 spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, &ext_opts);
1194 					break;
1195 				}
1196 #endif
1197 				rc = spdk_nvme_ns_cmd_writev_with_md(ns, fio_qpair->qpair, lba,
1198 								     lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1199 								     spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1200 								     dif_ctx->apptag_mask, dif_ctx->app_tag);
1201 			} else {
1202 				uint64_t zslba = fio_offset_to_zslba(io_u->offset, ns);
1203 				rc = spdk_nvme_zns_zone_appendv_with_md(ns, fio_qpair->qpair, zslba,
1204 									lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1205 									spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1206 									dif_ctx->apptag_mask, dif_ctx->app_tag);
1207 			}
1208 		}
1209 		break;
1210 	case DDIR_TRIM:
1211 		if (td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM) {
1212 			do_io_u_trim(td, io_u);
1213 			io_u_mark_submit(td, 1);
1214 			io_u_mark_complete(td, 1);
1215 			return FIO_Q_COMPLETED;
1216 		}
1217 
1218 		range = fio_req->dsm_range;
1219 #if FIO_HAS_MRT
1220 		if (td->o.num_range == 1) {
1221 			range->attributes.raw = 0;
1222 			range->length = lba_count;
1223 			range->starting_lba = lba;
1224 			num_range = 1;
1225 		} else {
1226 			struct trim_range *tr = (struct trim_range *)io_u->xfer_buf;
1227 			for (uint32_t i = 0; i < io_u->number_trim; i++) {
1228 				range->attributes.raw = 0;
1229 				range->length = tr->len / block_size;
1230 				range->starting_lba = tr->start / block_size;
1231 				range++;
1232 				tr++;
1233 			}
1234 			num_range = io_u->number_trim;
1235 			range = fio_req->dsm_range;
1236 		}
1237 #else
1238 		range->attributes.raw = 0;
1239 		range->length = lba_count;
1240 		range->starting_lba = lba;
1241 		num_range = 1;
1242 #endif
1243 
1244 		rc = spdk_nvme_ns_cmd_dataset_management(ns, fio_qpair->qpair,
1245 				SPDK_NVME_DSM_ATTR_DEALLOCATE, range, num_range,
1246 				spdk_fio_completion_cb, fio_req);
1247 		break;
1248 	default:
1249 		assert(false);
1250 		break;
1251 	}
1252 
1253 	/* NVMe read/write functions return -ENOMEM if there are no free requests. */
1254 	if (rc == -ENOMEM) {
1255 		return FIO_Q_BUSY;
1256 	}
1257 
1258 	if (rc != 0) {
1259 		io_u->error = abs(rc);
1260 		return FIO_Q_COMPLETED;
1261 	}
1262 
1263 	return FIO_Q_QUEUED;
1264 }
1265 
1266 static struct io_u *
1267 spdk_fio_event(struct thread_data *td, int event)
1268 {
1269 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1270 
1271 	assert(event >= 0);
1272 	assert((unsigned)event < fio_thread->iocq_count);
1273 	return fio_thread->iocq[event];
1274 }
1275 
1276 static int
1277 spdk_fio_getevents(struct thread_data *td, unsigned int min,
1278 		   unsigned int max, const struct timespec *t)
1279 {
1280 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1281 	struct spdk_fio_qpair *fio_qpair = NULL;
1282 	struct timespec t0, t1;
1283 	uint64_t timeout = 0;
1284 
1285 	if (t) {
1286 		timeout = t->tv_sec * 1000000000L + t->tv_nsec;
1287 		clock_gettime(CLOCK_MONOTONIC_RAW, &t0);
1288 	}
1289 
1290 	fio_thread->iocq_count = 0;
1291 
1292 	/* fetch the next qpair */
1293 	if (fio_thread->fio_qpair_current) {
1294 		fio_qpair = TAILQ_NEXT(fio_thread->fio_qpair_current, link);
1295 	}
1296 
1297 	for (;;) {
1298 		if (fio_qpair == NULL) {
1299 			fio_qpair = TAILQ_FIRST(&fio_thread->fio_qpair);
1300 		}
1301 
1302 		while (fio_qpair != NULL) {
1303 			/*
1304 			 * We can be called while spdk_fio_open()s are still
1305 			 * ongoing, in which case, ->qpair can still be NULL.
1306 			 */
1307 			if (fio_qpair->qpair == NULL) {
1308 				fio_qpair = TAILQ_NEXT(fio_qpair, link);
1309 				continue;
1310 			}
1311 
1312 			spdk_nvme_qpair_process_completions(fio_qpair->qpair, max - fio_thread->iocq_count);
1313 
1314 			if (fio_thread->iocq_count >= min) {
1315 				/* reset the current handling qpair */
1316 				fio_thread->fio_qpair_current = fio_qpair;
1317 				return fio_thread->iocq_count;
1318 			}
1319 
1320 			fio_qpair = TAILQ_NEXT(fio_qpair, link);
1321 		}
1322 
1323 		if (t) {
1324 			uint64_t elapse;
1325 
1326 			clock_gettime(CLOCK_MONOTONIC_RAW, &t1);
1327 			elapse = ((t1.tv_sec - t0.tv_sec) * 1000000000L)
1328 				 + t1.tv_nsec - t0.tv_nsec;
1329 			if (elapse > timeout) {
1330 				break;
1331 			}
1332 		}
1333 	}
1334 
1335 	/* reset the current handling qpair */
1336 	fio_thread->fio_qpair_current = fio_qpair;
1337 	return fio_thread->iocq_count;
1338 }
1339 
1340 static int
1341 spdk_fio_invalidate(struct thread_data *td, struct fio_file *f)
1342 {
1343 	/* TODO: This should probably send a flush to the device, but for now just return successful. */
1344 	return 0;
1345 }
1346 
1347 #if FIO_HAS_ZBD
1348 static int
1349 spdk_fio_get_zoned_model(struct thread_data *td, struct fio_file *f, enum zbd_zoned_model *model)
1350 {
1351 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1352 	struct spdk_fio_qpair *fio_qpair = NULL;
1353 	const struct spdk_nvme_zns_ns_data *zns_data = NULL;
1354 
1355 	if (f->filetype != FIO_TYPE_BLOCK) {
1356 		log_info("spdk/nvme: unsupported filetype: %d\n", f->filetype);
1357 		return -EINVAL;
1358 	}
1359 
1360 	fio_qpair = get_fio_qpair(fio_thread, f);
1361 	if (!fio_qpair) {
1362 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1363 		return -ENODEV;
1364 	}
1365 
1366 	switch (spdk_nvme_ns_get_csi(fio_qpair->ns)) {
1367 	case SPDK_NVME_CSI_NVM:
1368 		*model = ZBD_NONE;
1369 		return 0;
1370 
1371 	case SPDK_NVME_CSI_KV:
1372 		log_err("spdk/nvme: KV namespace is currently not supported\n");
1373 		return -ENOSYS;
1374 
1375 	case SPDK_NVME_CSI_ZNS:
1376 		zns_data = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1377 		if (!zns_data) {
1378 			log_err("spdk/nvme: file_name: '%s', ZNS is not enabled\n", f->file_name);
1379 			return -EINVAL;
1380 		}
1381 
1382 		*model = ZBD_HOST_MANAGED;
1383 
1384 		return 0;
1385 	}
1386 
1387 	return -EINVAL;
1388 }
1389 
1390 static int
1391 spdk_fio_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset,
1392 		      struct zbd_zone *zbdz, unsigned int nr_zones)
1393 {
1394 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1395 	struct spdk_fio_qpair *fio_qpair = NULL;
1396 	const struct spdk_nvme_zns_ns_data *zns = NULL;
1397 	struct spdk_nvme_zns_zone_report *report;
1398 	struct spdk_nvme_qpair *tmp_qpair;
1399 	uint32_t report_nzones = 0, report_nzones_max, report_nbytes, mdts_nbytes;
1400 	uint64_t zsze_nbytes, ns_nzones, lba_nbytes;
1401 	int completed = 0, err;
1402 
1403 	fio_qpair = get_fio_qpair(fio_thread, f);
1404 	if (!fio_qpair) {
1405 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1406 		return -ENODEV;
1407 	}
1408 	zns = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1409 	if (!zns) {
1410 		log_err("spdk/nvme: file_name: '%s', zns is not enabled\n", f->file_name);
1411 		return -EINVAL;
1412 	}
1413 
1414 	/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
1415 	 * Create a temporary qpair in order to perform report zones.
1416 	 */
1417 	assert(!fio_qpair->qpair);
1418 
1419 	tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_qpair->fio_ctrlr->ctrlr, NULL, 0);
1420 	if (!tmp_qpair) {
1421 		log_err("spdk/nvme: cannot allocate a temporary qpair\n");
1422 		return -EIO;
1423 	}
1424 
1425 	/** Retrieve device parameters */
1426 	mdts_nbytes = spdk_nvme_ns_get_max_io_xfer_size(fio_qpair->ns);
1427 	lba_nbytes = spdk_nvme_ns_get_sector_size(fio_qpair->ns);
1428 	zsze_nbytes = spdk_nvme_zns_ns_get_zone_size(fio_qpair->ns);
1429 	ns_nzones = spdk_nvme_zns_ns_get_num_zones(fio_qpair->ns);
1430 
1431 	/** Allocate report-buffer without exceeding mdts, zbdz-storage, and what is needed */
1432 	report_nzones_max = (mdts_nbytes - sizeof(*report)) / sizeof(report->descs[0]);
1433 	report_nzones_max = spdk_min(spdk_min(report_nzones_max, nr_zones), ns_nzones);
1434 	report_nbytes = sizeof(report->descs[0]) * report_nzones_max + sizeof(*report);
1435 	report = calloc(1, report_nbytes);
1436 	if (!report) {
1437 		log_err("spdk/nvme: failed report_zones(): ENOMEM\n");
1438 		err = -ENOMEM;
1439 		goto exit;
1440 	}
1441 
1442 	err = spdk_nvme_zns_report_zones(fio_qpair->ns, tmp_qpair, report, report_nbytes,
1443 					 offset / lba_nbytes, SPDK_NVME_ZRA_LIST_ALL, true, pcu_cb,
1444 					 &completed);
1445 	if (err || pcu(tmp_qpair, &completed) || completed < 0) {
1446 		log_err("spdk/nvme: report_zones(): err: %d, cpl: %d\n", err, completed);
1447 		err = err ? err : -EIO;
1448 		goto exit;
1449 	}
1450 	assert(report->nr_zones <= report_nzones_max);
1451 	report_nzones = report->nr_zones;
1452 
1453 	for (uint64_t idx = 0; idx < report->nr_zones; ++idx) {
1454 		struct spdk_nvme_zns_zone_desc *zdesc = &report->descs[idx];
1455 
1456 		zbdz[idx].start = zdesc->zslba * lba_nbytes;
1457 		zbdz[idx].len = zsze_nbytes;
1458 		zbdz[idx].capacity = zdesc->zcap * lba_nbytes;
1459 		zbdz[idx].wp = zdesc->wp * lba_nbytes;
1460 
1461 		switch (zdesc->zt) {
1462 		case SPDK_NVME_ZONE_TYPE_SEQWR:
1463 			zbdz[idx].type = ZBD_ZONE_TYPE_SWR;
1464 			break;
1465 
1466 		default:
1467 			log_err("spdk/nvme: %s: inv. zone-type: 0x%x\n", f->file_name, zdesc->zt);
1468 			err = -EIO;
1469 			goto exit;
1470 		}
1471 
1472 		switch (zdesc->zs) {
1473 		case SPDK_NVME_ZONE_STATE_EMPTY:
1474 			zbdz[idx].cond = ZBD_ZONE_COND_EMPTY;
1475 			break;
1476 		case SPDK_NVME_ZONE_STATE_IOPEN:
1477 			zbdz[idx].cond = ZBD_ZONE_COND_IMP_OPEN;
1478 			break;
1479 		case SPDK_NVME_ZONE_STATE_EOPEN:
1480 			zbdz[idx].cond = ZBD_ZONE_COND_EXP_OPEN;
1481 			break;
1482 		case SPDK_NVME_ZONE_STATE_CLOSED:
1483 			zbdz[idx].cond = ZBD_ZONE_COND_CLOSED;
1484 			break;
1485 		case SPDK_NVME_ZONE_STATE_RONLY:
1486 			zbdz[idx].cond = ZBD_ZONE_COND_READONLY;
1487 			break;
1488 		case SPDK_NVME_ZONE_STATE_FULL:
1489 			zbdz[idx].cond = ZBD_ZONE_COND_FULL;
1490 			break;
1491 		case SPDK_NVME_ZONE_STATE_OFFLINE:
1492 			zbdz[idx].cond = ZBD_ZONE_COND_OFFLINE;
1493 			break;
1494 
1495 		default:
1496 			log_err("spdk/nvme: %s: inv. zone-state: 0x%x\n", f->file_name, zdesc->zs);
1497 			err = -EIO;
1498 			goto exit;
1499 		}
1500 	}
1501 
1502 exit:
1503 	spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
1504 	free(report);
1505 
1506 	return err ? err : (int)report_nzones;
1507 }
1508 
1509 static int
1510 spdk_fio_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset, uint64_t length)
1511 {
1512 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1513 	struct spdk_fio_qpair *fio_qpair = NULL;
1514 	const struct spdk_nvme_zns_ns_data *zns = NULL;
1515 	uint64_t zsze_nbytes, lba_nbytes;
1516 	int err = 0;
1517 
1518 	fio_qpair = get_fio_qpair(fio_thread, f);
1519 	if (!fio_qpair) {
1520 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1521 		return -ENODEV;
1522 	}
1523 	zns = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1524 	if (!zns) {
1525 		log_err("spdk/nvme: file_name: '%s', zns is not enabled\n", f->file_name);
1526 		return -EINVAL;
1527 	}
1528 	zsze_nbytes = spdk_nvme_zns_ns_get_zone_size(fio_qpair->ns);
1529 	lba_nbytes = spdk_nvme_ns_get_sector_size(fio_qpair->ns);
1530 
1531 	/** check the assumption that offset is valid zone-start lba */
1532 	if (offset % zsze_nbytes) {
1533 		log_err("spdk/nvme: offset: %zu is not a valid zslba\n", offset);
1534 		return -EINVAL;
1535 	}
1536 
1537 	for (uint64_t cur = offset; cur < offset + length; cur += zsze_nbytes) {
1538 		int completed = 0;
1539 
1540 		err = spdk_nvme_zns_reset_zone(fio_qpair->ns, fio_qpair->qpair, cur / lba_nbytes,
1541 					       false, pcu_cb, &completed);
1542 		if (err || pcu(fio_qpair->qpair, &completed) || completed < 0) {
1543 			log_err("spdk/nvme: zns_reset_zone(): err: %d, cpl: %d\n", err, completed);
1544 			err = err ? err : -EIO;
1545 			break;
1546 		}
1547 	}
1548 
1549 	return err;
1550 }
1551 #endif
1552 
1553 #if FIO_IOOPS_VERSION >= 30
1554 static int
1555 spdk_fio_get_max_open_zones(struct thread_data *td, struct fio_file *f,
1556 			    unsigned int *max_open_zones)
1557 {
1558 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1559 	struct spdk_fio_qpair *fio_qpair = NULL;
1560 
1561 	fio_qpair = get_fio_qpair(fio_thread, f);
1562 	if (!fio_qpair) {
1563 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1564 		return -ENODEV;
1565 	}
1566 
1567 	*max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(fio_qpair->ns);
1568 
1569 	return 0;
1570 }
1571 #endif
1572 
1573 #if FIO_HAS_FDP
1574 static int
1575 spdk_fio_fdp_fetch_ruhs(struct thread_data *td, struct fio_file *f,
1576 			struct fio_ruhs_info *fruhs_info)
1577 {
1578 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1579 	struct spdk_fio_qpair *fio_qpair = NULL;
1580 	struct spdk_nvme_qpair *tmp_qpair;
1581 	struct {
1582 		struct spdk_nvme_fdp_ruhs ruhs;
1583 		struct spdk_nvme_fdp_ruhs_desc desc[128];
1584 	} fdp_ruhs;
1585 	uint16_t idx;
1586 	int completed = 0, err;
1587 
1588 	fio_qpair = get_fio_qpair(fio_thread, f);
1589 	if (!fio_qpair) {
1590 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1591 		return -ENODEV;
1592 	}
1593 
1594 	/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
1595 	 * Create a temporary qpair in order to perform report zones.
1596 	 */
1597 	assert(!fio_qpair->qpair);
1598 
1599 	tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_qpair->fio_ctrlr->ctrlr, NULL, 0);
1600 	if (!tmp_qpair) {
1601 		log_err("spdk/nvme: cannot allocate a temporary qpair\n");
1602 		return -EIO;
1603 	}
1604 
1605 	err = spdk_nvme_ns_cmd_io_mgmt_recv(fio_qpair->ns, tmp_qpair, &fdp_ruhs, sizeof(fdp_ruhs),
1606 					    SPDK_NVME_FDP_IO_MGMT_RECV_RUHS, 0, pcu_cb, &completed);
1607 	if (err || pcu(tmp_qpair, &completed) || completed < 0) {
1608 		log_err("spdk/nvme: fetch_ruhs(): err: %d, cpl: %d\n", err, completed);
1609 		err = err ? err : -EIO;
1610 		goto exit;
1611 	}
1612 
1613 	fruhs_info->nr_ruhs = fdp_ruhs.ruhs.nruhsd;
1614 	for (idx = 0; idx < fdp_ruhs.ruhs.nruhsd; idx++) {
1615 		fruhs_info->plis[idx] = fdp_ruhs.desc[idx].pid;
1616 	}
1617 
1618 exit:
1619 	spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
1620 
1621 	return err;
1622 }
1623 #endif
1624 
1625 static void
1626 spdk_fio_cleanup(struct thread_data *td)
1627 {
1628 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
1629 	struct spdk_fio_qpair	*fio_qpair, *fio_qpair_tmp;
1630 	struct spdk_fio_options *fio_options = td->eo;
1631 
1632 	if (fio_options->spdk_tracing) {
1633 		spdk_trace_unregister_user_thread();
1634 	}
1635 
1636 	TAILQ_FOREACH_SAFE(fio_qpair, &fio_thread->fio_qpair, link, fio_qpair_tmp) {
1637 		TAILQ_REMOVE(&fio_thread->fio_qpair, fio_qpair, link);
1638 		free(fio_qpair);
1639 	}
1640 
1641 	free(fio_thread->iocq);
1642 	free(fio_thread);
1643 
1644 	pthread_mutex_lock(&g_mutex);
1645 	g_td_count--;
1646 	if (g_td_count == 0) {
1647 		struct spdk_fio_ctrlr	*fio_ctrlr, *fio_ctrlr_tmp;
1648 		struct spdk_nvme_detach_ctx	*detach_ctx = NULL;
1649 
1650 		TAILQ_FOREACH_SAFE(fio_ctrlr, &g_ctrlrs, link, fio_ctrlr_tmp) {
1651 			TAILQ_REMOVE(&g_ctrlrs, fio_ctrlr, link);
1652 			spdk_nvme_detach_async(fio_ctrlr->ctrlr, &detach_ctx);
1653 			free(fio_ctrlr);
1654 		}
1655 
1656 		if (detach_ctx) {
1657 			spdk_nvme_detach_poll(detach_ctx);
1658 		}
1659 
1660 		if (fio_options->enable_vmd) {
1661 			spdk_vmd_fini();
1662 		}
1663 	}
1664 	pthread_mutex_unlock(&g_mutex);
1665 	if (TAILQ_EMPTY(&g_ctrlrs)) {
1666 		if (pthread_cancel(g_ctrlr_thread_id) == 0) {
1667 			pthread_join(g_ctrlr_thread_id, NULL);
1668 		}
1669 	}
1670 }
1671 
1672 /* This function enables addition of SPDK parameters to the fio config
1673  * Adding new parameters by defining them here and defining a callback
1674  * function to read the parameter value. */
1675 static struct fio_option options[] = {
1676 	{
1677 		.name           = "enable_wrr",
1678 		.lname          = "Enable weighted round robin (WRR) for IO submission queues",
1679 		.type           = FIO_OPT_INT,
1680 		.off1           = offsetof(struct spdk_fio_options, enable_wrr),
1681 		.def            = "0",
1682 		.help           = "Enable weighted round robin (WRR) for IO submission queues",
1683 		.category       = FIO_OPT_C_ENGINE,
1684 		.group          = FIO_OPT_G_INVALID,
1685 	},
1686 	{
1687 		.name           = "arbitration_burst",
1688 		.lname          = "Arbitration Burst",
1689 		.type           = FIO_OPT_INT,
1690 		.off1           = offsetof(struct spdk_fio_options, arbitration_burst),
1691 		.def            = "0",
1692 		.help           = "Arbitration Burst used for WRR (valid range from 0 - 7)",
1693 		.category       = FIO_OPT_C_ENGINE,
1694 		.group          = FIO_OPT_G_INVALID,
1695 	},
1696 	{
1697 		.name           = "low_weight",
1698 		.lname          = "low_weight for WRR",
1699 		.type           = FIO_OPT_INT,
1700 		.off1           = offsetof(struct spdk_fio_options, low_weight),
1701 		.def            = "0",
1702 		.help           = "low_weight used for WRR (valid range from 0 - 255)",
1703 		.category       = FIO_OPT_C_ENGINE,
1704 		.group          = FIO_OPT_G_INVALID,
1705 	},
1706 	{
1707 		.name           = "medium_weight",
1708 		.lname          = "medium_weight for WRR",
1709 		.type           = FIO_OPT_INT,
1710 		.off1           = offsetof(struct spdk_fio_options, medium_weight),
1711 		.def            = "0",
1712 		.help           = "medium weight used for WRR (valid range from 0 - 255)",
1713 		.category       = FIO_OPT_C_ENGINE,
1714 		.group          = FIO_OPT_G_INVALID,
1715 	},
1716 	{
1717 		.name           = "high_weight",
1718 		.lname          = "high_weight for WRR",
1719 		.type           = FIO_OPT_INT,
1720 		.off1           = offsetof(struct spdk_fio_options, high_weight),
1721 		.def            = "0",
1722 		.help           = "high weight used for WRR (valid range from 0 - 255)",
1723 		.category       = FIO_OPT_C_ENGINE,
1724 		.group          = FIO_OPT_G_INVALID,
1725 	},
1726 	{
1727 		.name           = "wrr_priority",
1728 		.lname          = "priority used for WRR",
1729 		.type           = FIO_OPT_INT,
1730 		.off1           = offsetof(struct spdk_fio_options, wrr_priority),
1731 		.def            = "0",
1732 		.help           = "priority used for WRR (valid range from 0-3)",
1733 		.category       = FIO_OPT_C_ENGINE,
1734 		.group          = FIO_OPT_G_INVALID,
1735 	},
1736 	{
1737 		.name		= "mem_size_mb",
1738 		.lname		= "Memory size in MB",
1739 		.type		= FIO_OPT_INT,
1740 		.off1		= offsetof(struct spdk_fio_options, mem_size),
1741 		.def		= "0",
1742 		.help		= "Memory Size for SPDK (MB)",
1743 		.category	= FIO_OPT_C_ENGINE,
1744 		.group		= FIO_OPT_G_INVALID,
1745 	},
1746 	{
1747 		.name		= "shm_id",
1748 		.lname		= "shared memory ID",
1749 		.type		= FIO_OPT_INT,
1750 		.off1		= offsetof(struct spdk_fio_options, shm_id),
1751 		.def		= "-1",
1752 		.help		= "Shared Memory ID",
1753 		.category	= FIO_OPT_C_ENGINE,
1754 		.group		= FIO_OPT_G_INVALID,
1755 	},
1756 	{
1757 		.name		= "enable_sgl",
1758 		.lname		= "SGL used for I/O commands",
1759 		.type		= FIO_OPT_INT,
1760 		.off1		= offsetof(struct spdk_fio_options, enable_sgl),
1761 		.def		= "0",
1762 		.help		= "SGL Used for I/O Commands (enable_sgl=1 or enable_sgl=0)",
1763 		.category	= FIO_OPT_C_ENGINE,
1764 		.group		= FIO_OPT_G_INVALID,
1765 	},
1766 	{
1767 		.name		= "sge_size",
1768 		.lname		= "SGL size used for I/O commands",
1769 		.type		= FIO_OPT_INT,
1770 		.off1		= offsetof(struct spdk_fio_options, sge_size),
1771 		.def		= "4096",
1772 		.help		= "SGL size in bytes for I/O Commands (default 4096)",
1773 		.category	= FIO_OPT_C_ENGINE,
1774 		.group		= FIO_OPT_G_INVALID,
1775 	},
1776 	{
1777 		.name		= "bit_bucket_data_len",
1778 		.lname		= "Amount of data used for Bit Bucket",
1779 		.type		= FIO_OPT_INT,
1780 		.off1		= offsetof(struct spdk_fio_options, bit_bucket_data_len),
1781 		.def		= "0",
1782 		.help		= "Bit Bucket Data Length for READ commands (disabled by default)",
1783 		.category	= FIO_OPT_C_ENGINE,
1784 		.group		= FIO_OPT_G_INVALID,
1785 	},
1786 	{
1787 		.name		= "hostnqn",
1788 		.lname		= "Host NQN to use when connecting to controllers.",
1789 		.type		= FIO_OPT_STR_STORE,
1790 		.off1		= offsetof(struct spdk_fio_options, hostnqn),
1791 		.help		= "Host NQN",
1792 		.category	= FIO_OPT_C_ENGINE,
1793 		.group		= FIO_OPT_G_INVALID,
1794 	},
1795 	{
1796 		.name		= "pi_act",
1797 		.lname		= "Protection Information Action",
1798 		.type		= FIO_OPT_INT,
1799 		.off1		= offsetof(struct spdk_fio_options, pi_act),
1800 		.def		= "1",
1801 		.help		= "Protection Information Action bit (pi_act=1 or pi_act=0)",
1802 		.category	= FIO_OPT_C_ENGINE,
1803 		.group		= FIO_OPT_G_INVALID,
1804 	},
1805 	{
1806 		.name		= "pi_chk",
1807 		.lname		= "Protection Information Check(GUARD|REFTAG|APPTAG)",
1808 		.type		= FIO_OPT_STR_STORE,
1809 		.off1		= offsetof(struct spdk_fio_options, pi_chk),
1810 		.def		= NULL,
1811 		.help		= "Control of Protection Information Checking (pi_chk=GUARD|REFTAG|APPTAG)",
1812 		.category	= FIO_OPT_C_ENGINE,
1813 		.group		= FIO_OPT_G_INVALID,
1814 	},
1815 	{
1816 		.name		= "md_per_io_size",
1817 		.lname		= "Separate Metadata Buffer Size per I/O",
1818 		.type		= FIO_OPT_INT,
1819 		.off1		= offsetof(struct spdk_fio_options, md_per_io_size),
1820 		.def		= "4096",
1821 		.help		= "Size of separate metadata buffer per I/O (Default: 4096)",
1822 		.category	= FIO_OPT_C_ENGINE,
1823 		.group		= FIO_OPT_G_INVALID,
1824 	},
1825 	{
1826 		.name		= "apptag",
1827 		.lname		= "Application Tag used in Protection Information",
1828 		.type		= FIO_OPT_INT,
1829 		.off1		= offsetof(struct spdk_fio_options, apptag),
1830 		.def		= "0x1234",
1831 		.help		= "Application Tag used in Protection Information field (Default: 0x1234)",
1832 		.category	= FIO_OPT_C_ENGINE,
1833 		.group		= FIO_OPT_G_INVALID,
1834 	},
1835 	{
1836 		.name		= "apptag_mask",
1837 		.lname		= "Application Tag Mask",
1838 		.type		= FIO_OPT_INT,
1839 		.off1		= offsetof(struct spdk_fio_options, apptag_mask),
1840 		.def		= "0xffff",
1841 		.help		= "Application Tag Mask used with Application Tag (Default: 0xffff)",
1842 		.category	= FIO_OPT_C_ENGINE,
1843 		.group		= FIO_OPT_G_INVALID,
1844 	},
1845 	{
1846 		.name		= "digest_enable",
1847 		.lname		= "PDU digest choice for NVMe/TCP Transport(NONE|HEADER|DATA|BOTH)",
1848 		.type		= FIO_OPT_STR_STORE,
1849 		.off1		= offsetof(struct spdk_fio_options, digest_enable),
1850 		.def		= NULL,
1851 		.help		= "Control the NVMe/TCP control(digest_enable=NONE|HEADER|DATA|BOTH)",
1852 		.category	= FIO_OPT_C_ENGINE,
1853 		.group		= FIO_OPT_G_INVALID,
1854 	},
1855 	{
1856 		.name		= "enable_vmd",
1857 		.lname		= "Enable VMD enumeration",
1858 		.type		= FIO_OPT_INT,
1859 		.off1		= offsetof(struct spdk_fio_options, enable_vmd),
1860 		.def		= "0",
1861 		.help		= "Enable VMD enumeration (enable_vmd=1 or enable_vmd=0)",
1862 		.category	= FIO_OPT_C_ENGINE,
1863 		.group		= FIO_OPT_G_INVALID,
1864 	},
1865 	{
1866 		.name		= "initial_zone_reset",
1867 		.lname		= "Reset Zones on initialization",
1868 		.type		= FIO_OPT_INT,
1869 		.off1		= offsetof(struct spdk_fio_options, initial_zone_reset),
1870 		.def		= "0",
1871 		.help		= "Reset Zones on initialization (0=disable, 1=Reset All Zones)",
1872 		.category	= FIO_OPT_C_ENGINE,
1873 		.group		= FIO_OPT_G_INVALID,
1874 	},
1875 	{
1876 		.name		= "zone_append",
1877 		.lname		= "Use zone append instead of write",
1878 		.type		= FIO_OPT_INT,
1879 		.off1		= offsetof(struct spdk_fio_options, zone_append),
1880 		.def		= "0",
1881 		.help		= "Use zone append instead of write (1=zone append, 0=write)",
1882 		.category	= FIO_OPT_C_ENGINE,
1883 		.group		= FIO_OPT_G_INVALID,
1884 	},
1885 	{
1886 		.name		= "print_qid_mappings",
1887 		.lname		= "Print job-to-qid mappings",
1888 		.type		= FIO_OPT_INT,
1889 		.off1		= offsetof(struct spdk_fio_options, print_qid_mappings),
1890 		.def		= "0",
1891 		.help		= "Print job-to-qid mappings (0=disable, 1=enable)",
1892 		.category	= FIO_OPT_C_ENGINE,
1893 		.group		= FIO_OPT_G_INVALID,
1894 	},
1895 	{
1896 		.name		= "log_flags",
1897 		.lname		= "log_flags",
1898 		.type		= FIO_OPT_STR_STORE,
1899 		.off1		= offsetof(struct spdk_fio_options, log_flags),
1900 		.help		= "Enable log flags (comma-separated list)",
1901 		.category	= FIO_OPT_C_ENGINE,
1902 		.group		= FIO_OPT_G_INVALID,
1903 	},
1904 	{
1905 		.name		= "spdk_tracing",
1906 		.lname		= "Enable SPDK Tracing",
1907 		.type		= FIO_OPT_INT,
1908 		.off1		= offsetof(struct spdk_fio_options, spdk_tracing),
1909 		.def		= "0",
1910 		.help		= "SPDK Tracing (0=disable, 1=enable)",
1911 		.category	= FIO_OPT_C_ENGINE,
1912 		.group		= FIO_OPT_G_INVALID,
1913 	},
1914 	{
1915 		.name		= NULL,
1916 	},
1917 };
1918 
1919 /* FIO imports this structure using dlsym */
1920 struct ioengine_ops ioengine = {
1921 	.name			= "spdk",
1922 	.version		= FIO_IOOPS_VERSION,
1923 	.queue			= spdk_fio_queue,
1924 	.getevents		= spdk_fio_getevents,
1925 	.event			= spdk_fio_event,
1926 	.cleanup		= spdk_fio_cleanup,
1927 	.open_file		= spdk_fio_open,
1928 	.close_file		= spdk_fio_close,
1929 	.invalidate		= spdk_fio_invalidate,
1930 	.iomem_alloc		= spdk_fio_iomem_alloc,
1931 	.iomem_free		= spdk_fio_iomem_free,
1932 	.setup			= spdk_fio_setup,
1933 	.init			= spdk_fio_init,
1934 	.io_u_init		= spdk_fio_io_u_init,
1935 	.io_u_free		= spdk_fio_io_u_free,
1936 #if FIO_HAS_ZBD
1937 	.get_zoned_model	= spdk_fio_get_zoned_model,
1938 	.report_zones		= spdk_fio_report_zones,
1939 	.reset_wp		= spdk_fio_reset_wp,
1940 #endif
1941 #if FIO_IOOPS_VERSION >= 30
1942 	.get_max_open_zones	= spdk_fio_get_max_open_zones,
1943 #endif
1944 #if FIO_HAS_FDP
1945 	.fdp_fetch_ruhs		= spdk_fio_fdp_fetch_ruhs,
1946 #endif
1947 #if FIO_HAS_MRT
1948 	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN | FIO_DISKLESSIO | FIO_MULTI_RANGE_TRIM,
1949 #else
1950 	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN | FIO_DISKLESSIO,
1951 #endif
1952 	.options		= options,
1953 	.option_struct_size	= sizeof(struct spdk_fio_options),
1954 };
1955 
1956 static void fio_init
1957 fio_spdk_register(void)
1958 {
1959 	register_ioengine(&ioengine);
1960 }
1961 
1962 static void fio_exit
1963 fio_spdk_unregister(void)
1964 {
1965 	if (g_spdk_env_initialized) {
1966 		spdk_trace_cleanup();
1967 		spdk_env_fini();
1968 	}
1969 
1970 	unregister_ioengine(&ioengine);
1971 }
1972 
1973 SPDK_LOG_REGISTER_COMPONENT(fio_nvme)
1974