xref: /spdk/app/fio/nvme/fio_plugin.c (revision c6c1234de9e0015e670dd0b51bf6ce39ee0e07bd)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include "spdk/nvme.h"
9 #include "spdk/nvme_zns.h"
10 #include "spdk/vmd.h"
11 #include "spdk/env.h"
12 #include "spdk/string.h"
13 #include "spdk/log.h"
14 #include "spdk/likely.h"
15 #include "spdk/endian.h"
16 #include "spdk/dif.h"
17 #include "spdk/util.h"
18 #include "spdk/trace.h"
19 
20 #include "config-host.h"
21 #include "fio.h"
22 #include "optgroup.h"
23 
24 #ifdef for_each_rw_ddir
25 #define FIO_HAS_ZBD (FIO_IOOPS_VERSION >= 26)
26 #define FIO_HAS_FDP (FIO_IOOPS_VERSION >= 35)
27 #define FIO_HAS_MRT (FIO_IOOPS_VERSION >= 34)
28 #else
29 #define FIO_HAS_ZBD (0)
30 #define FIO_HAS_FDP (0)
31 #define FIO_HAS_MRT (0)
32 #endif
33 
34 /* FreeBSD is missing CLOCK_MONOTONIC_RAW,
35  * so alternative is provided. */
36 #ifndef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */
37 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
38 #endif
39 
40 #define NVME_IO_ALIGN		4096
41 
42 static bool g_spdk_env_initialized;
43 static bool g_log_flag_error;
44 static int g_spdk_enable_sgl = 0;
45 static uint32_t g_spdk_sge_size = 4096;
46 static uint32_t g_spdk_bit_bucket_data_len = 0;
47 static uint32_t g_spdk_pract_flag;
48 static uint32_t g_spdk_prchk_flags;
49 static uint32_t g_spdk_md_per_io_size = 4096;
50 static uint16_t g_spdk_apptag;
51 static uint16_t g_spdk_apptag_mask;
52 
53 struct spdk_fio_options {
54 	void	*pad;	/* off1 used in option descriptions may not be 0 */
55 	int	enable_wrr;
56 	int	arbitration_burst;
57 	int	low_weight;
58 	int	medium_weight;
59 	int	high_weight;
60 	int	wrr_priority;
61 	int	mem_size;
62 	int	shm_id;
63 	int	enable_sgl;
64 	int	sge_size;
65 	int	bit_bucket_data_len;
66 	char	*hostnqn;
67 	int	pi_act;
68 	char	*pi_chk;
69 	int	md_per_io_size;
70 	int	apptag;
71 	int	apptag_mask;
72 	char	*digest_enable;
73 	int	enable_vmd;
74 	int	initial_zone_reset;
75 	int	zone_append;
76 	int	print_qid_mappings;
77 	int	spdk_tracing;
78 	char	*log_flags;
79 	int	disable_pcie_sgl_merge;
80 };
81 
82 struct spdk_fio_request {
83 	struct io_u		*io;
84 	/** Offset in current iovec, fio only uses 1 vector */
85 	uint32_t		iov_offset;
86 
87 	/** Amount of data used for Bit Bucket SGL */
88 	uint32_t		bit_bucket_data_len;
89 
90 	/** Context for NVMe PI */
91 	struct spdk_dif_ctx	dif_ctx;
92 	/** Separate metadata buffer pointer */
93 	void			*md_buf;
94 
95 	/** Dataset management range information */
96 	struct spdk_nvme_dsm_range *dsm_range;
97 
98 	struct spdk_fio_thread	*fio_thread;
99 	struct spdk_fio_qpair	*fio_qpair;
100 };
101 
102 struct spdk_fio_ctrlr {
103 	struct spdk_nvme_transport_id	tr_id;
104 	struct spdk_nvme_ctrlr_opts	opts;
105 	struct spdk_nvme_ctrlr		*ctrlr;
106 	TAILQ_ENTRY(spdk_fio_ctrlr)	link;
107 };
108 
109 static TAILQ_HEAD(, spdk_fio_ctrlr) g_ctrlrs = TAILQ_HEAD_INITIALIZER(g_ctrlrs);
110 static int g_td_count;
111 static pthread_t g_ctrlr_thread_id = 0;
112 static pthread_mutex_t g_mutex = PTHREAD_MUTEX_INITIALIZER;
113 static bool g_error;
114 
115 struct spdk_fio_qpair {
116 	struct fio_file			*f;
117 	struct spdk_nvme_qpair		*qpair;
118 	struct spdk_nvme_ns		*ns;
119 	uint32_t			io_flags;
120 	bool				zone_append_enabled;
121 	bool				nvme_pi_enabled;
122 	/* True for DIF and false for DIX, and this is valid only if nvme_pi_enabled is true. */
123 	bool				extended_lba;
124 	/* True for protection info transferred at start of metadata,
125 	 * false for protection info transferred at end of metadata, and
126 	 * this is valid only if nvme_pi_enabled is true.
127 	 */
128 	bool				md_start;
129 	TAILQ_ENTRY(spdk_fio_qpair)	link;
130 	struct spdk_fio_ctrlr		*fio_ctrlr;
131 };
132 
133 struct spdk_fio_thread {
134 	struct thread_data		*td;
135 
136 	TAILQ_HEAD(, spdk_fio_qpair)	fio_qpair;
137 	struct spdk_fio_qpair		*fio_qpair_current;	/* the current fio_qpair to be handled. */
138 
139 	struct io_u			**iocq;		/* io completion queue */
140 	unsigned int			iocq_count;	/* number of iocq entries filled by last getevents */
141 	unsigned int			iocq_size;	/* number of iocq entries allocated */
142 
143 };
144 
145 struct spdk_fio_probe_ctx {
146 	struct thread_data	*td;
147 	char			hostnqn[SPDK_NVMF_NQN_MAX_LEN + 1];
148 	struct fio_file		*f; /* fio_file given by user */
149 };
150 
151 static void *
152 spdk_fio_poll_ctrlrs(void *arg)
153 {
154 	struct spdk_fio_ctrlr *fio_ctrlr;
155 	int oldstate;
156 	int rc;
157 
158 	/* Loop until the thread is cancelled */
159 	while (true) {
160 		rc = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate);
161 		if (rc != 0) {
162 			SPDK_ERRLOG("Unable to set cancel state disabled on g_init_thread (%d): %s\n",
163 				    rc, spdk_strerror(rc));
164 		}
165 
166 		pthread_mutex_lock(&g_mutex);
167 
168 		TAILQ_FOREACH(fio_ctrlr, &g_ctrlrs, link) {
169 			spdk_nvme_ctrlr_process_admin_completions(fio_ctrlr->ctrlr);
170 		}
171 
172 		pthread_mutex_unlock(&g_mutex);
173 
174 		rc = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);
175 		if (rc != 0) {
176 			SPDK_ERRLOG("Unable to set cancel state enabled on g_init_thread (%d): %s\n",
177 				    rc, spdk_strerror(rc));
178 		}
179 
180 		/* This is a pthread cancellation point and cannot be removed. */
181 		sleep(1);
182 	}
183 
184 	return NULL;
185 }
186 
187 static bool
188 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
189 	 struct spdk_nvme_ctrlr_opts *opts)
190 {
191 	struct spdk_fio_probe_ctx *ctx = cb_ctx;
192 	struct thread_data *td = ctx->td;
193 	struct spdk_fio_options *fio_options = td->eo;
194 
195 	if (ctx->hostnqn[0] != '\0') {
196 		memcpy(opts->hostnqn, ctx->hostnqn, sizeof(opts->hostnqn));
197 	} else if (fio_options->hostnqn) {
198 		snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", fio_options->hostnqn);
199 	}
200 
201 	if (fio_options->enable_wrr) {
202 		opts->arb_mechanism		= SPDK_NVME_CC_AMS_WRR;
203 		opts->arbitration_burst		= fio_options->arbitration_burst;
204 		opts->low_priority_weight	= fio_options->low_weight;
205 		opts->medium_priority_weight	= fio_options->medium_weight;
206 		opts->high_priority_weight	= fio_options->high_weight;
207 	}
208 
209 	if (fio_options->digest_enable) {
210 		if (strcasecmp(fio_options->digest_enable, "HEADER") == 0) {
211 			opts->header_digest = true;
212 		} else if (strcasecmp(fio_options->digest_enable, "DATA") == 0) {
213 			opts->data_digest = true;
214 		} else if (strcasecmp(fio_options->digest_enable, "BOTH") == 0) {
215 			opts->header_digest = true;
216 			opts->data_digest = true;
217 		}
218 	}
219 
220 	return true;
221 }
222 
223 static struct spdk_fio_ctrlr *
224 get_fio_ctrlr(const struct spdk_nvme_transport_id *trid)
225 {
226 	struct spdk_fio_ctrlr	*fio_ctrlr;
227 
228 	TAILQ_FOREACH(fio_ctrlr, &g_ctrlrs, link) {
229 		if (spdk_nvme_transport_id_compare(trid, &fio_ctrlr->tr_id) == 0) {
230 			return fio_ctrlr;
231 		}
232 	}
233 
234 	return NULL;
235 }
236 
237 /**
238  * Returns the fio_qpair matching the given fio_file and has an associated ns
239  */
240 static struct spdk_fio_qpair *
241 get_fio_qpair(struct spdk_fio_thread *fio_thread, struct fio_file *f)
242 {
243 	struct spdk_fio_qpair	*fio_qpair;
244 
245 	TAILQ_FOREACH(fio_qpair, &fio_thread->fio_qpair, link) {
246 		if ((fio_qpair->f == f) && fio_qpair->ns) {
247 			return fio_qpair;
248 		}
249 	}
250 
251 	return NULL;
252 }
253 
254 #if FIO_HAS_ZBD
255 /**
256  * Callback function to use while processing completions until completion-indicator turns non-zero
257  */
258 static void
259 pcu_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
260 {
261 	int *completed = ctx;
262 
263 	*completed = spdk_nvme_cpl_is_error(cpl) ? -1 : 1;
264 }
265 
266 /**
267  * Process Completions Until the given 'completed' indicator turns non-zero or an error occurs
268  */
269 static int32_t
270 pcu(struct spdk_nvme_qpair *qpair, int *completed)
271 {
272 	int32_t ret;
273 
274 	while (!*completed) {
275 		ret = spdk_nvme_qpair_process_completions(qpair, 1);
276 		if (ret < 0) {
277 			log_err("spdk/nvme: process_compl(): ret: %d\n", ret);
278 			return ret;
279 		}
280 	}
281 
282 	return 0;
283 }
284 #endif
285 
286 static inline uint32_t
287 _nvme_get_host_buffer_sector_size(struct spdk_nvme_ns *ns, uint32_t io_flags)
288 {
289 	bool md_excluded_from_xfer = false;
290 	uint32_t md_size;
291 	uint32_t ns_flags;
292 
293 	ns_flags = spdk_nvme_ns_get_flags(ns);
294 	md_size = spdk_nvme_ns_get_md_size(ns);
295 
296 	/* For extended LBA format, if the metadata size is 8 bytes and PRACT is
297 	 * enabled(controller inserts/strips PI), we should reduce metadata size
298 	 * from block size.
299 	 */
300 	md_excluded_from_xfer = ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) &&
301 				 (ns_flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) &&
302 				 (ns_flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) &&
303 				 (md_size == 8));
304 
305 	return md_excluded_from_xfer ? spdk_nvme_ns_get_sector_size(ns) :
306 	       spdk_nvme_ns_get_extended_sector_size(ns);
307 }
308 
309 static void
310 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
311 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
312 {
313 	struct spdk_fio_probe_ctx *ctx = cb_ctx;
314 	struct thread_data	*td = ctx->td;
315 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
316 	struct spdk_fio_ctrlr	*fio_ctrlr;
317 	struct spdk_fio_qpair	*fio_qpair;
318 	struct spdk_nvme_ns	*ns;
319 	const struct spdk_nvme_ns_data	*nsdata;
320 	struct fio_file		*f = ctx->f;
321 	uint32_t		ns_id;
322 	char			*p;
323 	long int		tmp;
324 	uint32_t		block_size;
325 	struct spdk_fio_options *fio_options = td->eo;
326 
327 	p = strstr(f->file_name, "ns=");
328 	if (p != NULL) {
329 		tmp = spdk_strtol(p + 3, 10);
330 		if (tmp <= 0) {
331 			SPDK_ERRLOG("namespace id should be >=1, but was invalid: %ld\n", tmp);
332 			g_error = true;
333 			return;
334 		}
335 		ns_id = (uint32_t)tmp;
336 	} else {
337 		ns_id = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
338 		if (ns_id == 0) {
339 			/* The ctrlr has no active namespaces and we didn't specify any so nothing to do. */
340 			return;
341 		}
342 	}
343 
344 	pthread_mutex_lock(&g_mutex);
345 	fio_ctrlr = get_fio_ctrlr(trid);
346 	/* it is a new ctrlr and needs to be added */
347 	if (!fio_ctrlr) {
348 		/* Create an fio_ctrlr and add it to the list */
349 		fio_ctrlr = calloc(1, sizeof(*fio_ctrlr));
350 		if (!fio_ctrlr) {
351 			SPDK_ERRLOG("Cannot allocate space for fio_ctrlr\n");
352 			g_error = true;
353 			pthread_mutex_unlock(&g_mutex);
354 			return;
355 		}
356 		fio_ctrlr->opts = *opts;
357 		fio_ctrlr->ctrlr = ctrlr;
358 		fio_ctrlr->tr_id = *trid;
359 		TAILQ_INSERT_TAIL(&g_ctrlrs, fio_ctrlr, link);
360 	}
361 	pthread_mutex_unlock(&g_mutex);
362 
363 	ns = spdk_nvme_ctrlr_get_ns(fio_ctrlr->ctrlr, ns_id);
364 	if (ns == NULL) {
365 		SPDK_ERRLOG("Cannot get namespace by ns_id=%d\n", ns_id);
366 		g_error = true;
367 		return;
368 	}
369 
370 	if (!spdk_nvme_ns_is_active(ns)) {
371 		SPDK_ERRLOG("Inactive namespace by ns_id=%d\n", ns_id);
372 		g_error = true;
373 		return;
374 	}
375 	nsdata = spdk_nvme_ns_get_data(ns);
376 
377 	TAILQ_FOREACH(fio_qpair, &fio_thread->fio_qpair, link) {
378 		if ((fio_qpair->f == f) ||
379 		    ((spdk_nvme_transport_id_compare(trid, &fio_qpair->fio_ctrlr->tr_id) == 0) &&
380 		     (spdk_nvme_ns_get_id(fio_qpair->ns) == ns_id))) {
381 			/* Not the error case. Avoid duplicated connection */
382 			return;
383 		}
384 	}
385 
386 	/* create a new qpair */
387 	fio_qpair = calloc(1, sizeof(*fio_qpair));
388 	if (!fio_qpair) {
389 		g_error = true;
390 		SPDK_ERRLOG("Cannot allocate space for fio_qpair\n");
391 		return;
392 	}
393 
394 	f->engine_data = fio_qpair;
395 	fio_qpair->ns = ns;
396 	fio_qpair->f = f;
397 	fio_qpair->fio_ctrlr = fio_ctrlr;
398 	TAILQ_INSERT_TAIL(&fio_thread->fio_qpair, fio_qpair, link);
399 
400 	if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) {
401 		assert(spdk_nvme_ns_get_pi_type(ns) != SPDK_NVME_FMT_NVM_PROTECTION_DISABLE);
402 		fio_qpair->io_flags = g_spdk_pract_flag | g_spdk_prchk_flags;
403 		fio_qpair->nvme_pi_enabled = true;
404 		fio_qpair->md_start = nsdata->dps.md_start;
405 		fio_qpair->extended_lba = spdk_nvme_ns_supports_extended_lba(ns);
406 		fprintf(stdout, "PI type%u enabled with %s\n", spdk_nvme_ns_get_pi_type(ns),
407 			fio_qpair->extended_lba ? "extended lba" : "separate metadata");
408 	}
409 
410 	block_size = _nvme_get_host_buffer_sector_size(ns, fio_qpair->io_flags);
411 	for_each_rw_ddir(ddir) {
412 		if (td->o.min_bs[ddir] % block_size != 0 || td->o.max_bs[ddir] % block_size != 0) {
413 			if (spdk_nvme_ns_supports_extended_lba(ns)) {
414 				SPDK_ERRLOG("--bs or other block size related option has to be a multiple of (LBA data size + Metadata size)\n");
415 			} else {
416 				SPDK_ERRLOG("--bs or other block size related option has to be a multiple of LBA data size\n");
417 			}
418 			g_error = true;
419 			return;
420 		}
421 	}
422 
423 	if (fio_options->zone_append && spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS) {
424 		if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED) {
425 			SPDK_DEBUGLOG(fio_nvme, "Using zone appends instead of writes on: '%s'\n",
426 				      f->file_name);
427 			fio_qpair->zone_append_enabled = true;
428 		} else {
429 			SPDK_WARNLOG("Falling back to writes on: '%s' - ns lacks zone append cmd\n",
430 				     f->file_name);
431 		}
432 	}
433 
434 #if FIO_HAS_ZBD
435 	if (td_trim(td) && td->o.zone_mode == ZONE_MODE_ZBD) {
436 		td->io_ops->flags |= FIO_ASYNCIO_SYNC_TRIM;
437 	}
438 #endif
439 
440 	if (fio_options->initial_zone_reset == 1 && spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS) {
441 #if FIO_HAS_ZBD
442 		struct spdk_nvme_qpair *tmp_qpair;
443 		int completed = 0, err;
444 
445 		/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
446 		 * Create a temporary qpair in order to perform the initial zone reset.
447 		 */
448 		assert(!fio_qpair->qpair);
449 
450 		tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, NULL, 0);
451 		if (!tmp_qpair) {
452 			SPDK_ERRLOG("Cannot allocate a temporary qpair\n");
453 			g_error = true;
454 			return;
455 		}
456 
457 		err = spdk_nvme_zns_reset_zone(ns, tmp_qpair, 0x0, true, pcu_cb, &completed);
458 		if (err || pcu(tmp_qpair, &completed) || completed < 0) {
459 			log_err("spdk/nvme: warn: initial_zone_reset: err: %d, cpl: %d\n",
460 				err, completed);
461 		}
462 
463 		spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
464 #else
465 		log_err("spdk/nvme: ZBD/ZNS is not supported\n");
466 #endif
467 	}
468 
469 	f->real_file_size = spdk_nvme_ns_get_size(fio_qpair->ns);
470 	if (f->real_file_size <= 0) {
471 		g_error = true;
472 		SPDK_ERRLOG("Cannot get namespace size by ns=%p\n", ns);
473 		return;
474 	}
475 
476 	f->filetype = FIO_TYPE_BLOCK;
477 	fio_file_set_size_known(f);
478 }
479 
480 static void
481 parse_prchk_flags(const char *prchk_str)
482 {
483 	if (!prchk_str) {
484 		return;
485 	}
486 
487 	if (strstr(prchk_str, "GUARD") != NULL) {
488 		g_spdk_prchk_flags = SPDK_NVME_IO_FLAGS_PRCHK_GUARD;
489 	}
490 	if (strstr(prchk_str, "REFTAG") != NULL) {
491 		g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG;
492 	}
493 	if (strstr(prchk_str, "APPTAG") != NULL) {
494 		g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG;
495 	}
496 }
497 
498 static void
499 parse_pract_flag(int pract)
500 {
501 	if (pract == 1) {
502 		g_spdk_pract_flag = SPDK_NVME_IO_FLAGS_PRACT;
503 	} else {
504 		g_spdk_pract_flag = 0;
505 	}
506 }
507 
508 static bool
509 fio_redirected_to_dev_null(void)
510 {
511 	char path[PATH_MAX] = "";
512 	ssize_t ret;
513 
514 	ret = readlink("/proc/self/fd/1", path, sizeof(path));
515 
516 	if (ret == -1 || strcmp(path, "/dev/null") != 0) {
517 		return false;
518 	}
519 
520 	ret = readlink("/proc/self/fd/2", path, sizeof(path));
521 
522 	if (ret == -1 || strcmp(path, "/dev/null") != 0) {
523 		return false;
524 	}
525 
526 	return true;
527 }
528 
529 static int
530 spdk_fio_init(struct thread_data *td)
531 {
532 	int ret = 0;
533 	struct spdk_fio_options *fio_options = td->eo;
534 
535 	if (fio_options->spdk_tracing) {
536 		ret = spdk_trace_register_user_thread();
537 	}
538 
539 	return ret;
540 }
541 
542 /* Called once at initialization. This is responsible for gathering the size of
543  * each "file", which in our case are in the form
544  * 'key=value [key=value] ... ns=value'
545  * For example, For local PCIe NVMe device  - 'trtype=PCIe traddr=0000.04.00.0 ns=1'
546  * For remote exported by NVMe-oF target, 'trtype=RDMA adrfam=IPv4 traddr=192.168.100.8 trsvcid=4420 ns=1' */
547 static int
548 spdk_fio_setup(struct thread_data *td)
549 {
550 	struct spdk_fio_thread *fio_thread;
551 	struct spdk_fio_options *fio_options = td->eo;
552 	struct spdk_fio_probe_ctx ctx;
553 	struct spdk_env_opts opts;
554 	struct fio_file *f;
555 	char *p;
556 	int rc = 0;
557 	struct spdk_nvme_transport_id trid;
558 	struct spdk_fio_ctrlr *fio_ctrlr;
559 	char *trid_info;
560 	unsigned int i;
561 	size_t size;
562 
563 	/*
564 	 * If we're running in a daemonized FIO instance, it's possible
565 	 * fd 1/2 were re-used for something important by FIO. Newer fio
566 	 * versions are careful to redirect those to /dev/null, but if we're
567 	 * not, we'll abort early, so we don't accidentally write messages to
568 	 * an important file, etc.
569 	 */
570 	if (is_backend && !fio_redirected_to_dev_null()) {
571 		char buf[1024];
572 		snprintf(buf, sizeof(buf),
573 			 "SPDK FIO plugin is in daemon mode, but stdout/stderr "
574 			 "aren't redirected to /dev/null. Aborting.");
575 		fio_server_text_output(FIO_LOG_ERR, buf, sizeof(buf));
576 		return -1;
577 	}
578 
579 	if (!td->o.use_thread) {
580 		log_err("spdk: must set thread=1 when using spdk plugin\n");
581 		return 1;
582 	}
583 
584 	if (g_log_flag_error) {
585 		/* The first thread found an error when parsing log flags, so
586 		 * just return error immediately for all of the other threads.
587 		 */
588 		return 1;
589 	}
590 
591 	pthread_mutex_lock(&g_mutex);
592 
593 	fio_thread = calloc(1, sizeof(*fio_thread));
594 	assert(fio_thread != NULL);
595 
596 	td->io_ops_data = fio_thread;
597 	fio_thread->td = td;
598 
599 	fio_thread->iocq_size = td->o.iodepth;
600 	fio_thread->iocq = calloc(fio_thread->iocq_size, sizeof(struct io_u *));
601 	assert(fio_thread->iocq != NULL);
602 
603 	TAILQ_INIT(&fio_thread->fio_qpair);
604 
605 	if (!g_spdk_env_initialized) {
606 		opts.opts_size = sizeof(opts);
607 		spdk_env_opts_init(&opts);
608 		opts.name = "fio";
609 		opts.mem_size = fio_options->mem_size;
610 		opts.shm_id = fio_options->shm_id;
611 		g_spdk_enable_sgl = fio_options->enable_sgl;
612 		g_spdk_sge_size = fio_options->sge_size;
613 		g_spdk_bit_bucket_data_len = fio_options->bit_bucket_data_len;
614 		parse_pract_flag(fio_options->pi_act);
615 		g_spdk_md_per_io_size = spdk_max(fio_options->md_per_io_size, 4096);
616 		g_spdk_apptag = (uint16_t)fio_options->apptag;
617 		g_spdk_apptag_mask = (uint16_t)fio_options->apptag_mask;
618 		parse_prchk_flags(fio_options->pi_chk);
619 		if (spdk_env_init(&opts) < 0) {
620 			SPDK_ERRLOG("Unable to initialize SPDK env\n");
621 			free(fio_thread->iocq);
622 			free(fio_thread);
623 			fio_thread = NULL;
624 			pthread_mutex_unlock(&g_mutex);
625 			return 1;
626 		}
627 
628 		if (fio_options->log_flags) {
629 			char *tok = strtok(fio_options->log_flags, ",");
630 			do {
631 				rc = spdk_log_set_flag(tok);
632 				if (rc < 0) {
633 					SPDK_ERRLOG("unknown log flag %s\n", tok);
634 					g_log_flag_error = true;
635 					return 1;
636 				}
637 			} while ((tok = strtok(NULL, ",")) != NULL);
638 #ifdef DEBUG
639 			spdk_log_set_print_level(SPDK_LOG_DEBUG);
640 #endif
641 		}
642 
643 		g_spdk_env_initialized = true;
644 		spdk_unaffinitize_thread();
645 
646 		if (fio_options->spdk_tracing) {
647 			spdk_trace_init("spdk_fio_tracepoints", 65536, td->o.numjobs);
648 			spdk_trace_enable_tpoint_group("nvme_pcie");
649 			spdk_trace_enable_tpoint_group("nvme_tcp");
650 		}
651 
652 		/* Spawn a thread to continue polling the controllers */
653 		rc = pthread_create(&g_ctrlr_thread_id, NULL, &spdk_fio_poll_ctrlrs, NULL);
654 		if (rc != 0) {
655 			SPDK_ERRLOG("Unable to spawn a thread to poll admin queues. They won't be polled.\n");
656 		}
657 
658 		if (fio_options->enable_vmd && spdk_vmd_init()) {
659 			SPDK_ERRLOG("Failed to initialize VMD. Some NVMe devices can be unavailable.\n");
660 		}
661 	}
662 	pthread_mutex_unlock(&g_mutex);
663 
664 	for_each_file(td, f, i) {
665 		memset(&trid, 0, sizeof(trid));
666 		memset(&ctx, 0, sizeof(ctx));
667 
668 		trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
669 
670 		p = strstr(f->file_name, " ns=");
671 		if (p != NULL) {
672 			trid_info = strndup(f->file_name, p - f->file_name);
673 		} else {
674 			trid_info = strndup(f->file_name, strlen(f->file_name));
675 		}
676 
677 		if (!trid_info) {
678 			SPDK_ERRLOG("Failed to allocate space for trid_info\n");
679 			continue;
680 		}
681 
682 		rc = spdk_nvme_transport_id_parse(&trid, trid_info);
683 		if (rc < 0) {
684 			SPDK_ERRLOG("Failed to parse given str: %s\n", trid_info);
685 			free(trid_info);
686 			continue;
687 		}
688 		free(trid_info);
689 
690 		if (trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
691 			struct spdk_pci_addr pci_addr;
692 			if (spdk_pci_addr_parse(&pci_addr, trid.traddr) < 0) {
693 				SPDK_ERRLOG("Invalid traddr=%s\n", trid.traddr);
694 				continue;
695 			}
696 			spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
697 		} else {
698 			if (trid.subnqn[0] == '\0') {
699 				snprintf(trid.subnqn, sizeof(trid.subnqn), "%s",
700 					 SPDK_NVMF_DISCOVERY_NQN);
701 			}
702 			if ((p = strcasestr(f->file_name, "hostnqn:")) ||
703 			    (p = strcasestr(f->file_name, "hostnqn="))) {
704 				p += strlen("hostnqn:");
705 				size = strcspn(p, " \t\n");
706 				if (size > sizeof(ctx.hostnqn)) {
707 					SPDK_ERRLOG("Invalid hostnqn: too long\n");
708 					continue;
709 				}
710 				memcpy(ctx.hostnqn, p, size);
711 			}
712 		}
713 
714 		ctx.td = td;
715 		ctx.f = f;
716 
717 		pthread_mutex_lock(&g_mutex);
718 		fio_ctrlr = get_fio_ctrlr(&trid);
719 		pthread_mutex_unlock(&g_mutex);
720 		if (fio_ctrlr) {
721 			attach_cb(&ctx, &trid, fio_ctrlr->ctrlr, &fio_ctrlr->opts);
722 		} else {
723 			/* Enumerate all of the controllers */
724 			if (spdk_nvme_probe(&trid, &ctx, probe_cb, attach_cb, NULL) != 0) {
725 				SPDK_ERRLOG("spdk_nvme_probe() failed\n");
726 				continue;
727 			}
728 		}
729 
730 		if (g_error) {
731 			log_err("Failed to initialize spdk fio plugin\n");
732 			rc = 1;
733 			break;
734 		}
735 	}
736 
737 	pthread_mutex_lock(&g_mutex);
738 	g_td_count++;
739 	pthread_mutex_unlock(&g_mutex);
740 
741 	return rc;
742 }
743 
744 static int
745 spdk_fio_open(struct thread_data *td, struct fio_file *f)
746 {
747 	struct spdk_fio_qpair *fio_qpair = f->engine_data;
748 	struct spdk_fio_ctrlr *fio_ctrlr = fio_qpair->fio_ctrlr;
749 	struct spdk_fio_options *fio_options = td->eo;
750 	struct spdk_nvme_io_qpair_opts	qpopts;
751 
752 	assert(fio_qpair->qpair == NULL);
753 	spdk_nvme_ctrlr_get_default_io_qpair_opts(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts));
754 	qpopts.delay_cmd_submit = true;
755 	if (fio_options->enable_wrr) {
756 		qpopts.qprio = fio_options->wrr_priority;
757 	}
758 	qpopts.disable_pcie_sgl_merge = fio_options->disable_pcie_sgl_merge;
759 
760 	fio_qpair->qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts));
761 	if (!fio_qpair->qpair) {
762 		SPDK_ERRLOG("Cannot allocate nvme io_qpair any more\n");
763 		g_error = true;
764 		free(fio_qpair);
765 		return -1;
766 	}
767 
768 	if (fio_options->print_qid_mappings == 1) {
769 		log_info("job %s: %s qid %d\n", td->o.name, f->file_name,
770 			 spdk_nvme_qpair_get_id(fio_qpair->qpair));
771 	}
772 
773 	return 0;
774 }
775 
776 static int
777 spdk_fio_close(struct thread_data *td, struct fio_file *f)
778 {
779 	struct spdk_fio_qpair *fio_qpair = f->engine_data;
780 
781 	assert(fio_qpair->qpair != NULL);
782 	spdk_nvme_ctrlr_free_io_qpair(fio_qpair->qpair);
783 	fio_qpair->qpair = NULL;
784 	return 0;
785 }
786 
787 static int
788 spdk_fio_iomem_alloc(struct thread_data *td, size_t total_mem)
789 {
790 	td->orig_buffer = spdk_dma_zmalloc(total_mem, NVME_IO_ALIGN, NULL);
791 	return td->orig_buffer == NULL;
792 }
793 
794 static void
795 spdk_fio_iomem_free(struct thread_data *td)
796 {
797 	spdk_dma_free(td->orig_buffer);
798 }
799 
800 static int
801 spdk_fio_io_u_init(struct thread_data *td, struct io_u *io_u)
802 {
803 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
804 	struct spdk_fio_request	*fio_req;
805 	uint32_t dsm_size;
806 
807 	io_u->engine_data = NULL;
808 
809 	fio_req = calloc(1, sizeof(*fio_req));
810 	if (fio_req == NULL) {
811 		return 1;
812 	}
813 
814 	if (!(td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM)) {
815 #if FIO_HAS_MRT
816 		/* By default number of range is set to 1 */
817 		dsm_size = td->o.num_range * sizeof(struct spdk_nvme_dsm_range);
818 #else
819 		dsm_size = sizeof(struct spdk_nvme_dsm_range);
820 #endif
821 		fio_req->dsm_range = calloc(1, dsm_size);
822 		if (fio_req->dsm_range == NULL) {
823 			free(fio_req);
824 			return 1;
825 		}
826 	}
827 
828 	fio_req->md_buf = spdk_dma_zmalloc(g_spdk_md_per_io_size, NVME_IO_ALIGN, NULL);
829 	if (fio_req->md_buf == NULL) {
830 		fprintf(stderr, "Allocate %u metadata failed\n", g_spdk_md_per_io_size);
831 		free(fio_req->dsm_range);
832 		free(fio_req);
833 		return 1;
834 	}
835 
836 	fio_req->io = io_u;
837 	fio_req->fio_thread = fio_thread;
838 
839 	io_u->engine_data = fio_req;
840 
841 	return 0;
842 }
843 
844 static void
845 spdk_fio_io_u_free(struct thread_data *td, struct io_u *io_u)
846 {
847 	struct spdk_fio_request *fio_req = io_u->engine_data;
848 
849 	if (fio_req) {
850 		assert(fio_req->io == io_u);
851 		spdk_dma_free(fio_req->md_buf);
852 		free(fio_req->dsm_range);
853 		free(fio_req);
854 		io_u->engine_data = NULL;
855 	}
856 }
857 
858 static inline uint64_t
859 fio_offset_to_zslba(unsigned long long offset, struct spdk_nvme_ns *ns)
860 {
861 	return (offset / spdk_nvme_zns_ns_get_zone_size(ns)) * spdk_nvme_zns_ns_get_zone_size_sectors(ns);
862 }
863 
864 static int
865 fio_extended_lba_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
866 {
867 	struct spdk_nvme_ns *ns = fio_qpair->ns;
868 	struct spdk_fio_request *fio_req = io_u->engine_data;
869 	uint32_t md_size, extended_lba_size, lba_count;
870 	uint64_t lba;
871 	struct iovec iov;
872 	int rc;
873 	struct spdk_dif_ctx_init_ext_opts dif_opts;
874 
875 	/* Set appmask and apptag when PRACT is enabled */
876 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
877 		fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask;
878 		fio_req->dif_ctx.app_tag = g_spdk_apptag;
879 		return 0;
880 	}
881 
882 	extended_lba_size = spdk_nvme_ns_get_extended_sector_size(ns);
883 	md_size = spdk_nvme_ns_get_md_size(ns);
884 	lba = io_u->offset / extended_lba_size;
885 	lba_count = io_u->xfer_buflen / extended_lba_size;
886 
887 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
888 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
889 	rc = spdk_dif_ctx_init(&fio_req->dif_ctx, extended_lba_size, md_size,
890 			       true, fio_qpair->md_start,
891 			       (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns),
892 			       fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag,
893 			       0, 0, &dif_opts);
894 	if (rc != 0) {
895 		fprintf(stderr, "Initialization of DIF context failed\n");
896 		return rc;
897 	}
898 
899 	if (io_u->ddir != DDIR_WRITE) {
900 		return 0;
901 	}
902 
903 	iov.iov_base = io_u->buf;
904 	iov.iov_len = io_u->xfer_buflen;
905 	rc = spdk_dif_generate(&iov, 1, lba_count, &fio_req->dif_ctx);
906 	if (rc != 0) {
907 		fprintf(stderr, "Generation of DIF failed\n");
908 	}
909 
910 	return rc;
911 }
912 
913 static int
914 fio_separate_md_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
915 {
916 	struct spdk_nvme_ns *ns = fio_qpair->ns;
917 	struct spdk_fio_request *fio_req = io_u->engine_data;
918 	uint32_t md_size, block_size, lba_count;
919 	uint64_t lba;
920 	struct iovec iov, md_iov;
921 	int rc;
922 	struct spdk_dif_ctx_init_ext_opts dif_opts;
923 
924 	/* Set appmask and apptag when PRACT is enabled */
925 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
926 		fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask;
927 		fio_req->dif_ctx.app_tag = g_spdk_apptag;
928 		return 0;
929 	}
930 
931 	block_size = spdk_nvme_ns_get_sector_size(ns);
932 	md_size = spdk_nvme_ns_get_md_size(ns);
933 	lba = io_u->offset / block_size;
934 	lba_count = io_u->xfer_buflen / block_size;
935 
936 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
937 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
938 	rc = spdk_dif_ctx_init(&fio_req->dif_ctx, block_size, md_size,
939 			       false, fio_qpair->md_start,
940 			       (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns),
941 			       fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag,
942 			       0, 0, &dif_opts);
943 	if (rc != 0) {
944 		fprintf(stderr, "Initialization of DIF context failed\n");
945 		return rc;
946 	}
947 
948 	if (io_u->ddir != DDIR_WRITE) {
949 		return 0;
950 	}
951 
952 	iov.iov_base = io_u->buf;
953 	iov.iov_len = io_u->xfer_buflen;
954 	md_iov.iov_base = fio_req->md_buf;
955 	md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size);
956 	rc = spdk_dix_generate(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx);
957 	if (rc < 0) {
958 		fprintf(stderr, "Generation of DIX failed\n");
959 	}
960 
961 	return rc;
962 }
963 
964 static int
965 fio_extended_lba_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
966 {
967 	struct spdk_nvme_ns *ns = fio_qpair->ns;
968 	struct spdk_fio_request *fio_req = io_u->engine_data;
969 	uint32_t lba_count;
970 	struct iovec iov;
971 	struct spdk_dif_error err_blk = {};
972 	int rc;
973 
974 	/* Do nothing when PRACT is enabled */
975 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
976 		return 0;
977 	}
978 
979 	iov.iov_base = io_u->buf;
980 	iov.iov_len = io_u->xfer_buflen;
981 	lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_extended_sector_size(ns);
982 
983 	rc = spdk_dif_verify(&iov, 1, lba_count, &fio_req->dif_ctx, &err_blk);
984 	if (rc != 0) {
985 		fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n",
986 			err_blk.err_type, err_blk.err_offset);
987 	}
988 
989 	return rc;
990 }
991 
992 static int
993 fio_separate_md_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
994 {
995 	struct spdk_nvme_ns *ns = fio_qpair->ns;
996 	struct spdk_fio_request *fio_req = io_u->engine_data;
997 	uint32_t md_size, lba_count;
998 	struct iovec iov, md_iov;
999 	struct spdk_dif_error err_blk = {};
1000 	int rc;
1001 
1002 	/* Do nothing when PRACT is enabled */
1003 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
1004 		return 0;
1005 	}
1006 
1007 	iov.iov_base = io_u->buf;
1008 	iov.iov_len = io_u->xfer_buflen;
1009 	lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_sector_size(ns);
1010 	md_size = spdk_nvme_ns_get_md_size(ns);
1011 	md_iov.iov_base = fio_req->md_buf;
1012 	md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size);
1013 
1014 	rc = spdk_dix_verify(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx, &err_blk);
1015 	if (rc != 0) {
1016 		fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n",
1017 			err_blk.err_type, err_blk.err_offset);
1018 	}
1019 
1020 	return rc;
1021 }
1022 
1023 static void
1024 spdk_fio_completion_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
1025 {
1026 	struct spdk_fio_request		*fio_req = ctx;
1027 	struct spdk_fio_thread		*fio_thread = fio_req->fio_thread;
1028 	struct spdk_fio_qpair		*fio_qpair = fio_req->fio_qpair;
1029 	int				rc;
1030 
1031 	if (fio_qpair->nvme_pi_enabled && fio_req->io->ddir == DDIR_READ) {
1032 		if (fio_qpair->extended_lba) {
1033 			rc = fio_extended_lba_verify_pi(fio_qpair, fio_req->io);
1034 		} else {
1035 			rc = fio_separate_md_verify_pi(fio_qpair, fio_req->io);
1036 		}
1037 		if (rc != 0) {
1038 			fio_req->io->error = abs(rc);
1039 		}
1040 	}
1041 
1042 	if (spdk_nvme_cpl_is_error(cpl)) {
1043 		fio_req->io->error = EIO;
1044 	}
1045 
1046 	assert(fio_thread->iocq_count < fio_thread->iocq_size);
1047 	fio_thread->iocq[fio_thread->iocq_count++] = fio_req->io;
1048 }
1049 
1050 static void
1051 spdk_nvme_io_reset_sgl(void *ref, uint32_t sgl_offset)
1052 {
1053 	struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref;
1054 
1055 	fio_req->iov_offset = sgl_offset;
1056 	fio_req->bit_bucket_data_len = 0;
1057 }
1058 
1059 static int
1060 spdk_nvme_io_next_sge(void *ref, void **address, uint32_t *length)
1061 {
1062 	struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref;
1063 	struct io_u *io_u = fio_req->io;
1064 	uint32_t iov_len;
1065 	uint32_t bit_bucket_len;
1066 
1067 	*address = io_u->buf;
1068 
1069 	if (fio_req->iov_offset) {
1070 		assert(fio_req->iov_offset <= io_u->xfer_buflen);
1071 		*address += fio_req->iov_offset;
1072 	}
1073 
1074 	iov_len = io_u->xfer_buflen - fio_req->iov_offset;
1075 	if (iov_len > g_spdk_sge_size) {
1076 		iov_len = g_spdk_sge_size;
1077 	}
1078 
1079 	if ((fio_req->bit_bucket_data_len < g_spdk_bit_bucket_data_len) && (io_u->ddir == DDIR_READ)) {
1080 		assert(g_spdk_bit_bucket_data_len < io_u->xfer_buflen);
1081 		*address = (void *)UINT64_MAX;
1082 		bit_bucket_len = g_spdk_bit_bucket_data_len - fio_req->bit_bucket_data_len;
1083 		if (iov_len > bit_bucket_len) {
1084 			iov_len = bit_bucket_len;
1085 		}
1086 		fio_req->bit_bucket_data_len += iov_len;
1087 	}
1088 
1089 	fio_req->iov_offset += iov_len;
1090 	*length = iov_len;
1091 
1092 	return 0;
1093 }
1094 
1095 #if FIO_IOOPS_VERSION >= 24
1096 typedef enum fio_q_status fio_q_status_t;
1097 #else
1098 typedef int fio_q_status_t;
1099 #endif
1100 
1101 static fio_q_status_t
1102 spdk_fio_queue(struct thread_data *td, struct io_u *io_u)
1103 {
1104 	int rc = 1;
1105 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
1106 	struct spdk_fio_request	*fio_req = io_u->engine_data;
1107 	struct spdk_fio_qpair	*fio_qpair;
1108 	struct spdk_nvme_ns	*ns = NULL;
1109 	void			*md_buf = NULL;
1110 	struct spdk_dif_ctx	*dif_ctx = &fio_req->dif_ctx;
1111 #if FIO_HAS_FDP
1112 	struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
1113 #endif
1114 	struct spdk_nvme_dsm_range *range;
1115 	uint32_t		block_size;
1116 	uint64_t		lba;
1117 	uint32_t		lba_count;
1118 	uint32_t		num_range;
1119 
1120 	fio_qpair = get_fio_qpair(fio_thread, io_u->file);
1121 	if (fio_qpair == NULL) {
1122 		return -ENXIO;
1123 	}
1124 	ns = fio_qpair->ns;
1125 
1126 	if (fio_qpair->nvme_pi_enabled && !fio_qpair->extended_lba) {
1127 		md_buf = fio_req->md_buf;
1128 	}
1129 	fio_req->fio_qpair = fio_qpair;
1130 
1131 	block_size = _nvme_get_host_buffer_sector_size(ns, fio_qpair->io_flags);
1132 	lba = io_u->offset / block_size;
1133 	lba_count = io_u->xfer_buflen / block_size;
1134 
1135 #if FIO_HAS_FDP
1136 	/* Only SGL support for write command with directives */
1137 	if (io_u->ddir == DDIR_WRITE && io_u->dtype && !g_spdk_enable_sgl) {
1138 		log_err("spdk/nvme: queue() directives require SGL to be enabled\n");
1139 		io_u->error = -EINVAL;
1140 		return FIO_Q_COMPLETED;
1141 	}
1142 #endif
1143 
1144 	/* TODO: considering situations that fio will randomize and verify io_u */
1145 	if (fio_qpair->nvme_pi_enabled) {
1146 		if (fio_qpair->extended_lba) {
1147 			rc = fio_extended_lba_setup_pi(fio_qpair, io_u);
1148 		} else {
1149 			rc = fio_separate_md_setup_pi(fio_qpair, io_u);
1150 		}
1151 		if (rc < 0) {
1152 			io_u->error = -rc;
1153 			return FIO_Q_COMPLETED;
1154 		}
1155 	}
1156 
1157 	switch (io_u->ddir) {
1158 	case DDIR_READ:
1159 		if (!g_spdk_enable_sgl) {
1160 			rc = spdk_nvme_ns_cmd_read_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba, lba_count,
1161 							   spdk_fio_completion_cb, fio_req,
1162 							   fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1163 		} else {
1164 			rc = spdk_nvme_ns_cmd_readv_with_md(ns, fio_qpair->qpair, lba,
1165 							    lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1166 							    spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1167 							    dif_ctx->apptag_mask, dif_ctx->app_tag);
1168 		}
1169 		break;
1170 	case DDIR_WRITE:
1171 		if (!g_spdk_enable_sgl) {
1172 			if (!fio_qpair->zone_append_enabled) {
1173 				rc = spdk_nvme_ns_cmd_write_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba,
1174 								    lba_count,
1175 								    spdk_fio_completion_cb, fio_req,
1176 								    fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1177 			} else {
1178 				uint64_t zslba = fio_offset_to_zslba(io_u->offset, ns);
1179 				rc = spdk_nvme_zns_zone_append_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, zslba,
1180 								       lba_count,
1181 								       spdk_fio_completion_cb, fio_req,
1182 								       fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1183 			}
1184 		} else {
1185 			if (!fio_qpair->zone_append_enabled) {
1186 #if FIO_HAS_FDP
1187 				if (spdk_unlikely(io_u->dtype)) {
1188 					ext_opts.size = SPDK_SIZEOF(&ext_opts, cdw13);
1189 					ext_opts.io_flags = fio_qpair->io_flags | (io_u->dtype << 20);
1190 					ext_opts.metadata = md_buf;
1191 					ext_opts.cdw13 = (io_u->dspec << 16);
1192 					ext_opts.apptag = dif_ctx->app_tag;
1193 					ext_opts.apptag_mask = dif_ctx->apptag_mask;
1194 					rc = spdk_nvme_ns_cmd_writev_ext(ns, fio_qpair->qpair, lba, lba_count,
1195 									 spdk_fio_completion_cb, fio_req,
1196 									 spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, &ext_opts);
1197 					break;
1198 				}
1199 #endif
1200 				rc = spdk_nvme_ns_cmd_writev_with_md(ns, fio_qpair->qpair, lba,
1201 								     lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1202 								     spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1203 								     dif_ctx->apptag_mask, dif_ctx->app_tag);
1204 			} else {
1205 				uint64_t zslba = fio_offset_to_zslba(io_u->offset, ns);
1206 				rc = spdk_nvme_zns_zone_appendv_with_md(ns, fio_qpair->qpair, zslba,
1207 									lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1208 									spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1209 									dif_ctx->apptag_mask, dif_ctx->app_tag);
1210 			}
1211 		}
1212 		break;
1213 	case DDIR_TRIM:
1214 		if (td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM) {
1215 			do_io_u_trim(td, io_u);
1216 			io_u_mark_submit(td, 1);
1217 			io_u_mark_complete(td, 1);
1218 			return FIO_Q_COMPLETED;
1219 		}
1220 
1221 		range = fio_req->dsm_range;
1222 #if FIO_HAS_MRT
1223 		if (td->o.num_range == 1) {
1224 			range->attributes.raw = 0;
1225 			range->length = lba_count;
1226 			range->starting_lba = lba;
1227 			num_range = 1;
1228 		} else {
1229 			struct trim_range *tr = (struct trim_range *)io_u->xfer_buf;
1230 			for (uint32_t i = 0; i < io_u->number_trim; i++) {
1231 				range->attributes.raw = 0;
1232 				range->length = tr->len / block_size;
1233 				range->starting_lba = tr->start / block_size;
1234 				range++;
1235 				tr++;
1236 			}
1237 			num_range = io_u->number_trim;
1238 			range = fio_req->dsm_range;
1239 		}
1240 #else
1241 		range->attributes.raw = 0;
1242 		range->length = lba_count;
1243 		range->starting_lba = lba;
1244 		num_range = 1;
1245 #endif
1246 
1247 		rc = spdk_nvme_ns_cmd_dataset_management(ns, fio_qpair->qpair,
1248 				SPDK_NVME_DSM_ATTR_DEALLOCATE, range, num_range,
1249 				spdk_fio_completion_cb, fio_req);
1250 		break;
1251 	default:
1252 		assert(false);
1253 		break;
1254 	}
1255 
1256 	/* NVMe read/write functions return -ENOMEM if there are no free requests. */
1257 	if (rc == -ENOMEM) {
1258 		return FIO_Q_BUSY;
1259 	}
1260 
1261 	if (rc != 0) {
1262 		io_u->error = abs(rc);
1263 		return FIO_Q_COMPLETED;
1264 	}
1265 
1266 	return FIO_Q_QUEUED;
1267 }
1268 
1269 static struct io_u *
1270 spdk_fio_event(struct thread_data *td, int event)
1271 {
1272 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1273 
1274 	assert(event >= 0);
1275 	assert((unsigned)event < fio_thread->iocq_count);
1276 	return fio_thread->iocq[event];
1277 }
1278 
1279 static int
1280 spdk_fio_getevents(struct thread_data *td, unsigned int min,
1281 		   unsigned int max, const struct timespec *t)
1282 {
1283 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1284 	struct spdk_fio_qpair *fio_qpair = NULL;
1285 	struct timespec t0, t1;
1286 	uint64_t timeout = 0;
1287 
1288 	if (t) {
1289 		timeout = t->tv_sec * 1000000000L + t->tv_nsec;
1290 		clock_gettime(CLOCK_MONOTONIC_RAW, &t0);
1291 	}
1292 
1293 	fio_thread->iocq_count = 0;
1294 
1295 	/* fetch the next qpair */
1296 	if (fio_thread->fio_qpair_current) {
1297 		fio_qpair = TAILQ_NEXT(fio_thread->fio_qpair_current, link);
1298 	}
1299 
1300 	for (;;) {
1301 		if (fio_qpair == NULL) {
1302 			fio_qpair = TAILQ_FIRST(&fio_thread->fio_qpair);
1303 		}
1304 
1305 		while (fio_qpair != NULL) {
1306 			/*
1307 			 * We can be called while spdk_fio_open()s are still
1308 			 * ongoing, in which case, ->qpair can still be NULL.
1309 			 */
1310 			if (fio_qpair->qpair == NULL) {
1311 				fio_qpair = TAILQ_NEXT(fio_qpair, link);
1312 				continue;
1313 			}
1314 
1315 			spdk_nvme_qpair_process_completions(fio_qpair->qpair, max - fio_thread->iocq_count);
1316 
1317 			if (fio_thread->iocq_count >= min) {
1318 				/* reset the current handling qpair */
1319 				fio_thread->fio_qpair_current = fio_qpair;
1320 				return fio_thread->iocq_count;
1321 			}
1322 
1323 			fio_qpair = TAILQ_NEXT(fio_qpair, link);
1324 		}
1325 
1326 		if (t) {
1327 			uint64_t elapse;
1328 
1329 			clock_gettime(CLOCK_MONOTONIC_RAW, &t1);
1330 			elapse = ((t1.tv_sec - t0.tv_sec) * 1000000000L)
1331 				 + t1.tv_nsec - t0.tv_nsec;
1332 			if (elapse > timeout) {
1333 				break;
1334 			}
1335 		}
1336 	}
1337 
1338 	/* reset the current handling qpair */
1339 	fio_thread->fio_qpair_current = fio_qpair;
1340 	return fio_thread->iocq_count;
1341 }
1342 
1343 static int
1344 spdk_fio_invalidate(struct thread_data *td, struct fio_file *f)
1345 {
1346 	/* TODO: This should probably send a flush to the device, but for now just return successful. */
1347 	return 0;
1348 }
1349 
1350 #if FIO_HAS_ZBD
1351 static int
1352 spdk_fio_get_zoned_model(struct thread_data *td, struct fio_file *f, enum zbd_zoned_model *model)
1353 {
1354 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1355 	struct spdk_fio_qpair *fio_qpair = NULL;
1356 	const struct spdk_nvme_zns_ns_data *zns_data = NULL;
1357 
1358 	if (f->filetype != FIO_TYPE_BLOCK) {
1359 		log_info("spdk/nvme: unsupported filetype: %d\n", f->filetype);
1360 		return -EINVAL;
1361 	}
1362 
1363 	fio_qpair = get_fio_qpair(fio_thread, f);
1364 	if (!fio_qpair) {
1365 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1366 		return -ENODEV;
1367 	}
1368 
1369 	switch (spdk_nvme_ns_get_csi(fio_qpair->ns)) {
1370 	case SPDK_NVME_CSI_NVM:
1371 		*model = ZBD_NONE;
1372 		return 0;
1373 
1374 	case SPDK_NVME_CSI_KV:
1375 		log_err("spdk/nvme: KV namespace is currently not supported\n");
1376 		return -ENOSYS;
1377 
1378 	case SPDK_NVME_CSI_ZNS:
1379 		zns_data = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1380 		if (!zns_data) {
1381 			log_err("spdk/nvme: file_name: '%s', ZNS is not enabled\n", f->file_name);
1382 			return -EINVAL;
1383 		}
1384 
1385 		*model = ZBD_HOST_MANAGED;
1386 
1387 		return 0;
1388 	}
1389 
1390 	return -EINVAL;
1391 }
1392 
1393 static int
1394 spdk_fio_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset,
1395 		      struct zbd_zone *zbdz, unsigned int nr_zones)
1396 {
1397 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1398 	struct spdk_fio_qpair *fio_qpair = NULL;
1399 	const struct spdk_nvme_zns_ns_data *zns = NULL;
1400 	struct spdk_nvme_zns_zone_report *report;
1401 	struct spdk_nvme_qpair *tmp_qpair;
1402 	uint32_t report_nzones = 0, report_nzones_max, report_nbytes, mdts_nbytes;
1403 	uint64_t zsze_nbytes, ns_nzones, lba_nbytes;
1404 	int completed = 0, err;
1405 
1406 	fio_qpair = get_fio_qpair(fio_thread, f);
1407 	if (!fio_qpair) {
1408 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1409 		return -ENODEV;
1410 	}
1411 	zns = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1412 	if (!zns) {
1413 		log_err("spdk/nvme: file_name: '%s', zns is not enabled\n", f->file_name);
1414 		return -EINVAL;
1415 	}
1416 
1417 	/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
1418 	 * Create a temporary qpair in order to perform report zones.
1419 	 */
1420 	assert(!fio_qpair->qpair);
1421 
1422 	tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_qpair->fio_ctrlr->ctrlr, NULL, 0);
1423 	if (!tmp_qpair) {
1424 		log_err("spdk/nvme: cannot allocate a temporary qpair\n");
1425 		return -EIO;
1426 	}
1427 
1428 	/** Retrieve device parameters */
1429 	mdts_nbytes = spdk_nvme_ns_get_max_io_xfer_size(fio_qpair->ns);
1430 	lba_nbytes = spdk_nvme_ns_get_sector_size(fio_qpair->ns);
1431 	zsze_nbytes = spdk_nvme_zns_ns_get_zone_size(fio_qpair->ns);
1432 	ns_nzones = spdk_nvme_zns_ns_get_num_zones(fio_qpair->ns);
1433 
1434 	/** Allocate report-buffer without exceeding mdts, zbdz-storage, and what is needed */
1435 	report_nzones_max = (mdts_nbytes - sizeof(*report)) / sizeof(report->descs[0]);
1436 	report_nzones_max = spdk_min(spdk_min(report_nzones_max, nr_zones), ns_nzones);
1437 	report_nbytes = sizeof(report->descs[0]) * report_nzones_max + sizeof(*report);
1438 	report = calloc(1, report_nbytes);
1439 	if (!report) {
1440 		log_err("spdk/nvme: failed report_zones(): ENOMEM\n");
1441 		err = -ENOMEM;
1442 		goto exit;
1443 	}
1444 
1445 	err = spdk_nvme_zns_report_zones(fio_qpair->ns, tmp_qpair, report, report_nbytes,
1446 					 offset / lba_nbytes, SPDK_NVME_ZRA_LIST_ALL, true, pcu_cb,
1447 					 &completed);
1448 	if (err || pcu(tmp_qpair, &completed) || completed < 0) {
1449 		log_err("spdk/nvme: report_zones(): err: %d, cpl: %d\n", err, completed);
1450 		err = err ? err : -EIO;
1451 		goto exit;
1452 	}
1453 	assert(report->nr_zones <= report_nzones_max);
1454 	report_nzones = report->nr_zones;
1455 
1456 	for (uint64_t idx = 0; idx < report->nr_zones; ++idx) {
1457 		struct spdk_nvme_zns_zone_desc *zdesc = &report->descs[idx];
1458 
1459 		zbdz[idx].start = zdesc->zslba * lba_nbytes;
1460 		zbdz[idx].len = zsze_nbytes;
1461 		zbdz[idx].capacity = zdesc->zcap * lba_nbytes;
1462 		zbdz[idx].wp = zdesc->wp * lba_nbytes;
1463 
1464 		switch (zdesc->zt) {
1465 		case SPDK_NVME_ZONE_TYPE_SEQWR:
1466 			zbdz[idx].type = ZBD_ZONE_TYPE_SWR;
1467 			break;
1468 
1469 		default:
1470 			log_err("spdk/nvme: %s: inv. zone-type: 0x%x\n", f->file_name, zdesc->zt);
1471 			err = -EIO;
1472 			goto exit;
1473 		}
1474 
1475 		switch (zdesc->zs) {
1476 		case SPDK_NVME_ZONE_STATE_EMPTY:
1477 			zbdz[idx].cond = ZBD_ZONE_COND_EMPTY;
1478 			break;
1479 		case SPDK_NVME_ZONE_STATE_IOPEN:
1480 			zbdz[idx].cond = ZBD_ZONE_COND_IMP_OPEN;
1481 			break;
1482 		case SPDK_NVME_ZONE_STATE_EOPEN:
1483 			zbdz[idx].cond = ZBD_ZONE_COND_EXP_OPEN;
1484 			break;
1485 		case SPDK_NVME_ZONE_STATE_CLOSED:
1486 			zbdz[idx].cond = ZBD_ZONE_COND_CLOSED;
1487 			break;
1488 		case SPDK_NVME_ZONE_STATE_RONLY:
1489 			zbdz[idx].cond = ZBD_ZONE_COND_READONLY;
1490 			break;
1491 		case SPDK_NVME_ZONE_STATE_FULL:
1492 			zbdz[idx].cond = ZBD_ZONE_COND_FULL;
1493 			break;
1494 		case SPDK_NVME_ZONE_STATE_OFFLINE:
1495 			zbdz[idx].cond = ZBD_ZONE_COND_OFFLINE;
1496 			break;
1497 
1498 		default:
1499 			log_err("spdk/nvme: %s: inv. zone-state: 0x%x\n", f->file_name, zdesc->zs);
1500 			err = -EIO;
1501 			goto exit;
1502 		}
1503 	}
1504 
1505 exit:
1506 	spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
1507 	free(report);
1508 
1509 	return err ? err : (int)report_nzones;
1510 }
1511 
1512 static int
1513 spdk_fio_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset, uint64_t length)
1514 {
1515 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1516 	struct spdk_fio_qpair *fio_qpair = NULL;
1517 	const struct spdk_nvme_zns_ns_data *zns = NULL;
1518 	uint64_t zsze_nbytes, lba_nbytes;
1519 	int err = 0;
1520 
1521 	fio_qpair = get_fio_qpair(fio_thread, f);
1522 	if (!fio_qpair) {
1523 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1524 		return -ENODEV;
1525 	}
1526 	zns = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1527 	if (!zns) {
1528 		log_err("spdk/nvme: file_name: '%s', zns is not enabled\n", f->file_name);
1529 		return -EINVAL;
1530 	}
1531 	zsze_nbytes = spdk_nvme_zns_ns_get_zone_size(fio_qpair->ns);
1532 	lba_nbytes = spdk_nvme_ns_get_sector_size(fio_qpair->ns);
1533 
1534 	/** check the assumption that offset is valid zone-start lba */
1535 	if (offset % zsze_nbytes) {
1536 		log_err("spdk/nvme: offset: %zu is not a valid zslba\n", offset);
1537 		return -EINVAL;
1538 	}
1539 
1540 	for (uint64_t cur = offset; cur < offset + length; cur += zsze_nbytes) {
1541 		int completed = 0;
1542 
1543 		err = spdk_nvme_zns_reset_zone(fio_qpair->ns, fio_qpair->qpair, cur / lba_nbytes,
1544 					       false, pcu_cb, &completed);
1545 		if (err || pcu(fio_qpair->qpair, &completed) || completed < 0) {
1546 			log_err("spdk/nvme: zns_reset_zone(): err: %d, cpl: %d\n", err, completed);
1547 			err = err ? err : -EIO;
1548 			break;
1549 		}
1550 	}
1551 
1552 	return err;
1553 }
1554 #endif
1555 
1556 #if FIO_IOOPS_VERSION >= 30
1557 static int
1558 spdk_fio_get_max_open_zones(struct thread_data *td, struct fio_file *f,
1559 			    unsigned int *max_open_zones)
1560 {
1561 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1562 	struct spdk_fio_qpair *fio_qpair = NULL;
1563 
1564 	fio_qpair = get_fio_qpair(fio_thread, f);
1565 	if (!fio_qpair) {
1566 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1567 		return -ENODEV;
1568 	}
1569 
1570 	*max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(fio_qpair->ns);
1571 
1572 	return 0;
1573 }
1574 #endif
1575 
1576 #if FIO_HAS_FDP
1577 /**
1578  * This is called twice as the number of ruhs descriptors are unknown.
1579  * In the first call fio only sends a buffer to fetch the number of ruhs
1580  * descriptors. In the second call fio will send a buffer to fetch all the
1581  * ruhs descriptors.
1582  */
1583 static int
1584 spdk_fio_fdp_fetch_ruhs(struct thread_data *td, struct fio_file *f,
1585 			struct fio_ruhs_info *fruhs_info)
1586 {
1587 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1588 	struct spdk_fio_qpair *fio_qpair = NULL;
1589 	struct spdk_nvme_qpair *tmp_qpair;
1590 	struct spdk_nvme_fdp_ruhs *fdp_ruhs;
1591 	uint32_t ruhs_nbytes;
1592 	uint16_t idx, nruhsd;
1593 	int completed = 0, err;
1594 
1595 	fio_qpair = get_fio_qpair(fio_thread, f);
1596 	if (!fio_qpair) {
1597 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1598 		return -ENODEV;
1599 	}
1600 
1601 	/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
1602 	 * Create a temporary qpair in order to perform report zones.
1603 	 */
1604 	assert(!fio_qpair->qpair);
1605 
1606 	tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_qpair->fio_ctrlr->ctrlr, NULL, 0);
1607 	if (!tmp_qpair) {
1608 		log_err("spdk/nvme: cannot allocate a temporary qpair\n");
1609 		return -EIO;
1610 	}
1611 
1612 	nruhsd = fruhs_info->nr_ruhs;
1613 	ruhs_nbytes = sizeof(*fdp_ruhs) + nruhsd * sizeof(struct spdk_nvme_fdp_ruhs_desc);
1614 	fdp_ruhs = calloc(1, ruhs_nbytes);
1615 	if (!fdp_ruhs) {
1616 		log_err("spdk/nvme: failed fdp_fetch_ruhs(): ENOMEM\n");
1617 		err = -ENOMEM;
1618 		goto exit;
1619 	}
1620 
1621 	err = spdk_nvme_ns_cmd_io_mgmt_recv(fio_qpair->ns, tmp_qpair, fdp_ruhs, ruhs_nbytes,
1622 					    SPDK_NVME_FDP_IO_MGMT_RECV_RUHS, 0, pcu_cb, &completed);
1623 	if (err || pcu(tmp_qpair, &completed) || completed < 0) {
1624 		log_err("spdk/nvme: fetch_ruhs(): err: %d, cpl: %d\n", err, completed);
1625 		err = err ? err : -EIO;
1626 		goto exit;
1627 	}
1628 
1629 	fruhs_info->nr_ruhs = fdp_ruhs->nruhsd;
1630 	for (idx = 0; idx < nruhsd; idx++) {
1631 		fruhs_info->plis[idx] = fdp_ruhs->ruhs_desc[idx].pid;
1632 	}
1633 
1634 exit:
1635 	spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
1636 	free(fdp_ruhs);
1637 
1638 	return err;
1639 }
1640 #endif
1641 
1642 static void
1643 spdk_fio_cleanup(struct thread_data *td)
1644 {
1645 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
1646 	struct spdk_fio_qpair	*fio_qpair, *fio_qpair_tmp;
1647 	struct spdk_fio_options *fio_options = td->eo;
1648 
1649 	if (fio_options->spdk_tracing) {
1650 		spdk_trace_unregister_user_thread();
1651 	}
1652 
1653 	TAILQ_FOREACH_SAFE(fio_qpair, &fio_thread->fio_qpair, link, fio_qpair_tmp) {
1654 		TAILQ_REMOVE(&fio_thread->fio_qpair, fio_qpair, link);
1655 		free(fio_qpair);
1656 	}
1657 
1658 	free(fio_thread->iocq);
1659 	free(fio_thread);
1660 
1661 	pthread_mutex_lock(&g_mutex);
1662 	g_td_count--;
1663 	if (g_td_count == 0) {
1664 		struct spdk_fio_ctrlr	*fio_ctrlr, *fio_ctrlr_tmp;
1665 		struct spdk_nvme_detach_ctx	*detach_ctx = NULL;
1666 
1667 		TAILQ_FOREACH_SAFE(fio_ctrlr, &g_ctrlrs, link, fio_ctrlr_tmp) {
1668 			TAILQ_REMOVE(&g_ctrlrs, fio_ctrlr, link);
1669 			spdk_nvme_detach_async(fio_ctrlr->ctrlr, &detach_ctx);
1670 			free(fio_ctrlr);
1671 		}
1672 
1673 		if (detach_ctx) {
1674 			spdk_nvme_detach_poll(detach_ctx);
1675 		}
1676 
1677 		if (fio_options->enable_vmd) {
1678 			spdk_vmd_fini();
1679 		}
1680 	}
1681 	pthread_mutex_unlock(&g_mutex);
1682 	if (TAILQ_EMPTY(&g_ctrlrs)) {
1683 		if (pthread_cancel(g_ctrlr_thread_id) == 0) {
1684 			pthread_join(g_ctrlr_thread_id, NULL);
1685 		}
1686 	}
1687 }
1688 
1689 /* This function enables addition of SPDK parameters to the fio config
1690  * Adding new parameters by defining them here and defining a callback
1691  * function to read the parameter value. */
1692 static struct fio_option options[] = {
1693 	{
1694 		.name           = "enable_wrr",
1695 		.lname          = "Enable weighted round robin (WRR) for IO submission queues",
1696 		.type           = FIO_OPT_INT,
1697 		.off1           = offsetof(struct spdk_fio_options, enable_wrr),
1698 		.def            = "0",
1699 		.help           = "Enable weighted round robin (WRR) for IO submission queues",
1700 		.category       = FIO_OPT_C_ENGINE,
1701 		.group          = FIO_OPT_G_INVALID,
1702 	},
1703 	{
1704 		.name           = "arbitration_burst",
1705 		.lname          = "Arbitration Burst",
1706 		.type           = FIO_OPT_INT,
1707 		.off1           = offsetof(struct spdk_fio_options, arbitration_burst),
1708 		.def            = "0",
1709 		.help           = "Arbitration Burst used for WRR (valid range from 0 - 7)",
1710 		.category       = FIO_OPT_C_ENGINE,
1711 		.group          = FIO_OPT_G_INVALID,
1712 	},
1713 	{
1714 		.name           = "low_weight",
1715 		.lname          = "low_weight for WRR",
1716 		.type           = FIO_OPT_INT,
1717 		.off1           = offsetof(struct spdk_fio_options, low_weight),
1718 		.def            = "0",
1719 		.help           = "low_weight used for WRR (valid range from 0 - 255)",
1720 		.category       = FIO_OPT_C_ENGINE,
1721 		.group          = FIO_OPT_G_INVALID,
1722 	},
1723 	{
1724 		.name           = "medium_weight",
1725 		.lname          = "medium_weight for WRR",
1726 		.type           = FIO_OPT_INT,
1727 		.off1           = offsetof(struct spdk_fio_options, medium_weight),
1728 		.def            = "0",
1729 		.help           = "medium weight used for WRR (valid range from 0 - 255)",
1730 		.category       = FIO_OPT_C_ENGINE,
1731 		.group          = FIO_OPT_G_INVALID,
1732 	},
1733 	{
1734 		.name           = "high_weight",
1735 		.lname          = "high_weight for WRR",
1736 		.type           = FIO_OPT_INT,
1737 		.off1           = offsetof(struct spdk_fio_options, high_weight),
1738 		.def            = "0",
1739 		.help           = "high weight used for WRR (valid range from 0 - 255)",
1740 		.category       = FIO_OPT_C_ENGINE,
1741 		.group          = FIO_OPT_G_INVALID,
1742 	},
1743 	{
1744 		.name           = "wrr_priority",
1745 		.lname          = "priority used for WRR",
1746 		.type           = FIO_OPT_INT,
1747 		.off1           = offsetof(struct spdk_fio_options, wrr_priority),
1748 		.def            = "0",
1749 		.help           = "priority used for WRR (valid range from 0-3)",
1750 		.category       = FIO_OPT_C_ENGINE,
1751 		.group          = FIO_OPT_G_INVALID,
1752 	},
1753 	{
1754 		.name		= "mem_size_mb",
1755 		.lname		= "Memory size in MB",
1756 		.type		= FIO_OPT_INT,
1757 		.off1		= offsetof(struct spdk_fio_options, mem_size),
1758 		.def		= "0",
1759 		.help		= "Memory Size for SPDK (MB)",
1760 		.category	= FIO_OPT_C_ENGINE,
1761 		.group		= FIO_OPT_G_INVALID,
1762 	},
1763 	{
1764 		.name		= "shm_id",
1765 		.lname		= "shared memory ID",
1766 		.type		= FIO_OPT_INT,
1767 		.off1		= offsetof(struct spdk_fio_options, shm_id),
1768 		.def		= "-1",
1769 		.help		= "Shared Memory ID",
1770 		.category	= FIO_OPT_C_ENGINE,
1771 		.group		= FIO_OPT_G_INVALID,
1772 	},
1773 	{
1774 		.name		= "enable_sgl",
1775 		.lname		= "SGL used for I/O commands",
1776 		.type		= FIO_OPT_INT,
1777 		.off1		= offsetof(struct spdk_fio_options, enable_sgl),
1778 		.def		= "0",
1779 		.help		= "SGL Used for I/O Commands (enable_sgl=1 or enable_sgl=0)",
1780 		.category	= FIO_OPT_C_ENGINE,
1781 		.group		= FIO_OPT_G_INVALID,
1782 	},
1783 	{
1784 		.name		= "sge_size",
1785 		.lname		= "SGL size used for I/O commands",
1786 		.type		= FIO_OPT_INT,
1787 		.off1		= offsetof(struct spdk_fio_options, sge_size),
1788 		.def		= "4096",
1789 		.help		= "SGL size in bytes for I/O Commands (default 4096)",
1790 		.category	= FIO_OPT_C_ENGINE,
1791 		.group		= FIO_OPT_G_INVALID,
1792 	},
1793 	{
1794 		.name		= "disable_pcie_sgl_merge",
1795 		.lname		= "Disable merging of physically contiguous SGL elements",
1796 		.type		= FIO_OPT_INT,
1797 		.off1		= offsetof(struct spdk_fio_options, disable_pcie_sgl_merge),
1798 		.def		= "0",
1799 		.help		= "Disable SGL element merging (0=merging, 1=no merging)",
1800 		.category	= FIO_OPT_C_ENGINE,
1801 		.group		= FIO_OPT_G_INVALID,
1802 	},
1803 	{
1804 		.name		= "bit_bucket_data_len",
1805 		.lname		= "Amount of data used for Bit Bucket",
1806 		.type		= FIO_OPT_INT,
1807 		.off1		= offsetof(struct spdk_fio_options, bit_bucket_data_len),
1808 		.def		= "0",
1809 		.help		= "Bit Bucket Data Length for READ commands (disabled by default)",
1810 		.category	= FIO_OPT_C_ENGINE,
1811 		.group		= FIO_OPT_G_INVALID,
1812 	},
1813 	{
1814 		.name		= "hostnqn",
1815 		.lname		= "Host NQN to use when connecting to controllers.",
1816 		.type		= FIO_OPT_STR_STORE,
1817 		.off1		= offsetof(struct spdk_fio_options, hostnqn),
1818 		.help		= "Host NQN",
1819 		.category	= FIO_OPT_C_ENGINE,
1820 		.group		= FIO_OPT_G_INVALID,
1821 	},
1822 	{
1823 		.name		= "pi_act",
1824 		.lname		= "Protection Information Action",
1825 		.type		= FIO_OPT_INT,
1826 		.off1		= offsetof(struct spdk_fio_options, pi_act),
1827 		.def		= "1",
1828 		.help		= "Protection Information Action bit (pi_act=1 or pi_act=0)",
1829 		.category	= FIO_OPT_C_ENGINE,
1830 		.group		= FIO_OPT_G_INVALID,
1831 	},
1832 	{
1833 		.name		= "pi_chk",
1834 		.lname		= "Protection Information Check(GUARD|REFTAG|APPTAG)",
1835 		.type		= FIO_OPT_STR_STORE,
1836 		.off1		= offsetof(struct spdk_fio_options, pi_chk),
1837 		.def		= NULL,
1838 		.help		= "Control of Protection Information Checking (pi_chk=GUARD|REFTAG|APPTAG)",
1839 		.category	= FIO_OPT_C_ENGINE,
1840 		.group		= FIO_OPT_G_INVALID,
1841 	},
1842 	{
1843 		.name		= "md_per_io_size",
1844 		.lname		= "Separate Metadata Buffer Size per I/O",
1845 		.type		= FIO_OPT_INT,
1846 		.off1		= offsetof(struct spdk_fio_options, md_per_io_size),
1847 		.def		= "4096",
1848 		.help		= "Size of separate metadata buffer per I/O (Default: 4096)",
1849 		.category	= FIO_OPT_C_ENGINE,
1850 		.group		= FIO_OPT_G_INVALID,
1851 	},
1852 	{
1853 		.name		= "apptag",
1854 		.lname		= "Application Tag used in Protection Information",
1855 		.type		= FIO_OPT_INT,
1856 		.off1		= offsetof(struct spdk_fio_options, apptag),
1857 		.def		= "0x1234",
1858 		.help		= "Application Tag used in Protection Information field (Default: 0x1234)",
1859 		.category	= FIO_OPT_C_ENGINE,
1860 		.group		= FIO_OPT_G_INVALID,
1861 	},
1862 	{
1863 		.name		= "apptag_mask",
1864 		.lname		= "Application Tag Mask",
1865 		.type		= FIO_OPT_INT,
1866 		.off1		= offsetof(struct spdk_fio_options, apptag_mask),
1867 		.def		= "0xffff",
1868 		.help		= "Application Tag Mask used with Application Tag (Default: 0xffff)",
1869 		.category	= FIO_OPT_C_ENGINE,
1870 		.group		= FIO_OPT_G_INVALID,
1871 	},
1872 	{
1873 		.name		= "digest_enable",
1874 		.lname		= "PDU digest choice for NVMe/TCP Transport(NONE|HEADER|DATA|BOTH)",
1875 		.type		= FIO_OPT_STR_STORE,
1876 		.off1		= offsetof(struct spdk_fio_options, digest_enable),
1877 		.def		= NULL,
1878 		.help		= "Control the NVMe/TCP control(digest_enable=NONE|HEADER|DATA|BOTH)",
1879 		.category	= FIO_OPT_C_ENGINE,
1880 		.group		= FIO_OPT_G_INVALID,
1881 	},
1882 	{
1883 		.name		= "enable_vmd",
1884 		.lname		= "Enable VMD enumeration",
1885 		.type		= FIO_OPT_INT,
1886 		.off1		= offsetof(struct spdk_fio_options, enable_vmd),
1887 		.def		= "0",
1888 		.help		= "Enable VMD enumeration (enable_vmd=1 or enable_vmd=0)",
1889 		.category	= FIO_OPT_C_ENGINE,
1890 		.group		= FIO_OPT_G_INVALID,
1891 	},
1892 	{
1893 		.name		= "initial_zone_reset",
1894 		.lname		= "Reset Zones on initialization",
1895 		.type		= FIO_OPT_INT,
1896 		.off1		= offsetof(struct spdk_fio_options, initial_zone_reset),
1897 		.def		= "0",
1898 		.help		= "Reset Zones on initialization (0=disable, 1=Reset All Zones)",
1899 		.category	= FIO_OPT_C_ENGINE,
1900 		.group		= FIO_OPT_G_INVALID,
1901 	},
1902 	{
1903 		.name		= "zone_append",
1904 		.lname		= "Use zone append instead of write",
1905 		.type		= FIO_OPT_INT,
1906 		.off1		= offsetof(struct spdk_fio_options, zone_append),
1907 		.def		= "0",
1908 		.help		= "Use zone append instead of write (1=zone append, 0=write)",
1909 		.category	= FIO_OPT_C_ENGINE,
1910 		.group		= FIO_OPT_G_INVALID,
1911 	},
1912 	{
1913 		.name		= "print_qid_mappings",
1914 		.lname		= "Print job-to-qid mappings",
1915 		.type		= FIO_OPT_INT,
1916 		.off1		= offsetof(struct spdk_fio_options, print_qid_mappings),
1917 		.def		= "0",
1918 		.help		= "Print job-to-qid mappings (0=disable, 1=enable)",
1919 		.category	= FIO_OPT_C_ENGINE,
1920 		.group		= FIO_OPT_G_INVALID,
1921 	},
1922 	{
1923 		.name		= "log_flags",
1924 		.lname		= "log_flags",
1925 		.type		= FIO_OPT_STR_STORE,
1926 		.off1		= offsetof(struct spdk_fio_options, log_flags),
1927 		.help		= "Enable log flags (comma-separated list)",
1928 		.category	= FIO_OPT_C_ENGINE,
1929 		.group		= FIO_OPT_G_INVALID,
1930 	},
1931 	{
1932 		.name		= "spdk_tracing",
1933 		.lname		= "Enable SPDK Tracing",
1934 		.type		= FIO_OPT_INT,
1935 		.off1		= offsetof(struct spdk_fio_options, spdk_tracing),
1936 		.def		= "0",
1937 		.help		= "SPDK Tracing (0=disable, 1=enable)",
1938 		.category	= FIO_OPT_C_ENGINE,
1939 		.group		= FIO_OPT_G_INVALID,
1940 	},
1941 	{
1942 		.name		= NULL,
1943 	},
1944 };
1945 
1946 /* FIO imports this structure using dlsym */
1947 struct ioengine_ops ioengine = {
1948 	.name			= "spdk",
1949 	.version		= FIO_IOOPS_VERSION,
1950 	.queue			= spdk_fio_queue,
1951 	.getevents		= spdk_fio_getevents,
1952 	.event			= spdk_fio_event,
1953 	.cleanup		= spdk_fio_cleanup,
1954 	.open_file		= spdk_fio_open,
1955 	.close_file		= spdk_fio_close,
1956 	.invalidate		= spdk_fio_invalidate,
1957 	.iomem_alloc		= spdk_fio_iomem_alloc,
1958 	.iomem_free		= spdk_fio_iomem_free,
1959 	.setup			= spdk_fio_setup,
1960 	.init			= spdk_fio_init,
1961 	.io_u_init		= spdk_fio_io_u_init,
1962 	.io_u_free		= spdk_fio_io_u_free,
1963 #if FIO_HAS_ZBD
1964 	.get_zoned_model	= spdk_fio_get_zoned_model,
1965 	.report_zones		= spdk_fio_report_zones,
1966 	.reset_wp		= spdk_fio_reset_wp,
1967 #endif
1968 #if FIO_IOOPS_VERSION >= 30
1969 	.get_max_open_zones	= spdk_fio_get_max_open_zones,
1970 #endif
1971 #if FIO_HAS_FDP
1972 	.fdp_fetch_ruhs		= spdk_fio_fdp_fetch_ruhs,
1973 #endif
1974 #if FIO_HAS_MRT
1975 	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN | FIO_DISKLESSIO | FIO_MULTI_RANGE_TRIM,
1976 #else
1977 	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN | FIO_DISKLESSIO,
1978 #endif
1979 	.options		= options,
1980 	.option_struct_size	= sizeof(struct spdk_fio_options),
1981 };
1982 
1983 static void fio_init
1984 fio_spdk_register(void)
1985 {
1986 	register_ioengine(&ioengine);
1987 }
1988 
1989 static void fio_exit
1990 fio_spdk_unregister(void)
1991 {
1992 	if (g_spdk_env_initialized) {
1993 		spdk_trace_cleanup();
1994 		spdk_env_fini();
1995 	}
1996 
1997 	unregister_ioengine(&ioengine);
1998 }
1999 
2000 SPDK_LOG_REGISTER_COMPONENT(fio_nvme)
2001