xref: /spdk/app/fio/nvme/fio_plugin.c (revision ee32a82bfd3ff5b1a10ed775ee06f0eaffce60eb)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include "spdk/nvme.h"
9 #include "spdk/nvme_zns.h"
10 #include "spdk/vmd.h"
11 #include "spdk/env.h"
12 #include "spdk/string.h"
13 #include "spdk/log.h"
14 #include "spdk/likely.h"
15 #include "spdk/endian.h"
16 #include "spdk/dif.h"
17 #include "spdk/util.h"
18 #include "spdk/trace.h"
19 
20 #include "config-host.h"
21 #include "fio.h"
22 #include "optgroup.h"
23 
24 #ifdef for_each_rw_ddir
25 #define FIO_HAS_ZBD (FIO_IOOPS_VERSION >= 26)
26 #define FIO_HAS_FDP (FIO_IOOPS_VERSION >= 35)
27 #define FIO_HAS_MRT (FIO_IOOPS_VERSION >= 34)
28 #else
29 #define FIO_HAS_ZBD (0)
30 #define FIO_HAS_FDP (0)
31 #define FIO_HAS_MRT (0)
32 #endif
33 
34 /* FreeBSD is missing CLOCK_MONOTONIC_RAW,
35  * so alternative is provided. */
36 #ifndef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */
37 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
38 #endif
39 
40 #define NVME_IO_ALIGN		4096
41 
42 static bool g_spdk_env_initialized;
43 static bool g_log_flag_error;
44 static int g_spdk_enable_sgl = 0;
45 static uint32_t g_spdk_sge_size = 4096;
46 static uint32_t g_spdk_bit_bucket_data_len = 0;
47 static uint32_t g_spdk_pract_flag;
48 static uint32_t g_spdk_prchk_flags;
49 static uint32_t g_spdk_md_per_io_size = 4096;
50 static uint16_t g_spdk_apptag;
51 static uint16_t g_spdk_apptag_mask;
52 
53 struct spdk_fio_options {
54 	void	*pad;	/* off1 used in option descriptions may not be 0 */
55 	int	enable_wrr;
56 	int	arbitration_burst;
57 	int	low_weight;
58 	int	medium_weight;
59 	int	high_weight;
60 	int	wrr_priority;
61 	int	mem_size;
62 	int	shm_id;
63 	int	enable_sgl;
64 	int	sge_size;
65 	int	bit_bucket_data_len;
66 	char	*hostnqn;
67 	int	pi_act;
68 	char	*pi_chk;
69 	int	md_per_io_size;
70 	int	apptag;
71 	int	apptag_mask;
72 	char	*digest_enable;
73 	int	enable_vmd;
74 	int	initial_zone_reset;
75 	int	zone_append;
76 	int	print_qid_mappings;
77 	int	spdk_tracing;
78 	char	*log_flags;
79 	int	disable_pcie_sgl_merge;
80 };
81 
82 struct spdk_fio_request {
83 	struct io_u		*io;
84 	/** Offset in current iovec, fio only uses 1 vector */
85 	uint32_t		iov_offset;
86 
87 	/** Amount of data used for Bit Bucket SGL */
88 	uint32_t		bit_bucket_data_len;
89 
90 	/** Context for NVMe PI */
91 	struct spdk_dif_ctx	dif_ctx;
92 	/** Separate metadata buffer pointer */
93 	void			*md_buf;
94 
95 	/** Dataset management range information */
96 	struct spdk_nvme_dsm_range *dsm_range;
97 
98 	struct spdk_fio_thread	*fio_thread;
99 	struct spdk_fio_qpair	*fio_qpair;
100 };
101 
102 struct spdk_fio_ctrlr {
103 	struct spdk_nvme_transport_id	tr_id;
104 	struct spdk_nvme_ctrlr_opts	opts;
105 	struct spdk_nvme_ctrlr		*ctrlr;
106 	TAILQ_ENTRY(spdk_fio_ctrlr)	link;
107 };
108 
109 static TAILQ_HEAD(, spdk_fio_ctrlr) g_ctrlrs = TAILQ_HEAD_INITIALIZER(g_ctrlrs);
110 static int g_td_count;
111 static pthread_t g_ctrlr_thread_id = 0;
112 static pthread_mutex_t g_mutex = PTHREAD_MUTEX_INITIALIZER;
113 static bool g_error;
114 
115 struct spdk_fio_qpair {
116 	struct fio_file			*f;
117 	struct spdk_nvme_qpair		*qpair;
118 	struct spdk_nvme_ns		*ns;
119 	uint32_t			io_flags;
120 	bool				zone_append_enabled;
121 	bool				nvme_pi_enabled;
122 	/* True for DIF and false for DIX, and this is valid only if nvme_pi_enabled is true. */
123 	bool				extended_lba;
124 	/* True for protection info transferred at start of metadata,
125 	 * false for protection info transferred at end of metadata, and
126 	 * this is valid only if nvme_pi_enabled is true.
127 	 */
128 	bool				md_start;
129 	TAILQ_ENTRY(spdk_fio_qpair)	link;
130 	struct spdk_fio_ctrlr		*fio_ctrlr;
131 };
132 
133 struct spdk_fio_thread {
134 	struct thread_data		*td;
135 
136 	TAILQ_HEAD(, spdk_fio_qpair)	fio_qpair;
137 	struct spdk_fio_qpair		*fio_qpair_current;	/* the current fio_qpair to be handled. */
138 
139 	struct io_u			**iocq;		/* io completion queue */
140 	unsigned int			iocq_count;	/* number of iocq entries filled by last getevents */
141 	unsigned int			iocq_size;	/* number of iocq entries allocated */
142 
143 };
144 
145 struct spdk_fio_probe_ctx {
146 	struct thread_data	*td;
147 	char			hostnqn[SPDK_NVMF_NQN_MAX_LEN + 1];
148 	struct fio_file		*f; /* fio_file given by user */
149 };
150 
151 static void *
152 spdk_fio_poll_ctrlrs(void *arg)
153 {
154 	struct spdk_fio_ctrlr *fio_ctrlr;
155 	int oldstate;
156 	int rc;
157 
158 	/* Loop until the thread is cancelled */
159 	while (true) {
160 		rc = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate);
161 		if (rc != 0) {
162 			SPDK_ERRLOG("Unable to set cancel state disabled on g_init_thread (%d): %s\n",
163 				    rc, spdk_strerror(rc));
164 		}
165 
166 		pthread_mutex_lock(&g_mutex);
167 
168 		TAILQ_FOREACH(fio_ctrlr, &g_ctrlrs, link) {
169 			spdk_nvme_ctrlr_process_admin_completions(fio_ctrlr->ctrlr);
170 		}
171 
172 		pthread_mutex_unlock(&g_mutex);
173 
174 		rc = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);
175 		if (rc != 0) {
176 			SPDK_ERRLOG("Unable to set cancel state enabled on g_init_thread (%d): %s\n",
177 				    rc, spdk_strerror(rc));
178 		}
179 
180 		/* This is a pthread cancellation point and cannot be removed. */
181 		sleep(1);
182 	}
183 
184 	return NULL;
185 }
186 
187 static bool
188 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
189 	 struct spdk_nvme_ctrlr_opts *opts)
190 {
191 	struct spdk_fio_probe_ctx *ctx = cb_ctx;
192 	struct thread_data *td = ctx->td;
193 	struct spdk_fio_options *fio_options = td->eo;
194 
195 	if (ctx->hostnqn[0] != '\0') {
196 		memcpy(opts->hostnqn, ctx->hostnqn, sizeof(opts->hostnqn));
197 	} else if (fio_options->hostnqn) {
198 		snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", fio_options->hostnqn);
199 	}
200 
201 	if (fio_options->enable_wrr) {
202 		opts->arb_mechanism		= SPDK_NVME_CC_AMS_WRR;
203 		opts->arbitration_burst		= fio_options->arbitration_burst;
204 		opts->low_priority_weight	= fio_options->low_weight;
205 		opts->medium_priority_weight	= fio_options->medium_weight;
206 		opts->high_priority_weight	= fio_options->high_weight;
207 	}
208 
209 	if (fio_options->digest_enable) {
210 		if (strcasecmp(fio_options->digest_enable, "HEADER") == 0) {
211 			opts->header_digest = true;
212 		} else if (strcasecmp(fio_options->digest_enable, "DATA") == 0) {
213 			opts->data_digest = true;
214 		} else if (strcasecmp(fio_options->digest_enable, "BOTH") == 0) {
215 			opts->header_digest = true;
216 			opts->data_digest = true;
217 		}
218 	}
219 
220 	return true;
221 }
222 
223 static struct spdk_fio_ctrlr *
224 get_fio_ctrlr(const struct spdk_nvme_transport_id *trid)
225 {
226 	struct spdk_fio_ctrlr	*fio_ctrlr;
227 
228 	TAILQ_FOREACH(fio_ctrlr, &g_ctrlrs, link) {
229 		if (spdk_nvme_transport_id_compare(trid, &fio_ctrlr->tr_id) == 0) {
230 			return fio_ctrlr;
231 		}
232 	}
233 
234 	return NULL;
235 }
236 
237 /**
238  * Returns the fio_qpair matching the given fio_file and has an associated ns
239  */
240 static struct spdk_fio_qpair *
241 get_fio_qpair(struct spdk_fio_thread *fio_thread, struct fio_file *f)
242 {
243 	struct spdk_fio_qpair	*fio_qpair;
244 
245 	TAILQ_FOREACH(fio_qpair, &fio_thread->fio_qpair, link) {
246 		if ((fio_qpair->f == f) && fio_qpair->ns) {
247 			return fio_qpair;
248 		}
249 	}
250 
251 	return NULL;
252 }
253 
254 #if FIO_HAS_ZBD
255 /**
256  * Callback function to use while processing completions until completion-indicator turns non-zero
257  */
258 static void
259 pcu_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
260 {
261 	int *completed = ctx;
262 
263 	*completed = spdk_nvme_cpl_is_error(cpl) ? -1 : 1;
264 }
265 
266 /**
267  * Process Completions Until the given 'completed' indicator turns non-zero or an error occurs
268  */
269 static int32_t
270 pcu(struct spdk_nvme_qpair *qpair, int *completed)
271 {
272 	int32_t ret;
273 
274 	while (!*completed) {
275 		ret = spdk_nvme_qpair_process_completions(qpair, 1);
276 		if (ret < 0) {
277 			log_err("spdk/nvme: process_compl(): ret: %d\n", ret);
278 			return ret;
279 		}
280 	}
281 
282 	return 0;
283 }
284 #endif
285 
286 static inline uint32_t
287 _nvme_get_host_buffer_sector_size(struct spdk_nvme_ns *ns, uint32_t io_flags)
288 {
289 	bool md_excluded_from_xfer = false;
290 	uint32_t md_size;
291 	uint32_t ns_flags;
292 
293 	ns_flags = spdk_nvme_ns_get_flags(ns);
294 	md_size = spdk_nvme_ns_get_md_size(ns);
295 
296 	/* For extended LBA format, if the metadata size is 8 bytes and PRACT is
297 	 * enabled(controller inserts/strips PI), we should reduce metadata size
298 	 * from block size.
299 	 */
300 	md_excluded_from_xfer = ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) &&
301 				 (ns_flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) &&
302 				 (ns_flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) &&
303 				 (md_size == 8));
304 
305 	return md_excluded_from_xfer ? spdk_nvme_ns_get_sector_size(ns) :
306 	       spdk_nvme_ns_get_extended_sector_size(ns);
307 }
308 
309 static void
310 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
311 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
312 {
313 	struct spdk_fio_probe_ctx *ctx = cb_ctx;
314 	struct thread_data	*td = ctx->td;
315 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
316 	struct spdk_fio_ctrlr	*fio_ctrlr;
317 	struct spdk_fio_qpair	*fio_qpair;
318 	struct spdk_nvme_ns	*ns;
319 	const struct spdk_nvme_ns_data	*nsdata;
320 	struct fio_file		*f = ctx->f;
321 	uint32_t		ns_id;
322 	char			*p;
323 	long int		tmp;
324 	uint32_t		block_size;
325 	struct spdk_fio_options *fio_options = td->eo;
326 
327 	p = strstr(f->file_name, "ns=");
328 	if (p != NULL) {
329 		tmp = spdk_strtol(p + 3, 10);
330 		if (tmp <= 0) {
331 			SPDK_ERRLOG("namespace id should be >=1, but was invalid: %ld\n", tmp);
332 			g_error = true;
333 			return;
334 		}
335 		ns_id = (uint32_t)tmp;
336 	} else {
337 		ns_id = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
338 		if (ns_id == 0) {
339 			/* The ctrlr has no active namespaces and we didn't specify any so nothing to do. */
340 			return;
341 		}
342 	}
343 
344 	pthread_mutex_lock(&g_mutex);
345 	fio_ctrlr = get_fio_ctrlr(trid);
346 	/* it is a new ctrlr and needs to be added */
347 	if (!fio_ctrlr) {
348 		/* Create an fio_ctrlr and add it to the list */
349 		fio_ctrlr = calloc(1, sizeof(*fio_ctrlr));
350 		if (!fio_ctrlr) {
351 			SPDK_ERRLOG("Cannot allocate space for fio_ctrlr\n");
352 			g_error = true;
353 			pthread_mutex_unlock(&g_mutex);
354 			return;
355 		}
356 		fio_ctrlr->opts = *opts;
357 		fio_ctrlr->ctrlr = ctrlr;
358 		fio_ctrlr->tr_id = *trid;
359 		TAILQ_INSERT_TAIL(&g_ctrlrs, fio_ctrlr, link);
360 	}
361 	pthread_mutex_unlock(&g_mutex);
362 
363 	ns = spdk_nvme_ctrlr_get_ns(fio_ctrlr->ctrlr, ns_id);
364 	if (ns == NULL) {
365 		SPDK_ERRLOG("Cannot get namespace by ns_id=%d\n", ns_id);
366 		g_error = true;
367 		return;
368 	}
369 
370 	if (!spdk_nvme_ns_is_active(ns)) {
371 		SPDK_ERRLOG("Inactive namespace by ns_id=%d\n", ns_id);
372 		g_error = true;
373 		return;
374 	}
375 	nsdata = spdk_nvme_ns_get_data(ns);
376 
377 	TAILQ_FOREACH(fio_qpair, &fio_thread->fio_qpair, link) {
378 		if ((fio_qpair->f == f) ||
379 		    ((spdk_nvme_transport_id_compare(trid, &fio_qpair->fio_ctrlr->tr_id) == 0) &&
380 		     (spdk_nvme_ns_get_id(fio_qpair->ns) == ns_id))) {
381 			/* Not the error case. Avoid duplicated connection */
382 			return;
383 		}
384 	}
385 
386 	/* create a new qpair */
387 	fio_qpair = calloc(1, sizeof(*fio_qpair));
388 	if (!fio_qpair) {
389 		g_error = true;
390 		SPDK_ERRLOG("Cannot allocate space for fio_qpair\n");
391 		return;
392 	}
393 
394 	f->engine_data = fio_qpair;
395 	fio_qpair->ns = ns;
396 	fio_qpair->f = f;
397 	fio_qpair->fio_ctrlr = fio_ctrlr;
398 	TAILQ_INSERT_TAIL(&fio_thread->fio_qpair, fio_qpair, link);
399 
400 	if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) {
401 		assert(spdk_nvme_ns_get_pi_type(ns) != SPDK_NVME_FMT_NVM_PROTECTION_DISABLE);
402 		fio_qpair->io_flags = g_spdk_pract_flag | g_spdk_prchk_flags;
403 		fio_qpair->nvme_pi_enabled = true;
404 		fio_qpair->md_start = nsdata->dps.md_start;
405 		fio_qpair->extended_lba = spdk_nvme_ns_supports_extended_lba(ns);
406 		fprintf(stdout, "PI type%u enabled with %s\n", spdk_nvme_ns_get_pi_type(ns),
407 			fio_qpair->extended_lba ? "extended lba" : "separate metadata");
408 	}
409 
410 	block_size = _nvme_get_host_buffer_sector_size(ns, fio_qpair->io_flags);
411 	for_each_rw_ddir(ddir) {
412 		if (td->o.min_bs[ddir] % block_size != 0 || td->o.max_bs[ddir] % block_size != 0) {
413 			if (spdk_nvme_ns_supports_extended_lba(ns)) {
414 				SPDK_ERRLOG("--bs or other block size related option has to be a multiple of (LBA data size + Metadata size)\n");
415 			} else {
416 				SPDK_ERRLOG("--bs or other block size related option has to be a multiple of LBA data size\n");
417 			}
418 			g_error = true;
419 			return;
420 		}
421 	}
422 
423 	if (fio_options->zone_append && spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS) {
424 		if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED) {
425 			SPDK_DEBUGLOG(fio_nvme, "Using zone appends instead of writes on: '%s'\n",
426 				      f->file_name);
427 			fio_qpair->zone_append_enabled = true;
428 		} else {
429 			SPDK_WARNLOG("Falling back to writes on: '%s' - ns lacks zone append cmd\n",
430 				     f->file_name);
431 		}
432 	}
433 
434 #if FIO_HAS_ZBD
435 	if (td_trim(td) && td->o.zone_mode == ZONE_MODE_ZBD) {
436 		td->io_ops->flags |= FIO_ASYNCIO_SYNC_TRIM;
437 	}
438 #endif
439 
440 	if (fio_options->initial_zone_reset == 1 && spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS) {
441 #if FIO_HAS_ZBD
442 		struct spdk_nvme_qpair *tmp_qpair;
443 		int completed = 0, err;
444 
445 		/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
446 		 * Create a temporary qpair in order to perform the initial zone reset.
447 		 */
448 		assert(!fio_qpair->qpair);
449 
450 		tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, NULL, 0);
451 		if (!tmp_qpair) {
452 			SPDK_ERRLOG("Cannot allocate a temporary qpair\n");
453 			g_error = true;
454 			return;
455 		}
456 
457 		err = spdk_nvme_zns_reset_zone(ns, tmp_qpair, 0x0, true, pcu_cb, &completed);
458 		if (err || pcu(tmp_qpair, &completed) || completed < 0) {
459 			log_err("spdk/nvme: warn: initial_zone_reset: err: %d, cpl: %d\n",
460 				err, completed);
461 		}
462 
463 		spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
464 #else
465 		log_err("spdk/nvme: ZBD/ZNS is not supported\n");
466 #endif
467 	}
468 
469 	f->real_file_size = spdk_nvme_ns_get_size(fio_qpair->ns);
470 	if (f->real_file_size <= 0) {
471 		g_error = true;
472 		SPDK_ERRLOG("Cannot get namespace size by ns=%p\n", ns);
473 		return;
474 	}
475 
476 	f->filetype = FIO_TYPE_BLOCK;
477 	fio_file_set_size_known(f);
478 }
479 
480 static void
481 parse_prchk_flags(const char *prchk_str)
482 {
483 	if (!prchk_str) {
484 		return;
485 	}
486 
487 	if (strstr(prchk_str, "GUARD") != NULL) {
488 		g_spdk_prchk_flags = SPDK_NVME_IO_FLAGS_PRCHK_GUARD;
489 	}
490 	if (strstr(prchk_str, "REFTAG") != NULL) {
491 		g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG;
492 	}
493 	if (strstr(prchk_str, "APPTAG") != NULL) {
494 		g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG;
495 	}
496 }
497 
498 static void
499 parse_pract_flag(int pract)
500 {
501 	if (pract == 1) {
502 		g_spdk_pract_flag = SPDK_NVME_IO_FLAGS_PRACT;
503 	} else {
504 		g_spdk_pract_flag = 0;
505 	}
506 }
507 
508 static bool
509 fio_redirected_to_dev_null(void)
510 {
511 	char path[PATH_MAX] = "";
512 	ssize_t ret;
513 
514 	ret = readlink("/proc/self/fd/1", path, sizeof(path));
515 
516 	if (ret == -1 || strcmp(path, "/dev/null") != 0) {
517 		return false;
518 	}
519 
520 	ret = readlink("/proc/self/fd/2", path, sizeof(path));
521 
522 	if (ret == -1 || strcmp(path, "/dev/null") != 0) {
523 		return false;
524 	}
525 
526 	return true;
527 }
528 
529 static int
530 spdk_fio_init(struct thread_data *td)
531 {
532 	int ret = 0;
533 	struct spdk_fio_options *fio_options = td->eo;
534 
535 	if (fio_options->spdk_tracing) {
536 		ret = spdk_trace_register_user_thread();
537 	}
538 
539 	return ret;
540 }
541 
542 /* Called once at initialization. This is responsible for gathering the size of
543  * each "file", which in our case are in the form
544  * 'key=value [key=value] ... ns=value'
545  * For example, For local PCIe NVMe device  - 'trtype=PCIe traddr=0000.04.00.0 ns=1'
546  * For remote exported by NVMe-oF target, 'trtype=RDMA adrfam=IPv4 traddr=192.168.100.8 trsvcid=4420 ns=1' */
547 static int
548 spdk_fio_setup(struct thread_data *td)
549 {
550 	struct spdk_fio_thread *fio_thread;
551 	struct spdk_fio_options *fio_options = td->eo;
552 	struct spdk_fio_probe_ctx ctx;
553 	struct spdk_env_opts opts;
554 	struct fio_file *f;
555 	char *p;
556 	int rc = 0;
557 	struct spdk_nvme_transport_id trid;
558 	struct spdk_fio_ctrlr *fio_ctrlr;
559 	char *trid_info;
560 	unsigned int i;
561 	size_t size;
562 
563 	/*
564 	 * If we're running in a daemonized FIO instance, it's possible
565 	 * fd 1/2 were re-used for something important by FIO. Newer fio
566 	 * versions are careful to redirect those to /dev/null, but if we're
567 	 * not, we'll abort early, so we don't accidentally write messages to
568 	 * an important file, etc.
569 	 */
570 	if (is_backend && !fio_redirected_to_dev_null()) {
571 		char buf[1024];
572 		snprintf(buf, sizeof(buf),
573 			 "SPDK FIO plugin is in daemon mode, but stdout/stderr "
574 			 "aren't redirected to /dev/null. Aborting.");
575 		fio_server_text_output(FIO_LOG_ERR, buf, sizeof(buf));
576 		return -1;
577 	}
578 
579 	if (!td->o.use_thread) {
580 		log_err("spdk: must set thread=1 when using spdk plugin\n");
581 		return 1;
582 	}
583 
584 	if (g_log_flag_error) {
585 		/* The first thread found an error when parsing log flags, so
586 		 * just return error immediately for all of the other threads.
587 		 */
588 		return 1;
589 	}
590 
591 	pthread_mutex_lock(&g_mutex);
592 
593 	fio_thread = calloc(1, sizeof(*fio_thread));
594 	assert(fio_thread != NULL);
595 
596 	td->io_ops_data = fio_thread;
597 	fio_thread->td = td;
598 
599 	fio_thread->iocq_size = td->o.iodepth;
600 	fio_thread->iocq = calloc(fio_thread->iocq_size, sizeof(struct io_u *));
601 	assert(fio_thread->iocq != NULL);
602 
603 	TAILQ_INIT(&fio_thread->fio_qpair);
604 
605 	if (!g_spdk_env_initialized) {
606 		opts.opts_size = sizeof(opts);
607 		spdk_env_opts_init(&opts);
608 		opts.name = "fio";
609 		opts.mem_size = fio_options->mem_size;
610 		opts.shm_id = fio_options->shm_id;
611 		g_spdk_enable_sgl = fio_options->enable_sgl;
612 		g_spdk_sge_size = fio_options->sge_size;
613 		g_spdk_bit_bucket_data_len = fio_options->bit_bucket_data_len;
614 		parse_pract_flag(fio_options->pi_act);
615 		g_spdk_md_per_io_size = spdk_max(fio_options->md_per_io_size, 4096);
616 		g_spdk_apptag = (uint16_t)fio_options->apptag;
617 		g_spdk_apptag_mask = (uint16_t)fio_options->apptag_mask;
618 		parse_prchk_flags(fio_options->pi_chk);
619 		if (spdk_env_init(&opts) < 0) {
620 			SPDK_ERRLOG("Unable to initialize SPDK env\n");
621 			free(fio_thread->iocq);
622 			free(fio_thread);
623 			fio_thread = NULL;
624 			pthread_mutex_unlock(&g_mutex);
625 			return 1;
626 		}
627 
628 		if (fio_options->log_flags) {
629 			char *tok = strtok(fio_options->log_flags, ",");
630 			do {
631 				rc = spdk_log_set_flag(tok);
632 				if (rc < 0) {
633 					SPDK_ERRLOG("unknown log flag %s\n", tok);
634 					g_log_flag_error = true;
635 					return 1;
636 				}
637 			} while ((tok = strtok(NULL, ",")) != NULL);
638 #ifdef DEBUG
639 			spdk_log_set_print_level(SPDK_LOG_DEBUG);
640 #endif
641 		}
642 
643 		g_spdk_env_initialized = true;
644 		spdk_unaffinitize_thread();
645 
646 		if (fio_options->spdk_tracing) {
647 			spdk_trace_init("spdk_fio_tracepoints", 65536, td->o.numjobs);
648 			spdk_trace_enable_tpoint_group("nvme_pcie");
649 			spdk_trace_enable_tpoint_group("nvme_tcp");
650 		}
651 
652 		/* Spawn a thread to continue polling the controllers */
653 		rc = pthread_create(&g_ctrlr_thread_id, NULL, &spdk_fio_poll_ctrlrs, NULL);
654 		if (rc != 0) {
655 			SPDK_ERRLOG("Unable to spawn a thread to poll admin queues. They won't be polled.\n");
656 		}
657 
658 		if (fio_options->enable_vmd && spdk_vmd_init()) {
659 			SPDK_ERRLOG("Failed to initialize VMD. Some NVMe devices can be unavailable.\n");
660 		}
661 	}
662 	pthread_mutex_unlock(&g_mutex);
663 
664 	for_each_file(td, f, i) {
665 		memset(&trid, 0, sizeof(trid));
666 		memset(&ctx, 0, sizeof(ctx));
667 
668 		trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
669 
670 		p = strstr(f->file_name, " ns=");
671 		if (p != NULL) {
672 			trid_info = strndup(f->file_name, p - f->file_name);
673 		} else {
674 			trid_info = strndup(f->file_name, strlen(f->file_name));
675 		}
676 
677 		if (!trid_info) {
678 			SPDK_ERRLOG("Failed to allocate space for trid_info\n");
679 			continue;
680 		}
681 
682 		rc = spdk_nvme_transport_id_parse(&trid, trid_info);
683 		if (rc < 0) {
684 			SPDK_ERRLOG("Failed to parse given str: %s\n", trid_info);
685 			free(trid_info);
686 			continue;
687 		}
688 		free(trid_info);
689 
690 		if (trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
691 			struct spdk_pci_addr pci_addr;
692 			if (spdk_pci_addr_parse(&pci_addr, trid.traddr) < 0) {
693 				SPDK_ERRLOG("Invalid traddr=%s\n", trid.traddr);
694 				continue;
695 			}
696 			spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
697 		} else {
698 			if (trid.subnqn[0] == '\0') {
699 				snprintf(trid.subnqn, sizeof(trid.subnqn), "%s",
700 					 SPDK_NVMF_DISCOVERY_NQN);
701 			}
702 			if ((p = strcasestr(f->file_name, "hostnqn:")) ||
703 			    (p = strcasestr(f->file_name, "hostnqn="))) {
704 				p += strlen("hostnqn:");
705 				size = strcspn(p, " \t\n");
706 				if (size > sizeof(ctx.hostnqn)) {
707 					SPDK_ERRLOG("Invalid hostnqn: too long\n");
708 					continue;
709 				}
710 				memcpy(ctx.hostnqn, p, size);
711 			}
712 		}
713 
714 		ctx.td = td;
715 		ctx.f = f;
716 
717 		pthread_mutex_lock(&g_mutex);
718 		fio_ctrlr = get_fio_ctrlr(&trid);
719 		pthread_mutex_unlock(&g_mutex);
720 		if (fio_ctrlr) {
721 			attach_cb(&ctx, &trid, fio_ctrlr->ctrlr, &fio_ctrlr->opts);
722 		} else {
723 			/* Enumerate all of the controllers */
724 			if (spdk_nvme_probe(&trid, &ctx, probe_cb, attach_cb, NULL) != 0) {
725 				SPDK_ERRLOG("spdk_nvme_probe() failed\n");
726 				continue;
727 			}
728 		}
729 
730 		if (g_error) {
731 			log_err("Failed to initialize spdk fio plugin\n");
732 			rc = 1;
733 			break;
734 		}
735 	}
736 
737 	pthread_mutex_lock(&g_mutex);
738 	g_td_count++;
739 	pthread_mutex_unlock(&g_mutex);
740 
741 	return rc;
742 }
743 
744 static int
745 spdk_fio_open(struct thread_data *td, struct fio_file *f)
746 {
747 	struct spdk_fio_qpair *fio_qpair = f->engine_data;
748 	struct spdk_fio_ctrlr *fio_ctrlr = fio_qpair->fio_ctrlr;
749 	struct spdk_fio_options *fio_options = td->eo;
750 	struct spdk_nvme_io_qpair_opts	qpopts;
751 
752 	assert(fio_qpair->qpair == NULL);
753 	spdk_nvme_ctrlr_get_default_io_qpair_opts(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts));
754 	qpopts.delay_cmd_submit = true;
755 	if (fio_options->enable_wrr) {
756 		qpopts.qprio = fio_options->wrr_priority;
757 	}
758 	qpopts.disable_pcie_sgl_merge = fio_options->disable_pcie_sgl_merge;
759 
760 	fio_qpair->qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts));
761 	if (!fio_qpair->qpair) {
762 		SPDK_ERRLOG("Cannot allocate nvme io_qpair any more\n");
763 		g_error = true;
764 		free(fio_qpair);
765 		return -1;
766 	}
767 
768 	if (fio_options->print_qid_mappings == 1) {
769 		log_info("job %s: %s qid %d\n", td->o.name, f->file_name,
770 			 spdk_nvme_qpair_get_id(fio_qpair->qpair));
771 	}
772 
773 	return 0;
774 }
775 
776 static int
777 spdk_fio_close(struct thread_data *td, struct fio_file *f)
778 {
779 	struct spdk_fio_qpair *fio_qpair = f->engine_data;
780 
781 	assert(fio_qpair->qpair != NULL);
782 	spdk_nvme_ctrlr_free_io_qpair(fio_qpair->qpair);
783 	fio_qpair->qpair = NULL;
784 	return 0;
785 }
786 
787 static int
788 spdk_fio_iomem_alloc(struct thread_data *td, size_t total_mem)
789 {
790 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
791 	struct spdk_fio_qpair	*fio_qpair;
792 	struct spdk_nvme_ctrlr	*ctrlr;
793 	int32_t numa_id = SPDK_ENV_NUMA_ID_ANY, tmp_numa_id;
794 
795 	/* If all ctrlrs used by this fio_thread have the same numa
796 	 * id, allocate from that one. If they come from different numa
797 	 * ids, then don't try to optimize and just use SPDK_ENV_NUMA_ID_ANY.
798 	 */
799 	TAILQ_FOREACH(fio_qpair, &fio_thread->fio_qpair, link) {
800 		ctrlr = fio_qpair->fio_ctrlr->ctrlr;
801 		tmp_numa_id = spdk_nvme_ctrlr_get_numa_id(ctrlr);
802 		if (numa_id == SPDK_ENV_NUMA_ID_ANY) {
803 			numa_id = tmp_numa_id;
804 		} else if (tmp_numa_id != numa_id &&
805 			   tmp_numa_id != SPDK_ENV_NUMA_ID_ANY) {
806 			numa_id = SPDK_ENV_NUMA_ID_ANY;
807 			break;
808 		}
809 	}
810 
811 	td->orig_buffer = spdk_dma_zmalloc_socket(total_mem, NVME_IO_ALIGN, NULL, numa_id);
812 	return td->orig_buffer == NULL;
813 }
814 
815 static void
816 spdk_fio_iomem_free(struct thread_data *td)
817 {
818 	spdk_dma_free(td->orig_buffer);
819 }
820 
821 static int
822 spdk_fio_io_u_init(struct thread_data *td, struct io_u *io_u)
823 {
824 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
825 	struct spdk_fio_request	*fio_req;
826 	uint32_t dsm_size;
827 
828 	io_u->engine_data = NULL;
829 
830 	fio_req = calloc(1, sizeof(*fio_req));
831 	if (fio_req == NULL) {
832 		return 1;
833 	}
834 
835 	if (!(td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM)) {
836 #if FIO_HAS_MRT
837 		/* By default number of range is set to 1 */
838 		dsm_size = td->o.num_range * sizeof(struct spdk_nvme_dsm_range);
839 #else
840 		dsm_size = sizeof(struct spdk_nvme_dsm_range);
841 #endif
842 		fio_req->dsm_range = calloc(1, dsm_size);
843 		if (fio_req->dsm_range == NULL) {
844 			free(fio_req);
845 			return 1;
846 		}
847 	}
848 
849 	fio_req->md_buf = spdk_dma_zmalloc(g_spdk_md_per_io_size, NVME_IO_ALIGN, NULL);
850 	if (fio_req->md_buf == NULL) {
851 		fprintf(stderr, "Allocate %u metadata failed\n", g_spdk_md_per_io_size);
852 		free(fio_req->dsm_range);
853 		free(fio_req);
854 		return 1;
855 	}
856 
857 	fio_req->io = io_u;
858 	fio_req->fio_thread = fio_thread;
859 
860 	io_u->engine_data = fio_req;
861 
862 	return 0;
863 }
864 
865 static void
866 spdk_fio_io_u_free(struct thread_data *td, struct io_u *io_u)
867 {
868 	struct spdk_fio_request *fio_req = io_u->engine_data;
869 
870 	if (fio_req) {
871 		assert(fio_req->io == io_u);
872 		spdk_dma_free(fio_req->md_buf);
873 		free(fio_req->dsm_range);
874 		free(fio_req);
875 		io_u->engine_data = NULL;
876 	}
877 }
878 
879 static inline uint64_t
880 fio_offset_to_zslba(unsigned long long offset, struct spdk_nvme_ns *ns)
881 {
882 	return (offset / spdk_nvme_zns_ns_get_zone_size(ns)) * spdk_nvme_zns_ns_get_zone_size_sectors(ns);
883 }
884 
885 static int
886 fio_extended_lba_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
887 {
888 	struct spdk_nvme_ns *ns = fio_qpair->ns;
889 	struct spdk_fio_request *fio_req = io_u->engine_data;
890 	uint32_t md_size, extended_lba_size, lba_count;
891 	uint64_t lba;
892 	struct iovec iov;
893 	int rc;
894 	struct spdk_dif_ctx_init_ext_opts dif_opts;
895 
896 	/* Set appmask and apptag when PRACT is enabled */
897 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
898 		fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask;
899 		fio_req->dif_ctx.app_tag = g_spdk_apptag;
900 		return 0;
901 	}
902 
903 	extended_lba_size = spdk_nvme_ns_get_extended_sector_size(ns);
904 	md_size = spdk_nvme_ns_get_md_size(ns);
905 	lba = io_u->offset / extended_lba_size;
906 	lba_count = io_u->xfer_buflen / extended_lba_size;
907 
908 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
909 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
910 	rc = spdk_dif_ctx_init(&fio_req->dif_ctx, extended_lba_size, md_size,
911 			       true, fio_qpair->md_start,
912 			       (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns),
913 			       fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag,
914 			       0, 0, &dif_opts);
915 	if (rc != 0) {
916 		fprintf(stderr, "Initialization of DIF context failed\n");
917 		return rc;
918 	}
919 
920 	if (io_u->ddir != DDIR_WRITE) {
921 		return 0;
922 	}
923 
924 	iov.iov_base = io_u->buf;
925 	iov.iov_len = io_u->xfer_buflen;
926 	rc = spdk_dif_generate(&iov, 1, lba_count, &fio_req->dif_ctx);
927 	if (rc != 0) {
928 		fprintf(stderr, "Generation of DIF failed\n");
929 	}
930 
931 	return rc;
932 }
933 
934 static int
935 fio_separate_md_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
936 {
937 	struct spdk_nvme_ns *ns = fio_qpair->ns;
938 	struct spdk_fio_request *fio_req = io_u->engine_data;
939 	uint32_t md_size, block_size, lba_count;
940 	uint64_t lba;
941 	struct iovec iov, md_iov;
942 	int rc;
943 	struct spdk_dif_ctx_init_ext_opts dif_opts;
944 
945 	/* Set appmask and apptag when PRACT is enabled */
946 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
947 		fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask;
948 		fio_req->dif_ctx.app_tag = g_spdk_apptag;
949 		return 0;
950 	}
951 
952 	block_size = spdk_nvme_ns_get_sector_size(ns);
953 	md_size = spdk_nvme_ns_get_md_size(ns);
954 	lba = io_u->offset / block_size;
955 	lba_count = io_u->xfer_buflen / block_size;
956 
957 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
958 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
959 	rc = spdk_dif_ctx_init(&fio_req->dif_ctx, block_size, md_size,
960 			       false, fio_qpair->md_start,
961 			       (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns),
962 			       fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag,
963 			       0, 0, &dif_opts);
964 	if (rc != 0) {
965 		fprintf(stderr, "Initialization of DIF context failed\n");
966 		return rc;
967 	}
968 
969 	if (io_u->ddir != DDIR_WRITE) {
970 		return 0;
971 	}
972 
973 	iov.iov_base = io_u->buf;
974 	iov.iov_len = io_u->xfer_buflen;
975 	md_iov.iov_base = fio_req->md_buf;
976 	md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size);
977 	rc = spdk_dix_generate(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx);
978 	if (rc < 0) {
979 		fprintf(stderr, "Generation of DIX failed\n");
980 	}
981 
982 	return rc;
983 }
984 
985 static int
986 fio_extended_lba_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
987 {
988 	struct spdk_nvme_ns *ns = fio_qpair->ns;
989 	struct spdk_fio_request *fio_req = io_u->engine_data;
990 	uint32_t lba_count;
991 	struct iovec iov;
992 	struct spdk_dif_error err_blk = {};
993 	int rc;
994 
995 	/* Do nothing when PRACT is enabled */
996 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
997 		return 0;
998 	}
999 
1000 	iov.iov_base = io_u->buf;
1001 	iov.iov_len = io_u->xfer_buflen;
1002 	lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_extended_sector_size(ns);
1003 
1004 	rc = spdk_dif_verify(&iov, 1, lba_count, &fio_req->dif_ctx, &err_blk);
1005 	if (rc != 0) {
1006 		fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n",
1007 			err_blk.err_type, err_blk.err_offset);
1008 	}
1009 
1010 	return rc;
1011 }
1012 
1013 static int
1014 fio_separate_md_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
1015 {
1016 	struct spdk_nvme_ns *ns = fio_qpair->ns;
1017 	struct spdk_fio_request *fio_req = io_u->engine_data;
1018 	uint32_t md_size, lba_count;
1019 	struct iovec iov, md_iov;
1020 	struct spdk_dif_error err_blk = {};
1021 	int rc;
1022 
1023 	/* Do nothing when PRACT is enabled */
1024 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
1025 		return 0;
1026 	}
1027 
1028 	iov.iov_base = io_u->buf;
1029 	iov.iov_len = io_u->xfer_buflen;
1030 	lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_sector_size(ns);
1031 	md_size = spdk_nvme_ns_get_md_size(ns);
1032 	md_iov.iov_base = fio_req->md_buf;
1033 	md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size);
1034 
1035 	rc = spdk_dix_verify(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx, &err_blk);
1036 	if (rc != 0) {
1037 		fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n",
1038 			err_blk.err_type, err_blk.err_offset);
1039 	}
1040 
1041 	return rc;
1042 }
1043 
1044 static void
1045 spdk_fio_completion_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
1046 {
1047 	struct spdk_fio_request		*fio_req = ctx;
1048 	struct spdk_fio_thread		*fio_thread = fio_req->fio_thread;
1049 	struct spdk_fio_qpair		*fio_qpair = fio_req->fio_qpair;
1050 	int				rc;
1051 
1052 	if (fio_qpair->nvme_pi_enabled && fio_req->io->ddir == DDIR_READ) {
1053 		if (fio_qpair->extended_lba) {
1054 			rc = fio_extended_lba_verify_pi(fio_qpair, fio_req->io);
1055 		} else {
1056 			rc = fio_separate_md_verify_pi(fio_qpair, fio_req->io);
1057 		}
1058 		if (rc != 0) {
1059 			fio_req->io->error = abs(rc);
1060 		}
1061 	}
1062 
1063 	if (spdk_nvme_cpl_is_error(cpl)) {
1064 		fio_req->io->error = EIO;
1065 	}
1066 
1067 	assert(fio_thread->iocq_count < fio_thread->iocq_size);
1068 	fio_thread->iocq[fio_thread->iocq_count++] = fio_req->io;
1069 }
1070 
1071 static void
1072 spdk_nvme_io_reset_sgl(void *ref, uint32_t sgl_offset)
1073 {
1074 	struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref;
1075 
1076 	fio_req->iov_offset = sgl_offset;
1077 	fio_req->bit_bucket_data_len = 0;
1078 }
1079 
1080 static int
1081 spdk_nvme_io_next_sge(void *ref, void **address, uint32_t *length)
1082 {
1083 	struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref;
1084 	struct io_u *io_u = fio_req->io;
1085 	uint32_t iov_len;
1086 	uint32_t bit_bucket_len;
1087 
1088 	*address = io_u->buf;
1089 
1090 	if (fio_req->iov_offset) {
1091 		assert(fio_req->iov_offset <= io_u->xfer_buflen);
1092 		*address += fio_req->iov_offset;
1093 	}
1094 
1095 	iov_len = io_u->xfer_buflen - fio_req->iov_offset;
1096 	if (iov_len > g_spdk_sge_size) {
1097 		iov_len = g_spdk_sge_size;
1098 	}
1099 
1100 	if ((fio_req->bit_bucket_data_len < g_spdk_bit_bucket_data_len) && (io_u->ddir == DDIR_READ)) {
1101 		assert(g_spdk_bit_bucket_data_len < io_u->xfer_buflen);
1102 		*address = (void *)UINT64_MAX;
1103 		bit_bucket_len = g_spdk_bit_bucket_data_len - fio_req->bit_bucket_data_len;
1104 		if (iov_len > bit_bucket_len) {
1105 			iov_len = bit_bucket_len;
1106 		}
1107 		fio_req->bit_bucket_data_len += iov_len;
1108 	}
1109 
1110 	fio_req->iov_offset += iov_len;
1111 	*length = iov_len;
1112 
1113 	return 0;
1114 }
1115 
1116 #if FIO_IOOPS_VERSION >= 24
1117 typedef enum fio_q_status fio_q_status_t;
1118 #else
1119 typedef int fio_q_status_t;
1120 #endif
1121 
1122 static fio_q_status_t
1123 spdk_fio_queue(struct thread_data *td, struct io_u *io_u)
1124 {
1125 	int rc = 1;
1126 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
1127 	struct spdk_fio_request	*fio_req = io_u->engine_data;
1128 	struct spdk_fio_qpair	*fio_qpair;
1129 	struct spdk_nvme_ns	*ns = NULL;
1130 	void			*md_buf = NULL;
1131 	struct spdk_dif_ctx	*dif_ctx = &fio_req->dif_ctx;
1132 #if FIO_HAS_FDP
1133 	struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
1134 #endif
1135 	struct spdk_nvme_dsm_range *range;
1136 	uint32_t		block_size;
1137 	uint64_t		lba;
1138 	uint32_t		lba_count;
1139 	uint32_t		num_range;
1140 
1141 	fio_qpair = get_fio_qpair(fio_thread, io_u->file);
1142 	if (fio_qpair == NULL) {
1143 		return -ENXIO;
1144 	}
1145 	ns = fio_qpair->ns;
1146 
1147 	if (fio_qpair->nvme_pi_enabled && !fio_qpair->extended_lba) {
1148 		md_buf = fio_req->md_buf;
1149 	}
1150 	fio_req->fio_qpair = fio_qpair;
1151 
1152 	block_size = _nvme_get_host_buffer_sector_size(ns, fio_qpair->io_flags);
1153 	lba = io_u->offset / block_size;
1154 	lba_count = io_u->xfer_buflen / block_size;
1155 
1156 #if FIO_HAS_FDP
1157 	/* Only SGL support for write command with directives */
1158 	if (io_u->ddir == DDIR_WRITE && io_u->dtype && !g_spdk_enable_sgl) {
1159 		log_err("spdk/nvme: queue() directives require SGL to be enabled\n");
1160 		io_u->error = -EINVAL;
1161 		return FIO_Q_COMPLETED;
1162 	}
1163 #endif
1164 
1165 	/* TODO: considering situations that fio will randomize and verify io_u */
1166 	if (fio_qpair->nvme_pi_enabled) {
1167 		if (fio_qpair->extended_lba) {
1168 			rc = fio_extended_lba_setup_pi(fio_qpair, io_u);
1169 		} else {
1170 			rc = fio_separate_md_setup_pi(fio_qpair, io_u);
1171 		}
1172 		if (rc < 0) {
1173 			io_u->error = -rc;
1174 			return FIO_Q_COMPLETED;
1175 		}
1176 	}
1177 
1178 	switch (io_u->ddir) {
1179 	case DDIR_READ:
1180 		if (!g_spdk_enable_sgl) {
1181 			rc = spdk_nvme_ns_cmd_read_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba, lba_count,
1182 							   spdk_fio_completion_cb, fio_req,
1183 							   fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1184 		} else {
1185 			rc = spdk_nvme_ns_cmd_readv_with_md(ns, fio_qpair->qpair, lba,
1186 							    lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1187 							    spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1188 							    dif_ctx->apptag_mask, dif_ctx->app_tag);
1189 		}
1190 		break;
1191 	case DDIR_WRITE:
1192 		if (!g_spdk_enable_sgl) {
1193 			if (!fio_qpair->zone_append_enabled) {
1194 				rc = spdk_nvme_ns_cmd_write_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba,
1195 								    lba_count,
1196 								    spdk_fio_completion_cb, fio_req,
1197 								    fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1198 			} else {
1199 				uint64_t zslba = fio_offset_to_zslba(io_u->offset, ns);
1200 				rc = spdk_nvme_zns_zone_append_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, zslba,
1201 								       lba_count,
1202 								       spdk_fio_completion_cb, fio_req,
1203 								       fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1204 			}
1205 		} else {
1206 			if (!fio_qpair->zone_append_enabled) {
1207 #if FIO_HAS_FDP
1208 				if (spdk_unlikely(io_u->dtype)) {
1209 					ext_opts.size = SPDK_SIZEOF(&ext_opts, cdw13);
1210 					ext_opts.io_flags = fio_qpair->io_flags | (io_u->dtype << 20);
1211 					ext_opts.metadata = md_buf;
1212 					ext_opts.cdw13 = (io_u->dspec << 16);
1213 					ext_opts.apptag = dif_ctx->app_tag;
1214 					ext_opts.apptag_mask = dif_ctx->apptag_mask;
1215 					rc = spdk_nvme_ns_cmd_writev_ext(ns, fio_qpair->qpair, lba, lba_count,
1216 									 spdk_fio_completion_cb, fio_req,
1217 									 spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, &ext_opts);
1218 					break;
1219 				}
1220 #endif
1221 				rc = spdk_nvme_ns_cmd_writev_with_md(ns, fio_qpair->qpair, lba,
1222 								     lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1223 								     spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1224 								     dif_ctx->apptag_mask, dif_ctx->app_tag);
1225 			} else {
1226 				uint64_t zslba = fio_offset_to_zslba(io_u->offset, ns);
1227 				rc = spdk_nvme_zns_zone_appendv_with_md(ns, fio_qpair->qpair, zslba,
1228 									lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1229 									spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1230 									dif_ctx->apptag_mask, dif_ctx->app_tag);
1231 			}
1232 		}
1233 		break;
1234 	case DDIR_TRIM:
1235 		if (td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM) {
1236 			do_io_u_trim(td, io_u);
1237 			io_u_mark_submit(td, 1);
1238 			io_u_mark_complete(td, 1);
1239 			return FIO_Q_COMPLETED;
1240 		}
1241 
1242 		range = fio_req->dsm_range;
1243 #if FIO_HAS_MRT
1244 		if (td->o.num_range == 1) {
1245 			range->attributes.raw = 0;
1246 			range->length = lba_count;
1247 			range->starting_lba = lba;
1248 			num_range = 1;
1249 		} else {
1250 			struct trim_range *tr = (struct trim_range *)io_u->xfer_buf;
1251 			for (uint32_t i = 0; i < io_u->number_trim; i++) {
1252 				range->attributes.raw = 0;
1253 				range->length = tr->len / block_size;
1254 				range->starting_lba = tr->start / block_size;
1255 				range++;
1256 				tr++;
1257 			}
1258 			num_range = io_u->number_trim;
1259 			range = fio_req->dsm_range;
1260 		}
1261 #else
1262 		range->attributes.raw = 0;
1263 		range->length = lba_count;
1264 		range->starting_lba = lba;
1265 		num_range = 1;
1266 #endif
1267 
1268 		rc = spdk_nvme_ns_cmd_dataset_management(ns, fio_qpair->qpair,
1269 				SPDK_NVME_DSM_ATTR_DEALLOCATE, range, num_range,
1270 				spdk_fio_completion_cb, fio_req);
1271 		break;
1272 	default:
1273 		assert(false);
1274 		break;
1275 	}
1276 
1277 	/* NVMe read/write functions return -ENOMEM if there are no free requests. */
1278 	if (rc == -ENOMEM) {
1279 		return FIO_Q_BUSY;
1280 	}
1281 
1282 	if (rc != 0) {
1283 		io_u->error = abs(rc);
1284 		return FIO_Q_COMPLETED;
1285 	}
1286 
1287 	return FIO_Q_QUEUED;
1288 }
1289 
1290 static struct io_u *
1291 spdk_fio_event(struct thread_data *td, int event)
1292 {
1293 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1294 
1295 	assert(event >= 0);
1296 	assert((unsigned)event < fio_thread->iocq_count);
1297 	return fio_thread->iocq[event];
1298 }
1299 
1300 static int
1301 spdk_fio_getevents(struct thread_data *td, unsigned int min,
1302 		   unsigned int max, const struct timespec *t)
1303 {
1304 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1305 	struct spdk_fio_qpair *fio_qpair = NULL;
1306 	struct timespec t0, t1;
1307 	uint64_t timeout = 0;
1308 
1309 	if (t) {
1310 		timeout = t->tv_sec * 1000000000L + t->tv_nsec;
1311 		clock_gettime(CLOCK_MONOTONIC_RAW, &t0);
1312 	}
1313 
1314 	fio_thread->iocq_count = 0;
1315 
1316 	/* fetch the next qpair */
1317 	if (fio_thread->fio_qpair_current) {
1318 		fio_qpair = TAILQ_NEXT(fio_thread->fio_qpair_current, link);
1319 	}
1320 
1321 	for (;;) {
1322 		if (fio_qpair == NULL) {
1323 			fio_qpair = TAILQ_FIRST(&fio_thread->fio_qpair);
1324 		}
1325 
1326 		while (fio_qpair != NULL) {
1327 			/*
1328 			 * We can be called while spdk_fio_open()s are still
1329 			 * ongoing, in which case, ->qpair can still be NULL.
1330 			 */
1331 			if (fio_qpair->qpair == NULL) {
1332 				fio_qpair = TAILQ_NEXT(fio_qpair, link);
1333 				continue;
1334 			}
1335 
1336 			spdk_nvme_qpair_process_completions(fio_qpair->qpair, max - fio_thread->iocq_count);
1337 
1338 			if (fio_thread->iocq_count >= min) {
1339 				/* reset the current handling qpair */
1340 				fio_thread->fio_qpair_current = fio_qpair;
1341 				return fio_thread->iocq_count;
1342 			}
1343 
1344 			fio_qpair = TAILQ_NEXT(fio_qpair, link);
1345 		}
1346 
1347 		if (t) {
1348 			uint64_t elapse;
1349 
1350 			clock_gettime(CLOCK_MONOTONIC_RAW, &t1);
1351 			elapse = ((t1.tv_sec - t0.tv_sec) * 1000000000L)
1352 				 + t1.tv_nsec - t0.tv_nsec;
1353 			if (elapse > timeout) {
1354 				break;
1355 			}
1356 		}
1357 	}
1358 
1359 	/* reset the current handling qpair */
1360 	fio_thread->fio_qpair_current = fio_qpair;
1361 	return fio_thread->iocq_count;
1362 }
1363 
1364 static int
1365 spdk_fio_invalidate(struct thread_data *td, struct fio_file *f)
1366 {
1367 	/* TODO: This should probably send a flush to the device, but for now just return successful. */
1368 	return 0;
1369 }
1370 
1371 #if FIO_HAS_ZBD
1372 static int
1373 spdk_fio_get_zoned_model(struct thread_data *td, struct fio_file *f, enum zbd_zoned_model *model)
1374 {
1375 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1376 	struct spdk_fio_qpair *fio_qpair = NULL;
1377 	const struct spdk_nvme_zns_ns_data *zns_data = NULL;
1378 
1379 	if (f->filetype != FIO_TYPE_BLOCK) {
1380 		log_info("spdk/nvme: unsupported filetype: %d\n", f->filetype);
1381 		return -EINVAL;
1382 	}
1383 
1384 	fio_qpair = get_fio_qpair(fio_thread, f);
1385 	if (!fio_qpair) {
1386 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1387 		return -ENODEV;
1388 	}
1389 
1390 	switch (spdk_nvme_ns_get_csi(fio_qpair->ns)) {
1391 	case SPDK_NVME_CSI_NVM:
1392 		*model = ZBD_NONE;
1393 		return 0;
1394 
1395 	case SPDK_NVME_CSI_KV:
1396 		log_err("spdk/nvme: KV namespace is currently not supported\n");
1397 		return -ENOSYS;
1398 
1399 	case SPDK_NVME_CSI_ZNS:
1400 		zns_data = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1401 		if (!zns_data) {
1402 			log_err("spdk/nvme: file_name: '%s', ZNS is not enabled\n", f->file_name);
1403 			return -EINVAL;
1404 		}
1405 
1406 		*model = ZBD_HOST_MANAGED;
1407 
1408 		return 0;
1409 	}
1410 
1411 	return -EINVAL;
1412 }
1413 
1414 static int
1415 spdk_fio_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset,
1416 		      struct zbd_zone *zbdz, unsigned int nr_zones)
1417 {
1418 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1419 	struct spdk_fio_qpair *fio_qpair = NULL;
1420 	const struct spdk_nvme_zns_ns_data *zns = NULL;
1421 	struct spdk_nvme_zns_zone_report *report;
1422 	struct spdk_nvme_qpair *tmp_qpair;
1423 	uint32_t report_nzones = 0, report_nzones_max, report_nbytes, mdts_nbytes;
1424 	uint64_t zsze_nbytes, ns_nzones, lba_nbytes;
1425 	int completed = 0, err;
1426 
1427 	fio_qpair = get_fio_qpair(fio_thread, f);
1428 	if (!fio_qpair) {
1429 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1430 		return -ENODEV;
1431 	}
1432 	zns = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1433 	if (!zns) {
1434 		log_err("spdk/nvme: file_name: '%s', zns is not enabled\n", f->file_name);
1435 		return -EINVAL;
1436 	}
1437 
1438 	/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
1439 	 * Create a temporary qpair in order to perform report zones.
1440 	 */
1441 	assert(!fio_qpair->qpair);
1442 
1443 	tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_qpair->fio_ctrlr->ctrlr, NULL, 0);
1444 	if (!tmp_qpair) {
1445 		log_err("spdk/nvme: cannot allocate a temporary qpair\n");
1446 		return -EIO;
1447 	}
1448 
1449 	/** Retrieve device parameters */
1450 	mdts_nbytes = spdk_nvme_ns_get_max_io_xfer_size(fio_qpair->ns);
1451 	lba_nbytes = spdk_nvme_ns_get_sector_size(fio_qpair->ns);
1452 	zsze_nbytes = spdk_nvme_zns_ns_get_zone_size(fio_qpair->ns);
1453 	ns_nzones = spdk_nvme_zns_ns_get_num_zones(fio_qpair->ns);
1454 
1455 	/** Allocate report-buffer without exceeding mdts, zbdz-storage, and what is needed */
1456 	report_nzones_max = (mdts_nbytes - sizeof(*report)) / sizeof(report->descs[0]);
1457 	report_nzones_max = spdk_min(spdk_min(report_nzones_max, nr_zones), ns_nzones);
1458 	report_nbytes = sizeof(report->descs[0]) * report_nzones_max + sizeof(*report);
1459 	report = calloc(1, report_nbytes);
1460 	if (!report) {
1461 		log_err("spdk/nvme: failed report_zones(): ENOMEM\n");
1462 		err = -ENOMEM;
1463 		goto exit;
1464 	}
1465 
1466 	err = spdk_nvme_zns_report_zones(fio_qpair->ns, tmp_qpair, report, report_nbytes,
1467 					 offset / lba_nbytes, SPDK_NVME_ZRA_LIST_ALL, true, pcu_cb,
1468 					 &completed);
1469 	if (err || pcu(tmp_qpair, &completed) || completed < 0) {
1470 		log_err("spdk/nvme: report_zones(): err: %d, cpl: %d\n", err, completed);
1471 		err = err ? err : -EIO;
1472 		goto exit;
1473 	}
1474 	assert(report->nr_zones <= report_nzones_max);
1475 	report_nzones = report->nr_zones;
1476 
1477 	for (uint64_t idx = 0; idx < report->nr_zones; ++idx) {
1478 		struct spdk_nvme_zns_zone_desc *zdesc = &report->descs[idx];
1479 
1480 		zbdz[idx].start = zdesc->zslba * lba_nbytes;
1481 		zbdz[idx].len = zsze_nbytes;
1482 		zbdz[idx].capacity = zdesc->zcap * lba_nbytes;
1483 		zbdz[idx].wp = zdesc->wp * lba_nbytes;
1484 
1485 		switch (zdesc->zt) {
1486 		case SPDK_NVME_ZONE_TYPE_SEQWR:
1487 			zbdz[idx].type = ZBD_ZONE_TYPE_SWR;
1488 			break;
1489 
1490 		default:
1491 			log_err("spdk/nvme: %s: inv. zone-type: 0x%x\n", f->file_name, zdesc->zt);
1492 			err = -EIO;
1493 			goto exit;
1494 		}
1495 
1496 		switch (zdesc->zs) {
1497 		case SPDK_NVME_ZONE_STATE_EMPTY:
1498 			zbdz[idx].cond = ZBD_ZONE_COND_EMPTY;
1499 			break;
1500 		case SPDK_NVME_ZONE_STATE_IOPEN:
1501 			zbdz[idx].cond = ZBD_ZONE_COND_IMP_OPEN;
1502 			break;
1503 		case SPDK_NVME_ZONE_STATE_EOPEN:
1504 			zbdz[idx].cond = ZBD_ZONE_COND_EXP_OPEN;
1505 			break;
1506 		case SPDK_NVME_ZONE_STATE_CLOSED:
1507 			zbdz[idx].cond = ZBD_ZONE_COND_CLOSED;
1508 			break;
1509 		case SPDK_NVME_ZONE_STATE_RONLY:
1510 			zbdz[idx].cond = ZBD_ZONE_COND_READONLY;
1511 			break;
1512 		case SPDK_NVME_ZONE_STATE_FULL:
1513 			zbdz[idx].cond = ZBD_ZONE_COND_FULL;
1514 			break;
1515 		case SPDK_NVME_ZONE_STATE_OFFLINE:
1516 			zbdz[idx].cond = ZBD_ZONE_COND_OFFLINE;
1517 			break;
1518 
1519 		default:
1520 			log_err("spdk/nvme: %s: inv. zone-state: 0x%x\n", f->file_name, zdesc->zs);
1521 			err = -EIO;
1522 			goto exit;
1523 		}
1524 	}
1525 
1526 exit:
1527 	spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
1528 	free(report);
1529 
1530 	return err ? err : (int)report_nzones;
1531 }
1532 
1533 static int
1534 spdk_fio_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset, uint64_t length)
1535 {
1536 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1537 	struct spdk_fio_qpair *fio_qpair = NULL;
1538 	const struct spdk_nvme_zns_ns_data *zns = NULL;
1539 	uint64_t zsze_nbytes, lba_nbytes;
1540 	int err = 0;
1541 
1542 	fio_qpair = get_fio_qpair(fio_thread, f);
1543 	if (!fio_qpair) {
1544 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1545 		return -ENODEV;
1546 	}
1547 	zns = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1548 	if (!zns) {
1549 		log_err("spdk/nvme: file_name: '%s', zns is not enabled\n", f->file_name);
1550 		return -EINVAL;
1551 	}
1552 	zsze_nbytes = spdk_nvme_zns_ns_get_zone_size(fio_qpair->ns);
1553 	lba_nbytes = spdk_nvme_ns_get_sector_size(fio_qpair->ns);
1554 
1555 	/** check the assumption that offset is valid zone-start lba */
1556 	if (offset % zsze_nbytes) {
1557 		log_err("spdk/nvme: offset: %zu is not a valid zslba\n", offset);
1558 		return -EINVAL;
1559 	}
1560 
1561 	for (uint64_t cur = offset; cur < offset + length; cur += zsze_nbytes) {
1562 		int completed = 0;
1563 
1564 		err = spdk_nvme_zns_reset_zone(fio_qpair->ns, fio_qpair->qpair, cur / lba_nbytes,
1565 					       false, pcu_cb, &completed);
1566 		if (err || pcu(fio_qpair->qpair, &completed) || completed < 0) {
1567 			log_err("spdk/nvme: zns_reset_zone(): err: %d, cpl: %d\n", err, completed);
1568 			err = err ? err : -EIO;
1569 			break;
1570 		}
1571 	}
1572 
1573 	return err;
1574 }
1575 #endif
1576 
1577 #if FIO_IOOPS_VERSION >= 30
1578 static int
1579 spdk_fio_get_max_open_zones(struct thread_data *td, struct fio_file *f,
1580 			    unsigned int *max_open_zones)
1581 {
1582 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1583 	struct spdk_fio_qpair *fio_qpair = NULL;
1584 
1585 	fio_qpair = get_fio_qpair(fio_thread, f);
1586 	if (!fio_qpair) {
1587 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1588 		return -ENODEV;
1589 	}
1590 
1591 	*max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(fio_qpair->ns);
1592 
1593 	return 0;
1594 }
1595 #endif
1596 
1597 #if FIO_HAS_FDP
1598 /**
1599  * This is called twice as the number of ruhs descriptors are unknown.
1600  * In the first call fio only sends a buffer to fetch the number of ruhs
1601  * descriptors. In the second call fio will send a buffer to fetch all the
1602  * ruhs descriptors.
1603  */
1604 static int
1605 spdk_fio_fdp_fetch_ruhs(struct thread_data *td, struct fio_file *f,
1606 			struct fio_ruhs_info *fruhs_info)
1607 {
1608 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1609 	struct spdk_fio_qpair *fio_qpair = NULL;
1610 	struct spdk_nvme_qpair *tmp_qpair;
1611 	struct spdk_nvme_fdp_ruhs *fdp_ruhs;
1612 	uint32_t ruhs_nbytes;
1613 	uint16_t idx, nruhsd;
1614 	int completed = 0, err;
1615 
1616 	fio_qpair = get_fio_qpair(fio_thread, f);
1617 	if (!fio_qpair) {
1618 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1619 		return -ENODEV;
1620 	}
1621 
1622 	/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
1623 	 * Create a temporary qpair in order to perform report zones.
1624 	 */
1625 	assert(!fio_qpair->qpair);
1626 
1627 	tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_qpair->fio_ctrlr->ctrlr, NULL, 0);
1628 	if (!tmp_qpair) {
1629 		log_err("spdk/nvme: cannot allocate a temporary qpair\n");
1630 		return -EIO;
1631 	}
1632 
1633 	nruhsd = fruhs_info->nr_ruhs;
1634 	ruhs_nbytes = sizeof(*fdp_ruhs) + nruhsd * sizeof(struct spdk_nvme_fdp_ruhs_desc);
1635 	fdp_ruhs = calloc(1, ruhs_nbytes);
1636 	if (!fdp_ruhs) {
1637 		log_err("spdk/nvme: failed fdp_fetch_ruhs(): ENOMEM\n");
1638 		err = -ENOMEM;
1639 		goto exit;
1640 	}
1641 
1642 	err = spdk_nvme_ns_cmd_io_mgmt_recv(fio_qpair->ns, tmp_qpair, fdp_ruhs, ruhs_nbytes,
1643 					    SPDK_NVME_FDP_IO_MGMT_RECV_RUHS, 0, pcu_cb, &completed);
1644 	if (err || pcu(tmp_qpair, &completed) || completed < 0) {
1645 		log_err("spdk/nvme: fetch_ruhs(): err: %d, cpl: %d\n", err, completed);
1646 		err = err ? err : -EIO;
1647 		goto exit;
1648 	}
1649 
1650 	fruhs_info->nr_ruhs = fdp_ruhs->nruhsd;
1651 	for (idx = 0; idx < nruhsd; idx++) {
1652 		fruhs_info->plis[idx] = fdp_ruhs->ruhs_desc[idx].pid;
1653 	}
1654 
1655 exit:
1656 	spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
1657 	free(fdp_ruhs);
1658 
1659 	return err;
1660 }
1661 #endif
1662 
1663 static void
1664 spdk_fio_cleanup(struct thread_data *td)
1665 {
1666 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
1667 	struct spdk_fio_qpair	*fio_qpair, *fio_qpair_tmp;
1668 	struct spdk_fio_options *fio_options = td->eo;
1669 
1670 	if (fio_options->spdk_tracing) {
1671 		spdk_trace_unregister_user_thread();
1672 	}
1673 
1674 	TAILQ_FOREACH_SAFE(fio_qpair, &fio_thread->fio_qpair, link, fio_qpair_tmp) {
1675 		TAILQ_REMOVE(&fio_thread->fio_qpair, fio_qpair, link);
1676 		free(fio_qpair);
1677 	}
1678 
1679 	free(fio_thread->iocq);
1680 	free(fio_thread);
1681 
1682 	pthread_mutex_lock(&g_mutex);
1683 	g_td_count--;
1684 	if (g_td_count == 0) {
1685 		struct spdk_fio_ctrlr	*fio_ctrlr, *fio_ctrlr_tmp;
1686 		struct spdk_nvme_detach_ctx	*detach_ctx = NULL;
1687 
1688 		TAILQ_FOREACH_SAFE(fio_ctrlr, &g_ctrlrs, link, fio_ctrlr_tmp) {
1689 			TAILQ_REMOVE(&g_ctrlrs, fio_ctrlr, link);
1690 			spdk_nvme_detach_async(fio_ctrlr->ctrlr, &detach_ctx);
1691 			free(fio_ctrlr);
1692 		}
1693 
1694 		if (detach_ctx) {
1695 			spdk_nvme_detach_poll(detach_ctx);
1696 		}
1697 
1698 		if (fio_options->enable_vmd) {
1699 			spdk_vmd_fini();
1700 		}
1701 	}
1702 	pthread_mutex_unlock(&g_mutex);
1703 	if (TAILQ_EMPTY(&g_ctrlrs)) {
1704 		if (pthread_cancel(g_ctrlr_thread_id) == 0) {
1705 			pthread_join(g_ctrlr_thread_id, NULL);
1706 		}
1707 	}
1708 }
1709 
1710 /* This function enables addition of SPDK parameters to the fio config
1711  * Adding new parameters by defining them here and defining a callback
1712  * function to read the parameter value. */
1713 static struct fio_option options[] = {
1714 	{
1715 		.name           = "enable_wrr",
1716 		.lname          = "Enable weighted round robin (WRR) for IO submission queues",
1717 		.type           = FIO_OPT_INT,
1718 		.off1           = offsetof(struct spdk_fio_options, enable_wrr),
1719 		.def            = "0",
1720 		.help           = "Enable weighted round robin (WRR) for IO submission queues",
1721 		.category       = FIO_OPT_C_ENGINE,
1722 		.group          = FIO_OPT_G_INVALID,
1723 	},
1724 	{
1725 		.name           = "arbitration_burst",
1726 		.lname          = "Arbitration Burst",
1727 		.type           = FIO_OPT_INT,
1728 		.off1           = offsetof(struct spdk_fio_options, arbitration_burst),
1729 		.def            = "0",
1730 		.help           = "Arbitration Burst used for WRR (valid range from 0 - 7)",
1731 		.category       = FIO_OPT_C_ENGINE,
1732 		.group          = FIO_OPT_G_INVALID,
1733 	},
1734 	{
1735 		.name           = "low_weight",
1736 		.lname          = "low_weight for WRR",
1737 		.type           = FIO_OPT_INT,
1738 		.off1           = offsetof(struct spdk_fio_options, low_weight),
1739 		.def            = "0",
1740 		.help           = "low_weight used for WRR (valid range from 0 - 255)",
1741 		.category       = FIO_OPT_C_ENGINE,
1742 		.group          = FIO_OPT_G_INVALID,
1743 	},
1744 	{
1745 		.name           = "medium_weight",
1746 		.lname          = "medium_weight for WRR",
1747 		.type           = FIO_OPT_INT,
1748 		.off1           = offsetof(struct spdk_fio_options, medium_weight),
1749 		.def            = "0",
1750 		.help           = "medium weight used for WRR (valid range from 0 - 255)",
1751 		.category       = FIO_OPT_C_ENGINE,
1752 		.group          = FIO_OPT_G_INVALID,
1753 	},
1754 	{
1755 		.name           = "high_weight",
1756 		.lname          = "high_weight for WRR",
1757 		.type           = FIO_OPT_INT,
1758 		.off1           = offsetof(struct spdk_fio_options, high_weight),
1759 		.def            = "0",
1760 		.help           = "high weight used for WRR (valid range from 0 - 255)",
1761 		.category       = FIO_OPT_C_ENGINE,
1762 		.group          = FIO_OPT_G_INVALID,
1763 	},
1764 	{
1765 		.name           = "wrr_priority",
1766 		.lname          = "priority used for WRR",
1767 		.type           = FIO_OPT_INT,
1768 		.off1           = offsetof(struct spdk_fio_options, wrr_priority),
1769 		.def            = "0",
1770 		.help           = "priority used for WRR (valid range from 0-3)",
1771 		.category       = FIO_OPT_C_ENGINE,
1772 		.group          = FIO_OPT_G_INVALID,
1773 	},
1774 	{
1775 		.name		= "mem_size_mb",
1776 		.lname		= "Memory size in MB",
1777 		.type		= FIO_OPT_INT,
1778 		.off1		= offsetof(struct spdk_fio_options, mem_size),
1779 		.def		= "0",
1780 		.help		= "Memory Size for SPDK (MB)",
1781 		.category	= FIO_OPT_C_ENGINE,
1782 		.group		= FIO_OPT_G_INVALID,
1783 	},
1784 	{
1785 		.name		= "shm_id",
1786 		.lname		= "shared memory ID",
1787 		.type		= FIO_OPT_INT,
1788 		.off1		= offsetof(struct spdk_fio_options, shm_id),
1789 		.def		= "-1",
1790 		.help		= "Shared Memory ID",
1791 		.category	= FIO_OPT_C_ENGINE,
1792 		.group		= FIO_OPT_G_INVALID,
1793 	},
1794 	{
1795 		.name		= "enable_sgl",
1796 		.lname		= "SGL used for I/O commands",
1797 		.type		= FIO_OPT_INT,
1798 		.off1		= offsetof(struct spdk_fio_options, enable_sgl),
1799 		.def		= "0",
1800 		.help		= "SGL Used for I/O Commands (enable_sgl=1 or enable_sgl=0)",
1801 		.category	= FIO_OPT_C_ENGINE,
1802 		.group		= FIO_OPT_G_INVALID,
1803 	},
1804 	{
1805 		.name		= "sge_size",
1806 		.lname		= "SGL size used for I/O commands",
1807 		.type		= FIO_OPT_INT,
1808 		.off1		= offsetof(struct spdk_fio_options, sge_size),
1809 		.def		= "4096",
1810 		.help		= "SGL size in bytes for I/O Commands (default 4096)",
1811 		.category	= FIO_OPT_C_ENGINE,
1812 		.group		= FIO_OPT_G_INVALID,
1813 	},
1814 	{
1815 		.name		= "disable_pcie_sgl_merge",
1816 		.lname		= "Disable merging of physically contiguous SGL elements",
1817 		.type		= FIO_OPT_INT,
1818 		.off1		= offsetof(struct spdk_fio_options, disable_pcie_sgl_merge),
1819 		.def		= "0",
1820 		.help		= "Disable SGL element merging (0=merging, 1=no merging)",
1821 		.category	= FIO_OPT_C_ENGINE,
1822 		.group		= FIO_OPT_G_INVALID,
1823 	},
1824 	{
1825 		.name		= "bit_bucket_data_len",
1826 		.lname		= "Amount of data used for Bit Bucket",
1827 		.type		= FIO_OPT_INT,
1828 		.off1		= offsetof(struct spdk_fio_options, bit_bucket_data_len),
1829 		.def		= "0",
1830 		.help		= "Bit Bucket Data Length for READ commands (disabled by default)",
1831 		.category	= FIO_OPT_C_ENGINE,
1832 		.group		= FIO_OPT_G_INVALID,
1833 	},
1834 	{
1835 		.name		= "hostnqn",
1836 		.lname		= "Host NQN to use when connecting to controllers.",
1837 		.type		= FIO_OPT_STR_STORE,
1838 		.off1		= offsetof(struct spdk_fio_options, hostnqn),
1839 		.help		= "Host NQN",
1840 		.category	= FIO_OPT_C_ENGINE,
1841 		.group		= FIO_OPT_G_INVALID,
1842 	},
1843 	{
1844 		.name		= "pi_act",
1845 		.lname		= "Protection Information Action",
1846 		.type		= FIO_OPT_INT,
1847 		.off1		= offsetof(struct spdk_fio_options, pi_act),
1848 		.def		= "1",
1849 		.help		= "Protection Information Action bit (pi_act=1 or pi_act=0)",
1850 		.category	= FIO_OPT_C_ENGINE,
1851 		.group		= FIO_OPT_G_INVALID,
1852 	},
1853 	{
1854 		.name		= "pi_chk",
1855 		.lname		= "Protection Information Check(GUARD|REFTAG|APPTAG)",
1856 		.type		= FIO_OPT_STR_STORE,
1857 		.off1		= offsetof(struct spdk_fio_options, pi_chk),
1858 		.def		= NULL,
1859 		.help		= "Control of Protection Information Checking (pi_chk=GUARD|REFTAG|APPTAG)",
1860 		.category	= FIO_OPT_C_ENGINE,
1861 		.group		= FIO_OPT_G_INVALID,
1862 	},
1863 	{
1864 		.name		= "md_per_io_size",
1865 		.lname		= "Separate Metadata Buffer Size per I/O",
1866 		.type		= FIO_OPT_INT,
1867 		.off1		= offsetof(struct spdk_fio_options, md_per_io_size),
1868 		.def		= "4096",
1869 		.help		= "Size of separate metadata buffer per I/O (Default: 4096)",
1870 		.category	= FIO_OPT_C_ENGINE,
1871 		.group		= FIO_OPT_G_INVALID,
1872 	},
1873 	{
1874 		.name		= "apptag",
1875 		.lname		= "Application Tag used in Protection Information",
1876 		.type		= FIO_OPT_INT,
1877 		.off1		= offsetof(struct spdk_fio_options, apptag),
1878 		.def		= "0x1234",
1879 		.help		= "Application Tag used in Protection Information field (Default: 0x1234)",
1880 		.category	= FIO_OPT_C_ENGINE,
1881 		.group		= FIO_OPT_G_INVALID,
1882 	},
1883 	{
1884 		.name		= "apptag_mask",
1885 		.lname		= "Application Tag Mask",
1886 		.type		= FIO_OPT_INT,
1887 		.off1		= offsetof(struct spdk_fio_options, apptag_mask),
1888 		.def		= "0xffff",
1889 		.help		= "Application Tag Mask used with Application Tag (Default: 0xffff)",
1890 		.category	= FIO_OPT_C_ENGINE,
1891 		.group		= FIO_OPT_G_INVALID,
1892 	},
1893 	{
1894 		.name		= "digest_enable",
1895 		.lname		= "PDU digest choice for NVMe/TCP Transport(NONE|HEADER|DATA|BOTH)",
1896 		.type		= FIO_OPT_STR_STORE,
1897 		.off1		= offsetof(struct spdk_fio_options, digest_enable),
1898 		.def		= NULL,
1899 		.help		= "Control the NVMe/TCP control(digest_enable=NONE|HEADER|DATA|BOTH)",
1900 		.category	= FIO_OPT_C_ENGINE,
1901 		.group		= FIO_OPT_G_INVALID,
1902 	},
1903 	{
1904 		.name		= "enable_vmd",
1905 		.lname		= "Enable VMD enumeration",
1906 		.type		= FIO_OPT_INT,
1907 		.off1		= offsetof(struct spdk_fio_options, enable_vmd),
1908 		.def		= "0",
1909 		.help		= "Enable VMD enumeration (enable_vmd=1 or enable_vmd=0)",
1910 		.category	= FIO_OPT_C_ENGINE,
1911 		.group		= FIO_OPT_G_INVALID,
1912 	},
1913 	{
1914 		.name		= "initial_zone_reset",
1915 		.lname		= "Reset Zones on initialization",
1916 		.type		= FIO_OPT_INT,
1917 		.off1		= offsetof(struct spdk_fio_options, initial_zone_reset),
1918 		.def		= "0",
1919 		.help		= "Reset Zones on initialization (0=disable, 1=Reset All Zones)",
1920 		.category	= FIO_OPT_C_ENGINE,
1921 		.group		= FIO_OPT_G_INVALID,
1922 	},
1923 	{
1924 		.name		= "zone_append",
1925 		.lname		= "Use zone append instead of write",
1926 		.type		= FIO_OPT_INT,
1927 		.off1		= offsetof(struct spdk_fio_options, zone_append),
1928 		.def		= "0",
1929 		.help		= "Use zone append instead of write (1=zone append, 0=write)",
1930 		.category	= FIO_OPT_C_ENGINE,
1931 		.group		= FIO_OPT_G_INVALID,
1932 	},
1933 	{
1934 		.name		= "print_qid_mappings",
1935 		.lname		= "Print job-to-qid mappings",
1936 		.type		= FIO_OPT_INT,
1937 		.off1		= offsetof(struct spdk_fio_options, print_qid_mappings),
1938 		.def		= "0",
1939 		.help		= "Print job-to-qid mappings (0=disable, 1=enable)",
1940 		.category	= FIO_OPT_C_ENGINE,
1941 		.group		= FIO_OPT_G_INVALID,
1942 	},
1943 	{
1944 		.name		= "log_flags",
1945 		.lname		= "log_flags",
1946 		.type		= FIO_OPT_STR_STORE,
1947 		.off1		= offsetof(struct spdk_fio_options, log_flags),
1948 		.help		= "Enable log flags (comma-separated list)",
1949 		.category	= FIO_OPT_C_ENGINE,
1950 		.group		= FIO_OPT_G_INVALID,
1951 	},
1952 	{
1953 		.name		= "spdk_tracing",
1954 		.lname		= "Enable SPDK Tracing",
1955 		.type		= FIO_OPT_INT,
1956 		.off1		= offsetof(struct spdk_fio_options, spdk_tracing),
1957 		.def		= "0",
1958 		.help		= "SPDK Tracing (0=disable, 1=enable)",
1959 		.category	= FIO_OPT_C_ENGINE,
1960 		.group		= FIO_OPT_G_INVALID,
1961 	},
1962 	{
1963 		.name		= NULL,
1964 	},
1965 };
1966 
1967 /* FIO imports this structure using dlsym */
1968 struct ioengine_ops ioengine = {
1969 	.name			= "spdk",
1970 	.version		= FIO_IOOPS_VERSION,
1971 	.queue			= spdk_fio_queue,
1972 	.getevents		= spdk_fio_getevents,
1973 	.event			= spdk_fio_event,
1974 	.cleanup		= spdk_fio_cleanup,
1975 	.open_file		= spdk_fio_open,
1976 	.close_file		= spdk_fio_close,
1977 	.invalidate		= spdk_fio_invalidate,
1978 	.iomem_alloc		= spdk_fio_iomem_alloc,
1979 	.iomem_free		= spdk_fio_iomem_free,
1980 	.setup			= spdk_fio_setup,
1981 	.init			= spdk_fio_init,
1982 	.io_u_init		= spdk_fio_io_u_init,
1983 	.io_u_free		= spdk_fio_io_u_free,
1984 #if FIO_HAS_ZBD
1985 	.get_zoned_model	= spdk_fio_get_zoned_model,
1986 	.report_zones		= spdk_fio_report_zones,
1987 	.reset_wp		= spdk_fio_reset_wp,
1988 #endif
1989 #if FIO_IOOPS_VERSION >= 30
1990 	.get_max_open_zones	= spdk_fio_get_max_open_zones,
1991 #endif
1992 #if FIO_HAS_FDP
1993 	.fdp_fetch_ruhs		= spdk_fio_fdp_fetch_ruhs,
1994 #endif
1995 #if FIO_HAS_MRT
1996 	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN | FIO_DISKLESSIO | FIO_MULTI_RANGE_TRIM,
1997 #else
1998 	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN | FIO_DISKLESSIO,
1999 #endif
2000 	.options		= options,
2001 	.option_struct_size	= sizeof(struct spdk_fio_options),
2002 };
2003 
2004 static void fio_init
2005 fio_spdk_register(void)
2006 {
2007 	register_ioengine(&ioengine);
2008 }
2009 
2010 static void fio_exit
2011 fio_spdk_unregister(void)
2012 {
2013 	if (g_spdk_env_initialized) {
2014 		spdk_trace_cleanup();
2015 		spdk_env_fini();
2016 	}
2017 
2018 	unregister_ioengine(&ioengine);
2019 }
2020 
2021 SPDK_LOG_REGISTER_COMPONENT(fio_nvme)
2022