xref: /spdk/app/fio/nvme/fio_plugin.c (revision b37db06935181fd0e8f5592a96d860040abaa201)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include "spdk/nvme.h"
9 #include "spdk/nvme_zns.h"
10 #include "spdk/vmd.h"
11 #include "spdk/env.h"
12 #include "spdk/string.h"
13 #include "spdk/log.h"
14 #include "spdk/likely.h"
15 #include "spdk/endian.h"
16 #include "spdk/dif.h"
17 #include "spdk/util.h"
18 #include "spdk/trace.h"
19 
20 #include "config-host.h"
21 #include "fio.h"
22 #include "optgroup.h"
23 
24 #ifdef for_each_rw_ddir
25 #define FIO_HAS_ZBD (FIO_IOOPS_VERSION >= 26)
26 #define FIO_HAS_FDP (FIO_IOOPS_VERSION >= 35)
27 #define FIO_HAS_MRT (FIO_IOOPS_VERSION >= 34)
28 #else
29 #define FIO_HAS_ZBD (0)
30 #define FIO_HAS_FDP (0)
31 #define FIO_HAS_MRT (0)
32 #endif
33 
34 /* FreeBSD is missing CLOCK_MONOTONIC_RAW,
35  * so alternative is provided. */
36 #ifndef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */
37 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
38 #endif
39 
40 #define NVME_IO_ALIGN		4096
41 
42 static bool g_spdk_env_initialized;
43 static bool g_log_flag_error;
44 static int g_spdk_enable_sgl = 0;
45 static uint32_t g_spdk_sge_size = 4096;
46 static uint32_t g_spdk_bit_bucket_data_len = 0;
47 static uint32_t g_spdk_pract_flag;
48 static uint32_t g_spdk_prchk_flags;
49 static uint32_t g_spdk_md_per_io_size = 4096;
50 static uint16_t g_spdk_apptag;
51 static uint16_t g_spdk_apptag_mask;
52 
53 struct spdk_fio_options {
54 	void	*pad;	/* off1 used in option descriptions may not be 0 */
55 	int	enable_wrr;
56 	int	arbitration_burst;
57 	int	low_weight;
58 	int	medium_weight;
59 	int	high_weight;
60 	int	wrr_priority;
61 	int	mem_size;
62 	int	shm_id;
63 	int	enable_sgl;
64 	int	sge_size;
65 	int	bit_bucket_data_len;
66 	char	*hostnqn;
67 	int	pi_act;
68 	char	*pi_chk;
69 	int	md_per_io_size;
70 	int	apptag;
71 	int	apptag_mask;
72 	char	*digest_enable;
73 	int	enable_vmd;
74 	int	initial_zone_reset;
75 	int	zone_append;
76 	int	print_qid_mappings;
77 	int	spdk_tracing;
78 	char	*log_flags;
79 	int	disable_pcie_sgl_merge;
80 };
81 
82 struct spdk_fio_request {
83 	struct io_u		*io;
84 	/** Offset in current iovec, fio only uses 1 vector */
85 	uint32_t		iov_offset;
86 
87 	/** Amount of data used for Bit Bucket SGL */
88 	uint32_t		bit_bucket_data_len;
89 
90 	/** Context for NVMe PI */
91 	struct spdk_dif_ctx	dif_ctx;
92 	/** Separate metadata buffer pointer */
93 	void			*md_buf;
94 
95 	/** Dataset management range information */
96 	struct spdk_nvme_dsm_range *dsm_range;
97 
98 	struct spdk_fio_thread	*fio_thread;
99 	struct spdk_fio_qpair	*fio_qpair;
100 };
101 
102 struct spdk_fio_ctrlr {
103 	struct spdk_nvme_transport_id	tr_id;
104 	struct spdk_nvme_ctrlr_opts	opts;
105 	struct spdk_nvme_ctrlr		*ctrlr;
106 	TAILQ_ENTRY(spdk_fio_ctrlr)	link;
107 };
108 
109 static TAILQ_HEAD(, spdk_fio_ctrlr) g_ctrlrs = TAILQ_HEAD_INITIALIZER(g_ctrlrs);
110 static int g_td_count;
111 static pthread_t g_ctrlr_thread_id = 0;
112 static pthread_mutex_t g_mutex = PTHREAD_MUTEX_INITIALIZER;
113 static bool g_error;
114 
115 struct spdk_fio_qpair {
116 	struct fio_file			*f;
117 	struct spdk_nvme_qpair		*qpair;
118 	struct spdk_nvme_ns		*ns;
119 	uint32_t			io_flags;
120 	bool				zone_append_enabled;
121 	bool				nvme_pi_enabled;
122 	/* True for DIF and false for DIX, and this is valid only if nvme_pi_enabled is true. */
123 	bool				extended_lba;
124 	/* True for protection info transferred at start of metadata,
125 	 * false for protection info transferred at end of metadata, and
126 	 * this is valid only if nvme_pi_enabled is true.
127 	 */
128 	bool				md_start;
129 	TAILQ_ENTRY(spdk_fio_qpair)	link;
130 	struct spdk_fio_ctrlr		*fio_ctrlr;
131 };
132 
133 struct spdk_fio_thread {
134 	struct thread_data		*td;
135 
136 	TAILQ_HEAD(, spdk_fio_qpair)	fio_qpair;
137 	struct spdk_fio_qpair		*fio_qpair_current;	/* the current fio_qpair to be handled. */
138 
139 	struct io_u			**iocq;		/* io completion queue */
140 	unsigned int			iocq_count;	/* number of iocq entries filled by last getevents */
141 	unsigned int			iocq_size;	/* number of iocq entries allocated */
142 
143 };
144 
145 struct spdk_fio_probe_ctx {
146 	struct thread_data	*td;
147 	char			hostnqn[SPDK_NVMF_NQN_MAX_LEN + 1];
148 	struct fio_file		*f; /* fio_file given by user */
149 };
150 
151 static void *
152 spdk_fio_poll_ctrlrs(void *arg)
153 {
154 	struct spdk_fio_ctrlr *fio_ctrlr;
155 	int oldstate;
156 	int rc;
157 
158 	/* Loop until the thread is cancelled */
159 	while (true) {
160 		rc = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate);
161 		if (rc != 0) {
162 			SPDK_ERRLOG("Unable to set cancel state disabled on g_init_thread (%d): %s\n",
163 				    rc, spdk_strerror(rc));
164 		}
165 
166 		pthread_mutex_lock(&g_mutex);
167 
168 		TAILQ_FOREACH(fio_ctrlr, &g_ctrlrs, link) {
169 			spdk_nvme_ctrlr_process_admin_completions(fio_ctrlr->ctrlr);
170 		}
171 
172 		pthread_mutex_unlock(&g_mutex);
173 
174 		rc = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);
175 		if (rc != 0) {
176 			SPDK_ERRLOG("Unable to set cancel state enabled on g_init_thread (%d): %s\n",
177 				    rc, spdk_strerror(rc));
178 		}
179 
180 		/* This is a pthread cancellation point and cannot be removed. */
181 		sleep(1);
182 	}
183 
184 	return NULL;
185 }
186 
187 static bool
188 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
189 	 struct spdk_nvme_ctrlr_opts *opts)
190 {
191 	struct spdk_fio_probe_ctx *ctx = cb_ctx;
192 	struct thread_data *td = ctx->td;
193 	struct spdk_fio_options *fio_options = td->eo;
194 
195 	if (ctx->hostnqn[0] != '\0') {
196 		memcpy(opts->hostnqn, ctx->hostnqn, sizeof(opts->hostnqn));
197 	} else if (fio_options->hostnqn) {
198 		snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", fio_options->hostnqn);
199 	}
200 
201 	if (fio_options->enable_wrr) {
202 		opts->arb_mechanism		= SPDK_NVME_CC_AMS_WRR;
203 		opts->arbitration_burst		= fio_options->arbitration_burst;
204 		opts->low_priority_weight	= fio_options->low_weight;
205 		opts->medium_priority_weight	= fio_options->medium_weight;
206 		opts->high_priority_weight	= fio_options->high_weight;
207 	}
208 
209 	if (fio_options->digest_enable) {
210 		if (strcasecmp(fio_options->digest_enable, "HEADER") == 0) {
211 			opts->header_digest = true;
212 		} else if (strcasecmp(fio_options->digest_enable, "DATA") == 0) {
213 			opts->data_digest = true;
214 		} else if (strcasecmp(fio_options->digest_enable, "BOTH") == 0) {
215 			opts->header_digest = true;
216 			opts->data_digest = true;
217 		}
218 	}
219 
220 	return true;
221 }
222 
223 static struct spdk_fio_ctrlr *
224 get_fio_ctrlr(const struct spdk_nvme_transport_id *trid)
225 {
226 	struct spdk_fio_ctrlr	*fio_ctrlr;
227 
228 	TAILQ_FOREACH(fio_ctrlr, &g_ctrlrs, link) {
229 		if (spdk_nvme_transport_id_compare(trid, &fio_ctrlr->tr_id) == 0) {
230 			return fio_ctrlr;
231 		}
232 	}
233 
234 	return NULL;
235 }
236 
237 /**
238  * Returns the fio_qpair matching the given fio_file and has an associated ns
239  */
240 static struct spdk_fio_qpair *
241 get_fio_qpair(struct spdk_fio_thread *fio_thread, struct fio_file *f)
242 {
243 	struct spdk_fio_qpair	*fio_qpair;
244 
245 	TAILQ_FOREACH(fio_qpair, &fio_thread->fio_qpair, link) {
246 		if ((fio_qpair->f == f) && fio_qpair->ns) {
247 			return fio_qpair;
248 		}
249 	}
250 
251 	return NULL;
252 }
253 
254 #if FIO_HAS_ZBD
255 /**
256  * Callback function to use while processing completions until completion-indicator turns non-zero
257  */
258 static void
259 pcu_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
260 {
261 	int *completed = ctx;
262 
263 	*completed = spdk_nvme_cpl_is_error(cpl) ? -1 : 1;
264 }
265 
266 /**
267  * Process Completions Until the given 'completed' indicator turns non-zero or an error occurs
268  */
269 static int32_t
270 pcu(struct spdk_nvme_qpair *qpair, int *completed)
271 {
272 	int32_t ret;
273 
274 	while (!*completed) {
275 		ret = spdk_nvme_qpair_process_completions(qpair, 1);
276 		if (ret < 0) {
277 			log_err("spdk/nvme: process_compl(): ret: %d\n", ret);
278 			return ret;
279 		}
280 	}
281 
282 	return 0;
283 }
284 #endif
285 
286 static inline uint32_t
287 _nvme_get_host_buffer_sector_size(struct spdk_nvme_ns *ns, uint32_t io_flags)
288 {
289 	bool md_excluded_from_xfer = false;
290 	uint32_t md_size;
291 	uint32_t ns_flags;
292 
293 	ns_flags = spdk_nvme_ns_get_flags(ns);
294 	md_size = spdk_nvme_ns_get_md_size(ns);
295 
296 	/* For extended LBA format, if the metadata size is 8 bytes and PRACT is
297 	 * enabled(controller inserts/strips PI), we should reduce metadata size
298 	 * from block size.
299 	 */
300 	md_excluded_from_xfer = ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) &&
301 				 (ns_flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) &&
302 				 (ns_flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) &&
303 				 (md_size == 8));
304 
305 	return md_excluded_from_xfer ? spdk_nvme_ns_get_sector_size(ns) :
306 	       spdk_nvme_ns_get_extended_sector_size(ns);
307 }
308 
309 static void
310 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
311 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
312 {
313 	struct spdk_fio_probe_ctx *ctx = cb_ctx;
314 	struct thread_data	*td = ctx->td;
315 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
316 	struct spdk_fio_ctrlr	*fio_ctrlr;
317 	struct spdk_fio_qpair	*fio_qpair;
318 	struct spdk_nvme_ns	*ns;
319 	const struct spdk_nvme_ns_data	*nsdata;
320 	struct fio_file		*f = ctx->f;
321 	uint32_t		ns_id;
322 	char			*p;
323 	long int		tmp;
324 	uint32_t		block_size;
325 	struct spdk_fio_options *fio_options = td->eo;
326 
327 	p = strstr(f->file_name, "ns=");
328 	if (p != NULL) {
329 		tmp = spdk_strtol(p + 3, 10);
330 		if (tmp <= 0) {
331 			SPDK_ERRLOG("namespace id should be >=1, but was invalid: %ld\n", tmp);
332 			g_error = true;
333 			return;
334 		}
335 		ns_id = (uint32_t)tmp;
336 	} else {
337 		ns_id = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
338 		if (ns_id == 0) {
339 			/* The ctrlr has no active namespaces and we didn't specify any so nothing to do. */
340 			return;
341 		}
342 	}
343 
344 	pthread_mutex_lock(&g_mutex);
345 	fio_ctrlr = get_fio_ctrlr(trid);
346 	/* it is a new ctrlr and needs to be added */
347 	if (!fio_ctrlr) {
348 		/* Create an fio_ctrlr and add it to the list */
349 		fio_ctrlr = calloc(1, sizeof(*fio_ctrlr));
350 		if (!fio_ctrlr) {
351 			SPDK_ERRLOG("Cannot allocate space for fio_ctrlr\n");
352 			g_error = true;
353 			pthread_mutex_unlock(&g_mutex);
354 			return;
355 		}
356 		fio_ctrlr->opts = *opts;
357 		fio_ctrlr->ctrlr = ctrlr;
358 		fio_ctrlr->tr_id = *trid;
359 		TAILQ_INSERT_TAIL(&g_ctrlrs, fio_ctrlr, link);
360 	}
361 	pthread_mutex_unlock(&g_mutex);
362 
363 	ns = spdk_nvme_ctrlr_get_ns(fio_ctrlr->ctrlr, ns_id);
364 	if (ns == NULL) {
365 		SPDK_ERRLOG("Cannot get namespace by ns_id=%d\n", ns_id);
366 		g_error = true;
367 		return;
368 	}
369 
370 	if (!spdk_nvme_ns_is_active(ns)) {
371 		SPDK_ERRLOG("Inactive namespace by ns_id=%d\n", ns_id);
372 		g_error = true;
373 		return;
374 	}
375 	nsdata = spdk_nvme_ns_get_data(ns);
376 
377 	TAILQ_FOREACH(fio_qpair, &fio_thread->fio_qpair, link) {
378 		if ((fio_qpair->f == f) ||
379 		    ((spdk_nvme_transport_id_compare(trid, &fio_qpair->fio_ctrlr->tr_id) == 0) &&
380 		     (spdk_nvme_ns_get_id(fio_qpair->ns) == ns_id))) {
381 			/* Not the error case. Avoid duplicated connection */
382 			return;
383 		}
384 	}
385 
386 	/* create a new qpair */
387 	fio_qpair = calloc(1, sizeof(*fio_qpair));
388 	if (!fio_qpair) {
389 		g_error = true;
390 		SPDK_ERRLOG("Cannot allocate space for fio_qpair\n");
391 		return;
392 	}
393 
394 	f->engine_data = fio_qpair;
395 	fio_qpair->ns = ns;
396 	fio_qpair->f = f;
397 	fio_qpair->fio_ctrlr = fio_ctrlr;
398 	TAILQ_INSERT_TAIL(&fio_thread->fio_qpair, fio_qpair, link);
399 
400 	if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) {
401 		assert(spdk_nvme_ns_get_pi_type(ns) != SPDK_NVME_FMT_NVM_PROTECTION_DISABLE);
402 		fio_qpair->io_flags = g_spdk_pract_flag | g_spdk_prchk_flags;
403 		fio_qpair->nvme_pi_enabled = true;
404 		fio_qpair->md_start = nsdata->dps.md_start;
405 		fio_qpair->extended_lba = spdk_nvme_ns_supports_extended_lba(ns);
406 		fprintf(stdout, "PI type%u enabled with %s\n", spdk_nvme_ns_get_pi_type(ns),
407 			fio_qpair->extended_lba ? "extended lba" : "separate metadata");
408 	}
409 
410 	block_size = _nvme_get_host_buffer_sector_size(ns, fio_qpair->io_flags);
411 	for_each_rw_ddir(ddir) {
412 		if (td->o.min_bs[ddir] % block_size != 0 || td->o.max_bs[ddir] % block_size != 0) {
413 			if (spdk_nvme_ns_supports_extended_lba(ns)) {
414 				SPDK_ERRLOG("--bs or other block size related option has to be a multiple of (LBA data size + Metadata size)\n");
415 			} else {
416 				SPDK_ERRLOG("--bs or other block size related option has to be a multiple of LBA data size\n");
417 			}
418 			g_error = true;
419 			return;
420 		}
421 	}
422 
423 	if (fio_options->zone_append && spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS) {
424 		if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED) {
425 			SPDK_DEBUGLOG(fio_nvme, "Using zone appends instead of writes on: '%s'\n",
426 				      f->file_name);
427 			fio_qpair->zone_append_enabled = true;
428 		} else {
429 			SPDK_WARNLOG("Falling back to writes on: '%s' - ns lacks zone append cmd\n",
430 				     f->file_name);
431 		}
432 	}
433 
434 #if FIO_HAS_ZBD
435 	if (td_trim(td) && td->o.zone_mode == ZONE_MODE_ZBD) {
436 		td->io_ops->flags |= FIO_ASYNCIO_SYNC_TRIM;
437 	}
438 #endif
439 
440 	if (fio_options->initial_zone_reset == 1 && spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS) {
441 #if FIO_HAS_ZBD
442 		struct spdk_nvme_qpair *tmp_qpair;
443 		int completed = 0, err;
444 
445 		/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
446 		 * Create a temporary qpair in order to perform the initial zone reset.
447 		 */
448 		assert(!fio_qpair->qpair);
449 
450 		tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, NULL, 0);
451 		if (!tmp_qpair) {
452 			SPDK_ERRLOG("Cannot allocate a temporary qpair\n");
453 			g_error = true;
454 			return;
455 		}
456 
457 		err = spdk_nvme_zns_reset_zone(ns, tmp_qpair, 0x0, true, pcu_cb, &completed);
458 		if (err || pcu(tmp_qpair, &completed) || completed < 0) {
459 			log_err("spdk/nvme: warn: initial_zone_reset: err: %d, cpl: %d\n",
460 				err, completed);
461 		}
462 
463 		spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
464 #else
465 		log_err("spdk/nvme: ZBD/ZNS is not supported\n");
466 #endif
467 	}
468 
469 	f->real_file_size = spdk_nvme_ns_get_size(fio_qpair->ns);
470 	if (f->real_file_size <= 0) {
471 		g_error = true;
472 		SPDK_ERRLOG("Cannot get namespace size by ns=%p\n", ns);
473 		return;
474 	}
475 
476 	f->filetype = FIO_TYPE_BLOCK;
477 	fio_file_set_size_known(f);
478 }
479 
480 static void
481 parse_prchk_flags(const char *prchk_str)
482 {
483 	if (!prchk_str) {
484 		return;
485 	}
486 
487 	if (strstr(prchk_str, "GUARD") != NULL) {
488 		g_spdk_prchk_flags = SPDK_NVME_IO_FLAGS_PRCHK_GUARD;
489 	}
490 	if (strstr(prchk_str, "REFTAG") != NULL) {
491 		g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG;
492 	}
493 	if (strstr(prchk_str, "APPTAG") != NULL) {
494 		g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG;
495 	}
496 }
497 
498 static void
499 parse_pract_flag(int pract)
500 {
501 	if (pract == 1) {
502 		g_spdk_pract_flag = SPDK_NVME_IO_FLAGS_PRACT;
503 	} else {
504 		g_spdk_pract_flag = 0;
505 	}
506 }
507 
508 static bool
509 fio_redirected_to_dev_null(void)
510 {
511 	char path[PATH_MAX] = "";
512 	ssize_t ret;
513 
514 	ret = readlink("/proc/self/fd/1", path, sizeof(path));
515 
516 	if (ret == -1 || strcmp(path, "/dev/null") != 0) {
517 		return false;
518 	}
519 
520 	ret = readlink("/proc/self/fd/2", path, sizeof(path));
521 
522 	if (ret == -1 || strcmp(path, "/dev/null") != 0) {
523 		return false;
524 	}
525 
526 	return true;
527 }
528 
529 static int
530 spdk_fio_init(struct thread_data *td)
531 {
532 	int ret = 0;
533 	struct spdk_fio_options *fio_options = td->eo;
534 
535 	if (fio_options->spdk_tracing) {
536 		ret = spdk_trace_register_user_thread();
537 	}
538 
539 	return ret;
540 }
541 
542 /* Called once at initialization. This is responsible for gathering the size of
543  * each "file", which in our case are in the form
544  * 'key=value [key=value] ... ns=value'
545  * For example, For local PCIe NVMe device  - 'trtype=PCIe traddr=0000.04.00.0 ns=1'
546  * For remote exported by NVMe-oF target, 'trtype=RDMA adrfam=IPv4 traddr=192.168.100.8 trsvcid=4420 ns=1' */
547 static int
548 spdk_fio_setup(struct thread_data *td)
549 {
550 	struct spdk_fio_thread *fio_thread;
551 	struct spdk_fio_options *fio_options = td->eo;
552 	struct spdk_fio_probe_ctx ctx;
553 	struct spdk_env_opts opts;
554 	struct fio_file *f;
555 	char *p;
556 	int rc = 0;
557 	struct spdk_nvme_transport_id trid;
558 	struct spdk_fio_ctrlr *fio_ctrlr;
559 	char *trid_info;
560 	unsigned int i;
561 	size_t size;
562 
563 	/*
564 	 * If we're running in a daemonized FIO instance, it's possible
565 	 * fd 1/2 were re-used for something important by FIO. Newer fio
566 	 * versions are careful to redirect those to /dev/null, but if we're
567 	 * not, we'll abort early, so we don't accidentally write messages to
568 	 * an important file, etc.
569 	 */
570 	if (is_backend && !fio_redirected_to_dev_null()) {
571 		char buf[1024];
572 		snprintf(buf, sizeof(buf),
573 			 "SPDK FIO plugin is in daemon mode, but stdout/stderr "
574 			 "aren't redirected to /dev/null. Aborting.");
575 		fio_server_text_output(FIO_LOG_ERR, buf, sizeof(buf));
576 		return -1;
577 	}
578 
579 	if (!td->o.use_thread) {
580 		log_err("spdk: must set thread=1 when using spdk plugin\n");
581 		return 1;
582 	}
583 
584 	if (g_log_flag_error) {
585 		/* The first thread found an error when parsing log flags, so
586 		 * just return error immediately for all of the other threads.
587 		 */
588 		return 1;
589 	}
590 
591 	pthread_mutex_lock(&g_mutex);
592 
593 	fio_thread = calloc(1, sizeof(*fio_thread));
594 	assert(fio_thread != NULL);
595 
596 	td->io_ops_data = fio_thread;
597 	fio_thread->td = td;
598 
599 	fio_thread->iocq_size = td->o.iodepth;
600 	fio_thread->iocq = calloc(fio_thread->iocq_size, sizeof(struct io_u *));
601 	assert(fio_thread->iocq != NULL);
602 
603 	TAILQ_INIT(&fio_thread->fio_qpair);
604 
605 	if (!g_spdk_env_initialized) {
606 		opts.opts_size = sizeof(opts);
607 		spdk_env_opts_init(&opts);
608 		opts.name = "fio";
609 		opts.mem_size = fio_options->mem_size;
610 		opts.shm_id = fio_options->shm_id;
611 		g_spdk_enable_sgl = fio_options->enable_sgl;
612 		g_spdk_sge_size = fio_options->sge_size;
613 		g_spdk_bit_bucket_data_len = fio_options->bit_bucket_data_len;
614 		parse_pract_flag(fio_options->pi_act);
615 		g_spdk_md_per_io_size = spdk_max(fio_options->md_per_io_size, 4096);
616 		g_spdk_apptag = (uint16_t)fio_options->apptag;
617 		g_spdk_apptag_mask = (uint16_t)fio_options->apptag_mask;
618 		parse_prchk_flags(fio_options->pi_chk);
619 		if (spdk_env_init(&opts) < 0) {
620 			SPDK_ERRLOG("Unable to initialize SPDK env\n");
621 			free(fio_thread->iocq);
622 			free(fio_thread);
623 			fio_thread = NULL;
624 			pthread_mutex_unlock(&g_mutex);
625 			return 1;
626 		}
627 
628 		if (fio_options->log_flags) {
629 			char *sp = NULL;
630 			char *tok = strtok_r(fio_options->log_flags, ",", &sp);
631 			do {
632 				rc = spdk_log_set_flag(tok);
633 				if (rc < 0) {
634 					SPDK_ERRLOG("unknown log flag %s\n", tok);
635 					g_log_flag_error = true;
636 					return 1;
637 				}
638 			} while ((tok = strtok_r(NULL, ",", &sp)) != NULL);
639 #ifdef DEBUG
640 			spdk_log_set_print_level(SPDK_LOG_DEBUG);
641 #endif
642 		}
643 
644 		g_spdk_env_initialized = true;
645 		spdk_unaffinitize_thread();
646 
647 		if (fio_options->spdk_tracing) {
648 			spdk_trace_init("spdk_fio_tracepoints", 65536, td->o.numjobs);
649 			spdk_trace_enable_tpoint_group("nvme_pcie");
650 			spdk_trace_enable_tpoint_group("nvme_tcp");
651 		}
652 
653 		/* Spawn a thread to continue polling the controllers */
654 		rc = pthread_create(&g_ctrlr_thread_id, NULL, &spdk_fio_poll_ctrlrs, NULL);
655 		if (rc != 0) {
656 			SPDK_ERRLOG("Unable to spawn a thread to poll admin queues. They won't be polled.\n");
657 		}
658 
659 		if (fio_options->enable_vmd && spdk_vmd_init()) {
660 			SPDK_ERRLOG("Failed to initialize VMD. Some NVMe devices can be unavailable.\n");
661 		}
662 	}
663 	pthread_mutex_unlock(&g_mutex);
664 
665 	for_each_file(td, f, i) {
666 		memset(&trid, 0, sizeof(trid));
667 		memset(&ctx, 0, sizeof(ctx));
668 
669 		trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
670 
671 		p = strstr(f->file_name, " ns=");
672 		if (p != NULL) {
673 			trid_info = strndup(f->file_name, p - f->file_name);
674 		} else {
675 			trid_info = strndup(f->file_name, strlen(f->file_name));
676 		}
677 
678 		if (!trid_info) {
679 			SPDK_ERRLOG("Failed to allocate space for trid_info\n");
680 			continue;
681 		}
682 
683 		rc = spdk_nvme_transport_id_parse(&trid, trid_info);
684 		if (rc < 0) {
685 			SPDK_ERRLOG("Failed to parse given str: %s\n", trid_info);
686 			free(trid_info);
687 			continue;
688 		}
689 		free(trid_info);
690 
691 		if (trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
692 			struct spdk_pci_addr pci_addr;
693 			if (spdk_pci_addr_parse(&pci_addr, trid.traddr) < 0) {
694 				SPDK_ERRLOG("Invalid traddr=%s\n", trid.traddr);
695 				continue;
696 			}
697 			spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
698 		} else {
699 			if (trid.subnqn[0] == '\0') {
700 				snprintf(trid.subnqn, sizeof(trid.subnqn), "%s",
701 					 SPDK_NVMF_DISCOVERY_NQN);
702 			}
703 			if ((p = strcasestr(f->file_name, "hostnqn:")) ||
704 			    (p = strcasestr(f->file_name, "hostnqn="))) {
705 				p += strlen("hostnqn:");
706 				size = strcspn(p, " \t\n");
707 				if (size > sizeof(ctx.hostnqn)) {
708 					SPDK_ERRLOG("Invalid hostnqn: too long\n");
709 					continue;
710 				}
711 				memcpy(ctx.hostnqn, p, size);
712 			}
713 		}
714 
715 		ctx.td = td;
716 		ctx.f = f;
717 
718 		pthread_mutex_lock(&g_mutex);
719 		fio_ctrlr = get_fio_ctrlr(&trid);
720 		pthread_mutex_unlock(&g_mutex);
721 		if (fio_ctrlr) {
722 			attach_cb(&ctx, &trid, fio_ctrlr->ctrlr, &fio_ctrlr->opts);
723 		} else {
724 			/* Enumerate all of the controllers */
725 			if (spdk_nvme_probe(&trid, &ctx, probe_cb, attach_cb, NULL) != 0) {
726 				SPDK_ERRLOG("spdk_nvme_probe() failed\n");
727 				continue;
728 			}
729 		}
730 
731 		if (g_error) {
732 			log_err("Failed to initialize spdk fio plugin\n");
733 			rc = 1;
734 			break;
735 		}
736 	}
737 
738 	pthread_mutex_lock(&g_mutex);
739 	g_td_count++;
740 	pthread_mutex_unlock(&g_mutex);
741 
742 	return rc;
743 }
744 
745 static int
746 spdk_fio_open(struct thread_data *td, struct fio_file *f)
747 {
748 	struct spdk_fio_qpair *fio_qpair = f->engine_data;
749 	struct spdk_fio_ctrlr *fio_ctrlr = fio_qpair->fio_ctrlr;
750 	struct spdk_fio_options *fio_options = td->eo;
751 	struct spdk_nvme_io_qpair_opts	qpopts;
752 
753 	assert(fio_qpair->qpair == NULL);
754 	spdk_nvme_ctrlr_get_default_io_qpair_opts(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts));
755 	qpopts.delay_cmd_submit = true;
756 	if (fio_options->enable_wrr) {
757 		qpopts.qprio = fio_options->wrr_priority;
758 	}
759 	qpopts.disable_pcie_sgl_merge = fio_options->disable_pcie_sgl_merge;
760 
761 	fio_qpair->qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts));
762 	if (!fio_qpair->qpair) {
763 		SPDK_ERRLOG("Cannot allocate nvme io_qpair any more\n");
764 		g_error = true;
765 		free(fio_qpair);
766 		return -1;
767 	}
768 
769 	if (fio_options->print_qid_mappings == 1) {
770 		log_info("job %s: %s qid %d\n", td->o.name, f->file_name,
771 			 spdk_nvme_qpair_get_id(fio_qpair->qpair));
772 	}
773 
774 	return 0;
775 }
776 
777 static int
778 spdk_fio_close(struct thread_data *td, struct fio_file *f)
779 {
780 	struct spdk_fio_qpair *fio_qpair = f->engine_data;
781 
782 	assert(fio_qpair->qpair != NULL);
783 	spdk_nvme_ctrlr_free_io_qpair(fio_qpair->qpair);
784 	fio_qpair->qpair = NULL;
785 	return 0;
786 }
787 
788 static int
789 spdk_fio_iomem_alloc(struct thread_data *td, size_t total_mem)
790 {
791 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
792 	struct spdk_fio_qpair	*fio_qpair;
793 	struct spdk_nvme_ctrlr	*ctrlr;
794 	int32_t numa_id = SPDK_ENV_NUMA_ID_ANY, tmp_numa_id;
795 
796 	/* If all ctrlrs used by this fio_thread have the same numa
797 	 * id, allocate from that one. If they come from different numa
798 	 * ids, then don't try to optimize and just use SPDK_ENV_NUMA_ID_ANY.
799 	 */
800 	TAILQ_FOREACH(fio_qpair, &fio_thread->fio_qpair, link) {
801 		ctrlr = fio_qpair->fio_ctrlr->ctrlr;
802 		tmp_numa_id = spdk_nvme_ctrlr_get_numa_id(ctrlr);
803 		if (numa_id == SPDK_ENV_NUMA_ID_ANY) {
804 			numa_id = tmp_numa_id;
805 		} else if (tmp_numa_id != numa_id &&
806 			   tmp_numa_id != SPDK_ENV_NUMA_ID_ANY) {
807 			numa_id = SPDK_ENV_NUMA_ID_ANY;
808 			break;
809 		}
810 	}
811 
812 	td->orig_buffer = spdk_dma_zmalloc_socket(total_mem, NVME_IO_ALIGN, NULL, numa_id);
813 	return td->orig_buffer == NULL;
814 }
815 
816 static void
817 spdk_fio_iomem_free(struct thread_data *td)
818 {
819 	spdk_dma_free(td->orig_buffer);
820 }
821 
822 static int
823 spdk_fio_io_u_init(struct thread_data *td, struct io_u *io_u)
824 {
825 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
826 	struct spdk_fio_request	*fio_req;
827 	uint32_t dsm_size;
828 
829 	io_u->engine_data = NULL;
830 
831 	fio_req = calloc(1, sizeof(*fio_req));
832 	if (fio_req == NULL) {
833 		return 1;
834 	}
835 
836 	if (!(td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM)) {
837 #if FIO_HAS_MRT
838 		/* By default number of range is set to 1 */
839 		dsm_size = td->o.num_range * sizeof(struct spdk_nvme_dsm_range);
840 #else
841 		dsm_size = sizeof(struct spdk_nvme_dsm_range);
842 #endif
843 		fio_req->dsm_range = calloc(1, dsm_size);
844 		if (fio_req->dsm_range == NULL) {
845 			free(fio_req);
846 			return 1;
847 		}
848 	}
849 
850 	fio_req->md_buf = spdk_dma_zmalloc(g_spdk_md_per_io_size, NVME_IO_ALIGN, NULL);
851 	if (fio_req->md_buf == NULL) {
852 		fprintf(stderr, "Allocate %u metadata failed\n", g_spdk_md_per_io_size);
853 		free(fio_req->dsm_range);
854 		free(fio_req);
855 		return 1;
856 	}
857 
858 	fio_req->io = io_u;
859 	fio_req->fio_thread = fio_thread;
860 
861 	io_u->engine_data = fio_req;
862 
863 	return 0;
864 }
865 
866 static void
867 spdk_fio_io_u_free(struct thread_data *td, struct io_u *io_u)
868 {
869 	struct spdk_fio_request *fio_req = io_u->engine_data;
870 
871 	if (fio_req) {
872 		assert(fio_req->io == io_u);
873 		spdk_dma_free(fio_req->md_buf);
874 		free(fio_req->dsm_range);
875 		free(fio_req);
876 		io_u->engine_data = NULL;
877 	}
878 }
879 
880 static inline uint64_t
881 fio_offset_to_zslba(unsigned long long offset, struct spdk_nvme_ns *ns)
882 {
883 	return (offset / spdk_nvme_zns_ns_get_zone_size(ns)) * spdk_nvme_zns_ns_get_zone_size_sectors(ns);
884 }
885 
886 static int
887 fio_extended_lba_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
888 {
889 	struct spdk_nvme_ns *ns = fio_qpair->ns;
890 	struct spdk_fio_request *fio_req = io_u->engine_data;
891 	uint32_t md_size, extended_lba_size, lba_count;
892 	uint64_t lba;
893 	struct iovec iov;
894 	int rc;
895 	struct spdk_dif_ctx_init_ext_opts dif_opts;
896 
897 	/* Set appmask and apptag when PRACT is enabled */
898 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
899 		fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask;
900 		fio_req->dif_ctx.app_tag = g_spdk_apptag;
901 		return 0;
902 	}
903 
904 	extended_lba_size = spdk_nvme_ns_get_extended_sector_size(ns);
905 	md_size = spdk_nvme_ns_get_md_size(ns);
906 	lba = io_u->offset / extended_lba_size;
907 	lba_count = io_u->xfer_buflen / extended_lba_size;
908 
909 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
910 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
911 	rc = spdk_dif_ctx_init(&fio_req->dif_ctx, extended_lba_size, md_size,
912 			       true, fio_qpair->md_start,
913 			       (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns),
914 			       fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag,
915 			       0, 0, &dif_opts);
916 	if (rc != 0) {
917 		fprintf(stderr, "Initialization of DIF context failed\n");
918 		return rc;
919 	}
920 
921 	if (io_u->ddir != DDIR_WRITE) {
922 		return 0;
923 	}
924 
925 	iov.iov_base = io_u->buf;
926 	iov.iov_len = io_u->xfer_buflen;
927 	rc = spdk_dif_generate(&iov, 1, lba_count, &fio_req->dif_ctx);
928 	if (rc != 0) {
929 		fprintf(stderr, "Generation of DIF failed\n");
930 	}
931 
932 	return rc;
933 }
934 
935 static int
936 fio_separate_md_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
937 {
938 	struct spdk_nvme_ns *ns = fio_qpair->ns;
939 	struct spdk_fio_request *fio_req = io_u->engine_data;
940 	uint32_t md_size, block_size, lba_count;
941 	uint64_t lba;
942 	struct iovec iov, md_iov;
943 	int rc;
944 	struct spdk_dif_ctx_init_ext_opts dif_opts;
945 
946 	/* Set appmask and apptag when PRACT is enabled */
947 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
948 		fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask;
949 		fio_req->dif_ctx.app_tag = g_spdk_apptag;
950 		return 0;
951 	}
952 
953 	block_size = spdk_nvme_ns_get_sector_size(ns);
954 	md_size = spdk_nvme_ns_get_md_size(ns);
955 	lba = io_u->offset / block_size;
956 	lba_count = io_u->xfer_buflen / block_size;
957 
958 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
959 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
960 	rc = spdk_dif_ctx_init(&fio_req->dif_ctx, block_size, md_size,
961 			       false, fio_qpair->md_start,
962 			       (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns),
963 			       fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag,
964 			       0, 0, &dif_opts);
965 	if (rc != 0) {
966 		fprintf(stderr, "Initialization of DIF context failed\n");
967 		return rc;
968 	}
969 
970 	if (io_u->ddir != DDIR_WRITE) {
971 		return 0;
972 	}
973 
974 	iov.iov_base = io_u->buf;
975 	iov.iov_len = io_u->xfer_buflen;
976 	md_iov.iov_base = fio_req->md_buf;
977 	md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size);
978 	rc = spdk_dix_generate(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx);
979 	if (rc < 0) {
980 		fprintf(stderr, "Generation of DIX failed\n");
981 	}
982 
983 	return rc;
984 }
985 
986 static int
987 fio_extended_lba_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
988 {
989 	struct spdk_nvme_ns *ns = fio_qpair->ns;
990 	struct spdk_fio_request *fio_req = io_u->engine_data;
991 	uint32_t lba_count;
992 	struct iovec iov;
993 	struct spdk_dif_error err_blk = {};
994 	int rc;
995 
996 	/* Do nothing when PRACT is enabled */
997 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
998 		return 0;
999 	}
1000 
1001 	iov.iov_base = io_u->buf;
1002 	iov.iov_len = io_u->xfer_buflen;
1003 	lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_extended_sector_size(ns);
1004 
1005 	rc = spdk_dif_verify(&iov, 1, lba_count, &fio_req->dif_ctx, &err_blk);
1006 	if (rc != 0) {
1007 		fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n",
1008 			err_blk.err_type, err_blk.err_offset);
1009 	}
1010 
1011 	return rc;
1012 }
1013 
1014 static int
1015 fio_separate_md_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
1016 {
1017 	struct spdk_nvme_ns *ns = fio_qpair->ns;
1018 	struct spdk_fio_request *fio_req = io_u->engine_data;
1019 	uint32_t md_size, lba_count;
1020 	struct iovec iov, md_iov;
1021 	struct spdk_dif_error err_blk = {};
1022 	int rc;
1023 
1024 	/* Do nothing when PRACT is enabled */
1025 	if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
1026 		return 0;
1027 	}
1028 
1029 	iov.iov_base = io_u->buf;
1030 	iov.iov_len = io_u->xfer_buflen;
1031 	lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_sector_size(ns);
1032 	md_size = spdk_nvme_ns_get_md_size(ns);
1033 	md_iov.iov_base = fio_req->md_buf;
1034 	md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size);
1035 
1036 	rc = spdk_dix_verify(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx, &err_blk);
1037 	if (rc != 0) {
1038 		fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n",
1039 			err_blk.err_type, err_blk.err_offset);
1040 	}
1041 
1042 	return rc;
1043 }
1044 
1045 static void
1046 spdk_fio_completion_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
1047 {
1048 	struct spdk_fio_request		*fio_req = ctx;
1049 	struct spdk_fio_thread		*fio_thread = fio_req->fio_thread;
1050 	struct spdk_fio_qpair		*fio_qpair = fio_req->fio_qpair;
1051 	int				rc;
1052 
1053 	if (fio_qpair->nvme_pi_enabled && fio_req->io->ddir == DDIR_READ) {
1054 		if (fio_qpair->extended_lba) {
1055 			rc = fio_extended_lba_verify_pi(fio_qpair, fio_req->io);
1056 		} else {
1057 			rc = fio_separate_md_verify_pi(fio_qpair, fio_req->io);
1058 		}
1059 		if (rc != 0) {
1060 			fio_req->io->error = abs(rc);
1061 		}
1062 	}
1063 
1064 	if (spdk_nvme_cpl_is_error(cpl)) {
1065 		fio_req->io->error = EIO;
1066 	}
1067 
1068 	assert(fio_thread->iocq_count < fio_thread->iocq_size);
1069 	fio_thread->iocq[fio_thread->iocq_count++] = fio_req->io;
1070 }
1071 
1072 static void
1073 spdk_nvme_io_reset_sgl(void *ref, uint32_t sgl_offset)
1074 {
1075 	struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref;
1076 
1077 	fio_req->iov_offset = sgl_offset;
1078 	fio_req->bit_bucket_data_len = 0;
1079 }
1080 
1081 static int
1082 spdk_nvme_io_next_sge(void *ref, void **address, uint32_t *length)
1083 {
1084 	struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref;
1085 	struct io_u *io_u = fio_req->io;
1086 	uint32_t iov_len;
1087 	uint32_t bit_bucket_len;
1088 
1089 	*address = io_u->buf;
1090 
1091 	if (fio_req->iov_offset) {
1092 		assert(fio_req->iov_offset <= io_u->xfer_buflen);
1093 		*address += fio_req->iov_offset;
1094 	}
1095 
1096 	iov_len = io_u->xfer_buflen - fio_req->iov_offset;
1097 	if (iov_len > g_spdk_sge_size) {
1098 		iov_len = g_spdk_sge_size;
1099 	}
1100 
1101 	if ((fio_req->bit_bucket_data_len < g_spdk_bit_bucket_data_len) && (io_u->ddir == DDIR_READ)) {
1102 		assert(g_spdk_bit_bucket_data_len < io_u->xfer_buflen);
1103 		*address = (void *)UINT64_MAX;
1104 		bit_bucket_len = g_spdk_bit_bucket_data_len - fio_req->bit_bucket_data_len;
1105 		if (iov_len > bit_bucket_len) {
1106 			iov_len = bit_bucket_len;
1107 		}
1108 		fio_req->bit_bucket_data_len += iov_len;
1109 	}
1110 
1111 	fio_req->iov_offset += iov_len;
1112 	*length = iov_len;
1113 
1114 	return 0;
1115 }
1116 
1117 #if FIO_IOOPS_VERSION >= 24
1118 typedef enum fio_q_status fio_q_status_t;
1119 #else
1120 typedef int fio_q_status_t;
1121 #endif
1122 
1123 static fio_q_status_t
1124 spdk_fio_queue(struct thread_data *td, struct io_u *io_u)
1125 {
1126 	int rc = 1;
1127 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
1128 	struct spdk_fio_request	*fio_req = io_u->engine_data;
1129 	struct spdk_fio_qpair	*fio_qpair;
1130 	struct spdk_nvme_ns	*ns = NULL;
1131 	void			*md_buf = NULL;
1132 	struct spdk_dif_ctx	*dif_ctx = &fio_req->dif_ctx;
1133 #if FIO_HAS_FDP
1134 	struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
1135 #endif
1136 	struct spdk_nvme_dsm_range *range;
1137 	uint32_t		block_size;
1138 	uint64_t		lba;
1139 	uint32_t		lba_count;
1140 	uint32_t		num_range;
1141 
1142 	fio_qpair = get_fio_qpair(fio_thread, io_u->file);
1143 	if (fio_qpair == NULL) {
1144 		return -ENXIO;
1145 	}
1146 	ns = fio_qpair->ns;
1147 
1148 	if (fio_qpair->nvme_pi_enabled && !fio_qpair->extended_lba) {
1149 		md_buf = fio_req->md_buf;
1150 	}
1151 	fio_req->fio_qpair = fio_qpair;
1152 
1153 	block_size = _nvme_get_host_buffer_sector_size(ns, fio_qpair->io_flags);
1154 	lba = io_u->offset / block_size;
1155 	lba_count = io_u->xfer_buflen / block_size;
1156 
1157 #if FIO_HAS_FDP
1158 	/* Only SGL support for write command with directives */
1159 	if (io_u->ddir == DDIR_WRITE && io_u->dtype && !g_spdk_enable_sgl) {
1160 		log_err("spdk/nvme: queue() directives require SGL to be enabled\n");
1161 		io_u->error = -EINVAL;
1162 		return FIO_Q_COMPLETED;
1163 	}
1164 #endif
1165 
1166 	/* TODO: considering situations that fio will randomize and verify io_u */
1167 	if (fio_qpair->nvme_pi_enabled) {
1168 		if (fio_qpair->extended_lba) {
1169 			rc = fio_extended_lba_setup_pi(fio_qpair, io_u);
1170 		} else {
1171 			rc = fio_separate_md_setup_pi(fio_qpair, io_u);
1172 		}
1173 		if (rc < 0) {
1174 			io_u->error = -rc;
1175 			return FIO_Q_COMPLETED;
1176 		}
1177 	}
1178 
1179 	switch (io_u->ddir) {
1180 	case DDIR_READ:
1181 		if (!g_spdk_enable_sgl) {
1182 			rc = spdk_nvme_ns_cmd_read_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba, lba_count,
1183 							   spdk_fio_completion_cb, fio_req,
1184 							   fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1185 		} else {
1186 			rc = spdk_nvme_ns_cmd_readv_with_md(ns, fio_qpair->qpair, lba,
1187 							    lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1188 							    spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1189 							    dif_ctx->apptag_mask, dif_ctx->app_tag);
1190 		}
1191 		break;
1192 	case DDIR_WRITE:
1193 		if (!g_spdk_enable_sgl) {
1194 			if (!fio_qpair->zone_append_enabled) {
1195 				rc = spdk_nvme_ns_cmd_write_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba,
1196 								    lba_count,
1197 								    spdk_fio_completion_cb, fio_req,
1198 								    fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1199 			} else {
1200 				uint64_t zslba = fio_offset_to_zslba(io_u->offset, ns);
1201 				rc = spdk_nvme_zns_zone_append_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, zslba,
1202 								       lba_count,
1203 								       spdk_fio_completion_cb, fio_req,
1204 								       fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
1205 			}
1206 		} else {
1207 			if (!fio_qpair->zone_append_enabled) {
1208 #if FIO_HAS_FDP
1209 				if (spdk_unlikely(io_u->dtype)) {
1210 					ext_opts.size = SPDK_SIZEOF(&ext_opts, cdw13);
1211 					ext_opts.io_flags = fio_qpair->io_flags | (io_u->dtype << 20);
1212 					ext_opts.metadata = md_buf;
1213 					ext_opts.cdw13 = (io_u->dspec << 16);
1214 					ext_opts.apptag = dif_ctx->app_tag;
1215 					ext_opts.apptag_mask = dif_ctx->apptag_mask;
1216 					rc = spdk_nvme_ns_cmd_writev_ext(ns, fio_qpair->qpair, lba, lba_count,
1217 									 spdk_fio_completion_cb, fio_req,
1218 									 spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, &ext_opts);
1219 					break;
1220 				}
1221 #endif
1222 				rc = spdk_nvme_ns_cmd_writev_with_md(ns, fio_qpair->qpair, lba,
1223 								     lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1224 								     spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1225 								     dif_ctx->apptag_mask, dif_ctx->app_tag);
1226 			} else {
1227 				uint64_t zslba = fio_offset_to_zslba(io_u->offset, ns);
1228 				rc = spdk_nvme_zns_zone_appendv_with_md(ns, fio_qpair->qpair, zslba,
1229 									lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
1230 									spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
1231 									dif_ctx->apptag_mask, dif_ctx->app_tag);
1232 			}
1233 		}
1234 		break;
1235 	case DDIR_TRIM:
1236 		if (td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM) {
1237 			do_io_u_trim(td, io_u);
1238 			io_u_mark_submit(td, 1);
1239 			io_u_mark_complete(td, 1);
1240 			return FIO_Q_COMPLETED;
1241 		}
1242 
1243 		range = fio_req->dsm_range;
1244 #if FIO_HAS_MRT
1245 		if (td->o.num_range == 1) {
1246 			range->attributes.raw = 0;
1247 			range->length = lba_count;
1248 			range->starting_lba = lba;
1249 			num_range = 1;
1250 		} else {
1251 			struct trim_range *tr = (struct trim_range *)io_u->xfer_buf;
1252 			for (uint32_t i = 0; i < io_u->number_trim; i++) {
1253 				range->attributes.raw = 0;
1254 				range->length = tr->len / block_size;
1255 				range->starting_lba = tr->start / block_size;
1256 				range++;
1257 				tr++;
1258 			}
1259 			num_range = io_u->number_trim;
1260 			range = fio_req->dsm_range;
1261 		}
1262 #else
1263 		range->attributes.raw = 0;
1264 		range->length = lba_count;
1265 		range->starting_lba = lba;
1266 		num_range = 1;
1267 #endif
1268 
1269 		rc = spdk_nvme_ns_cmd_dataset_management(ns, fio_qpair->qpair,
1270 				SPDK_NVME_DSM_ATTR_DEALLOCATE, range, num_range,
1271 				spdk_fio_completion_cb, fio_req);
1272 		break;
1273 	default:
1274 		assert(false);
1275 		break;
1276 	}
1277 
1278 	/* NVMe read/write functions return -ENOMEM if there are no free requests. */
1279 	if (rc == -ENOMEM) {
1280 		return FIO_Q_BUSY;
1281 	}
1282 
1283 	if (rc != 0) {
1284 		io_u->error = abs(rc);
1285 		return FIO_Q_COMPLETED;
1286 	}
1287 
1288 	return FIO_Q_QUEUED;
1289 }
1290 
1291 static struct io_u *
1292 spdk_fio_event(struct thread_data *td, int event)
1293 {
1294 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1295 
1296 	assert(event >= 0);
1297 	assert((unsigned)event < fio_thread->iocq_count);
1298 	return fio_thread->iocq[event];
1299 }
1300 
1301 static int
1302 spdk_fio_getevents(struct thread_data *td, unsigned int min,
1303 		   unsigned int max, const struct timespec *t)
1304 {
1305 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1306 	struct spdk_fio_qpair *fio_qpair = NULL;
1307 	struct timespec t0, t1;
1308 	uint64_t timeout = 0;
1309 
1310 	if (t) {
1311 		timeout = t->tv_sec * 1000000000L + t->tv_nsec;
1312 		clock_gettime(CLOCK_MONOTONIC_RAW, &t0);
1313 	}
1314 
1315 	fio_thread->iocq_count = 0;
1316 
1317 	/* fetch the next qpair */
1318 	if (fio_thread->fio_qpair_current) {
1319 		fio_qpair = TAILQ_NEXT(fio_thread->fio_qpair_current, link);
1320 	}
1321 
1322 	for (;;) {
1323 		if (fio_qpair == NULL) {
1324 			fio_qpair = TAILQ_FIRST(&fio_thread->fio_qpair);
1325 		}
1326 
1327 		while (fio_qpair != NULL) {
1328 			/*
1329 			 * We can be called while spdk_fio_open()s are still
1330 			 * ongoing, in which case, ->qpair can still be NULL.
1331 			 */
1332 			if (fio_qpair->qpair == NULL) {
1333 				fio_qpair = TAILQ_NEXT(fio_qpair, link);
1334 				continue;
1335 			}
1336 
1337 			spdk_nvme_qpair_process_completions(fio_qpair->qpair, max - fio_thread->iocq_count);
1338 
1339 			if (fio_thread->iocq_count >= min) {
1340 				/* reset the current handling qpair */
1341 				fio_thread->fio_qpair_current = fio_qpair;
1342 				return fio_thread->iocq_count;
1343 			}
1344 
1345 			fio_qpair = TAILQ_NEXT(fio_qpair, link);
1346 		}
1347 
1348 		if (t) {
1349 			uint64_t elapse;
1350 
1351 			clock_gettime(CLOCK_MONOTONIC_RAW, &t1);
1352 			elapse = ((t1.tv_sec - t0.tv_sec) * 1000000000L)
1353 				 + t1.tv_nsec - t0.tv_nsec;
1354 			if (elapse > timeout) {
1355 				break;
1356 			}
1357 		}
1358 	}
1359 
1360 	/* reset the current handling qpair */
1361 	fio_thread->fio_qpair_current = fio_qpair;
1362 	return fio_thread->iocq_count;
1363 }
1364 
1365 static int
1366 spdk_fio_invalidate(struct thread_data *td, struct fio_file *f)
1367 {
1368 	/* TODO: This should probably send a flush to the device, but for now just return successful. */
1369 	return 0;
1370 }
1371 
1372 #if FIO_HAS_ZBD
1373 static int
1374 spdk_fio_get_zoned_model(struct thread_data *td, struct fio_file *f, enum zbd_zoned_model *model)
1375 {
1376 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1377 	struct spdk_fio_qpair *fio_qpair = NULL;
1378 	const struct spdk_nvme_zns_ns_data *zns_data = NULL;
1379 
1380 	if (f->filetype != FIO_TYPE_BLOCK) {
1381 		log_info("spdk/nvme: unsupported filetype: %d\n", f->filetype);
1382 		return -EINVAL;
1383 	}
1384 
1385 	fio_qpair = get_fio_qpair(fio_thread, f);
1386 	if (!fio_qpair) {
1387 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1388 		return -ENODEV;
1389 	}
1390 
1391 	switch (spdk_nvme_ns_get_csi(fio_qpair->ns)) {
1392 	case SPDK_NVME_CSI_NVM:
1393 		*model = ZBD_NONE;
1394 		return 0;
1395 
1396 	case SPDK_NVME_CSI_KV:
1397 		log_err("spdk/nvme: KV namespace is currently not supported\n");
1398 		return -ENOSYS;
1399 
1400 	case SPDK_NVME_CSI_ZNS:
1401 		zns_data = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1402 		if (!zns_data) {
1403 			log_err("spdk/nvme: file_name: '%s', ZNS is not enabled\n", f->file_name);
1404 			return -EINVAL;
1405 		}
1406 
1407 		*model = ZBD_HOST_MANAGED;
1408 
1409 		return 0;
1410 	}
1411 
1412 	return -EINVAL;
1413 }
1414 
1415 static int
1416 spdk_fio_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset,
1417 		      struct zbd_zone *zbdz, unsigned int nr_zones)
1418 {
1419 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1420 	struct spdk_fio_qpair *fio_qpair = NULL;
1421 	const struct spdk_nvme_zns_ns_data *zns = NULL;
1422 	struct spdk_nvme_zns_zone_report *report;
1423 	struct spdk_nvme_qpair *tmp_qpair;
1424 	uint32_t report_nzones = 0, report_nzones_max, report_nbytes, mdts_nbytes;
1425 	uint64_t zsze_nbytes, ns_nzones, lba_nbytes;
1426 	int completed = 0, err;
1427 
1428 	fio_qpair = get_fio_qpair(fio_thread, f);
1429 	if (!fio_qpair) {
1430 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1431 		return -ENODEV;
1432 	}
1433 	zns = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1434 	if (!zns) {
1435 		log_err("spdk/nvme: file_name: '%s', zns is not enabled\n", f->file_name);
1436 		return -EINVAL;
1437 	}
1438 
1439 	/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
1440 	 * Create a temporary qpair in order to perform report zones.
1441 	 */
1442 	assert(!fio_qpair->qpair);
1443 
1444 	tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_qpair->fio_ctrlr->ctrlr, NULL, 0);
1445 	if (!tmp_qpair) {
1446 		log_err("spdk/nvme: cannot allocate a temporary qpair\n");
1447 		return -EIO;
1448 	}
1449 
1450 	/** Retrieve device parameters */
1451 	mdts_nbytes = spdk_nvme_ns_get_max_io_xfer_size(fio_qpair->ns);
1452 	lba_nbytes = spdk_nvme_ns_get_sector_size(fio_qpair->ns);
1453 	zsze_nbytes = spdk_nvme_zns_ns_get_zone_size(fio_qpair->ns);
1454 	ns_nzones = spdk_nvme_zns_ns_get_num_zones(fio_qpair->ns);
1455 
1456 	/** Allocate report-buffer without exceeding mdts, zbdz-storage, and what is needed */
1457 	report_nzones_max = (mdts_nbytes - sizeof(*report)) / sizeof(report->descs[0]);
1458 	report_nzones_max = spdk_min(spdk_min(report_nzones_max, nr_zones), ns_nzones);
1459 	report_nbytes = sizeof(report->descs[0]) * report_nzones_max + sizeof(*report);
1460 	report = calloc(1, report_nbytes);
1461 	if (!report) {
1462 		log_err("spdk/nvme: failed report_zones(): ENOMEM\n");
1463 		err = -ENOMEM;
1464 		goto exit;
1465 	}
1466 
1467 	err = spdk_nvme_zns_report_zones(fio_qpair->ns, tmp_qpair, report, report_nbytes,
1468 					 offset / lba_nbytes, SPDK_NVME_ZRA_LIST_ALL, true, pcu_cb,
1469 					 &completed);
1470 	if (err || pcu(tmp_qpair, &completed) || completed < 0) {
1471 		log_err("spdk/nvme: report_zones(): err: %d, cpl: %d\n", err, completed);
1472 		err = err ? err : -EIO;
1473 		goto exit;
1474 	}
1475 	assert(report->nr_zones <= report_nzones_max);
1476 	report_nzones = report->nr_zones;
1477 
1478 	for (uint64_t idx = 0; idx < report->nr_zones; ++idx) {
1479 		struct spdk_nvme_zns_zone_desc *zdesc = &report->descs[idx];
1480 
1481 		zbdz[idx].start = zdesc->zslba * lba_nbytes;
1482 		zbdz[idx].len = zsze_nbytes;
1483 		zbdz[idx].capacity = zdesc->zcap * lba_nbytes;
1484 		zbdz[idx].wp = zdesc->wp * lba_nbytes;
1485 
1486 		switch (zdesc->zt) {
1487 		case SPDK_NVME_ZONE_TYPE_SEQWR:
1488 			zbdz[idx].type = ZBD_ZONE_TYPE_SWR;
1489 			break;
1490 
1491 		default:
1492 			log_err("spdk/nvme: %s: inv. zone-type: 0x%x\n", f->file_name, zdesc->zt);
1493 			err = -EIO;
1494 			goto exit;
1495 		}
1496 
1497 		switch (zdesc->zs) {
1498 		case SPDK_NVME_ZONE_STATE_EMPTY:
1499 			zbdz[idx].cond = ZBD_ZONE_COND_EMPTY;
1500 			break;
1501 		case SPDK_NVME_ZONE_STATE_IOPEN:
1502 			zbdz[idx].cond = ZBD_ZONE_COND_IMP_OPEN;
1503 			break;
1504 		case SPDK_NVME_ZONE_STATE_EOPEN:
1505 			zbdz[idx].cond = ZBD_ZONE_COND_EXP_OPEN;
1506 			break;
1507 		case SPDK_NVME_ZONE_STATE_CLOSED:
1508 			zbdz[idx].cond = ZBD_ZONE_COND_CLOSED;
1509 			break;
1510 		case SPDK_NVME_ZONE_STATE_RONLY:
1511 			zbdz[idx].cond = ZBD_ZONE_COND_READONLY;
1512 			break;
1513 		case SPDK_NVME_ZONE_STATE_FULL:
1514 			zbdz[idx].cond = ZBD_ZONE_COND_FULL;
1515 			break;
1516 		case SPDK_NVME_ZONE_STATE_OFFLINE:
1517 			zbdz[idx].cond = ZBD_ZONE_COND_OFFLINE;
1518 			break;
1519 
1520 		default:
1521 			log_err("spdk/nvme: %s: inv. zone-state: 0x%x\n", f->file_name, zdesc->zs);
1522 			err = -EIO;
1523 			goto exit;
1524 		}
1525 	}
1526 
1527 exit:
1528 	spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
1529 	free(report);
1530 
1531 	return err ? err : (int)report_nzones;
1532 }
1533 
1534 static int
1535 spdk_fio_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset, uint64_t length)
1536 {
1537 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1538 	struct spdk_fio_qpair *fio_qpair = NULL;
1539 	const struct spdk_nvme_zns_ns_data *zns = NULL;
1540 	uint64_t zsze_nbytes, lba_nbytes;
1541 	int err = 0;
1542 
1543 	fio_qpair = get_fio_qpair(fio_thread, f);
1544 	if (!fio_qpair) {
1545 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1546 		return -ENODEV;
1547 	}
1548 	zns = spdk_nvme_zns_ns_get_data(fio_qpair->ns);
1549 	if (!zns) {
1550 		log_err("spdk/nvme: file_name: '%s', zns is not enabled\n", f->file_name);
1551 		return -EINVAL;
1552 	}
1553 	zsze_nbytes = spdk_nvme_zns_ns_get_zone_size(fio_qpair->ns);
1554 	lba_nbytes = spdk_nvme_ns_get_sector_size(fio_qpair->ns);
1555 
1556 	/** check the assumption that offset is valid zone-start lba */
1557 	if (offset % zsze_nbytes) {
1558 		log_err("spdk/nvme: offset: %zu is not a valid zslba\n", offset);
1559 		return -EINVAL;
1560 	}
1561 
1562 	for (uint64_t cur = offset; cur < offset + length; cur += zsze_nbytes) {
1563 		int completed = 0;
1564 
1565 		err = spdk_nvme_zns_reset_zone(fio_qpair->ns, fio_qpair->qpair, cur / lba_nbytes,
1566 					       false, pcu_cb, &completed);
1567 		if (err || pcu(fio_qpair->qpair, &completed) || completed < 0) {
1568 			log_err("spdk/nvme: zns_reset_zone(): err: %d, cpl: %d\n", err, completed);
1569 			err = err ? err : -EIO;
1570 			break;
1571 		}
1572 	}
1573 
1574 	return err;
1575 }
1576 #endif
1577 
1578 #if FIO_IOOPS_VERSION >= 30
1579 static int
1580 spdk_fio_get_max_open_zones(struct thread_data *td, struct fio_file *f,
1581 			    unsigned int *max_open_zones)
1582 {
1583 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1584 	struct spdk_fio_qpair *fio_qpair = NULL;
1585 
1586 	fio_qpair = get_fio_qpair(fio_thread, f);
1587 	if (!fio_qpair) {
1588 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1589 		return -ENODEV;
1590 	}
1591 
1592 	*max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(fio_qpair->ns);
1593 
1594 	return 0;
1595 }
1596 #endif
1597 
1598 #if FIO_HAS_FDP
1599 /**
1600  * This is called twice as the number of ruhs descriptors are unknown.
1601  * In the first call fio only sends a buffer to fetch the number of ruhs
1602  * descriptors. In the second call fio will send a buffer to fetch all the
1603  * ruhs descriptors.
1604  */
1605 static int
1606 spdk_fio_fdp_fetch_ruhs(struct thread_data *td, struct fio_file *f,
1607 			struct fio_ruhs_info *fruhs_info)
1608 {
1609 	struct spdk_fio_thread *fio_thread = td->io_ops_data;
1610 	struct spdk_fio_qpair *fio_qpair = NULL;
1611 	struct spdk_nvme_qpair *tmp_qpair;
1612 	struct spdk_nvme_fdp_ruhs *fdp_ruhs;
1613 	uint32_t ruhs_nbytes;
1614 	uint16_t idx, nruhsd;
1615 	int completed = 0, err;
1616 
1617 	fio_qpair = get_fio_qpair(fio_thread, f);
1618 	if (!fio_qpair) {
1619 		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
1620 		return -ENODEV;
1621 	}
1622 
1623 	/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
1624 	 * Create a temporary qpair in order to perform report zones.
1625 	 */
1626 	assert(!fio_qpair->qpair);
1627 
1628 	tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_qpair->fio_ctrlr->ctrlr, NULL, 0);
1629 	if (!tmp_qpair) {
1630 		log_err("spdk/nvme: cannot allocate a temporary qpair\n");
1631 		return -EIO;
1632 	}
1633 
1634 	nruhsd = fruhs_info->nr_ruhs;
1635 	ruhs_nbytes = sizeof(*fdp_ruhs) + nruhsd * sizeof(struct spdk_nvme_fdp_ruhs_desc);
1636 	fdp_ruhs = calloc(1, ruhs_nbytes);
1637 	if (!fdp_ruhs) {
1638 		log_err("spdk/nvme: failed fdp_fetch_ruhs(): ENOMEM\n");
1639 		err = -ENOMEM;
1640 		goto exit;
1641 	}
1642 
1643 	err = spdk_nvme_ns_cmd_io_mgmt_recv(fio_qpair->ns, tmp_qpair, fdp_ruhs, ruhs_nbytes,
1644 					    SPDK_NVME_FDP_IO_MGMT_RECV_RUHS, 0, pcu_cb, &completed);
1645 	if (err || pcu(tmp_qpair, &completed) || completed < 0) {
1646 		log_err("spdk/nvme: fetch_ruhs(): err: %d, cpl: %d\n", err, completed);
1647 		err = err ? err : -EIO;
1648 		goto exit;
1649 	}
1650 
1651 	fruhs_info->nr_ruhs = fdp_ruhs->nruhsd;
1652 	for (idx = 0; idx < nruhsd; idx++) {
1653 		fruhs_info->plis[idx] = fdp_ruhs->ruhs_desc[idx].pid;
1654 	}
1655 
1656 exit:
1657 	spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
1658 	free(fdp_ruhs);
1659 
1660 	return err;
1661 }
1662 #endif
1663 
1664 static void
1665 spdk_fio_cleanup(struct thread_data *td)
1666 {
1667 	struct spdk_fio_thread	*fio_thread = td->io_ops_data;
1668 	struct spdk_fio_qpair	*fio_qpair, *fio_qpair_tmp;
1669 	struct spdk_fio_options *fio_options = td->eo;
1670 
1671 	if (fio_options->spdk_tracing) {
1672 		spdk_trace_unregister_user_thread();
1673 	}
1674 
1675 	TAILQ_FOREACH_SAFE(fio_qpair, &fio_thread->fio_qpair, link, fio_qpair_tmp) {
1676 		TAILQ_REMOVE(&fio_thread->fio_qpair, fio_qpair, link);
1677 		free(fio_qpair);
1678 	}
1679 
1680 	free(fio_thread->iocq);
1681 	free(fio_thread);
1682 
1683 	pthread_mutex_lock(&g_mutex);
1684 	g_td_count--;
1685 	if (g_td_count == 0) {
1686 		struct spdk_fio_ctrlr	*fio_ctrlr, *fio_ctrlr_tmp;
1687 		struct spdk_nvme_detach_ctx	*detach_ctx = NULL;
1688 
1689 		TAILQ_FOREACH_SAFE(fio_ctrlr, &g_ctrlrs, link, fio_ctrlr_tmp) {
1690 			TAILQ_REMOVE(&g_ctrlrs, fio_ctrlr, link);
1691 			spdk_nvme_detach_async(fio_ctrlr->ctrlr, &detach_ctx);
1692 			free(fio_ctrlr);
1693 		}
1694 
1695 		if (detach_ctx) {
1696 			spdk_nvme_detach_poll(detach_ctx);
1697 		}
1698 
1699 		if (fio_options->enable_vmd) {
1700 			spdk_vmd_fini();
1701 		}
1702 	}
1703 	pthread_mutex_unlock(&g_mutex);
1704 	if (TAILQ_EMPTY(&g_ctrlrs)) {
1705 		if (pthread_cancel(g_ctrlr_thread_id) == 0) {
1706 			pthread_join(g_ctrlr_thread_id, NULL);
1707 		}
1708 	}
1709 }
1710 
1711 /* This function enables addition of SPDK parameters to the fio config
1712  * Adding new parameters by defining them here and defining a callback
1713  * function to read the parameter value. */
1714 static struct fio_option options[] = {
1715 	{
1716 		.name           = "enable_wrr",
1717 		.lname          = "Enable weighted round robin (WRR) for IO submission queues",
1718 		.type           = FIO_OPT_INT,
1719 		.off1           = offsetof(struct spdk_fio_options, enable_wrr),
1720 		.def            = "0",
1721 		.help           = "Enable weighted round robin (WRR) for IO submission queues",
1722 		.category       = FIO_OPT_C_ENGINE,
1723 		.group          = FIO_OPT_G_INVALID,
1724 	},
1725 	{
1726 		.name           = "arbitration_burst",
1727 		.lname          = "Arbitration Burst",
1728 		.type           = FIO_OPT_INT,
1729 		.off1           = offsetof(struct spdk_fio_options, arbitration_burst),
1730 		.def            = "0",
1731 		.help           = "Arbitration Burst used for WRR (valid range from 0 - 7)",
1732 		.category       = FIO_OPT_C_ENGINE,
1733 		.group          = FIO_OPT_G_INVALID,
1734 	},
1735 	{
1736 		.name           = "low_weight",
1737 		.lname          = "low_weight for WRR",
1738 		.type           = FIO_OPT_INT,
1739 		.off1           = offsetof(struct spdk_fio_options, low_weight),
1740 		.def            = "0",
1741 		.help           = "low_weight used for WRR (valid range from 0 - 255)",
1742 		.category       = FIO_OPT_C_ENGINE,
1743 		.group          = FIO_OPT_G_INVALID,
1744 	},
1745 	{
1746 		.name           = "medium_weight",
1747 		.lname          = "medium_weight for WRR",
1748 		.type           = FIO_OPT_INT,
1749 		.off1           = offsetof(struct spdk_fio_options, medium_weight),
1750 		.def            = "0",
1751 		.help           = "medium weight used for WRR (valid range from 0 - 255)",
1752 		.category       = FIO_OPT_C_ENGINE,
1753 		.group          = FIO_OPT_G_INVALID,
1754 	},
1755 	{
1756 		.name           = "high_weight",
1757 		.lname          = "high_weight for WRR",
1758 		.type           = FIO_OPT_INT,
1759 		.off1           = offsetof(struct spdk_fio_options, high_weight),
1760 		.def            = "0",
1761 		.help           = "high weight used for WRR (valid range from 0 - 255)",
1762 		.category       = FIO_OPT_C_ENGINE,
1763 		.group          = FIO_OPT_G_INVALID,
1764 	},
1765 	{
1766 		.name           = "wrr_priority",
1767 		.lname          = "priority used for WRR",
1768 		.type           = FIO_OPT_INT,
1769 		.off1           = offsetof(struct spdk_fio_options, wrr_priority),
1770 		.def            = "0",
1771 		.help           = "priority used for WRR (valid range from 0-3)",
1772 		.category       = FIO_OPT_C_ENGINE,
1773 		.group          = FIO_OPT_G_INVALID,
1774 	},
1775 	{
1776 		.name		= "mem_size_mb",
1777 		.lname		= "Memory size in MB",
1778 		.type		= FIO_OPT_INT,
1779 		.off1		= offsetof(struct spdk_fio_options, mem_size),
1780 		.def		= "0",
1781 		.help		= "Memory Size for SPDK (MB)",
1782 		.category	= FIO_OPT_C_ENGINE,
1783 		.group		= FIO_OPT_G_INVALID,
1784 	},
1785 	{
1786 		.name		= "shm_id",
1787 		.lname		= "shared memory ID",
1788 		.type		= FIO_OPT_INT,
1789 		.off1		= offsetof(struct spdk_fio_options, shm_id),
1790 		.def		= "-1",
1791 		.help		= "Shared Memory ID",
1792 		.category	= FIO_OPT_C_ENGINE,
1793 		.group		= FIO_OPT_G_INVALID,
1794 	},
1795 	{
1796 		.name		= "enable_sgl",
1797 		.lname		= "SGL used for I/O commands",
1798 		.type		= FIO_OPT_INT,
1799 		.off1		= offsetof(struct spdk_fio_options, enable_sgl),
1800 		.def		= "0",
1801 		.help		= "SGL Used for I/O Commands (enable_sgl=1 or enable_sgl=0)",
1802 		.category	= FIO_OPT_C_ENGINE,
1803 		.group		= FIO_OPT_G_INVALID,
1804 	},
1805 	{
1806 		.name		= "sge_size",
1807 		.lname		= "SGL size used for I/O commands",
1808 		.type		= FIO_OPT_INT,
1809 		.off1		= offsetof(struct spdk_fio_options, sge_size),
1810 		.def		= "4096",
1811 		.help		= "SGL size in bytes for I/O Commands (default 4096)",
1812 		.category	= FIO_OPT_C_ENGINE,
1813 		.group		= FIO_OPT_G_INVALID,
1814 	},
1815 	{
1816 		.name		= "disable_pcie_sgl_merge",
1817 		.lname		= "Disable merging of physically contiguous SGL elements",
1818 		.type		= FIO_OPT_INT,
1819 		.off1		= offsetof(struct spdk_fio_options, disable_pcie_sgl_merge),
1820 		.def		= "0",
1821 		.help		= "Disable SGL element merging (0=merging, 1=no merging)",
1822 		.category	= FIO_OPT_C_ENGINE,
1823 		.group		= FIO_OPT_G_INVALID,
1824 	},
1825 	{
1826 		.name		= "bit_bucket_data_len",
1827 		.lname		= "Amount of data used for Bit Bucket",
1828 		.type		= FIO_OPT_INT,
1829 		.off1		= offsetof(struct spdk_fio_options, bit_bucket_data_len),
1830 		.def		= "0",
1831 		.help		= "Bit Bucket Data Length for READ commands (disabled by default)",
1832 		.category	= FIO_OPT_C_ENGINE,
1833 		.group		= FIO_OPT_G_INVALID,
1834 	},
1835 	{
1836 		.name		= "hostnqn",
1837 		.lname		= "Host NQN to use when connecting to controllers.",
1838 		.type		= FIO_OPT_STR_STORE,
1839 		.off1		= offsetof(struct spdk_fio_options, hostnqn),
1840 		.help		= "Host NQN",
1841 		.category	= FIO_OPT_C_ENGINE,
1842 		.group		= FIO_OPT_G_INVALID,
1843 	},
1844 	{
1845 		.name		= "pi_act",
1846 		.lname		= "Protection Information Action",
1847 		.type		= FIO_OPT_INT,
1848 		.off1		= offsetof(struct spdk_fio_options, pi_act),
1849 		.def		= "1",
1850 		.help		= "Protection Information Action bit (pi_act=1 or pi_act=0)",
1851 		.category	= FIO_OPT_C_ENGINE,
1852 		.group		= FIO_OPT_G_INVALID,
1853 	},
1854 	{
1855 		.name		= "pi_chk",
1856 		.lname		= "Protection Information Check(GUARD|REFTAG|APPTAG)",
1857 		.type		= FIO_OPT_STR_STORE,
1858 		.off1		= offsetof(struct spdk_fio_options, pi_chk),
1859 		.def		= NULL,
1860 		.help		= "Control of Protection Information Checking (pi_chk=GUARD|REFTAG|APPTAG)",
1861 		.category	= FIO_OPT_C_ENGINE,
1862 		.group		= FIO_OPT_G_INVALID,
1863 	},
1864 	{
1865 		.name		= "md_per_io_size",
1866 		.lname		= "Separate Metadata Buffer Size per I/O",
1867 		.type		= FIO_OPT_INT,
1868 		.off1		= offsetof(struct spdk_fio_options, md_per_io_size),
1869 		.def		= "4096",
1870 		.help		= "Size of separate metadata buffer per I/O (Default: 4096)",
1871 		.category	= FIO_OPT_C_ENGINE,
1872 		.group		= FIO_OPT_G_INVALID,
1873 	},
1874 	{
1875 		.name		= "apptag",
1876 		.lname		= "Application Tag used in Protection Information",
1877 		.type		= FIO_OPT_INT,
1878 		.off1		= offsetof(struct spdk_fio_options, apptag),
1879 		.def		= "0x1234",
1880 		.help		= "Application Tag used in Protection Information field (Default: 0x1234)",
1881 		.category	= FIO_OPT_C_ENGINE,
1882 		.group		= FIO_OPT_G_INVALID,
1883 	},
1884 	{
1885 		.name		= "apptag_mask",
1886 		.lname		= "Application Tag Mask",
1887 		.type		= FIO_OPT_INT,
1888 		.off1		= offsetof(struct spdk_fio_options, apptag_mask),
1889 		.def		= "0xffff",
1890 		.help		= "Application Tag Mask used with Application Tag (Default: 0xffff)",
1891 		.category	= FIO_OPT_C_ENGINE,
1892 		.group		= FIO_OPT_G_INVALID,
1893 	},
1894 	{
1895 		.name		= "digest_enable",
1896 		.lname		= "PDU digest choice for NVMe/TCP Transport(NONE|HEADER|DATA|BOTH)",
1897 		.type		= FIO_OPT_STR_STORE,
1898 		.off1		= offsetof(struct spdk_fio_options, digest_enable),
1899 		.def		= NULL,
1900 		.help		= "Control the NVMe/TCP control(digest_enable=NONE|HEADER|DATA|BOTH)",
1901 		.category	= FIO_OPT_C_ENGINE,
1902 		.group		= FIO_OPT_G_INVALID,
1903 	},
1904 	{
1905 		.name		= "enable_vmd",
1906 		.lname		= "Enable VMD enumeration",
1907 		.type		= FIO_OPT_INT,
1908 		.off1		= offsetof(struct spdk_fio_options, enable_vmd),
1909 		.def		= "0",
1910 		.help		= "Enable VMD enumeration (enable_vmd=1 or enable_vmd=0)",
1911 		.category	= FIO_OPT_C_ENGINE,
1912 		.group		= FIO_OPT_G_INVALID,
1913 	},
1914 	{
1915 		.name		= "initial_zone_reset",
1916 		.lname		= "Reset Zones on initialization",
1917 		.type		= FIO_OPT_INT,
1918 		.off1		= offsetof(struct spdk_fio_options, initial_zone_reset),
1919 		.def		= "0",
1920 		.help		= "Reset Zones on initialization (0=disable, 1=Reset All Zones)",
1921 		.category	= FIO_OPT_C_ENGINE,
1922 		.group		= FIO_OPT_G_INVALID,
1923 	},
1924 	{
1925 		.name		= "zone_append",
1926 		.lname		= "Use zone append instead of write",
1927 		.type		= FIO_OPT_INT,
1928 		.off1		= offsetof(struct spdk_fio_options, zone_append),
1929 		.def		= "0",
1930 		.help		= "Use zone append instead of write (1=zone append, 0=write)",
1931 		.category	= FIO_OPT_C_ENGINE,
1932 		.group		= FIO_OPT_G_INVALID,
1933 	},
1934 	{
1935 		.name		= "print_qid_mappings",
1936 		.lname		= "Print job-to-qid mappings",
1937 		.type		= FIO_OPT_INT,
1938 		.off1		= offsetof(struct spdk_fio_options, print_qid_mappings),
1939 		.def		= "0",
1940 		.help		= "Print job-to-qid mappings (0=disable, 1=enable)",
1941 		.category	= FIO_OPT_C_ENGINE,
1942 		.group		= FIO_OPT_G_INVALID,
1943 	},
1944 	{
1945 		.name		= "log_flags",
1946 		.lname		= "log_flags",
1947 		.type		= FIO_OPT_STR_STORE,
1948 		.off1		= offsetof(struct spdk_fio_options, log_flags),
1949 		.help		= "Enable log flags (comma-separated list)",
1950 		.category	= FIO_OPT_C_ENGINE,
1951 		.group		= FIO_OPT_G_INVALID,
1952 	},
1953 	{
1954 		.name		= "spdk_tracing",
1955 		.lname		= "Enable SPDK Tracing",
1956 		.type		= FIO_OPT_INT,
1957 		.off1		= offsetof(struct spdk_fio_options, spdk_tracing),
1958 		.def		= "0",
1959 		.help		= "SPDK Tracing (0=disable, 1=enable)",
1960 		.category	= FIO_OPT_C_ENGINE,
1961 		.group		= FIO_OPT_G_INVALID,
1962 	},
1963 	{
1964 		.name		= NULL,
1965 	},
1966 };
1967 
1968 /* FIO imports this structure using dlsym */
1969 struct ioengine_ops ioengine = {
1970 	.name			= "spdk",
1971 	.version		= FIO_IOOPS_VERSION,
1972 	.queue			= spdk_fio_queue,
1973 	.getevents		= spdk_fio_getevents,
1974 	.event			= spdk_fio_event,
1975 	.cleanup		= spdk_fio_cleanup,
1976 	.open_file		= spdk_fio_open,
1977 	.close_file		= spdk_fio_close,
1978 	.invalidate		= spdk_fio_invalidate,
1979 	.iomem_alloc		= spdk_fio_iomem_alloc,
1980 	.iomem_free		= spdk_fio_iomem_free,
1981 	.setup			= spdk_fio_setup,
1982 	.init			= spdk_fio_init,
1983 	.io_u_init		= spdk_fio_io_u_init,
1984 	.io_u_free		= spdk_fio_io_u_free,
1985 #if FIO_HAS_ZBD
1986 	.get_zoned_model	= spdk_fio_get_zoned_model,
1987 	.report_zones		= spdk_fio_report_zones,
1988 	.reset_wp		= spdk_fio_reset_wp,
1989 #endif
1990 #if FIO_IOOPS_VERSION >= 30
1991 	.get_max_open_zones	= spdk_fio_get_max_open_zones,
1992 #endif
1993 #if FIO_HAS_FDP
1994 	.fdp_fetch_ruhs		= spdk_fio_fdp_fetch_ruhs,
1995 #endif
1996 #if FIO_HAS_MRT
1997 	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN | FIO_DISKLESSIO | FIO_MULTI_RANGE_TRIM,
1998 #else
1999 	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN | FIO_DISKLESSIO,
2000 #endif
2001 	.options		= options,
2002 	.option_struct_size	= sizeof(struct spdk_fio_options),
2003 };
2004 
2005 static void fio_init
2006 fio_spdk_register(void)
2007 {
2008 	register_ioengine(&ioengine);
2009 }
2010 
2011 static void fio_exit
2012 fio_spdk_unregister(void)
2013 {
2014 	if (g_spdk_env_initialized) {
2015 		spdk_trace_cleanup();
2016 		spdk_env_fini();
2017 	}
2018 
2019 	unregister_ioengine(&ioengine);
2020 }
2021 
2022 SPDK_LOG_REGISTER_COMPONENT(fio_nvme)
2023