xref: /spdk/lib/nvme/nvme_pcie_common.c (revision 1efa1b16d579b0c09bcbf26a84140cbbcf88d9df)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2021 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2021 Mellanox Technologies LTD. All rights reserved.
4  *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 /*
8  * NVMe over PCIe common library
9  */
10 
11 #include "spdk/stdinc.h"
12 #include "spdk/likely.h"
13 #include "spdk/string.h"
14 #include "nvme_internal.h"
15 #include "nvme_pcie_internal.h"
16 #include "spdk/trace.h"
17 
18 #include "spdk_internal/trace_defs.h"
19 
20 __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL;
21 
22 static struct spdk_nvme_pcie_stat g_dummy_stat = {};
23 
24 static void nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair,
25 		struct nvme_tracker *tr);
26 
27 static inline uint64_t
28 nvme_pcie_vtophys(struct spdk_nvme_ctrlr *ctrlr, const void *buf, uint64_t *size)
29 {
30 	if (spdk_likely(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE)) {
31 		return spdk_vtophys(buf, size);
32 	} else {
33 		/* vfio-user address translation with IOVA=VA mode */
34 		return (uint64_t)(uintptr_t)buf;
35 	}
36 }
37 
38 int
39 nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair)
40 {
41 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
42 	uint32_t i;
43 
44 	/* all head/tail vals are set to 0 */
45 	pqpair->last_sq_tail = pqpair->sq_tail = pqpair->sq_head = pqpair->cq_head = 0;
46 
47 	/*
48 	 * First time through the completion queue, HW will set phase
49 	 *  bit on completions to 1.  So set this to 1 here, indicating
50 	 *  we're looking for a 1 to know which entries have completed.
51 	 *  we'll toggle the bit each time when the completion queue
52 	 *  rolls over.
53 	 */
54 	pqpair->flags.phase = 1;
55 	for (i = 0; i < pqpair->num_entries; i++) {
56 		pqpair->cpl[i].status.p = 0;
57 	}
58 
59 	return 0;
60 }
61 
62 int
63 nvme_pcie_qpair_get_fd(struct spdk_nvme_qpair *qpair, struct spdk_event_handler_opts *opts)
64 {
65 	struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
66 	struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr);
67 
68 	assert(devhandle != NULL);
69 	if (!ctrlr->opts.enable_interrupts) {
70 		return -1;
71 	}
72 
73 	if (!opts) {
74 		return spdk_pci_device_get_interrupt_efd_by_index(devhandle, qpair->id);
75 	}
76 
77 	if (!SPDK_FIELD_VALID(opts, fd_type, opts->opts_size)) {
78 		return -EINVAL;
79 	}
80 
81 	spdk_fd_group_get_default_event_handler_opts(opts, opts->opts_size);
82 	opts->fd_type = SPDK_FD_TYPE_EVENTFD;
83 
84 	return spdk_pci_device_get_interrupt_efd_by_index(devhandle, qpair->id);
85 }
86 
87 static void
88 nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr)
89 {
90 	tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
91 	tr->cid = cid;
92 	tr->req = NULL;
93 }
94 
95 static void *
96 nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t size, uint64_t alignment,
97 			  uint64_t *phys_addr)
98 {
99 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
100 	uintptr_t addr;
101 
102 	if (pctrlr->cmb.mem_register_addr != NULL) {
103 		/* BAR is mapped for data */
104 		return NULL;
105 	}
106 
107 	addr = (uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset;
108 	addr = (addr + (alignment - 1)) & ~(alignment - 1);
109 
110 	/* CMB may only consume part of the BAR, calculate accordingly */
111 	if (addr + size > ((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.size)) {
112 		SPDK_ERRLOG("Tried to allocate past valid CMB range!\n");
113 		return NULL;
114 	}
115 	*phys_addr = pctrlr->cmb.bar_pa + addr - (uintptr_t)pctrlr->cmb.bar_va;
116 
117 	pctrlr->cmb.current_offset = (addr + size) - (uintptr_t)pctrlr->cmb.bar_va;
118 
119 	return (void *)addr;
120 }
121 
122 int
123 nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair,
124 			  const struct spdk_nvme_io_qpair_opts *opts)
125 {
126 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
127 	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(ctrlr);
128 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
129 	struct nvme_tracker	*tr;
130 	uint16_t		i;
131 	uint16_t		num_trackers;
132 	size_t			page_align = sysconf(_SC_PAGESIZE);
133 	size_t			queue_align, queue_len;
134 	uint32_t                flags = SPDK_MALLOC_DMA;
135 	int32_t			numa_id;
136 	uint64_t		sq_paddr = 0;
137 	uint64_t		cq_paddr = 0;
138 
139 	if (opts) {
140 		pqpair->sq_vaddr = opts->sq.vaddr;
141 		pqpair->cq_vaddr = opts->cq.vaddr;
142 		pqpair->flags.disable_pcie_sgl_merge = opts->disable_pcie_sgl_merge;
143 		sq_paddr = opts->sq.paddr;
144 		cq_paddr = opts->cq.paddr;
145 	}
146 
147 	pqpair->retry_count = ctrlr->opts.transport_retry_count;
148 
149 	/*
150 	 * Limit the maximum number of completions to return per call to prevent wraparound,
151 	 * and calculate how many trackers can be submitted at once without overflowing the
152 	 * completion queue.
153 	 */
154 	pqpair->max_completions_cap = pqpair->num_entries / 4;
155 	pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS);
156 	pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS);
157 	num_trackers = pqpair->num_entries - pqpair->max_completions_cap;
158 
159 	SPDK_INFOLOG(nvme, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n",
160 		     pqpair->max_completions_cap, num_trackers);
161 
162 	assert(num_trackers != 0);
163 
164 	pqpair->sq_in_cmb = false;
165 
166 	if (nvme_qpair_is_admin_queue(&pqpair->qpair)) {
167 		flags |= SPDK_MALLOC_SHARE;
168 	}
169 
170 	/* cmd and cpl rings must be aligned on page size boundaries. */
171 	if (ctrlr->opts.use_cmb_sqs) {
172 		pqpair->cmd = nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
173 							page_align, &pqpair->cmd_bus_addr);
174 		if (pqpair->cmd != NULL) {
175 			pqpair->sq_in_cmb = true;
176 		}
177 	}
178 
179 	if (pqpair->sq_in_cmb == false) {
180 		if (pqpair->sq_vaddr) {
181 			pqpair->cmd = pqpair->sq_vaddr;
182 		} else {
183 			/* To ensure physical address contiguity we make each ring occupy
184 			 * a single hugepage only. See MAX_IO_QUEUE_ENTRIES.
185 			 */
186 			queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cmd);
187 			queue_align = spdk_max(spdk_align32pow2(queue_len), page_align);
188 			pqpair->cmd = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_NUMA_ID_ANY, flags);
189 			if (pqpair->cmd == NULL) {
190 				SPDK_ERRLOG("alloc qpair_cmd failed\n");
191 				return -ENOMEM;
192 			}
193 		}
194 		if (sq_paddr) {
195 			assert(pqpair->sq_vaddr != NULL);
196 			pqpair->cmd_bus_addr = sq_paddr;
197 		} else {
198 			pqpair->cmd_bus_addr = nvme_pcie_vtophys(ctrlr, pqpair->cmd, NULL);
199 			if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) {
200 				SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n");
201 				return -EFAULT;
202 			}
203 		}
204 	}
205 
206 	if (pqpair->cq_vaddr) {
207 		pqpair->cpl = pqpair->cq_vaddr;
208 	} else {
209 		queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cpl);
210 		queue_align = spdk_max(spdk_align32pow2(queue_len), page_align);
211 		numa_id = spdk_nvme_ctrlr_get_numa_id(ctrlr);
212 		pqpair->cpl = spdk_zmalloc(queue_len, queue_align, NULL, numa_id, flags);
213 		if (pqpair->cpl == NULL) {
214 			SPDK_ERRLOG("alloc qpair_cpl failed\n");
215 			return -ENOMEM;
216 		}
217 	}
218 	if (cq_paddr) {
219 		assert(pqpair->cq_vaddr != NULL);
220 		pqpair->cpl_bus_addr = cq_paddr;
221 	} else {
222 		pqpair->cpl_bus_addr =  nvme_pcie_vtophys(ctrlr, pqpair->cpl, NULL);
223 		if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) {
224 			SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n");
225 			return -EFAULT;
226 		}
227 	}
228 
229 	pqpair->sq_tdbl = pctrlr->doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32;
230 	pqpair->cq_hdbl = pctrlr->doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32;
231 
232 	/*
233 	 * Reserve space for all of the trackers in a single allocation.
234 	 *   struct nvme_tracker must be padded so that its size is already a power of 2.
235 	 *   This ensures the PRP list embedded in the nvme_tracker object will not span a
236 	 *   4KB boundary, while allowing access to trackers in tr[] via normal array indexing.
237 	 */
238 	pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL,
239 				  SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_SHARE);
240 	if (pqpair->tr == NULL) {
241 		SPDK_ERRLOG("nvme_tr failed\n");
242 		return -ENOMEM;
243 	}
244 
245 	TAILQ_INIT(&pqpair->free_tr);
246 	TAILQ_INIT(&pqpair->outstanding_tr);
247 	pqpair->qpair.queue_depth = 0;
248 
249 	for (i = 0; i < num_trackers; i++) {
250 		tr = &pqpair->tr[i];
251 		nvme_qpair_construct_tracker(tr, i, nvme_pcie_vtophys(ctrlr, tr, NULL));
252 		TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
253 	}
254 
255 	nvme_pcie_qpair_reset(qpair);
256 
257 	return 0;
258 }
259 
260 int
261 nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t num_entries)
262 {
263 	struct nvme_pcie_qpair *pqpair;
264 	int rc;
265 
266 	pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_SHARE);
267 	if (pqpair == NULL) {
268 		return -ENOMEM;
269 	}
270 
271 	pqpair->num_entries = num_entries;
272 	pqpair->flags.delay_cmd_submit = 0;
273 	pqpair->pcie_state = NVME_PCIE_QPAIR_READY;
274 
275 	ctrlr->adminq = &pqpair->qpair;
276 
277 	rc = nvme_qpair_init(ctrlr->adminq,
278 			     0, /* qpair ID */
279 			     ctrlr,
280 			     SPDK_NVME_QPRIO_URGENT,
281 			     num_entries,
282 			     false);
283 	if (rc != 0) {
284 		return rc;
285 	}
286 
287 	pqpair->stat = spdk_zmalloc(sizeof(*pqpair->stat), 64, NULL, SPDK_ENV_NUMA_ID_ANY,
288 				    SPDK_MALLOC_SHARE);
289 	if (!pqpair->stat) {
290 		SPDK_ERRLOG("Failed to allocate admin qpair statistics\n");
291 		return -ENOMEM;
292 	}
293 
294 	return nvme_pcie_qpair_construct(ctrlr->adminq, NULL);
295 }
296 
297 /**
298  * Note: the ctrlr_lock must be held when calling this function.
299  */
300 void
301 nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair,
302 		struct nvme_request *req, struct spdk_nvme_cpl *cpl)
303 {
304 	struct spdk_nvme_ctrlr		*ctrlr = qpair->ctrlr;
305 	struct nvme_request		*active_req = req;
306 	struct spdk_nvme_ctrlr_process	*active_proc;
307 
308 	/*
309 	 * The admin request is from another process. Move to the per
310 	 *  process list for that process to handle it later.
311 	 */
312 	assert(nvme_qpair_is_admin_queue(qpair));
313 	assert(active_req->pid != getpid());
314 
315 	active_proc = nvme_ctrlr_get_process(ctrlr, active_req->pid);
316 	if (active_proc) {
317 		/* Save the original completion information */
318 		memcpy(&active_req->cpl, cpl, sizeof(*cpl));
319 		STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq);
320 	} else {
321 		SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n",
322 			    active_req->pid);
323 		nvme_cleanup_user_req(active_req);
324 		nvme_free_request(active_req);
325 	}
326 }
327 
328 /**
329  * Note: the ctrlr_lock must be held when calling this function.
330  */
331 void
332 nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair)
333 {
334 	struct spdk_nvme_ctrlr		*ctrlr = qpair->ctrlr;
335 	struct nvme_request		*req, *tmp_req;
336 	pid_t				pid = getpid();
337 	struct spdk_nvme_ctrlr_process	*proc;
338 
339 	/*
340 	 * Check whether there is any pending admin request from
341 	 * other active processes.
342 	 */
343 	assert(nvme_qpair_is_admin_queue(qpair));
344 
345 	proc = nvme_ctrlr_get_current_process(ctrlr);
346 	if (!proc) {
347 		SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid);
348 		assert(proc);
349 		return;
350 	}
351 
352 	STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
353 		STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
354 
355 		assert(req->pid == pid);
356 
357 		nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl);
358 	}
359 }
360 
361 int
362 nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr,
363 				 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn,
364 				 void *cb_arg)
365 {
366 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
367 	struct nvme_request *req;
368 	struct spdk_nvme_cmd *cmd;
369 	bool ien = ctrlr->opts.enable_interrupts;
370 
371 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
372 	if (req == NULL) {
373 		return -ENOMEM;
374 	}
375 
376 	cmd = &req->cmd;
377 	cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ;
378 
379 	cmd->cdw10_bits.create_io_q.qid = io_que->id;
380 	cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1;
381 
382 	cmd->cdw11_bits.create_io_cq.pc = 1;
383 	if (ien) {
384 		cmd->cdw11_bits.create_io_cq.ien = 1;
385 		/* The interrupt vector offset starts from 1. We directly map the
386 		 * queue id to interrupt vector.
387 		 */
388 		cmd->cdw11_bits.create_io_cq.iv = io_que->id;
389 	}
390 
391 	cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr;
392 
393 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
394 }
395 
396 int
397 nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr,
398 				 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
399 {
400 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
401 	struct nvme_request *req;
402 	struct spdk_nvme_cmd *cmd;
403 
404 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
405 	if (req == NULL) {
406 		return -ENOMEM;
407 	}
408 
409 	cmd = &req->cmd;
410 	cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ;
411 
412 	cmd->cdw10_bits.create_io_q.qid = io_que->id;
413 	cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1;
414 	cmd->cdw11_bits.create_io_sq.pc = 1;
415 	cmd->cdw11_bits.create_io_sq.qprio = io_que->qprio;
416 	cmd->cdw11_bits.create_io_sq.cqid = io_que->id;
417 	cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr;
418 
419 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
420 }
421 
422 int
423 nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
424 				 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
425 {
426 	struct nvme_request *req;
427 	struct spdk_nvme_cmd *cmd;
428 
429 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
430 	if (req == NULL) {
431 		return -ENOMEM;
432 	}
433 
434 	cmd = &req->cmd;
435 	cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ;
436 	cmd->cdw10_bits.delete_io_q.qid = qpair->id;
437 
438 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
439 }
440 
441 int
442 nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
443 				 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
444 {
445 	struct nvme_request *req;
446 	struct spdk_nvme_cmd *cmd;
447 
448 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
449 	if (req == NULL) {
450 		return -ENOMEM;
451 	}
452 
453 	cmd = &req->cmd;
454 	cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ;
455 	cmd->cdw10_bits.delete_io_q.qid = qpair->id;
456 
457 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
458 }
459 
460 static void
461 nvme_completion_sq_error_delete_cq_cb(void *arg, const struct spdk_nvme_cpl *cpl)
462 {
463 	struct spdk_nvme_qpair *qpair = arg;
464 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
465 
466 	if (spdk_nvme_cpl_is_error(cpl)) {
467 		SPDK_ERRLOG("delete_io_cq failed!\n");
468 	}
469 
470 	pqpair->pcie_state = NVME_PCIE_QPAIR_FAILED;
471 }
472 
473 static void
474 nvme_completion_create_sq_cb(void *arg, const struct spdk_nvme_cpl *cpl)
475 {
476 	struct spdk_nvme_qpair *qpair = arg;
477 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
478 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
479 	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(ctrlr);
480 	int rc;
481 
482 	if (pqpair->flags.defer_destruction) {
483 		/* This qpair was deleted by the application while the
484 		 * connection was still in progress.  We had to wait
485 		 * to free the qpair resources until this outstanding
486 		 * command was completed.  Now that we have the completion
487 		 * free it now.
488 		 */
489 		nvme_pcie_qpair_destroy(qpair);
490 		return;
491 	}
492 
493 	if (spdk_nvme_cpl_is_error(cpl)) {
494 		SPDK_ERRLOG("nvme_create_io_sq failed, deleting cq!\n");
495 		rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_sq_error_delete_cq_cb,
496 						      qpair);
497 		if (rc != 0) {
498 			SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
499 			pqpair->pcie_state = NVME_PCIE_QPAIR_FAILED;
500 		}
501 		return;
502 	}
503 	pqpair->pcie_state = NVME_PCIE_QPAIR_READY;
504 	if (ctrlr->shadow_doorbell) {
505 		pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) *
506 						  pctrlr->doorbell_stride_u32;
507 		pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) *
508 						  pctrlr->doorbell_stride_u32;
509 		pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) *
510 						      pctrlr->doorbell_stride_u32;
511 		pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) *
512 						      pctrlr->doorbell_stride_u32;
513 		pqpair->flags.has_shadow_doorbell = 1;
514 	} else {
515 		pqpair->flags.has_shadow_doorbell = 0;
516 	}
517 	nvme_pcie_qpair_reset(qpair);
518 
519 }
520 
521 static void
522 nvme_completion_create_cq_cb(void *arg, const struct spdk_nvme_cpl *cpl)
523 {
524 	struct spdk_nvme_qpair *qpair = arg;
525 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
526 	int rc;
527 
528 	if (pqpair->flags.defer_destruction) {
529 		/* This qpair was deleted by the application while the
530 		 * connection was still in progress.  We had to wait
531 		 * to free the qpair resources until this outstanding
532 		 * command was completed.  Now that we have the completion
533 		 * free it now.
534 		 */
535 		nvme_pcie_qpair_destroy(qpair);
536 		return;
537 	}
538 
539 	if (spdk_nvme_cpl_is_error(cpl)) {
540 		pqpair->pcie_state = NVME_PCIE_QPAIR_FAILED;
541 		SPDK_ERRLOG("nvme_create_io_cq failed!\n");
542 		return;
543 	}
544 
545 	rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_create_sq_cb, qpair);
546 
547 	if (rc != 0) {
548 		SPDK_ERRLOG("Failed to send request to create_io_sq, deleting cq!\n");
549 		rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_sq_error_delete_cq_cb,
550 						      qpair);
551 		if (rc != 0) {
552 			SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
553 			pqpair->pcie_state = NVME_PCIE_QPAIR_FAILED;
554 		}
555 		return;
556 	}
557 	pqpair->pcie_state = NVME_PCIE_QPAIR_WAIT_FOR_SQ;
558 }
559 
560 static int
561 _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
562 				 uint16_t qid)
563 {
564 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
565 	int	rc;
566 
567 	/* Statistics may already be allocated in the case of controller reset */
568 	if (qpair->poll_group) {
569 		struct nvme_pcie_poll_group *group = SPDK_CONTAINEROF(qpair->poll_group,
570 						     struct nvme_pcie_poll_group, group);
571 
572 		pqpair->stat = &group->stats;
573 		pqpair->shared_stats = true;
574 	} else {
575 		if (pqpair->stat == NULL) {
576 			pqpair->stat = calloc(1, sizeof(*pqpair->stat));
577 			if (!pqpair->stat) {
578 				SPDK_ERRLOG("Failed to allocate qpair statistics\n");
579 				nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
580 				return -ENOMEM;
581 			}
582 		}
583 	}
584 
585 	rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_create_cq_cb, qpair);
586 
587 	if (rc != 0) {
588 		SPDK_ERRLOG("Failed to send request to create_io_cq\n");
589 		nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
590 		return rc;
591 	}
592 	pqpair->pcie_state = NVME_PCIE_QPAIR_WAIT_FOR_CQ;
593 	return 0;
594 }
595 
596 int
597 nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
598 {
599 	int rc = 0;
600 
601 	if (!nvme_qpair_is_admin_queue(qpair)) {
602 		rc = _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id);
603 	} else {
604 		nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED);
605 	}
606 
607 	return rc;
608 }
609 
610 void
611 nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
612 {
613 	if (!nvme_qpair_is_admin_queue(qpair) || !ctrlr->is_disconnecting) {
614 		nvme_transport_ctrlr_disconnect_qpair_done(qpair);
615 	} else {
616 		/* If this function is called for the admin qpair via spdk_nvme_ctrlr_reset()
617 		 * or spdk_nvme_ctrlr_disconnect(), initiate a Controller Level Reset.
618 		 * Then we can abort trackers safely because the Controller Level Reset deletes
619 		 * all I/O SQ/CQs.
620 		 */
621 		nvme_ctrlr_disable(ctrlr);
622 	}
623 }
624 
625 /* Used when dst points to MMIO (i.e. CMB) in a virtual machine - in these cases we must
626  * not use wide instructions because QEMU will not emulate such instructions to MMIO space.
627  * So this function ensures we only copy 8 bytes at a time.
628  */
629 static inline void
630 nvme_pcie_copy_command_mmio(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
631 {
632 	uint64_t *dst64 = (uint64_t *)dst;
633 	const uint64_t *src64 = (const uint64_t *)src;
634 	uint32_t i;
635 
636 	for (i = 0; i < sizeof(*dst) / 8; i++) {
637 		dst64[i] = src64[i];
638 	}
639 }
640 
641 static inline void
642 nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
643 {
644 	/* dst and src are known to be non-overlapping and 64-byte aligned. */
645 #if defined(__SSE2__)
646 	__m128i *d128 = (__m128i *)dst;
647 	const __m128i *s128 = (const __m128i *)src;
648 
649 	_mm_stream_si128(&d128[0], _mm_load_si128(&s128[0]));
650 	_mm_stream_si128(&d128[1], _mm_load_si128(&s128[1]));
651 	_mm_stream_si128(&d128[2], _mm_load_si128(&s128[2]));
652 	_mm_stream_si128(&d128[3], _mm_load_si128(&s128[3]));
653 #else
654 	*dst = *src;
655 #endif
656 }
657 
658 void
659 nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
660 {
661 	struct nvme_request	*req;
662 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
663 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
664 
665 	req = tr->req;
666 	assert(req != NULL);
667 
668 	spdk_trace_record(TRACE_NVME_PCIE_SUBMIT, qpair->id, 0, (uintptr_t)req, req->cb_arg,
669 			  (uint32_t)req->cmd.cid, (uint32_t)req->cmd.opc,
670 			  req->cmd.cdw10, req->cmd.cdw11, req->cmd.cdw12,
671 			  pqpair->qpair.queue_depth);
672 
673 	if (req->cmd.fuse) {
674 		/*
675 		 * Keep track of the fuse operation sequence so that we ring the doorbell only
676 		 * after the second fuse is submitted.
677 		 */
678 		qpair->last_fuse = req->cmd.fuse;
679 	}
680 
681 	/* Don't use wide instructions to copy NVMe command, this is limited by QEMU
682 	 * virtual NVMe controller, the maximum access width is 8 Bytes for one time.
683 	 */
684 	if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH) && pqpair->sq_in_cmb)) {
685 		nvme_pcie_copy_command_mmio(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
686 	} else {
687 		/* Copy the command from the tracker to the submission queue. */
688 		nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
689 	}
690 
691 	if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) {
692 		pqpair->sq_tail = 0;
693 	}
694 
695 	if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) {
696 		SPDK_ERRLOG("sq_tail is passing sq_head!\n");
697 	}
698 
699 	if (!pqpair->flags.delay_cmd_submit) {
700 		nvme_pcie_qpair_ring_sq_doorbell(qpair);
701 	}
702 }
703 
704 void
705 nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
706 				 struct spdk_nvme_cpl *cpl, bool print_on_error)
707 {
708 	struct nvme_pcie_qpair		*pqpair = nvme_pcie_qpair(qpair);
709 	struct nvme_request		*req;
710 	bool				retry, error;
711 	bool				print_error;
712 
713 	req = tr->req;
714 
715 	spdk_trace_record(TRACE_NVME_PCIE_COMPLETE, qpair->id, 0, (uintptr_t)req, req->cb_arg,
716 			  (uint32_t)req->cmd.cid, (uint32_t)cpl->status_raw, pqpair->qpair.queue_depth);
717 
718 	assert(req != NULL);
719 
720 	error = spdk_nvme_cpl_is_error(cpl);
721 	retry = error && nvme_completion_is_retry(cpl) &&
722 		req->retries < pqpair->retry_count;
723 	print_error = error && print_on_error && !qpair->ctrlr->opts.disable_error_logging;
724 
725 	if (print_error) {
726 		spdk_nvme_qpair_print_command(qpair, &req->cmd);
727 	}
728 
729 	if (print_error || SPDK_DEBUGLOG_FLAG_ENABLED("nvme")) {
730 		spdk_nvme_qpair_print_completion(qpair, cpl);
731 	}
732 
733 	assert(cpl->cid == req->cmd.cid);
734 
735 	if (retry) {
736 		req->retries++;
737 		nvme_pcie_qpair_submit_tracker(qpair, tr);
738 	} else {
739 		TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list);
740 		pqpair->qpair.queue_depth--;
741 
742 		/* Only check admin requests from different processes. */
743 		if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) {
744 			nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl);
745 		} else {
746 			nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl);
747 		}
748 
749 		tr->req = NULL;
750 
751 		TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
752 	}
753 }
754 
755 void
756 nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair,
757 					struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
758 					bool print_on_error)
759 {
760 	struct spdk_nvme_cpl	cpl;
761 
762 	memset(&cpl, 0, sizeof(cpl));
763 	cpl.sqid = qpair->id;
764 	cpl.cid = tr->cid;
765 	cpl.status.sct = sct;
766 	cpl.status.sc = sc;
767 	cpl.status.dnr = dnr;
768 	nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
769 }
770 
771 void
772 nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr)
773 {
774 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
775 	struct nvme_tracker *tr, *temp, *last;
776 
777 	last = TAILQ_LAST(&pqpair->outstanding_tr, nvme_outstanding_tr_head);
778 
779 	/* Abort previously submitted (outstanding) trs */
780 	TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) {
781 		if (!qpair->ctrlr->opts.disable_error_logging) {
782 			SPDK_ERRLOG("aborting outstanding command\n");
783 		}
784 		nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
785 							SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true);
786 
787 		if (tr == last) {
788 			break;
789 		}
790 	}
791 }
792 
793 void
794 nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
795 {
796 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
797 	struct nvme_tracker	*tr;
798 
799 	tr = TAILQ_FIRST(&pqpair->outstanding_tr);
800 	while (tr != NULL) {
801 		assert(tr->req != NULL);
802 		if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
803 			nvme_pcie_qpair_manual_complete_tracker(qpair, tr,
804 								SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0,
805 								false);
806 			tr = TAILQ_FIRST(&pqpair->outstanding_tr);
807 		} else {
808 			tr = TAILQ_NEXT(tr, tq_list);
809 		}
810 	}
811 }
812 
813 void
814 nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair)
815 {
816 	nvme_pcie_admin_qpair_abort_aers(qpair);
817 }
818 
819 void
820 nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
821 {
822 	nvme_pcie_qpair_abort_trackers(qpair, dnr);
823 }
824 
825 static void
826 nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
827 {
828 	uint64_t t02;
829 	struct nvme_tracker *tr, *tmp;
830 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
831 	struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
832 	struct spdk_nvme_ctrlr_process *active_proc;
833 
834 	/* Don't check timeouts during controller initialization. */
835 	if (ctrlr->state != NVME_CTRLR_STATE_READY) {
836 		return;
837 	}
838 
839 	if (nvme_qpair_is_admin_queue(qpair)) {
840 		active_proc = nvme_ctrlr_get_current_process(ctrlr);
841 	} else {
842 		active_proc = qpair->active_proc;
843 	}
844 
845 	/* Only check timeouts if the current process has a timeout callback. */
846 	if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
847 		return;
848 	}
849 
850 	t02 = spdk_get_ticks();
851 	TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
852 		assert(tr->req != NULL);
853 
854 		if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) {
855 			/*
856 			 * The requests are in order, so as soon as one has not timed out,
857 			 * stop iterating.
858 			 */
859 			break;
860 		}
861 	}
862 }
863 
864 int32_t
865 nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
866 {
867 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
868 	struct nvme_tracker	*tr;
869 	struct spdk_nvme_cpl	*cpl, *next_cpl;
870 	uint32_t		 num_completions = 0;
871 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
872 	uint16_t		 next_cq_head;
873 	uint8_t			 next_phase;
874 	bool			 next_is_valid = false;
875 	int			 rc;
876 
877 	if (spdk_unlikely(pqpair->pcie_state == NVME_PCIE_QPAIR_FAILED)) {
878 		return -ENXIO;
879 	}
880 
881 	if (spdk_unlikely(nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING)) {
882 		if (pqpair->pcie_state == NVME_PCIE_QPAIR_READY) {
883 			/* It is possible that another thread set the pcie_state to
884 			 * QPAIR_READY, if it polled the adminq and processed the SQ
885 			 * completion for this qpair.  So check for that condition
886 			 * here and then update the qpair's state to CONNECTED, since
887 			 * we can only set the qpair state from the qpair's thread.
888 			 * (Note: this fixed issue #2157.)
889 			 */
890 			nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED);
891 		} else if (pqpair->pcie_state == NVME_PCIE_QPAIR_FAILED) {
892 			nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
893 			return -ENXIO;
894 		} else {
895 			rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
896 			if (rc < 0) {
897 				return rc;
898 			} else if (pqpair->pcie_state == NVME_PCIE_QPAIR_FAILED) {
899 				nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
900 				return -ENXIO;
901 			}
902 		}
903 		return 0;
904 	}
905 
906 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
907 		nvme_ctrlr_lock(ctrlr);
908 	}
909 
910 	if (max_completions == 0 || max_completions > pqpair->max_completions_cap) {
911 		/*
912 		 * max_completions == 0 means unlimited, but complete at most
913 		 * max_completions_cap batch of I/O at a time so that the completion
914 		 * queue doorbells don't wrap around.
915 		 */
916 		max_completions = pqpair->max_completions_cap;
917 	}
918 
919 	pqpair->stat->polls++;
920 
921 	while (1) {
922 		cpl = &pqpair->cpl[pqpair->cq_head];
923 
924 		if (!next_is_valid && cpl->status.p != pqpair->flags.phase) {
925 			break;
926 		}
927 
928 		if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) {
929 			next_cq_head = pqpair->cq_head + 1;
930 			next_phase = pqpair->flags.phase;
931 		} else {
932 			next_cq_head = 0;
933 			next_phase = !pqpair->flags.phase;
934 		}
935 		next_cpl = &pqpair->cpl[next_cq_head];
936 		next_is_valid = (next_cpl->status.p == next_phase);
937 		if (next_is_valid) {
938 			__builtin_prefetch(&pqpair->tr[next_cpl->cid]);
939 		}
940 
941 #if defined(__PPC64__) || defined(__riscv) || defined(__loongarch__)
942 		/*
943 		 * This memory barrier prevents reordering of:
944 		 * - load after store from/to tr
945 		 * - load after load cpl phase and cpl cid
946 		 */
947 		spdk_mb();
948 #elif defined(__aarch64__)
949 		__asm volatile("dmb oshld" ::: "memory");
950 #endif
951 
952 		if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) {
953 			pqpair->cq_head = 0;
954 			pqpair->flags.phase = !pqpair->flags.phase;
955 		}
956 
957 		tr = &pqpair->tr[cpl->cid];
958 		pqpair->sq_head = cpl->sqhd;
959 
960 		if (tr->req) {
961 			/* Prefetch the req's STAILQ_ENTRY since we'll need to access it
962 			 * as part of putting the req back on the qpair's free list.
963 			 */
964 			__builtin_prefetch(&tr->req->stailq);
965 			nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true);
966 		} else {
967 			SPDK_ERRLOG("cpl does not map to outstanding cmd\n");
968 			spdk_nvme_qpair_print_completion(qpair, cpl);
969 			assert(0);
970 		}
971 
972 		if (++num_completions == max_completions) {
973 			break;
974 		}
975 	}
976 
977 	if (num_completions > 0) {
978 		pqpair->stat->completions += num_completions;
979 		nvme_pcie_qpair_ring_cq_doorbell(qpair);
980 	} else {
981 		pqpair->stat->idle_polls++;
982 	}
983 
984 	if (pqpair->flags.delay_cmd_submit) {
985 		if (pqpair->last_sq_tail != pqpair->sq_tail) {
986 			nvme_pcie_qpair_ring_sq_doorbell(qpair);
987 			pqpair->last_sq_tail = pqpair->sq_tail;
988 		}
989 	}
990 
991 	if (spdk_unlikely(ctrlr->timeout_enabled)) {
992 		/*
993 		 * User registered for timeout callback
994 		 */
995 		nvme_pcie_qpair_check_timeout(qpair);
996 	}
997 
998 	/* Before returning, complete any pending admin request or
999 	 * process the admin qpair disconnection.
1000 	 */
1001 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
1002 		nvme_pcie_qpair_complete_pending_admin_request(qpair);
1003 
1004 		if (nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING) {
1005 			rc = nvme_ctrlr_disable_poll(qpair->ctrlr);
1006 			if (rc != -EAGAIN) {
1007 				nvme_transport_ctrlr_disconnect_qpair_done(qpair);
1008 			}
1009 		}
1010 
1011 		nvme_ctrlr_unlock(ctrlr);
1012 	}
1013 
1014 	if (spdk_unlikely(pqpair->flags.has_pending_vtophys_failures)) {
1015 		struct nvme_tracker *tr, *tmp;
1016 
1017 		TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
1018 			if (tr->bad_vtophys) {
1019 				tr->bad_vtophys = 0;
1020 				nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1021 			}
1022 		}
1023 		pqpair->flags.has_pending_vtophys_failures = 0;
1024 	}
1025 
1026 	return num_completions;
1027 }
1028 
1029 int
1030 nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair)
1031 {
1032 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1033 
1034 	if (nvme_qpair_is_admin_queue(qpair)) {
1035 		nvme_pcie_admin_qpair_destroy(qpair);
1036 	}
1037 	/*
1038 	 * We check sq_vaddr and cq_vaddr to see if the user specified the memory
1039 	 * buffers when creating the I/O queue.
1040 	 * If the user specified them, we cannot free that memory.
1041 	 * Nor do we free it if it's in the CMB.
1042 	 */
1043 	if (!pqpair->sq_vaddr && pqpair->cmd && !pqpair->sq_in_cmb) {
1044 		spdk_free(pqpair->cmd);
1045 	}
1046 	if (!pqpair->cq_vaddr && pqpair->cpl) {
1047 		spdk_free(pqpair->cpl);
1048 	}
1049 	if (pqpair->tr) {
1050 		spdk_free(pqpair->tr);
1051 	}
1052 
1053 	nvme_qpair_deinit(qpair);
1054 
1055 	if (!pqpair->shared_stats && (!qpair->active_proc ||
1056 				      qpair->active_proc == nvme_ctrlr_get_current_process(qpair->ctrlr))) {
1057 		if (qpair->id) {
1058 			free(pqpair->stat);
1059 		} else {
1060 			/* statistics of admin qpair are allocates from huge pages because
1061 			 * admin qpair is shared for multi-process */
1062 			spdk_free(pqpair->stat);
1063 		}
1064 
1065 	}
1066 
1067 	spdk_free(pqpair);
1068 
1069 	return 0;
1070 }
1071 
1072 struct spdk_nvme_qpair *
1073 nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
1074 				const struct spdk_nvme_io_qpair_opts *opts)
1075 {
1076 	struct nvme_pcie_qpair *pqpair;
1077 	struct spdk_nvme_qpair *qpair;
1078 	int rc;
1079 
1080 	assert(ctrlr != NULL);
1081 
1082 	pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL,
1083 			      SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_SHARE);
1084 	if (pqpair == NULL) {
1085 		return NULL;
1086 	}
1087 
1088 	pqpair->num_entries = opts->io_queue_size;
1089 	pqpair->flags.delay_cmd_submit = opts->delay_cmd_submit;
1090 
1091 	qpair = &pqpair->qpair;
1092 
1093 	rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests, opts->async_mode);
1094 	if (rc != 0) {
1095 		nvme_pcie_qpair_destroy(qpair);
1096 		return NULL;
1097 	}
1098 
1099 	rc = nvme_pcie_qpair_construct(qpair, opts);
1100 
1101 	if (rc != 0) {
1102 		nvme_pcie_qpair_destroy(qpair);
1103 		return NULL;
1104 	}
1105 
1106 	return qpair;
1107 }
1108 
1109 int
1110 nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1111 {
1112 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1113 	struct nvme_completion_poll_status *status;
1114 	int rc;
1115 
1116 	assert(ctrlr != NULL);
1117 
1118 	if (ctrlr->is_removed) {
1119 		goto free;
1120 	}
1121 
1122 	if (ctrlr->prepare_for_reset) {
1123 		if (nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING) {
1124 			pqpair->flags.defer_destruction = true;
1125 		}
1126 		goto clear_shadow_doorbells;
1127 	}
1128 
1129 	/* If attempting to delete a qpair that's still being connected, we have to wait until it's
1130 	 * finished, so that we don't free it while it's waiting for the create cq/sq callbacks.
1131 	 */
1132 	while (pqpair->pcie_state == NVME_PCIE_QPAIR_WAIT_FOR_CQ ||
1133 	       pqpair->pcie_state == NVME_PCIE_QPAIR_WAIT_FOR_SQ) {
1134 		rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
1135 		if (rc < 0) {
1136 			break;
1137 		}
1138 	}
1139 
1140 	status = calloc(1, sizeof(*status));
1141 	if (!status) {
1142 		SPDK_ERRLOG("Failed to allocate status tracker\n");
1143 		goto free;
1144 	}
1145 
1146 	/* Delete the I/O submission queue */
1147 	rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, status);
1148 	if (rc != 0) {
1149 		SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc);
1150 		free(status);
1151 		goto free;
1152 	}
1153 	if (nvme_wait_for_completion(ctrlr->adminq, status)) {
1154 		if (!status->timed_out) {
1155 			free(status);
1156 		}
1157 		goto free;
1158 	}
1159 
1160 	/* Now that the submission queue is deleted, the device is supposed to have
1161 	 * completed any outstanding I/O. Try to complete them. If they don't complete,
1162 	 * they'll be marked as aborted and completed below. */
1163 	if (qpair->active_proc == nvme_ctrlr_get_current_process(ctrlr)) {
1164 		nvme_pcie_qpair_process_completions(qpair, 0);
1165 	}
1166 
1167 	memset(status, 0, sizeof(*status));
1168 	/* Delete the completion queue */
1169 	rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status);
1170 	if (rc != 0) {
1171 		SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
1172 		free(status);
1173 		goto free;
1174 	}
1175 	if (nvme_wait_for_completion(ctrlr->adminq, status)) {
1176 		if (!status->timed_out) {
1177 			free(status);
1178 		}
1179 		goto free;
1180 	}
1181 	free(status);
1182 
1183 clear_shadow_doorbells:
1184 	if (pqpair->flags.has_shadow_doorbell && ctrlr->shadow_doorbell) {
1185 		*pqpair->shadow_doorbell.sq_tdbl = 0;
1186 		*pqpair->shadow_doorbell.cq_hdbl = 0;
1187 		*pqpair->shadow_doorbell.sq_eventidx = 0;
1188 		*pqpair->shadow_doorbell.cq_eventidx = 0;
1189 	}
1190 free:
1191 	if (qpair->no_deletion_notification_needed == 0) {
1192 		/* Abort the rest of the I/O */
1193 		nvme_pcie_qpair_abort_trackers(qpair, 1);
1194 	}
1195 
1196 	if (!pqpair->flags.defer_destruction) {
1197 		nvme_pcie_qpair_destroy(qpair);
1198 	}
1199 	return 0;
1200 }
1201 
1202 static void
1203 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
1204 {
1205 	if (!qpair->in_completion_context) {
1206 		struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1207 
1208 		tr->bad_vtophys = 1;
1209 		pqpair->flags.has_pending_vtophys_failures = 1;
1210 		return;
1211 	}
1212 
1213 	/*
1214 	 * Bad vtophys translation, so abort this request and return
1215 	 *  immediately.
1216 	 */
1217 	SPDK_ERRLOG("vtophys or other payload buffer related error\n");
1218 	nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
1219 						SPDK_NVME_SC_INVALID_FIELD,
1220 						1 /* do not retry */, true);
1221 }
1222 
1223 /*
1224  * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes.
1225  *
1226  * *prp_index will be updated to account for the number of PRP entries used.
1227  */
1228 static inline int
1229 nvme_pcie_prp_list_append(struct spdk_nvme_ctrlr *ctrlr, struct nvme_tracker *tr,
1230 			  uint32_t *prp_index, void *virt_addr, size_t len,
1231 			  uint32_t page_size)
1232 {
1233 	struct spdk_nvme_cmd *cmd = &tr->req->cmd;
1234 	uintptr_t page_mask = page_size - 1;
1235 	uint64_t phys_addr;
1236 	uint32_t i;
1237 
1238 	SPDK_DEBUGLOG(nvme, "prp_index:%u virt_addr:%p len:%u\n",
1239 		      *prp_index, virt_addr, (uint32_t)len);
1240 
1241 	if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) {
1242 		SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1243 		return -EFAULT;
1244 	}
1245 
1246 	i = *prp_index;
1247 	while (len) {
1248 		uint32_t seg_len;
1249 
1250 		/*
1251 		 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array,
1252 		 * so prp_index == count is valid.
1253 		 */
1254 		if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) {
1255 			SPDK_ERRLOG("out of PRP entries\n");
1256 			return -EFAULT;
1257 		}
1258 
1259 		phys_addr = nvme_pcie_vtophys(ctrlr, virt_addr, NULL);
1260 		if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) {
1261 			SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr);
1262 			return -EFAULT;
1263 		}
1264 
1265 		if (i == 0) {
1266 			SPDK_DEBUGLOG(nvme, "prp1 = %p\n", (void *)phys_addr);
1267 			cmd->dptr.prp.prp1 = phys_addr;
1268 			seg_len = page_size - ((uintptr_t)virt_addr & page_mask);
1269 		} else {
1270 			if ((phys_addr & page_mask) != 0) {
1271 				SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr);
1272 				return -EFAULT;
1273 			}
1274 
1275 			SPDK_DEBUGLOG(nvme, "prp[%u] = %p\n", i - 1, (void *)phys_addr);
1276 			tr->u.prp[i - 1] = phys_addr;
1277 			seg_len = page_size;
1278 		}
1279 
1280 		seg_len = spdk_min(seg_len, len);
1281 		virt_addr = (uint8_t *)virt_addr + seg_len;
1282 		len -= seg_len;
1283 		i++;
1284 	}
1285 
1286 	cmd->psdt = SPDK_NVME_PSDT_PRP;
1287 	if (i <= 1) {
1288 		cmd->dptr.prp.prp2 = 0;
1289 	} else if (i == 2) {
1290 		cmd->dptr.prp.prp2 = tr->u.prp[0];
1291 		SPDK_DEBUGLOG(nvme, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2);
1292 	} else {
1293 		cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr;
1294 		SPDK_DEBUGLOG(nvme, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2);
1295 	}
1296 
1297 	*prp_index = i;
1298 	return 0;
1299 }
1300 
1301 static int
1302 nvme_pcie_qpair_build_request_invalid(struct spdk_nvme_qpair *qpair,
1303 				      struct nvme_request *req, struct nvme_tracker *tr, bool dword_aligned)
1304 {
1305 	assert(0);
1306 	nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1307 	return -EINVAL;
1308 }
1309 
1310 /**
1311  * Build PRP list describing physically contiguous payload buffer.
1312  */
1313 static int
1314 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1315 				     struct nvme_tracker *tr, bool dword_aligned)
1316 {
1317 	uint32_t prp_index = 0;
1318 	int rc;
1319 
1320 	rc = nvme_pcie_prp_list_append(qpair->ctrlr, tr, &prp_index,
1321 				       (uint8_t *)req->payload.contig_or_cb_arg + req->payload_offset,
1322 				       req->payload_size, qpair->ctrlr->page_size);
1323 	if (rc) {
1324 		nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1325 	} else {
1326 		SPDK_DEBUGLOG(nvme, "Number of PRP entries: %" PRIu32 "\n", prp_index);
1327 	}
1328 
1329 	return rc;
1330 }
1331 
1332 /**
1333  * Build an SGL describing a physically contiguous payload buffer.
1334  *
1335  * This is more efficient than using PRP because large buffers can be
1336  * described this way.
1337  */
1338 static int
1339 nvme_pcie_qpair_build_contig_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1340 		struct nvme_tracker *tr, bool dword_aligned)
1341 {
1342 	uint8_t *virt_addr;
1343 	uint64_t phys_addr, mapping_length;
1344 	uint32_t length;
1345 	struct spdk_nvme_sgl_descriptor *sgl;
1346 	uint32_t nseg = 0;
1347 
1348 	assert(req->payload_size != 0);
1349 	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
1350 
1351 	sgl = tr->u.sgl;
1352 	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
1353 	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
1354 
1355 	length = req->payload_size;
1356 	/* ubsan complains about applying zero offset to null pointer if contig_or_cb_arg is NULL,
1357 	 * so just double cast it to make it go away */
1358 	virt_addr = (uint8_t *)((uintptr_t)req->payload.contig_or_cb_arg + req->payload_offset);
1359 
1360 	while (length > 0) {
1361 		if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1362 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1363 			return -EFAULT;
1364 		}
1365 
1366 		if (dword_aligned && ((uintptr_t)virt_addr & 3)) {
1367 			SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1368 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1369 			return -EFAULT;
1370 		}
1371 
1372 		mapping_length = length;
1373 		phys_addr = nvme_pcie_vtophys(qpair->ctrlr, virt_addr, &mapping_length);
1374 		if (phys_addr == SPDK_VTOPHYS_ERROR) {
1375 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1376 			return -EFAULT;
1377 		}
1378 
1379 		mapping_length = spdk_min(length, mapping_length);
1380 
1381 		length -= mapping_length;
1382 		virt_addr += mapping_length;
1383 
1384 		sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1385 		sgl->unkeyed.length = mapping_length;
1386 		sgl->address = phys_addr;
1387 		sgl->unkeyed.subtype = 0;
1388 
1389 		sgl++;
1390 		nseg++;
1391 	}
1392 
1393 	if (nseg == 1) {
1394 		/*
1395 		 * The whole transfer can be described by a single SGL descriptor.
1396 		 *  Use the special case described by the spec where SGL1's type is Data Block.
1397 		 *  This means the SGL in the tracker is not used at all, so copy the first (and only)
1398 		 *  SGL element into SGL1.
1399 		 */
1400 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1401 		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
1402 		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
1403 	} else {
1404 		/* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because
1405 		 *  NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page.
1406 		 */
1407 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
1408 		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
1409 		req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
1410 	}
1411 
1412 	SPDK_DEBUGLOG(nvme, "Number of SGL descriptors: %" PRIu32 "\n", nseg);
1413 	return 0;
1414 }
1415 
1416 /**
1417  * Build SGL list describing scattered payload buffer.
1418  */
1419 static int
1420 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1421 				     struct nvme_tracker *tr, bool dword_aligned)
1422 {
1423 	int rc;
1424 	void *virt_addr;
1425 	uint64_t phys_addr, mapping_length;
1426 	uint32_t remaining_transfer_len, remaining_user_sge_len, length;
1427 	struct spdk_nvme_sgl_descriptor *sgl;
1428 	uint32_t nseg = 0;
1429 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1430 
1431 	/*
1432 	 * Build scattered payloads.
1433 	 */
1434 	assert(req->payload_size != 0);
1435 	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
1436 	assert(req->payload.reset_sgl_fn != NULL);
1437 	assert(req->payload.next_sge_fn != NULL);
1438 	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
1439 
1440 	sgl = tr->u.sgl;
1441 	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
1442 	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
1443 
1444 	remaining_transfer_len = req->payload_size;
1445 
1446 	while (remaining_transfer_len > 0) {
1447 		rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg,
1448 					      &virt_addr, &remaining_user_sge_len);
1449 		if (rc) {
1450 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1451 			return -EFAULT;
1452 		}
1453 
1454 		/* Bit Bucket SGL descriptor */
1455 		if ((uint64_t)virt_addr == UINT64_MAX) {
1456 			/* TODO: enable WRITE and COMPARE when necessary */
1457 			if (req->cmd.opc != SPDK_NVME_OPC_READ) {
1458 				SPDK_ERRLOG("Only READ command can be supported\n");
1459 				goto exit;
1460 			}
1461 			if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1462 				SPDK_ERRLOG("Too many SGL entries\n");
1463 				goto exit;
1464 			}
1465 
1466 			sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_BIT_BUCKET;
1467 			/* If the SGL describes a destination data buffer, the length of data
1468 			 * buffer shall be discarded by controller, and the length is included
1469 			 * in Number of Logical Blocks (NLB) parameter. Otherwise, the length
1470 			 * is not included in the NLB parameter.
1471 			 */
1472 			remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
1473 			remaining_transfer_len -= remaining_user_sge_len;
1474 
1475 			sgl->unkeyed.length = remaining_user_sge_len;
1476 			sgl->address = 0;
1477 			sgl->unkeyed.subtype = 0;
1478 
1479 			sgl++;
1480 			nseg++;
1481 
1482 			continue;
1483 		}
1484 
1485 		remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
1486 		remaining_transfer_len -= remaining_user_sge_len;
1487 		while (remaining_user_sge_len > 0) {
1488 			if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1489 				SPDK_ERRLOG("Too many SGL entries\n");
1490 				goto exit;
1491 			}
1492 
1493 			if (dword_aligned && ((uintptr_t)virt_addr & 3)) {
1494 				SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1495 				goto exit;
1496 			}
1497 
1498 			mapping_length = remaining_user_sge_len;
1499 			phys_addr = nvme_pcie_vtophys(qpair->ctrlr, virt_addr, &mapping_length);
1500 			if (phys_addr == SPDK_VTOPHYS_ERROR) {
1501 				goto exit;
1502 			}
1503 
1504 			length = spdk_min(remaining_user_sge_len, mapping_length);
1505 			remaining_user_sge_len -= length;
1506 			virt_addr = (uint8_t *)virt_addr + length;
1507 
1508 			if (!pqpair->flags.disable_pcie_sgl_merge && nseg > 0 &&
1509 			    phys_addr == (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) {
1510 				/* extend previous entry */
1511 				(*(sgl - 1)).unkeyed.length += length;
1512 				continue;
1513 			}
1514 
1515 			sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1516 			sgl->unkeyed.length = length;
1517 			sgl->address = phys_addr;
1518 			sgl->unkeyed.subtype = 0;
1519 
1520 			sgl++;
1521 			nseg++;
1522 		}
1523 	}
1524 
1525 	if (nseg == 1) {
1526 		/*
1527 		 * The whole transfer can be described by a single SGL descriptor.
1528 		 *  Use the special case described by the spec where SGL1's type is Data Block.
1529 		 *  This means the SGL in the tracker is not used at all, so copy the first (and only)
1530 		 *  SGL element into SGL1.
1531 		 */
1532 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1533 		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
1534 		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
1535 	} else {
1536 		/* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because
1537 		 *  NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page.
1538 		 */
1539 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
1540 		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
1541 		req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
1542 	}
1543 
1544 	SPDK_DEBUGLOG(nvme, "Number of SGL descriptors: %" PRIu32 "\n", nseg);
1545 	return 0;
1546 
1547 exit:
1548 	nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1549 	return -EFAULT;
1550 }
1551 
1552 /**
1553  * Build PRP list describing scattered payload buffer.
1554  */
1555 static int
1556 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1557 				       struct nvme_tracker *tr, bool dword_aligned)
1558 {
1559 	int rc;
1560 	void *virt_addr;
1561 	uint32_t remaining_transfer_len, length;
1562 	uint32_t prp_index = 0;
1563 	uint32_t page_size = qpair->ctrlr->page_size;
1564 
1565 	/*
1566 	 * Build scattered payloads.
1567 	 */
1568 	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
1569 	assert(req->payload.reset_sgl_fn != NULL);
1570 	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
1571 
1572 	remaining_transfer_len = req->payload_size;
1573 	while (remaining_transfer_len > 0) {
1574 		assert(req->payload.next_sge_fn != NULL);
1575 		rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length);
1576 		if (rc) {
1577 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1578 			return -EFAULT;
1579 		}
1580 
1581 		length = spdk_min(remaining_transfer_len, length);
1582 
1583 		/*
1584 		 * Any incompatible sges should have been handled up in the splitting routine,
1585 		 *  but assert here as an additional check.
1586 		 *
1587 		 * All SGEs except last must end on a page boundary.
1588 		 */
1589 		assert((length == remaining_transfer_len) ||
1590 		       _is_page_aligned((uintptr_t)virt_addr + length, page_size));
1591 
1592 		rc = nvme_pcie_prp_list_append(qpair->ctrlr, tr, &prp_index, virt_addr, length, page_size);
1593 		if (rc) {
1594 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1595 			return rc;
1596 		}
1597 
1598 		remaining_transfer_len -= length;
1599 	}
1600 
1601 	SPDK_DEBUGLOG(nvme, "Number of PRP entries: %" PRIu32 "\n", prp_index);
1602 	return 0;
1603 }
1604 
1605 typedef int(*build_req_fn)(struct spdk_nvme_qpair *, struct nvme_request *, struct nvme_tracker *,
1606 			   bool);
1607 
1608 static build_req_fn const g_nvme_pcie_build_req_table[][2] = {
1609 	[NVME_PAYLOAD_TYPE_INVALID] = {
1610 		nvme_pcie_qpair_build_request_invalid,			/* PRP */
1611 		nvme_pcie_qpair_build_request_invalid			/* SGL */
1612 	},
1613 	[NVME_PAYLOAD_TYPE_CONTIG] = {
1614 		nvme_pcie_qpair_build_contig_request,			/* PRP */
1615 		nvme_pcie_qpair_build_contig_hw_sgl_request		/* SGL */
1616 	},
1617 	[NVME_PAYLOAD_TYPE_SGL] = {
1618 		nvme_pcie_qpair_build_prps_sgl_request,			/* PRP */
1619 		nvme_pcie_qpair_build_hw_sgl_request			/* SGL */
1620 	}
1621 };
1622 
1623 static int
1624 nvme_pcie_qpair_build_metadata(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
1625 			       bool sgl_supported, bool mptr_sgl_supported, bool dword_aligned)
1626 {
1627 	void *md_payload;
1628 	struct nvme_request *req = tr->req;
1629 	uint64_t mapping_length;
1630 
1631 	if (req->payload.md) {
1632 		md_payload = (uint8_t *)req->payload.md + req->md_offset;
1633 		if (dword_aligned && ((uintptr_t)md_payload & 3)) {
1634 			SPDK_ERRLOG("virt_addr %p not dword aligned\n", md_payload);
1635 			goto exit;
1636 		}
1637 
1638 		mapping_length = req->md_size;
1639 		if (sgl_supported && mptr_sgl_supported && dword_aligned) {
1640 			assert(req->cmd.psdt == SPDK_NVME_PSDT_SGL_MPTR_CONTIG);
1641 			req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_SGL;
1642 
1643 			tr->meta_sgl.address = nvme_pcie_vtophys(qpair->ctrlr, md_payload, &mapping_length);
1644 			if (tr->meta_sgl.address == SPDK_VTOPHYS_ERROR || mapping_length != req->md_size) {
1645 				goto exit;
1646 			}
1647 			tr->meta_sgl.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1648 			tr->meta_sgl.unkeyed.length = req->md_size;
1649 			tr->meta_sgl.unkeyed.subtype = 0;
1650 			req->cmd.mptr = tr->prp_sgl_bus_addr - sizeof(struct spdk_nvme_sgl_descriptor);
1651 		} else {
1652 			req->cmd.mptr = nvme_pcie_vtophys(qpair->ctrlr, md_payload, &mapping_length);
1653 			if (req->cmd.mptr == SPDK_VTOPHYS_ERROR || mapping_length != req->md_size) {
1654 				goto exit;
1655 			}
1656 		}
1657 	}
1658 
1659 	return 0;
1660 
1661 exit:
1662 	nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1663 	return -EINVAL;
1664 }
1665 
1666 int
1667 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
1668 {
1669 	struct nvme_tracker	*tr;
1670 	int			rc = 0;
1671 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
1672 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
1673 	enum nvme_payload_type	payload_type;
1674 	bool			sgl_supported;
1675 	bool			mptr_sgl_supported;
1676 	bool			dword_aligned = true;
1677 
1678 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
1679 		nvme_ctrlr_lock(ctrlr);
1680 	}
1681 
1682 	tr = TAILQ_FIRST(&pqpair->free_tr);
1683 
1684 	if (tr == NULL) {
1685 		pqpair->stat->queued_requests++;
1686 		/* Inform the upper layer to try again later. */
1687 		rc = -EAGAIN;
1688 		goto exit;
1689 	}
1690 
1691 	pqpair->stat->submitted_requests++;
1692 	TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */
1693 	TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list);
1694 	pqpair->qpair.queue_depth++;
1695 	tr->req = req;
1696 	tr->cb_fn = req->cb_fn;
1697 	tr->cb_arg = req->cb_arg;
1698 	req->cmd.cid = tr->cid;
1699 	/* Use PRP by default. This bit will be overridden below if needed. */
1700 	req->cmd.psdt = SPDK_NVME_PSDT_PRP;
1701 
1702 	if (req->payload_size != 0) {
1703 		payload_type = nvme_payload_type(&req->payload);
1704 		/* According to the specification, PRPs shall be used for all
1705 		 *  Admin commands for NVMe over PCIe implementations.
1706 		 */
1707 		sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) != 0 &&
1708 				!nvme_qpair_is_admin_queue(qpair);
1709 		mptr_sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_MPTR_SGL_SUPPORTED) != 0 &&
1710 				     !nvme_qpair_is_admin_queue(qpair);
1711 
1712 		if (sgl_supported) {
1713 			/* Don't use SGL for DSM command */
1714 			if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_NO_SGL_FOR_DSM) &&
1715 					  (req->cmd.opc == SPDK_NVME_OPC_DATASET_MANAGEMENT))) {
1716 				sgl_supported = false;
1717 			}
1718 		}
1719 
1720 		if (sgl_supported && !(ctrlr->flags & SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT)) {
1721 			dword_aligned = false;
1722 		}
1723 
1724 		/* If we fail to build the request or the metadata, do not return the -EFAULT back up
1725 		 * the stack.  This ensures that we always fail these types of requests via a
1726 		 * completion callback, and never in the context of the submission.
1727 		 */
1728 		rc = g_nvme_pcie_build_req_table[payload_type][sgl_supported](qpair, req, tr, dword_aligned);
1729 		if (rc < 0) {
1730 			assert(rc == -EFAULT);
1731 			rc = 0;
1732 			goto exit;
1733 		}
1734 
1735 		rc = nvme_pcie_qpair_build_metadata(qpair, tr, sgl_supported, mptr_sgl_supported, dword_aligned);
1736 		if (rc < 0) {
1737 			assert(rc == -EFAULT);
1738 			rc = 0;
1739 			goto exit;
1740 		}
1741 	}
1742 
1743 	nvme_pcie_qpair_submit_tracker(qpair, tr);
1744 
1745 exit:
1746 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
1747 		nvme_ctrlr_unlock(ctrlr);
1748 	}
1749 
1750 	return rc;
1751 }
1752 
1753 struct spdk_nvme_transport_poll_group *
1754 nvme_pcie_poll_group_create(void)
1755 {
1756 	struct nvme_pcie_poll_group *group = calloc(1, sizeof(*group));
1757 
1758 	if (group == NULL) {
1759 		SPDK_ERRLOG("Unable to allocate poll group.\n");
1760 		return NULL;
1761 	}
1762 
1763 	return &group->group;
1764 }
1765 
1766 int
1767 nvme_pcie_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
1768 {
1769 	return 0;
1770 }
1771 
1772 int
1773 nvme_pcie_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
1774 {
1775 	return 0;
1776 }
1777 
1778 int
1779 nvme_pcie_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
1780 			 struct spdk_nvme_qpair *qpair)
1781 {
1782 	return 0;
1783 }
1784 
1785 int
1786 nvme_pcie_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
1787 			    struct spdk_nvme_qpair *qpair)
1788 {
1789 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1790 
1791 	pqpair->stat = &g_dummy_stat;
1792 	return 0;
1793 }
1794 
1795 int64_t
1796 nvme_pcie_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
1797 		uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
1798 {
1799 	struct spdk_nvme_qpair *qpair, *tmp_qpair;
1800 	int32_t local_completions = 0;
1801 	int64_t total_completions = 0;
1802 
1803 	STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) {
1804 		disconnected_qpair_cb(qpair, tgroup->group->ctx);
1805 	}
1806 
1807 	STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) {
1808 		local_completions = spdk_nvme_qpair_process_completions(qpair, completions_per_qpair);
1809 		if (spdk_unlikely(local_completions < 0)) {
1810 			disconnected_qpair_cb(qpair, tgroup->group->ctx);
1811 			total_completions = -ENXIO;
1812 		} else if (spdk_likely(total_completions >= 0)) {
1813 			total_completions += local_completions;
1814 		}
1815 	}
1816 
1817 	return total_completions;
1818 }
1819 
1820 void
1821 nvme_pcie_poll_group_check_disconnected_qpairs(struct spdk_nvme_transport_poll_group *tgroup,
1822 		spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
1823 {
1824 	struct spdk_nvme_qpair *qpair, *tmp_qpair;
1825 
1826 	STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) {
1827 		disconnected_qpair_cb(qpair, tgroup->group->ctx);
1828 	}
1829 }
1830 
1831 int
1832 nvme_pcie_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup)
1833 {
1834 	if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) {
1835 		return -EBUSY;
1836 	}
1837 
1838 	free(tgroup);
1839 
1840 	return 0;
1841 }
1842 
1843 int
1844 nvme_pcie_poll_group_get_stats(struct spdk_nvme_transport_poll_group *tgroup,
1845 			       struct spdk_nvme_transport_poll_group_stat **_stats)
1846 {
1847 	struct nvme_pcie_poll_group *group;
1848 	struct spdk_nvme_transport_poll_group_stat *stats;
1849 
1850 	if (tgroup == NULL || _stats == NULL) {
1851 		SPDK_ERRLOG("Invalid stats or group pointer\n");
1852 		return -EINVAL;
1853 	}
1854 
1855 	stats = calloc(1, sizeof(*stats));
1856 	if (!stats) {
1857 		SPDK_ERRLOG("Can't allocate memory for stats\n");
1858 		return -ENOMEM;
1859 	}
1860 	stats->trtype = SPDK_NVME_TRANSPORT_PCIE;
1861 	group = SPDK_CONTAINEROF(tgroup, struct nvme_pcie_poll_group, group);
1862 	memcpy(&stats->pcie, &group->stats, sizeof(group->stats));
1863 
1864 	*_stats = stats;
1865 
1866 	return 0;
1867 }
1868 
1869 void
1870 nvme_pcie_poll_group_free_stats(struct spdk_nvme_transport_poll_group *tgroup,
1871 				struct spdk_nvme_transport_poll_group_stat *stats)
1872 {
1873 	free(stats);
1874 }
1875 
1876 static void
1877 nvme_pcie_trace(void)
1878 {
1879 	struct spdk_trace_tpoint_opts opts[] = {
1880 		{
1881 			"NVME_PCIE_SUBMIT", TRACE_NVME_PCIE_SUBMIT,
1882 			OWNER_TYPE_NVME_PCIE_QP, OBJECT_NVME_PCIE_REQ, 1,
1883 			{	{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
1884 				{ "cid", SPDK_TRACE_ARG_TYPE_INT, 4 },
1885 				{ "opc", SPDK_TRACE_ARG_TYPE_INT, 4 },
1886 				{ "dw10", SPDK_TRACE_ARG_TYPE_PTR, 4 },
1887 				{ "dw11", SPDK_TRACE_ARG_TYPE_PTR, 4 },
1888 				{ "dw12", SPDK_TRACE_ARG_TYPE_PTR, 4 },
1889 				{ "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
1890 			}
1891 		},
1892 		{
1893 			"NVME_PCIE_COMPLETE", TRACE_NVME_PCIE_COMPLETE,
1894 			OWNER_TYPE_NVME_PCIE_QP, OBJECT_NVME_PCIE_REQ, 0,
1895 			{	{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
1896 				{ "cid", SPDK_TRACE_ARG_TYPE_INT, 4 },
1897 				{ "cpl", SPDK_TRACE_ARG_TYPE_PTR, 4 },
1898 				{ "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
1899 			}
1900 		},
1901 	};
1902 
1903 	spdk_trace_register_object(OBJECT_NVME_PCIE_REQ, 'p');
1904 	spdk_trace_register_owner_type(OWNER_TYPE_NVME_PCIE_QP, 'q');
1905 	spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
1906 }
1907 SPDK_TRACE_REGISTER_FN(nvme_pcie_trace, "nvme_pcie", TRACE_GROUP_NVME_PCIE)
1908