xref: /spdk/lib/nvme/nvme_pcie_common.c (revision 83ba9086796471697a4975a58f60e2392bccd08c)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2021 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2021 Mellanox Technologies LTD. All rights reserved.
4  *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 /*
8  * NVMe over PCIe common library
9  */
10 
11 #include "spdk/stdinc.h"
12 #include "spdk/likely.h"
13 #include "spdk/string.h"
14 #include "nvme_internal.h"
15 #include "nvme_pcie_internal.h"
16 #include "spdk/trace.h"
17 
18 #include "spdk_internal/trace_defs.h"
19 
20 __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL;
21 
22 static struct spdk_nvme_pcie_stat g_dummy_stat = {};
23 
24 static void nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair,
25 		struct nvme_tracker *tr);
26 
27 static inline uint64_t
28 nvme_pcie_vtophys(struct spdk_nvme_ctrlr *ctrlr, const void *buf, uint64_t *size)
29 {
30 	if (spdk_likely(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE)) {
31 		return spdk_vtophys(buf, size);
32 	} else {
33 		/* vfio-user address translation with IOVA=VA mode */
34 		return (uint64_t)(uintptr_t)buf;
35 	}
36 }
37 
38 int
39 nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair)
40 {
41 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
42 	uint32_t i;
43 
44 	/* all head/tail vals are set to 0 */
45 	pqpair->last_sq_tail = pqpair->sq_tail = pqpair->sq_head = pqpair->cq_head = 0;
46 
47 	/*
48 	 * First time through the completion queue, HW will set phase
49 	 *  bit on completions to 1.  So set this to 1 here, indicating
50 	 *  we're looking for a 1 to know which entries have completed.
51 	 *  we'll toggle the bit each time when the completion queue
52 	 *  rolls over.
53 	 */
54 	pqpair->flags.phase = 1;
55 	for (i = 0; i < pqpair->num_entries; i++) {
56 		pqpair->cpl[i].status.p = 0;
57 	}
58 
59 	return 0;
60 }
61 
62 static void
63 nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr)
64 {
65 	tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
66 	tr->cid = cid;
67 	tr->req = NULL;
68 }
69 
70 static void *
71 nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t size, uint64_t alignment,
72 			  uint64_t *phys_addr)
73 {
74 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
75 	uintptr_t addr;
76 
77 	if (pctrlr->cmb.mem_register_addr != NULL) {
78 		/* BAR is mapped for data */
79 		return NULL;
80 	}
81 
82 	addr = (uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset;
83 	addr = (addr + (alignment - 1)) & ~(alignment - 1);
84 
85 	/* CMB may only consume part of the BAR, calculate accordingly */
86 	if (addr + size > ((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.size)) {
87 		SPDK_ERRLOG("Tried to allocate past valid CMB range!\n");
88 		return NULL;
89 	}
90 	*phys_addr = pctrlr->cmb.bar_pa + addr - (uintptr_t)pctrlr->cmb.bar_va;
91 
92 	pctrlr->cmb.current_offset = (addr + size) - (uintptr_t)pctrlr->cmb.bar_va;
93 
94 	return (void *)addr;
95 }
96 
97 int
98 nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair,
99 			  const struct spdk_nvme_io_qpair_opts *opts)
100 {
101 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
102 	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(ctrlr);
103 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
104 	struct nvme_tracker	*tr;
105 	uint16_t		i;
106 	uint16_t		num_trackers;
107 	size_t			page_align = sysconf(_SC_PAGESIZE);
108 	size_t			queue_align, queue_len;
109 	uint32_t                flags = SPDK_MALLOC_DMA;
110 	int32_t			numa_id;
111 	uint64_t		sq_paddr = 0;
112 	uint64_t		cq_paddr = 0;
113 
114 	if (opts) {
115 		pqpair->sq_vaddr = opts->sq.vaddr;
116 		pqpair->cq_vaddr = opts->cq.vaddr;
117 		pqpair->flags.disable_pcie_sgl_merge = opts->disable_pcie_sgl_merge;
118 		sq_paddr = opts->sq.paddr;
119 		cq_paddr = opts->cq.paddr;
120 	}
121 
122 	pqpair->retry_count = ctrlr->opts.transport_retry_count;
123 
124 	/*
125 	 * Limit the maximum number of completions to return per call to prevent wraparound,
126 	 * and calculate how many trackers can be submitted at once without overflowing the
127 	 * completion queue.
128 	 */
129 	pqpair->max_completions_cap = pqpair->num_entries / 4;
130 	pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS);
131 	pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS);
132 	num_trackers = pqpair->num_entries - pqpair->max_completions_cap;
133 
134 	SPDK_INFOLOG(nvme, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n",
135 		     pqpair->max_completions_cap, num_trackers);
136 
137 	assert(num_trackers != 0);
138 
139 	pqpair->sq_in_cmb = false;
140 
141 	if (nvme_qpair_is_admin_queue(&pqpair->qpair)) {
142 		flags |= SPDK_MALLOC_SHARE;
143 	}
144 
145 	/* cmd and cpl rings must be aligned on page size boundaries. */
146 	if (ctrlr->opts.use_cmb_sqs) {
147 		pqpair->cmd = nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
148 							page_align, &pqpair->cmd_bus_addr);
149 		if (pqpair->cmd != NULL) {
150 			pqpair->sq_in_cmb = true;
151 		}
152 	}
153 
154 	if (pqpair->sq_in_cmb == false) {
155 		if (pqpair->sq_vaddr) {
156 			pqpair->cmd = pqpair->sq_vaddr;
157 		} else {
158 			/* To ensure physical address contiguity we make each ring occupy
159 			 * a single hugepage only. See MAX_IO_QUEUE_ENTRIES.
160 			 */
161 			queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cmd);
162 			queue_align = spdk_max(spdk_align32pow2(queue_len), page_align);
163 			pqpair->cmd = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_NUMA_ID_ANY, flags);
164 			if (pqpair->cmd == NULL) {
165 				SPDK_ERRLOG("alloc qpair_cmd failed\n");
166 				return -ENOMEM;
167 			}
168 		}
169 		if (sq_paddr) {
170 			assert(pqpair->sq_vaddr != NULL);
171 			pqpair->cmd_bus_addr = sq_paddr;
172 		} else {
173 			pqpair->cmd_bus_addr = nvme_pcie_vtophys(ctrlr, pqpair->cmd, NULL);
174 			if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) {
175 				SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n");
176 				return -EFAULT;
177 			}
178 		}
179 	}
180 
181 	if (pqpair->cq_vaddr) {
182 		pqpair->cpl = pqpair->cq_vaddr;
183 	} else {
184 		queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cpl);
185 		queue_align = spdk_max(spdk_align32pow2(queue_len), page_align);
186 		numa_id = spdk_nvme_ctrlr_get_numa_id(ctrlr);
187 		pqpair->cpl = spdk_zmalloc(queue_len, queue_align, NULL, numa_id, flags);
188 		if (pqpair->cpl == NULL) {
189 			SPDK_ERRLOG("alloc qpair_cpl failed\n");
190 			return -ENOMEM;
191 		}
192 	}
193 	if (cq_paddr) {
194 		assert(pqpair->cq_vaddr != NULL);
195 		pqpair->cpl_bus_addr = cq_paddr;
196 	} else {
197 		pqpair->cpl_bus_addr =  nvme_pcie_vtophys(ctrlr, pqpair->cpl, NULL);
198 		if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) {
199 			SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n");
200 			return -EFAULT;
201 		}
202 	}
203 
204 	pqpair->sq_tdbl = pctrlr->doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32;
205 	pqpair->cq_hdbl = pctrlr->doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32;
206 
207 	/*
208 	 * Reserve space for all of the trackers in a single allocation.
209 	 *   struct nvme_tracker must be padded so that its size is already a power of 2.
210 	 *   This ensures the PRP list embedded in the nvme_tracker object will not span a
211 	 *   4KB boundary, while allowing access to trackers in tr[] via normal array indexing.
212 	 */
213 	pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL,
214 				  SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_SHARE);
215 	if (pqpair->tr == NULL) {
216 		SPDK_ERRLOG("nvme_tr failed\n");
217 		return -ENOMEM;
218 	}
219 
220 	TAILQ_INIT(&pqpair->free_tr);
221 	TAILQ_INIT(&pqpair->outstanding_tr);
222 	pqpair->qpair.queue_depth = 0;
223 
224 	for (i = 0; i < num_trackers; i++) {
225 		tr = &pqpair->tr[i];
226 		nvme_qpair_construct_tracker(tr, i, nvme_pcie_vtophys(ctrlr, tr, NULL));
227 		TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
228 	}
229 
230 	nvme_pcie_qpair_reset(qpair);
231 
232 	return 0;
233 }
234 
235 int
236 nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t num_entries)
237 {
238 	struct nvme_pcie_qpair *pqpair;
239 	int rc;
240 
241 	pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_SHARE);
242 	if (pqpair == NULL) {
243 		return -ENOMEM;
244 	}
245 
246 	pqpair->num_entries = num_entries;
247 	pqpair->flags.delay_cmd_submit = 0;
248 	pqpair->pcie_state = NVME_PCIE_QPAIR_READY;
249 
250 	ctrlr->adminq = &pqpair->qpair;
251 
252 	rc = nvme_qpair_init(ctrlr->adminq,
253 			     0, /* qpair ID */
254 			     ctrlr,
255 			     SPDK_NVME_QPRIO_URGENT,
256 			     num_entries,
257 			     false);
258 	if (rc != 0) {
259 		return rc;
260 	}
261 
262 	pqpair->stat = spdk_zmalloc(sizeof(*pqpair->stat), 64, NULL, SPDK_ENV_NUMA_ID_ANY,
263 				    SPDK_MALLOC_SHARE);
264 	if (!pqpair->stat) {
265 		SPDK_ERRLOG("Failed to allocate admin qpair statistics\n");
266 		return -ENOMEM;
267 	}
268 
269 	return nvme_pcie_qpair_construct(ctrlr->adminq, NULL);
270 }
271 
272 /**
273  * Note: the ctrlr_lock must be held when calling this function.
274  */
275 void
276 nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair,
277 		struct nvme_request *req, struct spdk_nvme_cpl *cpl)
278 {
279 	struct spdk_nvme_ctrlr		*ctrlr = qpair->ctrlr;
280 	struct nvme_request		*active_req = req;
281 	struct spdk_nvme_ctrlr_process	*active_proc;
282 
283 	/*
284 	 * The admin request is from another process. Move to the per
285 	 *  process list for that process to handle it later.
286 	 */
287 	assert(nvme_qpair_is_admin_queue(qpair));
288 	assert(active_req->pid != getpid());
289 
290 	active_proc = nvme_ctrlr_get_process(ctrlr, active_req->pid);
291 	if (active_proc) {
292 		/* Save the original completion information */
293 		memcpy(&active_req->cpl, cpl, sizeof(*cpl));
294 		STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq);
295 	} else {
296 		SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n",
297 			    active_req->pid);
298 		nvme_cleanup_user_req(active_req);
299 		nvme_free_request(active_req);
300 	}
301 }
302 
303 /**
304  * Note: the ctrlr_lock must be held when calling this function.
305  */
306 void
307 nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair)
308 {
309 	struct spdk_nvme_ctrlr		*ctrlr = qpair->ctrlr;
310 	struct nvme_request		*req, *tmp_req;
311 	pid_t				pid = getpid();
312 	struct spdk_nvme_ctrlr_process	*proc;
313 
314 	/*
315 	 * Check whether there is any pending admin request from
316 	 * other active processes.
317 	 */
318 	assert(nvme_qpair_is_admin_queue(qpair));
319 
320 	proc = nvme_ctrlr_get_current_process(ctrlr);
321 	if (!proc) {
322 		SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid);
323 		assert(proc);
324 		return;
325 	}
326 
327 	STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
328 		STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
329 
330 		assert(req->pid == pid);
331 
332 		nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl);
333 	}
334 }
335 
336 int
337 nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr,
338 				 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn,
339 				 void *cb_arg)
340 {
341 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
342 	struct nvme_request *req;
343 	struct spdk_nvme_cmd *cmd;
344 
345 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
346 	if (req == NULL) {
347 		return -ENOMEM;
348 	}
349 
350 	cmd = &req->cmd;
351 	cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ;
352 
353 	cmd->cdw10_bits.create_io_q.qid = io_que->id;
354 	cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1;
355 
356 	cmd->cdw11_bits.create_io_cq.pc = 1;
357 	cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr;
358 
359 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
360 }
361 
362 int
363 nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr,
364 				 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
365 {
366 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
367 	struct nvme_request *req;
368 	struct spdk_nvme_cmd *cmd;
369 
370 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
371 	if (req == NULL) {
372 		return -ENOMEM;
373 	}
374 
375 	cmd = &req->cmd;
376 	cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ;
377 
378 	cmd->cdw10_bits.create_io_q.qid = io_que->id;
379 	cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1;
380 	cmd->cdw11_bits.create_io_sq.pc = 1;
381 	cmd->cdw11_bits.create_io_sq.qprio = io_que->qprio;
382 	cmd->cdw11_bits.create_io_sq.cqid = io_que->id;
383 	cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr;
384 
385 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
386 }
387 
388 int
389 nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
390 				 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
391 {
392 	struct nvme_request *req;
393 	struct spdk_nvme_cmd *cmd;
394 
395 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
396 	if (req == NULL) {
397 		return -ENOMEM;
398 	}
399 
400 	cmd = &req->cmd;
401 	cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ;
402 	cmd->cdw10_bits.delete_io_q.qid = qpair->id;
403 
404 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
405 }
406 
407 int
408 nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
409 				 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
410 {
411 	struct nvme_request *req;
412 	struct spdk_nvme_cmd *cmd;
413 
414 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
415 	if (req == NULL) {
416 		return -ENOMEM;
417 	}
418 
419 	cmd = &req->cmd;
420 	cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ;
421 	cmd->cdw10_bits.delete_io_q.qid = qpair->id;
422 
423 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
424 }
425 
426 static void
427 nvme_completion_sq_error_delete_cq_cb(void *arg, const struct spdk_nvme_cpl *cpl)
428 {
429 	struct spdk_nvme_qpair *qpair = arg;
430 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
431 
432 	if (spdk_nvme_cpl_is_error(cpl)) {
433 		SPDK_ERRLOG("delete_io_cq failed!\n");
434 	}
435 
436 	pqpair->pcie_state = NVME_PCIE_QPAIR_FAILED;
437 }
438 
439 static void
440 nvme_completion_create_sq_cb(void *arg, const struct spdk_nvme_cpl *cpl)
441 {
442 	struct spdk_nvme_qpair *qpair = arg;
443 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
444 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
445 	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(ctrlr);
446 	int rc;
447 
448 	if (pqpair->flags.defer_destruction) {
449 		/* This qpair was deleted by the application while the
450 		 * connection was still in progress.  We had to wait
451 		 * to free the qpair resources until this outstanding
452 		 * command was completed.  Now that we have the completion
453 		 * free it now.
454 		 */
455 		nvme_pcie_qpair_destroy(qpair);
456 		return;
457 	}
458 
459 	if (spdk_nvme_cpl_is_error(cpl)) {
460 		SPDK_ERRLOG("nvme_create_io_sq failed, deleting cq!\n");
461 		rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_sq_error_delete_cq_cb,
462 						      qpair);
463 		if (rc != 0) {
464 			SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
465 			pqpair->pcie_state = NVME_PCIE_QPAIR_FAILED;
466 		}
467 		return;
468 	}
469 	pqpair->pcie_state = NVME_PCIE_QPAIR_READY;
470 	if (ctrlr->shadow_doorbell) {
471 		pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) *
472 						  pctrlr->doorbell_stride_u32;
473 		pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) *
474 						  pctrlr->doorbell_stride_u32;
475 		pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) *
476 						      pctrlr->doorbell_stride_u32;
477 		pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) *
478 						      pctrlr->doorbell_stride_u32;
479 		pqpair->flags.has_shadow_doorbell = 1;
480 	} else {
481 		pqpair->flags.has_shadow_doorbell = 0;
482 	}
483 	nvme_pcie_qpair_reset(qpair);
484 
485 }
486 
487 static void
488 nvme_completion_create_cq_cb(void *arg, const struct spdk_nvme_cpl *cpl)
489 {
490 	struct spdk_nvme_qpair *qpair = arg;
491 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
492 	int rc;
493 
494 	if (pqpair->flags.defer_destruction) {
495 		/* This qpair was deleted by the application while the
496 		 * connection was still in progress.  We had to wait
497 		 * to free the qpair resources until this outstanding
498 		 * command was completed.  Now that we have the completion
499 		 * free it now.
500 		 */
501 		nvme_pcie_qpair_destroy(qpair);
502 		return;
503 	}
504 
505 	if (spdk_nvme_cpl_is_error(cpl)) {
506 		pqpair->pcie_state = NVME_PCIE_QPAIR_FAILED;
507 		SPDK_ERRLOG("nvme_create_io_cq failed!\n");
508 		return;
509 	}
510 
511 	rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_create_sq_cb, qpair);
512 
513 	if (rc != 0) {
514 		SPDK_ERRLOG("Failed to send request to create_io_sq, deleting cq!\n");
515 		rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_sq_error_delete_cq_cb,
516 						      qpair);
517 		if (rc != 0) {
518 			SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
519 			pqpair->pcie_state = NVME_PCIE_QPAIR_FAILED;
520 		}
521 		return;
522 	}
523 	pqpair->pcie_state = NVME_PCIE_QPAIR_WAIT_FOR_SQ;
524 }
525 
526 static int
527 _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
528 				 uint16_t qid)
529 {
530 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
531 	int	rc;
532 
533 	/* Statistics may already be allocated in the case of controller reset */
534 	if (qpair->poll_group) {
535 		struct nvme_pcie_poll_group *group = SPDK_CONTAINEROF(qpair->poll_group,
536 						     struct nvme_pcie_poll_group, group);
537 
538 		pqpair->stat = &group->stats;
539 		pqpair->shared_stats = true;
540 	} else {
541 		if (pqpair->stat == NULL) {
542 			pqpair->stat = calloc(1, sizeof(*pqpair->stat));
543 			if (!pqpair->stat) {
544 				SPDK_ERRLOG("Failed to allocate qpair statistics\n");
545 				nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
546 				return -ENOMEM;
547 			}
548 		}
549 	}
550 
551 	rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_create_cq_cb, qpair);
552 
553 	if (rc != 0) {
554 		SPDK_ERRLOG("Failed to send request to create_io_cq\n");
555 		nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
556 		return rc;
557 	}
558 	pqpair->pcie_state = NVME_PCIE_QPAIR_WAIT_FOR_CQ;
559 	return 0;
560 }
561 
562 int
563 nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
564 {
565 	int rc = 0;
566 
567 	if (!nvme_qpair_is_admin_queue(qpair)) {
568 		rc = _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id);
569 	} else {
570 		nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED);
571 	}
572 
573 	return rc;
574 }
575 
576 void
577 nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
578 {
579 	if (!nvme_qpair_is_admin_queue(qpair) || !ctrlr->is_disconnecting) {
580 		nvme_transport_ctrlr_disconnect_qpair_done(qpair);
581 	} else {
582 		/* If this function is called for the admin qpair via spdk_nvme_ctrlr_reset()
583 		 * or spdk_nvme_ctrlr_disconnect(), initiate a Controller Level Reset.
584 		 * Then we can abort trackers safely because the Controller Level Reset deletes
585 		 * all I/O SQ/CQs.
586 		 */
587 		nvme_ctrlr_disable(ctrlr);
588 	}
589 }
590 
591 /* Used when dst points to MMIO (i.e. CMB) in a virtual machine - in these cases we must
592  * not use wide instructions because QEMU will not emulate such instructions to MMIO space.
593  * So this function ensures we only copy 8 bytes at a time.
594  */
595 static inline void
596 nvme_pcie_copy_command_mmio(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
597 {
598 	uint64_t *dst64 = (uint64_t *)dst;
599 	const uint64_t *src64 = (const uint64_t *)src;
600 	uint32_t i;
601 
602 	for (i = 0; i < sizeof(*dst) / 8; i++) {
603 		dst64[i] = src64[i];
604 	}
605 }
606 
607 static inline void
608 nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
609 {
610 	/* dst and src are known to be non-overlapping and 64-byte aligned. */
611 #if defined(__SSE2__)
612 	__m128i *d128 = (__m128i *)dst;
613 	const __m128i *s128 = (const __m128i *)src;
614 
615 	_mm_stream_si128(&d128[0], _mm_load_si128(&s128[0]));
616 	_mm_stream_si128(&d128[1], _mm_load_si128(&s128[1]));
617 	_mm_stream_si128(&d128[2], _mm_load_si128(&s128[2]));
618 	_mm_stream_si128(&d128[3], _mm_load_si128(&s128[3]));
619 #else
620 	*dst = *src;
621 #endif
622 }
623 
624 void
625 nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
626 {
627 	struct nvme_request	*req;
628 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
629 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
630 
631 	req = tr->req;
632 	assert(req != NULL);
633 
634 	spdk_trace_record(TRACE_NVME_PCIE_SUBMIT, qpair->id, 0, (uintptr_t)req, req->cb_arg,
635 			  (uint32_t)req->cmd.cid, (uint32_t)req->cmd.opc,
636 			  req->cmd.cdw10, req->cmd.cdw11, req->cmd.cdw12,
637 			  pqpair->qpair.queue_depth);
638 
639 	if (req->cmd.fuse) {
640 		/*
641 		 * Keep track of the fuse operation sequence so that we ring the doorbell only
642 		 * after the second fuse is submitted.
643 		 */
644 		qpair->last_fuse = req->cmd.fuse;
645 	}
646 
647 	/* Don't use wide instructions to copy NVMe command, this is limited by QEMU
648 	 * virtual NVMe controller, the maximum access width is 8 Bytes for one time.
649 	 */
650 	if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH) && pqpair->sq_in_cmb)) {
651 		nvme_pcie_copy_command_mmio(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
652 	} else {
653 		/* Copy the command from the tracker to the submission queue. */
654 		nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
655 	}
656 
657 	if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) {
658 		pqpair->sq_tail = 0;
659 	}
660 
661 	if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) {
662 		SPDK_ERRLOG("sq_tail is passing sq_head!\n");
663 	}
664 
665 	if (!pqpair->flags.delay_cmd_submit) {
666 		nvme_pcie_qpair_ring_sq_doorbell(qpair);
667 	}
668 }
669 
670 void
671 nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
672 				 struct spdk_nvme_cpl *cpl, bool print_on_error)
673 {
674 	struct nvme_pcie_qpair		*pqpair = nvme_pcie_qpair(qpair);
675 	struct nvme_request		*req;
676 	bool				retry, error;
677 	bool				print_error;
678 
679 	req = tr->req;
680 
681 	spdk_trace_record(TRACE_NVME_PCIE_COMPLETE, qpair->id, 0, (uintptr_t)req, req->cb_arg,
682 			  (uint32_t)req->cmd.cid, (uint32_t)cpl->status_raw, pqpair->qpair.queue_depth);
683 
684 	assert(req != NULL);
685 
686 	error = spdk_nvme_cpl_is_error(cpl);
687 	retry = error && nvme_completion_is_retry(cpl) &&
688 		req->retries < pqpair->retry_count;
689 	print_error = error && print_on_error && !qpair->ctrlr->opts.disable_error_logging;
690 
691 	if (print_error) {
692 		spdk_nvme_qpair_print_command(qpair, &req->cmd);
693 	}
694 
695 	if (print_error || SPDK_DEBUGLOG_FLAG_ENABLED("nvme")) {
696 		spdk_nvme_qpair_print_completion(qpair, cpl);
697 	}
698 
699 	assert(cpl->cid == req->cmd.cid);
700 
701 	if (retry) {
702 		req->retries++;
703 		nvme_pcie_qpair_submit_tracker(qpair, tr);
704 	} else {
705 		TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list);
706 		pqpair->qpair.queue_depth--;
707 
708 		/* Only check admin requests from different processes. */
709 		if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) {
710 			nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl);
711 		} else {
712 			nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl);
713 		}
714 
715 		tr->req = NULL;
716 
717 		TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
718 	}
719 }
720 
721 void
722 nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair,
723 					struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
724 					bool print_on_error)
725 {
726 	struct spdk_nvme_cpl	cpl;
727 
728 	memset(&cpl, 0, sizeof(cpl));
729 	cpl.sqid = qpair->id;
730 	cpl.cid = tr->cid;
731 	cpl.status.sct = sct;
732 	cpl.status.sc = sc;
733 	cpl.status.dnr = dnr;
734 	nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
735 }
736 
737 void
738 nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr)
739 {
740 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
741 	struct nvme_tracker *tr, *temp, *last;
742 
743 	last = TAILQ_LAST(&pqpair->outstanding_tr, nvme_outstanding_tr_head);
744 
745 	/* Abort previously submitted (outstanding) trs */
746 	TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) {
747 		if (!qpair->ctrlr->opts.disable_error_logging) {
748 			SPDK_ERRLOG("aborting outstanding command\n");
749 		}
750 		nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
751 							SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true);
752 
753 		if (tr == last) {
754 			break;
755 		}
756 	}
757 }
758 
759 void
760 nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
761 {
762 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
763 	struct nvme_tracker	*tr;
764 
765 	tr = TAILQ_FIRST(&pqpair->outstanding_tr);
766 	while (tr != NULL) {
767 		assert(tr->req != NULL);
768 		if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
769 			nvme_pcie_qpair_manual_complete_tracker(qpair, tr,
770 								SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0,
771 								false);
772 			tr = TAILQ_FIRST(&pqpair->outstanding_tr);
773 		} else {
774 			tr = TAILQ_NEXT(tr, tq_list);
775 		}
776 	}
777 }
778 
779 void
780 nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair)
781 {
782 	nvme_pcie_admin_qpair_abort_aers(qpair);
783 }
784 
785 void
786 nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
787 {
788 	nvme_pcie_qpair_abort_trackers(qpair, dnr);
789 }
790 
791 static void
792 nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
793 {
794 	uint64_t t02;
795 	struct nvme_tracker *tr, *tmp;
796 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
797 	struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
798 	struct spdk_nvme_ctrlr_process *active_proc;
799 
800 	/* Don't check timeouts during controller initialization. */
801 	if (ctrlr->state != NVME_CTRLR_STATE_READY) {
802 		return;
803 	}
804 
805 	if (nvme_qpair_is_admin_queue(qpair)) {
806 		active_proc = nvme_ctrlr_get_current_process(ctrlr);
807 	} else {
808 		active_proc = qpair->active_proc;
809 	}
810 
811 	/* Only check timeouts if the current process has a timeout callback. */
812 	if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
813 		return;
814 	}
815 
816 	t02 = spdk_get_ticks();
817 	TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
818 		assert(tr->req != NULL);
819 
820 		if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) {
821 			/*
822 			 * The requests are in order, so as soon as one has not timed out,
823 			 * stop iterating.
824 			 */
825 			break;
826 		}
827 	}
828 }
829 
830 int32_t
831 nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
832 {
833 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
834 	struct nvme_tracker	*tr;
835 	struct spdk_nvme_cpl	*cpl, *next_cpl;
836 	uint32_t		 num_completions = 0;
837 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
838 	uint16_t		 next_cq_head;
839 	uint8_t			 next_phase;
840 	bool			 next_is_valid = false;
841 	int			 rc;
842 
843 	if (spdk_unlikely(pqpair->pcie_state == NVME_PCIE_QPAIR_FAILED)) {
844 		return -ENXIO;
845 	}
846 
847 	if (spdk_unlikely(nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING)) {
848 		if (pqpair->pcie_state == NVME_PCIE_QPAIR_READY) {
849 			/* It is possible that another thread set the pcie_state to
850 			 * QPAIR_READY, if it polled the adminq and processed the SQ
851 			 * completion for this qpair.  So check for that condition
852 			 * here and then update the qpair's state to CONNECTED, since
853 			 * we can only set the qpair state from the qpair's thread.
854 			 * (Note: this fixed issue #2157.)
855 			 */
856 			nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED);
857 		} else if (pqpair->pcie_state == NVME_PCIE_QPAIR_FAILED) {
858 			nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
859 			return -ENXIO;
860 		} else {
861 			rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
862 			if (rc < 0) {
863 				return rc;
864 			} else if (pqpair->pcie_state == NVME_PCIE_QPAIR_FAILED) {
865 				nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
866 				return -ENXIO;
867 			}
868 		}
869 		return 0;
870 	}
871 
872 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
873 		nvme_ctrlr_lock(ctrlr);
874 	}
875 
876 	if (max_completions == 0 || max_completions > pqpair->max_completions_cap) {
877 		/*
878 		 * max_completions == 0 means unlimited, but complete at most
879 		 * max_completions_cap batch of I/O at a time so that the completion
880 		 * queue doorbells don't wrap around.
881 		 */
882 		max_completions = pqpair->max_completions_cap;
883 	}
884 
885 	pqpair->stat->polls++;
886 
887 	while (1) {
888 		cpl = &pqpair->cpl[pqpair->cq_head];
889 
890 		if (!next_is_valid && cpl->status.p != pqpair->flags.phase) {
891 			break;
892 		}
893 
894 		if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) {
895 			next_cq_head = pqpair->cq_head + 1;
896 			next_phase = pqpair->flags.phase;
897 		} else {
898 			next_cq_head = 0;
899 			next_phase = !pqpair->flags.phase;
900 		}
901 		next_cpl = &pqpair->cpl[next_cq_head];
902 		next_is_valid = (next_cpl->status.p == next_phase);
903 		if (next_is_valid) {
904 			__builtin_prefetch(&pqpair->tr[next_cpl->cid]);
905 		}
906 
907 #if defined(__PPC64__) || defined(__riscv) || defined(__loongarch__)
908 		/*
909 		 * This memory barrier prevents reordering of:
910 		 * - load after store from/to tr
911 		 * - load after load cpl phase and cpl cid
912 		 */
913 		spdk_mb();
914 #elif defined(__aarch64__)
915 		__asm volatile("dmb oshld" ::: "memory");
916 #endif
917 
918 		if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) {
919 			pqpair->cq_head = 0;
920 			pqpair->flags.phase = !pqpair->flags.phase;
921 		}
922 
923 		tr = &pqpair->tr[cpl->cid];
924 		pqpair->sq_head = cpl->sqhd;
925 
926 		if (tr->req) {
927 			/* Prefetch the req's STAILQ_ENTRY since we'll need to access it
928 			 * as part of putting the req back on the qpair's free list.
929 			 */
930 			__builtin_prefetch(&tr->req->stailq);
931 			nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true);
932 		} else {
933 			SPDK_ERRLOG("cpl does not map to outstanding cmd\n");
934 			spdk_nvme_qpair_print_completion(qpair, cpl);
935 			assert(0);
936 		}
937 
938 		if (++num_completions == max_completions) {
939 			break;
940 		}
941 	}
942 
943 	if (num_completions > 0) {
944 		pqpair->stat->completions += num_completions;
945 		nvme_pcie_qpair_ring_cq_doorbell(qpair);
946 	} else {
947 		pqpair->stat->idle_polls++;
948 	}
949 
950 	if (pqpair->flags.delay_cmd_submit) {
951 		if (pqpair->last_sq_tail != pqpair->sq_tail) {
952 			nvme_pcie_qpair_ring_sq_doorbell(qpair);
953 			pqpair->last_sq_tail = pqpair->sq_tail;
954 		}
955 	}
956 
957 	if (spdk_unlikely(ctrlr->timeout_enabled)) {
958 		/*
959 		 * User registered for timeout callback
960 		 */
961 		nvme_pcie_qpair_check_timeout(qpair);
962 	}
963 
964 	/* Before returning, complete any pending admin request or
965 	 * process the admin qpair disconnection.
966 	 */
967 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
968 		nvme_pcie_qpair_complete_pending_admin_request(qpair);
969 
970 		if (nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING) {
971 			rc = nvme_ctrlr_disable_poll(qpair->ctrlr);
972 			if (rc != -EAGAIN) {
973 				nvme_transport_ctrlr_disconnect_qpair_done(qpair);
974 			}
975 		}
976 
977 		nvme_ctrlr_unlock(ctrlr);
978 	}
979 
980 	if (spdk_unlikely(pqpair->flags.has_pending_vtophys_failures)) {
981 		struct nvme_tracker *tr, *tmp;
982 
983 		TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
984 			if (tr->bad_vtophys) {
985 				tr->bad_vtophys = 0;
986 				nvme_pcie_fail_request_bad_vtophys(qpair, tr);
987 			}
988 		}
989 		pqpair->flags.has_pending_vtophys_failures = 0;
990 	}
991 
992 	return num_completions;
993 }
994 
995 int
996 nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair)
997 {
998 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
999 
1000 	if (nvme_qpair_is_admin_queue(qpair)) {
1001 		nvme_pcie_admin_qpair_destroy(qpair);
1002 	}
1003 	/*
1004 	 * We check sq_vaddr and cq_vaddr to see if the user specified the memory
1005 	 * buffers when creating the I/O queue.
1006 	 * If the user specified them, we cannot free that memory.
1007 	 * Nor do we free it if it's in the CMB.
1008 	 */
1009 	if (!pqpair->sq_vaddr && pqpair->cmd && !pqpair->sq_in_cmb) {
1010 		spdk_free(pqpair->cmd);
1011 	}
1012 	if (!pqpair->cq_vaddr && pqpair->cpl) {
1013 		spdk_free(pqpair->cpl);
1014 	}
1015 	if (pqpair->tr) {
1016 		spdk_free(pqpair->tr);
1017 	}
1018 
1019 	nvme_qpair_deinit(qpair);
1020 
1021 	if (!pqpair->shared_stats && (!qpair->active_proc ||
1022 				      qpair->active_proc == nvme_ctrlr_get_current_process(qpair->ctrlr))) {
1023 		if (qpair->id) {
1024 			free(pqpair->stat);
1025 		} else {
1026 			/* statistics of admin qpair are allocates from huge pages because
1027 			 * admin qpair is shared for multi-process */
1028 			spdk_free(pqpair->stat);
1029 		}
1030 
1031 	}
1032 
1033 	spdk_free(pqpair);
1034 
1035 	return 0;
1036 }
1037 
1038 struct spdk_nvme_qpair *
1039 nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
1040 				const struct spdk_nvme_io_qpair_opts *opts)
1041 {
1042 	struct nvme_pcie_qpair *pqpair;
1043 	struct spdk_nvme_qpair *qpair;
1044 	int rc;
1045 
1046 	assert(ctrlr != NULL);
1047 
1048 	pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL,
1049 			      SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_SHARE);
1050 	if (pqpair == NULL) {
1051 		return NULL;
1052 	}
1053 
1054 	pqpair->num_entries = opts->io_queue_size;
1055 	pqpair->flags.delay_cmd_submit = opts->delay_cmd_submit;
1056 
1057 	qpair = &pqpair->qpair;
1058 
1059 	rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests, opts->async_mode);
1060 	if (rc != 0) {
1061 		nvme_pcie_qpair_destroy(qpair);
1062 		return NULL;
1063 	}
1064 
1065 	rc = nvme_pcie_qpair_construct(qpair, opts);
1066 
1067 	if (rc != 0) {
1068 		nvme_pcie_qpair_destroy(qpair);
1069 		return NULL;
1070 	}
1071 
1072 	return qpair;
1073 }
1074 
1075 int
1076 nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1077 {
1078 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1079 	struct nvme_completion_poll_status *status;
1080 	int rc;
1081 
1082 	assert(ctrlr != NULL);
1083 
1084 	if (ctrlr->is_removed) {
1085 		goto free;
1086 	}
1087 
1088 	if (ctrlr->prepare_for_reset) {
1089 		if (nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING) {
1090 			pqpair->flags.defer_destruction = true;
1091 		}
1092 		goto clear_shadow_doorbells;
1093 	}
1094 
1095 	/* If attempting to delete a qpair that's still being connected, we have to wait until it's
1096 	 * finished, so that we don't free it while it's waiting for the create cq/sq callbacks.
1097 	 */
1098 	while (pqpair->pcie_state == NVME_PCIE_QPAIR_WAIT_FOR_CQ ||
1099 	       pqpair->pcie_state == NVME_PCIE_QPAIR_WAIT_FOR_SQ) {
1100 		rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
1101 		if (rc < 0) {
1102 			break;
1103 		}
1104 	}
1105 
1106 	status = calloc(1, sizeof(*status));
1107 	if (!status) {
1108 		SPDK_ERRLOG("Failed to allocate status tracker\n");
1109 		goto free;
1110 	}
1111 
1112 	/* Delete the I/O submission queue */
1113 	rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, status);
1114 	if (rc != 0) {
1115 		SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc);
1116 		free(status);
1117 		goto free;
1118 	}
1119 	if (nvme_wait_for_completion(ctrlr->adminq, status)) {
1120 		if (!status->timed_out) {
1121 			free(status);
1122 		}
1123 		goto free;
1124 	}
1125 
1126 	/* Now that the submission queue is deleted, the device is supposed to have
1127 	 * completed any outstanding I/O. Try to complete them. If they don't complete,
1128 	 * they'll be marked as aborted and completed below. */
1129 	if (qpair->active_proc == nvme_ctrlr_get_current_process(ctrlr)) {
1130 		nvme_pcie_qpair_process_completions(qpair, 0);
1131 	}
1132 
1133 	memset(status, 0, sizeof(*status));
1134 	/* Delete the completion queue */
1135 	rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status);
1136 	if (rc != 0) {
1137 		SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
1138 		free(status);
1139 		goto free;
1140 	}
1141 	if (nvme_wait_for_completion(ctrlr->adminq, status)) {
1142 		if (!status->timed_out) {
1143 			free(status);
1144 		}
1145 		goto free;
1146 	}
1147 	free(status);
1148 
1149 clear_shadow_doorbells:
1150 	if (pqpair->flags.has_shadow_doorbell && ctrlr->shadow_doorbell) {
1151 		*pqpair->shadow_doorbell.sq_tdbl = 0;
1152 		*pqpair->shadow_doorbell.cq_hdbl = 0;
1153 		*pqpair->shadow_doorbell.sq_eventidx = 0;
1154 		*pqpair->shadow_doorbell.cq_eventidx = 0;
1155 	}
1156 free:
1157 	if (qpair->no_deletion_notification_needed == 0) {
1158 		/* Abort the rest of the I/O */
1159 		nvme_pcie_qpair_abort_trackers(qpair, 1);
1160 	}
1161 
1162 	if (!pqpair->flags.defer_destruction) {
1163 		nvme_pcie_qpair_destroy(qpair);
1164 	}
1165 	return 0;
1166 }
1167 
1168 static void
1169 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
1170 {
1171 	if (!qpair->in_completion_context) {
1172 		struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1173 
1174 		tr->bad_vtophys = 1;
1175 		pqpair->flags.has_pending_vtophys_failures = 1;
1176 		return;
1177 	}
1178 
1179 	/*
1180 	 * Bad vtophys translation, so abort this request and return
1181 	 *  immediately.
1182 	 */
1183 	SPDK_ERRLOG("vtophys or other payload buffer related error\n");
1184 	nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
1185 						SPDK_NVME_SC_INVALID_FIELD,
1186 						1 /* do not retry */, true);
1187 }
1188 
1189 /*
1190  * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes.
1191  *
1192  * *prp_index will be updated to account for the number of PRP entries used.
1193  */
1194 static inline int
1195 nvme_pcie_prp_list_append(struct spdk_nvme_ctrlr *ctrlr, struct nvme_tracker *tr,
1196 			  uint32_t *prp_index, void *virt_addr, size_t len,
1197 			  uint32_t page_size)
1198 {
1199 	struct spdk_nvme_cmd *cmd = &tr->req->cmd;
1200 	uintptr_t page_mask = page_size - 1;
1201 	uint64_t phys_addr;
1202 	uint32_t i;
1203 
1204 	SPDK_DEBUGLOG(nvme, "prp_index:%u virt_addr:%p len:%u\n",
1205 		      *prp_index, virt_addr, (uint32_t)len);
1206 
1207 	if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) {
1208 		SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1209 		return -EFAULT;
1210 	}
1211 
1212 	i = *prp_index;
1213 	while (len) {
1214 		uint32_t seg_len;
1215 
1216 		/*
1217 		 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array,
1218 		 * so prp_index == count is valid.
1219 		 */
1220 		if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) {
1221 			SPDK_ERRLOG("out of PRP entries\n");
1222 			return -EFAULT;
1223 		}
1224 
1225 		phys_addr = nvme_pcie_vtophys(ctrlr, virt_addr, NULL);
1226 		if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) {
1227 			SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr);
1228 			return -EFAULT;
1229 		}
1230 
1231 		if (i == 0) {
1232 			SPDK_DEBUGLOG(nvme, "prp1 = %p\n", (void *)phys_addr);
1233 			cmd->dptr.prp.prp1 = phys_addr;
1234 			seg_len = page_size - ((uintptr_t)virt_addr & page_mask);
1235 		} else {
1236 			if ((phys_addr & page_mask) != 0) {
1237 				SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr);
1238 				return -EFAULT;
1239 			}
1240 
1241 			SPDK_DEBUGLOG(nvme, "prp[%u] = %p\n", i - 1, (void *)phys_addr);
1242 			tr->u.prp[i - 1] = phys_addr;
1243 			seg_len = page_size;
1244 		}
1245 
1246 		seg_len = spdk_min(seg_len, len);
1247 		virt_addr = (uint8_t *)virt_addr + seg_len;
1248 		len -= seg_len;
1249 		i++;
1250 	}
1251 
1252 	cmd->psdt = SPDK_NVME_PSDT_PRP;
1253 	if (i <= 1) {
1254 		cmd->dptr.prp.prp2 = 0;
1255 	} else if (i == 2) {
1256 		cmd->dptr.prp.prp2 = tr->u.prp[0];
1257 		SPDK_DEBUGLOG(nvme, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2);
1258 	} else {
1259 		cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr;
1260 		SPDK_DEBUGLOG(nvme, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2);
1261 	}
1262 
1263 	*prp_index = i;
1264 	return 0;
1265 }
1266 
1267 static int
1268 nvme_pcie_qpair_build_request_invalid(struct spdk_nvme_qpair *qpair,
1269 				      struct nvme_request *req, struct nvme_tracker *tr, bool dword_aligned)
1270 {
1271 	assert(0);
1272 	nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1273 	return -EINVAL;
1274 }
1275 
1276 /**
1277  * Build PRP list describing physically contiguous payload buffer.
1278  */
1279 static int
1280 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1281 				     struct nvme_tracker *tr, bool dword_aligned)
1282 {
1283 	uint32_t prp_index = 0;
1284 	int rc;
1285 
1286 	rc = nvme_pcie_prp_list_append(qpair->ctrlr, tr, &prp_index,
1287 				       (uint8_t *)req->payload.contig_or_cb_arg + req->payload_offset,
1288 				       req->payload_size, qpair->ctrlr->page_size);
1289 	if (rc) {
1290 		nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1291 	} else {
1292 		SPDK_DEBUGLOG(nvme, "Number of PRP entries: %" PRIu32 "\n", prp_index);
1293 	}
1294 
1295 	return rc;
1296 }
1297 
1298 /**
1299  * Build an SGL describing a physically contiguous payload buffer.
1300  *
1301  * This is more efficient than using PRP because large buffers can be
1302  * described this way.
1303  */
1304 static int
1305 nvme_pcie_qpair_build_contig_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1306 		struct nvme_tracker *tr, bool dword_aligned)
1307 {
1308 	uint8_t *virt_addr;
1309 	uint64_t phys_addr, mapping_length;
1310 	uint32_t length;
1311 	struct spdk_nvme_sgl_descriptor *sgl;
1312 	uint32_t nseg = 0;
1313 
1314 	assert(req->payload_size != 0);
1315 	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
1316 
1317 	sgl = tr->u.sgl;
1318 	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
1319 	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
1320 
1321 	length = req->payload_size;
1322 	/* ubsan complains about applying zero offset to null pointer if contig_or_cb_arg is NULL,
1323 	 * so just double cast it to make it go away */
1324 	virt_addr = (uint8_t *)((uintptr_t)req->payload.contig_or_cb_arg + req->payload_offset);
1325 
1326 	while (length > 0) {
1327 		if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1328 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1329 			return -EFAULT;
1330 		}
1331 
1332 		if (dword_aligned && ((uintptr_t)virt_addr & 3)) {
1333 			SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1334 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1335 			return -EFAULT;
1336 		}
1337 
1338 		mapping_length = length;
1339 		phys_addr = nvme_pcie_vtophys(qpair->ctrlr, virt_addr, &mapping_length);
1340 		if (phys_addr == SPDK_VTOPHYS_ERROR) {
1341 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1342 			return -EFAULT;
1343 		}
1344 
1345 		mapping_length = spdk_min(length, mapping_length);
1346 
1347 		length -= mapping_length;
1348 		virt_addr += mapping_length;
1349 
1350 		sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1351 		sgl->unkeyed.length = mapping_length;
1352 		sgl->address = phys_addr;
1353 		sgl->unkeyed.subtype = 0;
1354 
1355 		sgl++;
1356 		nseg++;
1357 	}
1358 
1359 	if (nseg == 1) {
1360 		/*
1361 		 * The whole transfer can be described by a single SGL descriptor.
1362 		 *  Use the special case described by the spec where SGL1's type is Data Block.
1363 		 *  This means the SGL in the tracker is not used at all, so copy the first (and only)
1364 		 *  SGL element into SGL1.
1365 		 */
1366 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1367 		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
1368 		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
1369 	} else {
1370 		/* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because
1371 		 *  NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page.
1372 		 */
1373 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
1374 		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
1375 		req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
1376 	}
1377 
1378 	SPDK_DEBUGLOG(nvme, "Number of SGL descriptors: %" PRIu32 "\n", nseg);
1379 	return 0;
1380 }
1381 
1382 /**
1383  * Build SGL list describing scattered payload buffer.
1384  */
1385 static int
1386 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1387 				     struct nvme_tracker *tr, bool dword_aligned)
1388 {
1389 	int rc;
1390 	void *virt_addr;
1391 	uint64_t phys_addr, mapping_length;
1392 	uint32_t remaining_transfer_len, remaining_user_sge_len, length;
1393 	struct spdk_nvme_sgl_descriptor *sgl;
1394 	uint32_t nseg = 0;
1395 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1396 
1397 	/*
1398 	 * Build scattered payloads.
1399 	 */
1400 	assert(req->payload_size != 0);
1401 	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
1402 	assert(req->payload.reset_sgl_fn != NULL);
1403 	assert(req->payload.next_sge_fn != NULL);
1404 	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
1405 
1406 	sgl = tr->u.sgl;
1407 	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
1408 	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
1409 
1410 	remaining_transfer_len = req->payload_size;
1411 
1412 	while (remaining_transfer_len > 0) {
1413 		rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg,
1414 					      &virt_addr, &remaining_user_sge_len);
1415 		if (rc) {
1416 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1417 			return -EFAULT;
1418 		}
1419 
1420 		/* Bit Bucket SGL descriptor */
1421 		if ((uint64_t)virt_addr == UINT64_MAX) {
1422 			/* TODO: enable WRITE and COMPARE when necessary */
1423 			if (req->cmd.opc != SPDK_NVME_OPC_READ) {
1424 				SPDK_ERRLOG("Only READ command can be supported\n");
1425 				goto exit;
1426 			}
1427 			if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1428 				SPDK_ERRLOG("Too many SGL entries\n");
1429 				goto exit;
1430 			}
1431 
1432 			sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_BIT_BUCKET;
1433 			/* If the SGL describes a destination data buffer, the length of data
1434 			 * buffer shall be discarded by controller, and the length is included
1435 			 * in Number of Logical Blocks (NLB) parameter. Otherwise, the length
1436 			 * is not included in the NLB parameter.
1437 			 */
1438 			remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
1439 			remaining_transfer_len -= remaining_user_sge_len;
1440 
1441 			sgl->unkeyed.length = remaining_user_sge_len;
1442 			sgl->address = 0;
1443 			sgl->unkeyed.subtype = 0;
1444 
1445 			sgl++;
1446 			nseg++;
1447 
1448 			continue;
1449 		}
1450 
1451 		remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
1452 		remaining_transfer_len -= remaining_user_sge_len;
1453 		while (remaining_user_sge_len > 0) {
1454 			if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1455 				SPDK_ERRLOG("Too many SGL entries\n");
1456 				goto exit;
1457 			}
1458 
1459 			if (dword_aligned && ((uintptr_t)virt_addr & 3)) {
1460 				SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1461 				goto exit;
1462 			}
1463 
1464 			mapping_length = remaining_user_sge_len;
1465 			phys_addr = nvme_pcie_vtophys(qpair->ctrlr, virt_addr, &mapping_length);
1466 			if (phys_addr == SPDK_VTOPHYS_ERROR) {
1467 				goto exit;
1468 			}
1469 
1470 			length = spdk_min(remaining_user_sge_len, mapping_length);
1471 			remaining_user_sge_len -= length;
1472 			virt_addr = (uint8_t *)virt_addr + length;
1473 
1474 			if (!pqpair->flags.disable_pcie_sgl_merge && nseg > 0 &&
1475 			    phys_addr == (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) {
1476 				/* extend previous entry */
1477 				(*(sgl - 1)).unkeyed.length += length;
1478 				continue;
1479 			}
1480 
1481 			sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1482 			sgl->unkeyed.length = length;
1483 			sgl->address = phys_addr;
1484 			sgl->unkeyed.subtype = 0;
1485 
1486 			sgl++;
1487 			nseg++;
1488 		}
1489 	}
1490 
1491 	if (nseg == 1) {
1492 		/*
1493 		 * The whole transfer can be described by a single SGL descriptor.
1494 		 *  Use the special case described by the spec where SGL1's type is Data Block.
1495 		 *  This means the SGL in the tracker is not used at all, so copy the first (and only)
1496 		 *  SGL element into SGL1.
1497 		 */
1498 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1499 		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
1500 		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
1501 	} else {
1502 		/* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because
1503 		 *  NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page.
1504 		 */
1505 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
1506 		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
1507 		req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
1508 	}
1509 
1510 	SPDK_DEBUGLOG(nvme, "Number of SGL descriptors: %" PRIu32 "\n", nseg);
1511 	return 0;
1512 
1513 exit:
1514 	nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1515 	return -EFAULT;
1516 }
1517 
1518 /**
1519  * Build PRP list describing scattered payload buffer.
1520  */
1521 static int
1522 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1523 				       struct nvme_tracker *tr, bool dword_aligned)
1524 {
1525 	int rc;
1526 	void *virt_addr;
1527 	uint32_t remaining_transfer_len, length;
1528 	uint32_t prp_index = 0;
1529 	uint32_t page_size = qpair->ctrlr->page_size;
1530 
1531 	/*
1532 	 * Build scattered payloads.
1533 	 */
1534 	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
1535 	assert(req->payload.reset_sgl_fn != NULL);
1536 	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
1537 
1538 	remaining_transfer_len = req->payload_size;
1539 	while (remaining_transfer_len > 0) {
1540 		assert(req->payload.next_sge_fn != NULL);
1541 		rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length);
1542 		if (rc) {
1543 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1544 			return -EFAULT;
1545 		}
1546 
1547 		length = spdk_min(remaining_transfer_len, length);
1548 
1549 		/*
1550 		 * Any incompatible sges should have been handled up in the splitting routine,
1551 		 *  but assert here as an additional check.
1552 		 *
1553 		 * All SGEs except last must end on a page boundary.
1554 		 */
1555 		assert((length == remaining_transfer_len) ||
1556 		       _is_page_aligned((uintptr_t)virt_addr + length, page_size));
1557 
1558 		rc = nvme_pcie_prp_list_append(qpair->ctrlr, tr, &prp_index, virt_addr, length, page_size);
1559 		if (rc) {
1560 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1561 			return rc;
1562 		}
1563 
1564 		remaining_transfer_len -= length;
1565 	}
1566 
1567 	SPDK_DEBUGLOG(nvme, "Number of PRP entries: %" PRIu32 "\n", prp_index);
1568 	return 0;
1569 }
1570 
1571 typedef int(*build_req_fn)(struct spdk_nvme_qpair *, struct nvme_request *, struct nvme_tracker *,
1572 			   bool);
1573 
1574 static build_req_fn const g_nvme_pcie_build_req_table[][2] = {
1575 	[NVME_PAYLOAD_TYPE_INVALID] = {
1576 		nvme_pcie_qpair_build_request_invalid,			/* PRP */
1577 		nvme_pcie_qpair_build_request_invalid			/* SGL */
1578 	},
1579 	[NVME_PAYLOAD_TYPE_CONTIG] = {
1580 		nvme_pcie_qpair_build_contig_request,			/* PRP */
1581 		nvme_pcie_qpair_build_contig_hw_sgl_request		/* SGL */
1582 	},
1583 	[NVME_PAYLOAD_TYPE_SGL] = {
1584 		nvme_pcie_qpair_build_prps_sgl_request,			/* PRP */
1585 		nvme_pcie_qpair_build_hw_sgl_request			/* SGL */
1586 	}
1587 };
1588 
1589 static int
1590 nvme_pcie_qpair_build_metadata(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
1591 			       bool sgl_supported, bool mptr_sgl_supported, bool dword_aligned)
1592 {
1593 	void *md_payload;
1594 	struct nvme_request *req = tr->req;
1595 	uint64_t mapping_length;
1596 
1597 	if (req->payload.md) {
1598 		md_payload = (uint8_t *)req->payload.md + req->md_offset;
1599 		if (dword_aligned && ((uintptr_t)md_payload & 3)) {
1600 			SPDK_ERRLOG("virt_addr %p not dword aligned\n", md_payload);
1601 			goto exit;
1602 		}
1603 
1604 		mapping_length = req->md_size;
1605 		if (sgl_supported && mptr_sgl_supported && dword_aligned) {
1606 			assert(req->cmd.psdt == SPDK_NVME_PSDT_SGL_MPTR_CONTIG);
1607 			req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_SGL;
1608 
1609 			tr->meta_sgl.address = nvme_pcie_vtophys(qpair->ctrlr, md_payload, &mapping_length);
1610 			if (tr->meta_sgl.address == SPDK_VTOPHYS_ERROR || mapping_length != req->md_size) {
1611 				goto exit;
1612 			}
1613 			tr->meta_sgl.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1614 			tr->meta_sgl.unkeyed.length = req->md_size;
1615 			tr->meta_sgl.unkeyed.subtype = 0;
1616 			req->cmd.mptr = tr->prp_sgl_bus_addr - sizeof(struct spdk_nvme_sgl_descriptor);
1617 		} else {
1618 			req->cmd.mptr = nvme_pcie_vtophys(qpair->ctrlr, md_payload, &mapping_length);
1619 			if (req->cmd.mptr == SPDK_VTOPHYS_ERROR || mapping_length != req->md_size) {
1620 				goto exit;
1621 			}
1622 		}
1623 	}
1624 
1625 	return 0;
1626 
1627 exit:
1628 	nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1629 	return -EINVAL;
1630 }
1631 
1632 int
1633 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
1634 {
1635 	struct nvme_tracker	*tr;
1636 	int			rc = 0;
1637 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
1638 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
1639 	enum nvme_payload_type	payload_type;
1640 	bool			sgl_supported;
1641 	bool			mptr_sgl_supported;
1642 	bool			dword_aligned = true;
1643 
1644 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
1645 		nvme_ctrlr_lock(ctrlr);
1646 	}
1647 
1648 	tr = TAILQ_FIRST(&pqpair->free_tr);
1649 
1650 	if (tr == NULL) {
1651 		pqpair->stat->queued_requests++;
1652 		/* Inform the upper layer to try again later. */
1653 		rc = -EAGAIN;
1654 		goto exit;
1655 	}
1656 
1657 	pqpair->stat->submitted_requests++;
1658 	TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */
1659 	TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list);
1660 	pqpair->qpair.queue_depth++;
1661 	tr->req = req;
1662 	tr->cb_fn = req->cb_fn;
1663 	tr->cb_arg = req->cb_arg;
1664 	req->cmd.cid = tr->cid;
1665 	/* Use PRP by default. This bit will be overridden below if needed. */
1666 	req->cmd.psdt = SPDK_NVME_PSDT_PRP;
1667 
1668 	if (req->payload_size != 0) {
1669 		payload_type = nvme_payload_type(&req->payload);
1670 		/* According to the specification, PRPs shall be used for all
1671 		 *  Admin commands for NVMe over PCIe implementations.
1672 		 */
1673 		sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) != 0 &&
1674 				!nvme_qpair_is_admin_queue(qpair);
1675 		mptr_sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_MPTR_SGL_SUPPORTED) != 0 &&
1676 				     !nvme_qpair_is_admin_queue(qpair);
1677 
1678 		if (sgl_supported) {
1679 			/* Don't use SGL for DSM command */
1680 			if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_NO_SGL_FOR_DSM) &&
1681 					  (req->cmd.opc == SPDK_NVME_OPC_DATASET_MANAGEMENT))) {
1682 				sgl_supported = false;
1683 			}
1684 		}
1685 
1686 		if (sgl_supported && !(ctrlr->flags & SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT)) {
1687 			dword_aligned = false;
1688 		}
1689 
1690 		/* If we fail to build the request or the metadata, do not return the -EFAULT back up
1691 		 * the stack.  This ensures that we always fail these types of requests via a
1692 		 * completion callback, and never in the context of the submission.
1693 		 */
1694 		rc = g_nvme_pcie_build_req_table[payload_type][sgl_supported](qpair, req, tr, dword_aligned);
1695 		if (rc < 0) {
1696 			assert(rc == -EFAULT);
1697 			rc = 0;
1698 			goto exit;
1699 		}
1700 
1701 		rc = nvme_pcie_qpair_build_metadata(qpair, tr, sgl_supported, mptr_sgl_supported, dword_aligned);
1702 		if (rc < 0) {
1703 			assert(rc == -EFAULT);
1704 			rc = 0;
1705 			goto exit;
1706 		}
1707 	}
1708 
1709 	nvme_pcie_qpair_submit_tracker(qpair, tr);
1710 
1711 exit:
1712 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
1713 		nvme_ctrlr_unlock(ctrlr);
1714 	}
1715 
1716 	return rc;
1717 }
1718 
1719 struct spdk_nvme_transport_poll_group *
1720 nvme_pcie_poll_group_create(void)
1721 {
1722 	struct nvme_pcie_poll_group *group = calloc(1, sizeof(*group));
1723 
1724 	if (group == NULL) {
1725 		SPDK_ERRLOG("Unable to allocate poll group.\n");
1726 		return NULL;
1727 	}
1728 
1729 	return &group->group;
1730 }
1731 
1732 int
1733 nvme_pcie_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
1734 {
1735 	return 0;
1736 }
1737 
1738 int
1739 nvme_pcie_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
1740 {
1741 	return 0;
1742 }
1743 
1744 int
1745 nvme_pcie_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
1746 			 struct spdk_nvme_qpair *qpair)
1747 {
1748 	return 0;
1749 }
1750 
1751 int
1752 nvme_pcie_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
1753 			    struct spdk_nvme_qpair *qpair)
1754 {
1755 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1756 
1757 	pqpair->stat = &g_dummy_stat;
1758 	return 0;
1759 }
1760 
1761 int64_t
1762 nvme_pcie_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
1763 		uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
1764 {
1765 	struct spdk_nvme_qpair *qpair, *tmp_qpair;
1766 	int32_t local_completions = 0;
1767 	int64_t total_completions = 0;
1768 
1769 	STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) {
1770 		disconnected_qpair_cb(qpair, tgroup->group->ctx);
1771 	}
1772 
1773 	STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) {
1774 		local_completions = spdk_nvme_qpair_process_completions(qpair, completions_per_qpair);
1775 		if (spdk_unlikely(local_completions < 0)) {
1776 			disconnected_qpair_cb(qpair, tgroup->group->ctx);
1777 			total_completions = -ENXIO;
1778 		} else if (spdk_likely(total_completions >= 0)) {
1779 			total_completions += local_completions;
1780 		}
1781 	}
1782 
1783 	return total_completions;
1784 }
1785 
1786 int
1787 nvme_pcie_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup)
1788 {
1789 	if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) {
1790 		return -EBUSY;
1791 	}
1792 
1793 	free(tgroup);
1794 
1795 	return 0;
1796 }
1797 
1798 int
1799 nvme_pcie_poll_group_get_stats(struct spdk_nvme_transport_poll_group *tgroup,
1800 			       struct spdk_nvme_transport_poll_group_stat **_stats)
1801 {
1802 	struct nvme_pcie_poll_group *group;
1803 	struct spdk_nvme_transport_poll_group_stat *stats;
1804 
1805 	if (tgroup == NULL || _stats == NULL) {
1806 		SPDK_ERRLOG("Invalid stats or group pointer\n");
1807 		return -EINVAL;
1808 	}
1809 
1810 	stats = calloc(1, sizeof(*stats));
1811 	if (!stats) {
1812 		SPDK_ERRLOG("Can't allocate memory for stats\n");
1813 		return -ENOMEM;
1814 	}
1815 	stats->trtype = SPDK_NVME_TRANSPORT_PCIE;
1816 	group = SPDK_CONTAINEROF(tgroup, struct nvme_pcie_poll_group, group);
1817 	memcpy(&stats->pcie, &group->stats, sizeof(group->stats));
1818 
1819 	*_stats = stats;
1820 
1821 	return 0;
1822 }
1823 
1824 void
1825 nvme_pcie_poll_group_free_stats(struct spdk_nvme_transport_poll_group *tgroup,
1826 				struct spdk_nvme_transport_poll_group_stat *stats)
1827 {
1828 	free(stats);
1829 }
1830 
1831 static void
1832 nvme_pcie_trace(void)
1833 {
1834 	struct spdk_trace_tpoint_opts opts[] = {
1835 		{
1836 			"NVME_PCIE_SUBMIT", TRACE_NVME_PCIE_SUBMIT,
1837 			OWNER_TYPE_NVME_PCIE_QP, OBJECT_NVME_PCIE_REQ, 1,
1838 			{	{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
1839 				{ "cid", SPDK_TRACE_ARG_TYPE_INT, 4 },
1840 				{ "opc", SPDK_TRACE_ARG_TYPE_INT, 4 },
1841 				{ "dw10", SPDK_TRACE_ARG_TYPE_PTR, 4 },
1842 				{ "dw11", SPDK_TRACE_ARG_TYPE_PTR, 4 },
1843 				{ "dw12", SPDK_TRACE_ARG_TYPE_PTR, 4 },
1844 				{ "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
1845 			}
1846 		},
1847 		{
1848 			"NVME_PCIE_COMPLETE", TRACE_NVME_PCIE_COMPLETE,
1849 			OWNER_TYPE_NVME_PCIE_QP, OBJECT_NVME_PCIE_REQ, 0,
1850 			{	{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
1851 				{ "cid", SPDK_TRACE_ARG_TYPE_INT, 4 },
1852 				{ "cpl", SPDK_TRACE_ARG_TYPE_PTR, 4 },
1853 				{ "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
1854 			}
1855 		},
1856 	};
1857 
1858 	spdk_trace_register_object(OBJECT_NVME_PCIE_REQ, 'p');
1859 	spdk_trace_register_owner_type(OWNER_TYPE_NVME_PCIE_QP, 'q');
1860 	spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
1861 }
1862 SPDK_TRACE_REGISTER_FN(nvme_pcie_trace, "nvme_pcie", TRACE_GROUP_NVME_PCIE)
1863