xref: /spdk/lib/nvme/nvme_pcie.c (revision 1a9ed697f0c1696ba6b5819e27e68a3fbbf3b223)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2017, IBM Corporation. All rights reserved.
6  *   Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 /*
36  * NVMe over PCIe transport
37  */
38 
39 #include "spdk/stdinc.h"
40 #include "spdk/env.h"
41 #include "spdk/likely.h"
42 #include "spdk/string.h"
43 #include "nvme_internal.h"
44 #include "nvme_uevent.h"
45 
46 /*
47  * Number of completion queue entries to process before ringing the
48  *  completion queue doorbell.
49  */
50 #define NVME_MIN_COMPLETIONS	(1)
51 #define NVME_MAX_COMPLETIONS	(128)
52 
53 /*
54  * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL
55  *  segment.
56  */
57 #define NVME_MAX_SGL_DESCRIPTORS	(250)
58 
59 #define NVME_MAX_PRP_LIST_ENTRIES	(503)
60 
61 struct nvme_pcie_enum_ctx {
62 	struct spdk_nvme_probe_ctx *probe_ctx;
63 	struct spdk_pci_addr pci_addr;
64 	bool has_pci_addr;
65 };
66 
67 /* PCIe transport extensions for spdk_nvme_ctrlr */
68 struct nvme_pcie_ctrlr {
69 	struct spdk_nvme_ctrlr ctrlr;
70 
71 	/** NVMe MMIO register space */
72 	volatile struct spdk_nvme_registers *regs;
73 
74 	/** NVMe MMIO register size */
75 	uint64_t regs_size;
76 
77 	struct {
78 		/* BAR mapping address which contains controller memory buffer */
79 		void *bar_va;
80 
81 		/* BAR physical address which contains controller memory buffer */
82 		uint64_t bar_pa;
83 
84 		/* Controller memory buffer size in Bytes */
85 		uint64_t size;
86 
87 		/* Current offset of controller memory buffer, relative to start of BAR virt addr */
88 		uint64_t current_offset;
89 
90 		void *mem_register_addr;
91 		size_t mem_register_size;
92 	} cmb;
93 
94 	/** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */
95 	uint32_t doorbell_stride_u32;
96 
97 	/* Opaque handle to associated PCI device. */
98 	struct spdk_pci_device *devhandle;
99 
100 	/* Flag to indicate the MMIO register has been remapped */
101 	bool is_remapped;
102 };
103 
104 struct nvme_tracker {
105 	TAILQ_ENTRY(nvme_tracker)       tq_list;
106 
107 	struct nvme_request		*req;
108 	uint16_t			cid;
109 
110 	uint16_t			rsvd0;
111 	uint32_t			rsvd1;
112 
113 	spdk_nvme_cmd_cb		cb_fn;
114 	void				*cb_arg;
115 
116 	uint64_t			prp_sgl_bus_addr;
117 
118 	/* Don't move, metadata SGL is always contiguous with Data Block SGL */
119 	struct spdk_nvme_sgl_descriptor		meta_sgl;
120 	union {
121 		uint64_t			prp[NVME_MAX_PRP_LIST_ENTRIES];
122 		struct spdk_nvme_sgl_descriptor	sgl[NVME_MAX_SGL_DESCRIPTORS];
123 	} u;
124 };
125 /*
126  * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary
127  * and so that there is no padding required to meet alignment requirements.
128  */
129 SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K");
130 SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned");
131 SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, meta_sgl) & 7) == 0, "SGL must be Qword aligned");
132 
133 struct nvme_pcie_poll_group {
134 	struct spdk_nvme_transport_poll_group group;
135 };
136 
137 /* PCIe transport extensions for spdk_nvme_qpair */
138 struct nvme_pcie_qpair {
139 	/* Submission queue tail doorbell */
140 	volatile uint32_t *sq_tdbl;
141 
142 	/* Completion queue head doorbell */
143 	volatile uint32_t *cq_hdbl;
144 
145 	/* Submission queue */
146 	struct spdk_nvme_cmd *cmd;
147 
148 	/* Completion queue */
149 	struct spdk_nvme_cpl *cpl;
150 
151 	TAILQ_HEAD(, nvme_tracker) free_tr;
152 	TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr;
153 
154 	/* Array of trackers indexed by command ID. */
155 	struct nvme_tracker *tr;
156 
157 	uint16_t num_entries;
158 
159 	uint8_t retry_count;
160 
161 	uint16_t max_completions_cap;
162 
163 	uint16_t last_sq_tail;
164 	uint16_t sq_tail;
165 	uint16_t cq_head;
166 	uint16_t sq_head;
167 
168 	struct {
169 		uint8_t phase			: 1;
170 		uint8_t delay_cmd_submit	: 1;
171 		uint8_t has_shadow_doorbell	: 1;
172 	} flags;
173 
174 	/*
175 	 * Base qpair structure.
176 	 * This is located after the hot data in this structure so that the important parts of
177 	 * nvme_pcie_qpair are in the same cache line.
178 	 */
179 	struct spdk_nvme_qpair qpair;
180 
181 	struct {
182 		/* Submission queue shadow tail doorbell */
183 		volatile uint32_t *sq_tdbl;
184 
185 		/* Completion queue shadow head doorbell */
186 		volatile uint32_t *cq_hdbl;
187 
188 		/* Submission queue event index */
189 		volatile uint32_t *sq_eventidx;
190 
191 		/* Completion queue event index */
192 		volatile uint32_t *cq_eventidx;
193 	} shadow_doorbell;
194 
195 	/*
196 	 * Fields below this point should not be touched on the normal I/O path.
197 	 */
198 
199 	bool sq_in_cmb;
200 
201 	uint64_t cmd_bus_addr;
202 	uint64_t cpl_bus_addr;
203 
204 	struct spdk_nvme_cmd *sq_vaddr;
205 	struct spdk_nvme_cpl *cq_vaddr;
206 };
207 
208 static int nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx,
209 				  struct spdk_pci_addr *pci_addr);
210 static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair,
211 				     const struct spdk_nvme_io_qpair_opts *opts);
212 static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair);
213 
214 __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL;
215 static uint16_t g_signal_lock;
216 static bool g_sigset = false;
217 static int g_hotplug_fd = -1;
218 
219 static void
220 nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx)
221 {
222 	void *map_address;
223 	uint16_t flag = 0;
224 
225 	if (!__atomic_compare_exchange_n(&g_signal_lock, &flag, 1, false, __ATOMIC_ACQUIRE,
226 					 __ATOMIC_RELAXED)) {
227 		SPDK_DEBUGLOG(SPDK_LOG_NVME, "request g_signal_lock failed\n");
228 		return;
229 	}
230 
231 	assert(g_thread_mmio_ctrlr != NULL);
232 
233 	if (!g_thread_mmio_ctrlr->is_remapped) {
234 		map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size,
235 				   PROT_READ | PROT_WRITE,
236 				   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
237 		if (map_address == MAP_FAILED) {
238 			SPDK_ERRLOG("mmap failed\n");
239 			__atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE);
240 			return;
241 		}
242 		memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers));
243 		g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address;
244 		g_thread_mmio_ctrlr->is_remapped = true;
245 	}
246 	__atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE);
247 }
248 
249 static void
250 nvme_pcie_ctrlr_setup_signal(void)
251 {
252 	struct sigaction sa;
253 
254 	sa.sa_sigaction = nvme_sigbus_fault_sighandler;
255 	sigemptyset(&sa.sa_mask);
256 	sa.sa_flags = SA_SIGINFO;
257 	sigaction(SIGBUS, &sa, NULL);
258 }
259 
260 static inline struct nvme_pcie_ctrlr *
261 nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
262 {
263 	assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE);
264 	return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr);
265 }
266 
267 static int
268 _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx)
269 {
270 	struct spdk_nvme_ctrlr *ctrlr, *tmp;
271 	struct spdk_uevent event;
272 	struct spdk_pci_addr pci_addr;
273 	union spdk_nvme_csts_register csts;
274 	struct spdk_nvme_ctrlr_process *proc;
275 
276 	while (spdk_get_uevent(g_hotplug_fd, &event) > 0) {
277 		if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO ||
278 		    event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) {
279 			if (event.action == SPDK_NVME_UEVENT_ADD) {
280 				SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n",
281 					      event.traddr);
282 				if (spdk_process_is_primary()) {
283 					if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) {
284 						nvme_pcie_ctrlr_attach(probe_ctx, &pci_addr);
285 					}
286 				}
287 			} else if (event.action == SPDK_NVME_UEVENT_REMOVE) {
288 				struct spdk_nvme_transport_id trid;
289 
290 				memset(&trid, 0, sizeof(trid));
291 				spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE);
292 				snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr);
293 
294 				ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid);
295 				if (ctrlr == NULL) {
296 					return 0;
297 				}
298 				SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n",
299 					      event.traddr);
300 
301 				nvme_ctrlr_fail(ctrlr, true);
302 
303 				/* get the user app to clean up and stop I/O */
304 				if (ctrlr->remove_cb) {
305 					nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
306 					ctrlr->remove_cb(probe_ctx->cb_ctx, ctrlr);
307 					nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
308 				}
309 			}
310 		}
311 	}
312 
313 	/* This is a work around for vfio-attached device hot remove detection. */
314 	TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) {
315 		bool do_remove = false;
316 
317 		if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
318 			struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
319 
320 			if (spdk_pci_device_is_removed(pctrlr->devhandle)) {
321 				do_remove = true;
322 			}
323 		}
324 
325 		/* NVMe controller BAR must be mapped in the current process before any access. */
326 		proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
327 		if (proc) {
328 			csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
329 			if (csts.raw == 0xffffffffU) {
330 				do_remove = true;
331 			}
332 		}
333 
334 		if (do_remove) {
335 			nvme_ctrlr_fail(ctrlr, true);
336 			if (ctrlr->remove_cb) {
337 				nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
338 				ctrlr->remove_cb(probe_ctx->cb_ctx, ctrlr);
339 				nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
340 			}
341 		}
342 	}
343 	return 0;
344 }
345 
346 static inline struct nvme_pcie_qpair *
347 nvme_pcie_qpair(struct spdk_nvme_qpair *qpair)
348 {
349 	assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE);
350 	return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair);
351 }
352 
353 static volatile void *
354 nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset)
355 {
356 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
357 
358 	return (volatile void *)((uintptr_t)pctrlr->regs + offset);
359 }
360 
361 static int
362 nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
363 {
364 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
365 
366 	assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
367 	g_thread_mmio_ctrlr = pctrlr;
368 	spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value);
369 	g_thread_mmio_ctrlr = NULL;
370 	return 0;
371 }
372 
373 static int
374 nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
375 {
376 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
377 
378 	assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
379 	g_thread_mmio_ctrlr = pctrlr;
380 	spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value);
381 	g_thread_mmio_ctrlr = NULL;
382 	return 0;
383 }
384 
385 static int
386 nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
387 {
388 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
389 
390 	assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
391 	assert(value != NULL);
392 	g_thread_mmio_ctrlr = pctrlr;
393 	*value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset));
394 	g_thread_mmio_ctrlr = NULL;
395 	if (~(*value) == 0) {
396 		return -1;
397 	}
398 
399 	return 0;
400 }
401 
402 static int
403 nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
404 {
405 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
406 
407 	assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
408 	assert(value != NULL);
409 	g_thread_mmio_ctrlr = pctrlr;
410 	*value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset));
411 	g_thread_mmio_ctrlr = NULL;
412 	if (~(*value) == 0) {
413 		return -1;
414 	}
415 
416 	return 0;
417 }
418 
419 static int
420 nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
421 {
422 	return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq),
423 					 value);
424 }
425 
426 static int
427 nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
428 {
429 	return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq),
430 					 value);
431 }
432 
433 static int
434 nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa)
435 {
436 	return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw),
437 					 aqa->raw);
438 }
439 
440 static int
441 nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc)
442 {
443 	return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw),
444 					 &cmbloc->raw);
445 }
446 
447 static int
448 nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz)
449 {
450 	return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw),
451 					 &cmbsz->raw);
452 }
453 
454 static  uint32_t
455 nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
456 {
457 	/*
458 	 * For commands requiring more than 2 PRP entries, one PRP will be
459 	 *  embedded in the command (prp1), and the rest of the PRP entries
460 	 *  will be in a list pointed to by the command (prp2).  This means
461 	 *  that real max number of PRP entries we support is 506+1, which
462 	 *  results in a max xfer size of 506*ctrlr->page_size.
463 	 */
464 	return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size;
465 }
466 
467 static uint16_t
468 nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
469 {
470 	return NVME_MAX_SGL_DESCRIPTORS;
471 }
472 
473 static void
474 nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr)
475 {
476 	int rc;
477 	void *addr;
478 	uint32_t bir;
479 	union spdk_nvme_cmbsz_register cmbsz;
480 	union spdk_nvme_cmbloc_register cmbloc;
481 	uint64_t size, unit_size, offset, bar_size, bar_phys_addr;
482 
483 	if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) ||
484 	    nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
485 		SPDK_ERRLOG("get registers failed\n");
486 		goto exit;
487 	}
488 
489 	if (!cmbsz.bits.sz) {
490 		goto exit;
491 	}
492 
493 	bir = cmbloc.bits.bir;
494 	/* Values 0 2 3 4 5 are valid for BAR */
495 	if (bir > 5 || bir == 1) {
496 		goto exit;
497 	}
498 
499 	/* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */
500 	unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu);
501 	/* controller memory buffer size in Bytes */
502 	size = unit_size * cmbsz.bits.sz;
503 	/* controller memory buffer offset from BAR in Bytes */
504 	offset = unit_size * cmbloc.bits.ofst;
505 
506 	rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr,
507 				     &bar_phys_addr, &bar_size);
508 	if ((rc != 0) || addr == NULL) {
509 		goto exit;
510 	}
511 
512 	if (offset > bar_size) {
513 		goto exit;
514 	}
515 
516 	if (size > bar_size - offset) {
517 		goto exit;
518 	}
519 
520 	pctrlr->cmb.bar_va = addr;
521 	pctrlr->cmb.bar_pa = bar_phys_addr;
522 	pctrlr->cmb.size = size;
523 	pctrlr->cmb.current_offset = offset;
524 
525 	if (!cmbsz.bits.sqs) {
526 		pctrlr->ctrlr.opts.use_cmb_sqs = false;
527 	}
528 
529 	return;
530 exit:
531 	pctrlr->ctrlr.opts.use_cmb_sqs = false;
532 	return;
533 }
534 
535 static int
536 nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr)
537 {
538 	int rc = 0;
539 	union spdk_nvme_cmbloc_register cmbloc;
540 	void *addr = pctrlr->cmb.bar_va;
541 
542 	if (addr) {
543 		if (pctrlr->cmb.mem_register_addr) {
544 			spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size);
545 		}
546 
547 		if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
548 			SPDK_ERRLOG("get_cmbloc() failed\n");
549 			return -EIO;
550 		}
551 		rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr);
552 	}
553 	return rc;
554 }
555 
556 static int
557 nvme_pcie_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr)
558 {
559 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
560 
561 	if (pctrlr->cmb.bar_va == NULL) {
562 		SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n");
563 		return -ENOTSUP;
564 	}
565 
566 	if (ctrlr->opts.use_cmb_sqs) {
567 		SPDK_ERRLOG("CMB is already in use for submission queues.\n");
568 		return -ENOTSUP;
569 	}
570 
571 	return 0;
572 }
573 
574 static void *
575 nvme_pcie_ctrlr_map_io_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size)
576 {
577 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
578 	union spdk_nvme_cmbsz_register cmbsz;
579 	union spdk_nvme_cmbloc_register cmbloc;
580 	uint64_t mem_register_start, mem_register_end;
581 	int rc;
582 
583 	if (pctrlr->cmb.mem_register_addr != NULL) {
584 		*size = pctrlr->cmb.mem_register_size;
585 		return pctrlr->cmb.mem_register_addr;
586 	}
587 
588 	*size = 0;
589 
590 	if (pctrlr->cmb.bar_va == NULL) {
591 		SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n");
592 		return NULL;
593 	}
594 
595 	if (ctrlr->opts.use_cmb_sqs) {
596 		SPDK_ERRLOG("CMB is already in use for submission queues.\n");
597 		return NULL;
598 	}
599 
600 	if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) ||
601 	    nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
602 		SPDK_ERRLOG("get registers failed\n");
603 		return NULL;
604 	}
605 
606 	/* If only SQS is supported */
607 	if (!(cmbsz.bits.wds || cmbsz.bits.rds)) {
608 		return NULL;
609 	}
610 
611 	/* If CMB is less than 4MiB in size then abort CMB mapping */
612 	if (pctrlr->cmb.size < (1ULL << 22)) {
613 		return NULL;
614 	}
615 
616 	mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset +
617 				       VALUE_2MB - 1);
618 	mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset +
619 				     pctrlr->cmb.size);
620 	pctrlr->cmb.mem_register_addr = (void *)mem_register_start;
621 	pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start;
622 
623 	rc = spdk_mem_register((void *)mem_register_start, mem_register_end - mem_register_start);
624 	if (rc) {
625 		SPDK_ERRLOG("spdk_mem_register() failed\n");
626 		return NULL;
627 	}
628 
629 	pctrlr->cmb.mem_register_addr = (void *)mem_register_start;
630 	pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start;
631 
632 	*size = pctrlr->cmb.mem_register_size;
633 	return pctrlr->cmb.mem_register_addr;
634 }
635 
636 static int
637 nvme_pcie_ctrlr_unmap_io_cmb(struct spdk_nvme_ctrlr *ctrlr)
638 {
639 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
640 	int rc;
641 
642 	if (pctrlr->cmb.mem_register_addr == NULL) {
643 		return 0;
644 	}
645 
646 	rc = spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size);
647 
648 	if (rc == 0) {
649 		pctrlr->cmb.mem_register_addr = NULL;
650 		pctrlr->cmb.mem_register_size = 0;
651 	}
652 
653 	return rc;
654 }
655 
656 static int
657 nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr)
658 {
659 	int rc;
660 	void *addr;
661 	uint64_t phys_addr, size;
662 
663 	rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr,
664 				     &phys_addr, &size);
665 	pctrlr->regs = (volatile struct spdk_nvme_registers *)addr;
666 	if ((pctrlr->regs == NULL) || (rc != 0)) {
667 		SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n",
668 			    rc, pctrlr->regs);
669 		return -1;
670 	}
671 
672 	pctrlr->regs_size = size;
673 	nvme_pcie_ctrlr_map_cmb(pctrlr);
674 
675 	return 0;
676 }
677 
678 static int
679 nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr)
680 {
681 	int rc = 0;
682 	void *addr = (void *)pctrlr->regs;
683 
684 	if (pctrlr->ctrlr.is_removed) {
685 		return rc;
686 	}
687 
688 	rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr);
689 	if (rc != 0) {
690 		SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc);
691 		return -1;
692 	}
693 
694 	if (addr) {
695 		/* NOTE: addr may have been remapped here. We're relying on DPDK to call
696 		 * munmap internally.
697 		 */
698 		rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr);
699 	}
700 	return rc;
701 }
702 
703 static int
704 nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t num_entries)
705 {
706 	struct nvme_pcie_qpair *pqpair;
707 	int rc;
708 
709 	pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
710 	if (pqpair == NULL) {
711 		return -ENOMEM;
712 	}
713 
714 	pqpair->num_entries = num_entries;
715 	pqpair->flags.delay_cmd_submit = 0;
716 
717 	ctrlr->adminq = &pqpair->qpair;
718 
719 	rc = nvme_qpair_init(ctrlr->adminq,
720 			     0, /* qpair ID */
721 			     ctrlr,
722 			     SPDK_NVME_QPRIO_URGENT,
723 			     num_entries);
724 	if (rc != 0) {
725 		return rc;
726 	}
727 
728 	return nvme_pcie_qpair_construct(ctrlr->adminq, NULL);
729 }
730 
731 /* This function must only be called while holding g_spdk_nvme_driver->lock */
732 static int
733 pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
734 {
735 	struct spdk_nvme_transport_id trid = {};
736 	struct nvme_pcie_enum_ctx *enum_ctx = ctx;
737 	struct spdk_nvme_ctrlr *ctrlr;
738 	struct spdk_pci_addr pci_addr;
739 
740 	pci_addr = spdk_pci_device_get_addr(pci_dev);
741 
742 	spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE);
743 	spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
744 
745 	ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid);
746 	if (!spdk_process_is_primary()) {
747 		if (!ctrlr) {
748 			SPDK_ERRLOG("Controller must be constructed in the primary process first.\n");
749 			return -1;
750 		}
751 
752 		return nvme_ctrlr_add_process(ctrlr, pci_dev);
753 	}
754 
755 	/* check whether user passes the pci_addr */
756 	if (enum_ctx->has_pci_addr &&
757 	    (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) {
758 		return 1;
759 	}
760 
761 	return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev);
762 }
763 
764 static int
765 nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx,
766 		     bool direct_connect)
767 {
768 	struct nvme_pcie_enum_ctx enum_ctx = {};
769 
770 	enum_ctx.probe_ctx = probe_ctx;
771 
772 	if (strlen(probe_ctx->trid.traddr) != 0) {
773 		if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) {
774 			return -1;
775 		}
776 		enum_ctx.has_pci_addr = true;
777 	}
778 
779 	/* Only the primary process can monitor hotplug. */
780 	if (spdk_process_is_primary()) {
781 		if (g_hotplug_fd < 0) {
782 			g_hotplug_fd = spdk_uevent_connect();
783 			if (g_hotplug_fd < 0) {
784 				SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n");
785 			}
786 		} else {
787 			_nvme_pcie_hotplug_monitor(probe_ctx);
788 		}
789 	}
790 
791 	if (enum_ctx.has_pci_addr == false) {
792 		return spdk_pci_enumerate(spdk_pci_nvme_get_driver(),
793 					  pcie_nvme_enum_cb, &enum_ctx);
794 	} else {
795 		return spdk_pci_device_attach(spdk_pci_nvme_get_driver(),
796 					      pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr);
797 	}
798 }
799 
800 static int
801 nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, struct spdk_pci_addr *pci_addr)
802 {
803 	struct nvme_pcie_enum_ctx enum_ctx;
804 
805 	enum_ctx.probe_ctx = probe_ctx;
806 	enum_ctx.has_pci_addr = true;
807 	enum_ctx.pci_addr = *pci_addr;
808 
809 	return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pcie_nvme_enum_cb, &enum_ctx);
810 }
811 
812 static struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
813 		const struct spdk_nvme_ctrlr_opts *opts,
814 		void *devhandle)
815 {
816 	struct spdk_pci_device *pci_dev = devhandle;
817 	struct nvme_pcie_ctrlr *pctrlr;
818 	union spdk_nvme_cap_register cap;
819 	union spdk_nvme_vs_register vs;
820 	uint32_t cmd_reg;
821 	int rc;
822 	struct spdk_pci_id pci_id;
823 
824 	rc = spdk_pci_device_claim(pci_dev);
825 	if (rc < 0) {
826 		SPDK_ERRLOG("could not claim device %s (%s)\n",
827 			    trid->traddr, spdk_strerror(-rc));
828 		return NULL;
829 	}
830 
831 	pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL,
832 			      SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
833 	if (pctrlr == NULL) {
834 		spdk_pci_device_unclaim(pci_dev);
835 		SPDK_ERRLOG("could not allocate ctrlr\n");
836 		return NULL;
837 	}
838 
839 	pctrlr->is_remapped = false;
840 	pctrlr->ctrlr.is_removed = false;
841 	pctrlr->devhandle = devhandle;
842 	pctrlr->ctrlr.opts = *opts;
843 	pctrlr->ctrlr.trid = *trid;
844 
845 	rc = nvme_ctrlr_construct(&pctrlr->ctrlr);
846 	if (rc != 0) {
847 		spdk_pci_device_unclaim(pci_dev);
848 		spdk_free(pctrlr);
849 		return NULL;
850 	}
851 
852 	rc = nvme_pcie_ctrlr_allocate_bars(pctrlr);
853 	if (rc != 0) {
854 		spdk_pci_device_unclaim(pci_dev);
855 		spdk_free(pctrlr);
856 		return NULL;
857 	}
858 
859 	/* Enable PCI busmaster and disable INTx */
860 	spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4);
861 	cmd_reg |= 0x404;
862 	spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4);
863 
864 	if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) {
865 		SPDK_ERRLOG("get_cap() failed\n");
866 		spdk_pci_device_unclaim(pci_dev);
867 		spdk_free(pctrlr);
868 		return NULL;
869 	}
870 
871 	if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) {
872 		SPDK_ERRLOG("get_vs() failed\n");
873 		spdk_pci_device_unclaim(pci_dev);
874 		spdk_free(pctrlr);
875 		return NULL;
876 	}
877 
878 	nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs);
879 
880 	/* Doorbell stride is 2 ^ (dstrd + 2),
881 	 * but we want multiples of 4, so drop the + 2 */
882 	pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd;
883 
884 	pci_id = spdk_pci_device_get_id(pci_dev);
885 	pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id);
886 
887 	rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr, pctrlr->ctrlr.opts.admin_queue_size);
888 	if (rc != 0) {
889 		nvme_ctrlr_destruct(&pctrlr->ctrlr);
890 		return NULL;
891 	}
892 
893 	/* Construct the primary process properties */
894 	rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev);
895 	if (rc != 0) {
896 		nvme_ctrlr_destruct(&pctrlr->ctrlr);
897 		return NULL;
898 	}
899 
900 	if (g_sigset != true) {
901 		nvme_pcie_ctrlr_setup_signal();
902 		g_sigset = true;
903 	}
904 
905 	return &pctrlr->ctrlr;
906 }
907 
908 static int
909 nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
910 {
911 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
912 	struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq);
913 	union spdk_nvme_aqa_register aqa;
914 
915 	if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) {
916 		SPDK_ERRLOG("set_asq() failed\n");
917 		return -EIO;
918 	}
919 
920 	if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) {
921 		SPDK_ERRLOG("set_acq() failed\n");
922 		return -EIO;
923 	}
924 
925 	aqa.raw = 0;
926 	/* acqs and asqs are 0-based. */
927 	aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
928 	aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
929 
930 	if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) {
931 		SPDK_ERRLOG("set_aqa() failed\n");
932 		return -EIO;
933 	}
934 
935 	return 0;
936 }
937 
938 static int
939 nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
940 {
941 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
942 	struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr);
943 
944 	if (ctrlr->adminq) {
945 		nvme_pcie_qpair_destroy(ctrlr->adminq);
946 	}
947 
948 	nvme_ctrlr_destruct_finish(ctrlr);
949 
950 	nvme_ctrlr_free_processes(ctrlr);
951 
952 	nvme_pcie_ctrlr_free_bars(pctrlr);
953 
954 	if (devhandle) {
955 		spdk_pci_device_unclaim(devhandle);
956 		spdk_pci_device_detach(devhandle);
957 	}
958 
959 	spdk_free(pctrlr);
960 
961 	return 0;
962 }
963 
964 static void
965 nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr)
966 {
967 	tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
968 	tr->cid = cid;
969 	tr->req = NULL;
970 }
971 
972 static int
973 nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair)
974 {
975 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
976 	uint32_t i;
977 
978 	/* all head/tail vals are set to 0 */
979 	pqpair->last_sq_tail = pqpair->sq_tail = pqpair->sq_head = pqpair->cq_head = 0;
980 
981 	/*
982 	 * First time through the completion queue, HW will set phase
983 	 *  bit on completions to 1.  So set this to 1 here, indicating
984 	 *  we're looking for a 1 to know which entries have completed.
985 	 *  we'll toggle the bit each time when the completion queue
986 	 *  rolls over.
987 	 */
988 	pqpair->flags.phase = 1;
989 	for (i = 0; i < pqpair->num_entries; i++) {
990 		pqpair->cpl[i].status.p = 0;
991 	}
992 
993 	return 0;
994 }
995 
996 static void *
997 nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t size, uint64_t alignment,
998 			  uint64_t *phys_addr)
999 {
1000 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
1001 	uintptr_t addr;
1002 
1003 	if (pctrlr->cmb.mem_register_addr != NULL) {
1004 		/* BAR is mapped for data */
1005 		return NULL;
1006 	}
1007 
1008 	addr = (uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset;
1009 	addr = (addr + (alignment - 1)) & ~(alignment - 1);
1010 
1011 	/* CMB may only consume part of the BAR, calculate accordingly */
1012 	if (addr + size > ((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.size)) {
1013 		SPDK_ERRLOG("Tried to allocate past valid CMB range!\n");
1014 		return NULL;
1015 	}
1016 	*phys_addr = pctrlr->cmb.bar_pa + addr - (uintptr_t)pctrlr->cmb.bar_va;
1017 
1018 	pctrlr->cmb.current_offset = (addr + size) - (uintptr_t)pctrlr->cmb.bar_va;
1019 
1020 	return (void *)addr;
1021 }
1022 
1023 static int
1024 nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair,
1025 			  const struct spdk_nvme_io_qpair_opts *opts)
1026 {
1027 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
1028 	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(ctrlr);
1029 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
1030 	struct nvme_tracker	*tr;
1031 	uint16_t		i;
1032 	volatile uint32_t	*doorbell_base;
1033 	uint16_t		num_trackers;
1034 	size_t			page_align = sysconf(_SC_PAGESIZE);
1035 	size_t			queue_align, queue_len;
1036 	uint32_t                flags = SPDK_MALLOC_DMA;
1037 	uint64_t		sq_paddr = 0;
1038 	uint64_t		cq_paddr = 0;
1039 
1040 	if (opts) {
1041 		pqpair->sq_vaddr = opts->sq.vaddr;
1042 		pqpair->cq_vaddr = opts->cq.vaddr;
1043 		sq_paddr = opts->sq.paddr;
1044 		cq_paddr = opts->cq.paddr;
1045 	}
1046 
1047 	pqpair->retry_count = ctrlr->opts.transport_retry_count;
1048 
1049 	/*
1050 	 * Limit the maximum number of completions to return per call to prevent wraparound,
1051 	 * and calculate how many trackers can be submitted at once without overflowing the
1052 	 * completion queue.
1053 	 */
1054 	pqpair->max_completions_cap = pqpair->num_entries / 4;
1055 	pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS);
1056 	pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS);
1057 	num_trackers = pqpair->num_entries - pqpair->max_completions_cap;
1058 
1059 	SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n",
1060 		     pqpair->max_completions_cap, num_trackers);
1061 
1062 	assert(num_trackers != 0);
1063 
1064 	pqpair->sq_in_cmb = false;
1065 
1066 	if (nvme_qpair_is_admin_queue(&pqpair->qpair)) {
1067 		flags |= SPDK_MALLOC_SHARE;
1068 	}
1069 
1070 	/* cmd and cpl rings must be aligned on page size boundaries. */
1071 	if (ctrlr->opts.use_cmb_sqs) {
1072 		pqpair->cmd = nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
1073 							page_align, &pqpair->cmd_bus_addr);
1074 		if (pqpair->cmd != NULL) {
1075 			pqpair->sq_in_cmb = true;
1076 		}
1077 	}
1078 
1079 	if (pqpair->sq_in_cmb == false) {
1080 		if (pqpair->sq_vaddr) {
1081 			pqpair->cmd = pqpair->sq_vaddr;
1082 		} else {
1083 			/* To ensure physical address contiguity we make each ring occupy
1084 			 * a single hugepage only. See MAX_IO_QUEUE_ENTRIES.
1085 			 */
1086 			queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cmd);
1087 			queue_align = spdk_max(spdk_align32pow2(queue_len), page_align);
1088 			pqpair->cmd = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags);
1089 			if (pqpair->cmd == NULL) {
1090 				SPDK_ERRLOG("alloc qpair_cmd failed\n");
1091 				return -ENOMEM;
1092 			}
1093 		}
1094 		if (sq_paddr) {
1095 			assert(pqpair->sq_vaddr != NULL);
1096 			pqpair->cmd_bus_addr = sq_paddr;
1097 		} else {
1098 			pqpair->cmd_bus_addr = spdk_vtophys(pqpair->cmd, NULL);
1099 			if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) {
1100 				SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n");
1101 				return -EFAULT;
1102 			}
1103 		}
1104 	}
1105 
1106 	if (pqpair->cq_vaddr) {
1107 		pqpair->cpl = pqpair->cq_vaddr;
1108 	} else {
1109 		queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cpl);
1110 		queue_align = spdk_max(spdk_align32pow2(queue_len), page_align);
1111 		pqpair->cpl = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags);
1112 		if (pqpair->cpl == NULL) {
1113 			SPDK_ERRLOG("alloc qpair_cpl failed\n");
1114 			return -ENOMEM;
1115 		}
1116 	}
1117 	if (cq_paddr) {
1118 		assert(pqpair->cq_vaddr != NULL);
1119 		pqpair->cpl_bus_addr = cq_paddr;
1120 	} else {
1121 		pqpair->cpl_bus_addr = spdk_vtophys(pqpair->cpl, NULL);
1122 		if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) {
1123 			SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n");
1124 			return -EFAULT;
1125 		}
1126 	}
1127 
1128 	doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl;
1129 	pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32;
1130 	pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32;
1131 
1132 	/*
1133 	 * Reserve space for all of the trackers in a single allocation.
1134 	 *   struct nvme_tracker must be padded so that its size is already a power of 2.
1135 	 *   This ensures the PRP list embedded in the nvme_tracker object will not span a
1136 	 *   4KB boundary, while allowing access to trackers in tr[] via normal array indexing.
1137 	 */
1138 	pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL,
1139 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
1140 	if (pqpair->tr == NULL) {
1141 		SPDK_ERRLOG("nvme_tr failed\n");
1142 		return -ENOMEM;
1143 	}
1144 
1145 	TAILQ_INIT(&pqpair->free_tr);
1146 	TAILQ_INIT(&pqpair->outstanding_tr);
1147 
1148 	for (i = 0; i < num_trackers; i++) {
1149 		tr = &pqpair->tr[i];
1150 		nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr, NULL));
1151 		TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
1152 	}
1153 
1154 	nvme_pcie_qpair_reset(qpair);
1155 
1156 	return 0;
1157 }
1158 
1159 static inline void
1160 nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
1161 {
1162 	/* dst and src are known to be non-overlapping and 64-byte aligned. */
1163 #if defined(__SSE2__)
1164 	__m128i *d128 = (__m128i *)dst;
1165 	const __m128i *s128 = (const __m128i *)src;
1166 
1167 	_mm_stream_si128(&d128[0], _mm_load_si128(&s128[0]));
1168 	_mm_stream_si128(&d128[1], _mm_load_si128(&s128[1]));
1169 	_mm_stream_si128(&d128[2], _mm_load_si128(&s128[2]));
1170 	_mm_stream_si128(&d128[3], _mm_load_si128(&s128[3]));
1171 #else
1172 	*dst = *src;
1173 #endif
1174 }
1175 
1176 /**
1177  * Note: the ctrlr_lock must be held when calling this function.
1178  */
1179 static void
1180 nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair,
1181 		struct nvme_request *req, struct spdk_nvme_cpl *cpl)
1182 {
1183 	struct spdk_nvme_ctrlr		*ctrlr = qpair->ctrlr;
1184 	struct nvme_request		*active_req = req;
1185 	struct spdk_nvme_ctrlr_process	*active_proc;
1186 
1187 	/*
1188 	 * The admin request is from another process. Move to the per
1189 	 *  process list for that process to handle it later.
1190 	 */
1191 	assert(nvme_qpair_is_admin_queue(qpair));
1192 	assert(active_req->pid != getpid());
1193 
1194 	active_proc = spdk_nvme_ctrlr_get_process(ctrlr, active_req->pid);
1195 	if (active_proc) {
1196 		/* Save the original completion information */
1197 		memcpy(&active_req->cpl, cpl, sizeof(*cpl));
1198 		STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq);
1199 	} else {
1200 		SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n",
1201 			    active_req->pid);
1202 
1203 		nvme_free_request(active_req);
1204 	}
1205 }
1206 
1207 /**
1208  * Note: the ctrlr_lock must be held when calling this function.
1209  */
1210 static void
1211 nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair)
1212 {
1213 	struct spdk_nvme_ctrlr		*ctrlr = qpair->ctrlr;
1214 	struct nvme_request		*req, *tmp_req;
1215 	pid_t				pid = getpid();
1216 	struct spdk_nvme_ctrlr_process	*proc;
1217 
1218 	/*
1219 	 * Check whether there is any pending admin request from
1220 	 * other active processes.
1221 	 */
1222 	assert(nvme_qpair_is_admin_queue(qpair));
1223 
1224 	proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
1225 	if (!proc) {
1226 		SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid);
1227 		assert(proc);
1228 		return;
1229 	}
1230 
1231 	STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
1232 		STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
1233 
1234 		assert(req->pid == pid);
1235 
1236 		nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl);
1237 		nvme_free_request(req);
1238 	}
1239 }
1240 
1241 static inline int
1242 nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
1243 {
1244 	return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old);
1245 }
1246 
1247 static bool
1248 nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value,
1249 				     volatile uint32_t *shadow_db,
1250 				     volatile uint32_t *eventidx)
1251 {
1252 	uint16_t old;
1253 
1254 	if (!shadow_db) {
1255 		return true;
1256 	}
1257 
1258 	old = *shadow_db;
1259 	*shadow_db = value;
1260 
1261 	/*
1262 	 * Ensure that the doorbell is updated before reading the EventIdx from
1263 	 * memory
1264 	 */
1265 	spdk_mb();
1266 
1267 	if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) {
1268 		return false;
1269 	}
1270 
1271 	return true;
1272 }
1273 
1274 static inline void
1275 nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair)
1276 {
1277 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
1278 	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
1279 	bool need_mmio = true;
1280 
1281 	if (qpair->first_fused_submitted) {
1282 		/* This is first cmd of two fused commands - don't ring doorbell */
1283 		qpair->first_fused_submitted = 0;
1284 		return;
1285 	}
1286 
1287 	if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
1288 		need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
1289 				pqpair->sq_tail,
1290 				pqpair->shadow_doorbell.sq_tdbl,
1291 				pqpair->shadow_doorbell.sq_eventidx);
1292 	}
1293 
1294 	if (spdk_likely(need_mmio)) {
1295 		spdk_wmb();
1296 		g_thread_mmio_ctrlr = pctrlr;
1297 		spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail);
1298 		g_thread_mmio_ctrlr = NULL;
1299 	}
1300 }
1301 
1302 static inline void
1303 nvme_pcie_qpair_ring_cq_doorbell(struct spdk_nvme_qpair *qpair)
1304 {
1305 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
1306 	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
1307 	bool need_mmio = true;
1308 
1309 	if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
1310 		need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
1311 				pqpair->cq_head,
1312 				pqpair->shadow_doorbell.cq_hdbl,
1313 				pqpair->shadow_doorbell.cq_eventidx);
1314 	}
1315 
1316 	if (spdk_likely(need_mmio)) {
1317 		g_thread_mmio_ctrlr = pctrlr;
1318 		spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head);
1319 		g_thread_mmio_ctrlr = NULL;
1320 	}
1321 }
1322 
1323 static void
1324 nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
1325 {
1326 	struct nvme_request	*req;
1327 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
1328 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
1329 
1330 	req = tr->req;
1331 	assert(req != NULL);
1332 
1333 	if (req->cmd.fuse == SPDK_NVME_IO_FLAGS_FUSE_FIRST) {
1334 		/* This is first cmd of two fused commands - don't ring doorbell */
1335 		qpair->first_fused_submitted = 1;
1336 	}
1337 
1338 	/* Don't use wide instructions to copy NVMe command, this is limited by QEMU
1339 	 * virtual NVMe controller, the maximum access width is 8 Bytes for one time.
1340 	 */
1341 	if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH) && pqpair->sq_in_cmb)) {
1342 		pqpair->cmd[pqpair->sq_tail] = req->cmd;
1343 	} else {
1344 		/* Copy the command from the tracker to the submission queue. */
1345 		nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
1346 	}
1347 
1348 	if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) {
1349 		pqpair->sq_tail = 0;
1350 	}
1351 
1352 	if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) {
1353 		SPDK_ERRLOG("sq_tail is passing sq_head!\n");
1354 	}
1355 
1356 	if (!pqpair->flags.delay_cmd_submit) {
1357 		nvme_pcie_qpair_ring_sq_doorbell(qpair);
1358 	}
1359 }
1360 
1361 static void
1362 nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
1363 				 struct spdk_nvme_cpl *cpl, bool print_on_error)
1364 {
1365 	struct nvme_pcie_qpair		*pqpair = nvme_pcie_qpair(qpair);
1366 	struct nvme_request		*req;
1367 	bool				retry, error;
1368 	bool				req_from_current_proc = true;
1369 
1370 	req = tr->req;
1371 
1372 	assert(req != NULL);
1373 
1374 	error = spdk_nvme_cpl_is_error(cpl);
1375 	retry = error && nvme_completion_is_retry(cpl) &&
1376 		req->retries < pqpair->retry_count;
1377 
1378 	if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) {
1379 		spdk_nvme_qpair_print_command(qpair, &req->cmd);
1380 		spdk_nvme_qpair_print_completion(qpair, cpl);
1381 	}
1382 
1383 	assert(cpl->cid == req->cmd.cid);
1384 
1385 	if (retry) {
1386 		req->retries++;
1387 		nvme_pcie_qpair_submit_tracker(qpair, tr);
1388 	} else {
1389 		/* Only check admin requests from different processes. */
1390 		if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) {
1391 			req_from_current_proc = false;
1392 			nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl);
1393 		} else {
1394 			nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl);
1395 		}
1396 
1397 		if (req_from_current_proc == true) {
1398 			nvme_qpair_free_request(qpair, req);
1399 		}
1400 
1401 		tr->req = NULL;
1402 
1403 		TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list);
1404 		TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
1405 	}
1406 }
1407 
1408 static void
1409 nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair,
1410 					struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
1411 					bool print_on_error)
1412 {
1413 	struct spdk_nvme_cpl	cpl;
1414 
1415 	memset(&cpl, 0, sizeof(cpl));
1416 	cpl.sqid = qpair->id;
1417 	cpl.cid = tr->cid;
1418 	cpl.status.sct = sct;
1419 	cpl.status.sc = sc;
1420 	cpl.status.dnr = dnr;
1421 	nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
1422 }
1423 
1424 static void
1425 nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr)
1426 {
1427 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1428 	struct nvme_tracker *tr, *temp, *last;
1429 
1430 	last = TAILQ_LAST(&pqpair->outstanding_tr, nvme_outstanding_tr_head);
1431 
1432 	/* Abort previously submitted (outstanding) trs */
1433 	TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) {
1434 		if (!qpair->ctrlr->opts.disable_error_logging) {
1435 			SPDK_ERRLOG("aborting outstanding command\n");
1436 		}
1437 		nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
1438 							SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true);
1439 
1440 		if (tr == last) {
1441 			break;
1442 		}
1443 	}
1444 }
1445 
1446 static void
1447 nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
1448 {
1449 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
1450 	struct nvme_tracker	*tr;
1451 
1452 	tr = TAILQ_FIRST(&pqpair->outstanding_tr);
1453 	while (tr != NULL) {
1454 		assert(tr->req != NULL);
1455 		if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
1456 			nvme_pcie_qpair_manual_complete_tracker(qpair, tr,
1457 								SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0,
1458 								false);
1459 			tr = TAILQ_FIRST(&pqpair->outstanding_tr);
1460 		} else {
1461 			tr = TAILQ_NEXT(tr, tq_list);
1462 		}
1463 	}
1464 }
1465 
1466 static void
1467 nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair)
1468 {
1469 	nvme_pcie_admin_qpair_abort_aers(qpair);
1470 }
1471 
1472 static int
1473 nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair)
1474 {
1475 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1476 
1477 	if (nvme_qpair_is_admin_queue(qpair)) {
1478 		nvme_pcie_admin_qpair_destroy(qpair);
1479 	}
1480 	/*
1481 	 * We check sq_vaddr and cq_vaddr to see if the user specified the memory
1482 	 * buffers when creating the I/O queue.
1483 	 * If the user specified them, we cannot free that memory.
1484 	 * Nor do we free it if it's in the CMB.
1485 	 */
1486 	if (!pqpair->sq_vaddr && pqpair->cmd && !pqpair->sq_in_cmb) {
1487 		spdk_free(pqpair->cmd);
1488 	}
1489 	if (!pqpair->cq_vaddr && pqpair->cpl) {
1490 		spdk_free(pqpair->cpl);
1491 	}
1492 	if (pqpair->tr) {
1493 		spdk_free(pqpair->tr);
1494 	}
1495 
1496 	nvme_qpair_deinit(qpair);
1497 
1498 	spdk_free(pqpair);
1499 
1500 	return 0;
1501 }
1502 
1503 static void
1504 nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
1505 {
1506 	nvme_pcie_qpair_abort_trackers(qpair, dnr);
1507 }
1508 
1509 static int
1510 nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr,
1511 				 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn,
1512 				 void *cb_arg)
1513 {
1514 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
1515 	struct nvme_request *req;
1516 	struct spdk_nvme_cmd *cmd;
1517 
1518 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1519 	if (req == NULL) {
1520 		return -ENOMEM;
1521 	}
1522 
1523 	cmd = &req->cmd;
1524 	cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ;
1525 
1526 	cmd->cdw10_bits.create_io_q.qid = io_que->id;
1527 	cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1;
1528 
1529 	cmd->cdw11_bits.create_io_cq.pc = 1;
1530 	cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr;
1531 
1532 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
1533 }
1534 
1535 static int
1536 nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr,
1537 				 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1538 {
1539 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
1540 	struct nvme_request *req;
1541 	struct spdk_nvme_cmd *cmd;
1542 
1543 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1544 	if (req == NULL) {
1545 		return -ENOMEM;
1546 	}
1547 
1548 	cmd = &req->cmd;
1549 	cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ;
1550 
1551 	cmd->cdw10_bits.create_io_q.qid = io_que->id;
1552 	cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1;
1553 	cmd->cdw11_bits.create_io_sq.pc = 1;
1554 	cmd->cdw11_bits.create_io_sq.qprio = io_que->qprio;
1555 	cmd->cdw11_bits.create_io_sq.cqid = io_que->id;
1556 	cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr;
1557 
1558 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
1559 }
1560 
1561 static int
1562 nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1563 				 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1564 {
1565 	struct nvme_request *req;
1566 	struct spdk_nvme_cmd *cmd;
1567 
1568 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1569 	if (req == NULL) {
1570 		return -ENOMEM;
1571 	}
1572 
1573 	cmd = &req->cmd;
1574 	cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ;
1575 	cmd->cdw10_bits.delete_io_q.qid = qpair->id;
1576 
1577 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
1578 }
1579 
1580 static int
1581 nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1582 				 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1583 {
1584 	struct nvme_request *req;
1585 	struct spdk_nvme_cmd *cmd;
1586 
1587 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1588 	if (req == NULL) {
1589 		return -ENOMEM;
1590 	}
1591 
1592 	cmd = &req->cmd;
1593 	cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ;
1594 	cmd->cdw10_bits.delete_io_q.qid = qpair->id;
1595 
1596 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
1597 }
1598 
1599 static int
1600 _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1601 				 uint16_t qid)
1602 {
1603 	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(ctrlr);
1604 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
1605 	struct nvme_completion_poll_status	*status;
1606 	int					rc;
1607 
1608 	status = calloc(1, sizeof(*status));
1609 	if (!status) {
1610 		SPDK_ERRLOG("Failed to allocate status tracker\n");
1611 		return -ENOMEM;
1612 	}
1613 
1614 	rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status);
1615 	if (rc != 0) {
1616 		free(status);
1617 		return rc;
1618 	}
1619 
1620 	if (spdk_nvme_wait_for_completion(ctrlr->adminq, status)) {
1621 		SPDK_ERRLOG("nvme_create_io_cq failed!\n");
1622 		if (!status->timed_out) {
1623 			free(status);
1624 		}
1625 		return -1;
1626 	}
1627 
1628 	memset(status, 0, sizeof(*status));
1629 	rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status);
1630 	if (rc != 0) {
1631 		free(status);
1632 		return rc;
1633 	}
1634 
1635 	if (spdk_nvme_wait_for_completion(ctrlr->adminq, status)) {
1636 		SPDK_ERRLOG("nvme_create_io_sq failed!\n");
1637 		if (status->timed_out) {
1638 			/* Request is still queued, the memory will be freed in a completion callback.
1639 			   allocate a new request */
1640 			status = calloc(1, sizeof(*status));
1641 			if (!status) {
1642 				SPDK_ERRLOG("Failed to allocate status tracker\n");
1643 				return -ENOMEM;
1644 			}
1645 		}
1646 
1647 		memset(status, 0, sizeof(*status));
1648 		/* Attempt to delete the completion queue */
1649 		rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status);
1650 		if (rc != 0) {
1651 			/* The originall or newly allocated status structure can be freed since
1652 			 * the corresponding request has been completed of failed to submit */
1653 			free(status);
1654 			return -1;
1655 		}
1656 		spdk_nvme_wait_for_completion(ctrlr->adminq, status);
1657 		if (!status->timed_out) {
1658 			/* status can be freed regardless of spdk_nvme_wait_for_completion return value */
1659 			free(status);
1660 		}
1661 		return -1;
1662 	}
1663 
1664 	if (ctrlr->shadow_doorbell) {
1665 		pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) *
1666 						  pctrlr->doorbell_stride_u32;
1667 		pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) *
1668 						  pctrlr->doorbell_stride_u32;
1669 		pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) *
1670 						      pctrlr->doorbell_stride_u32;
1671 		pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) *
1672 						      pctrlr->doorbell_stride_u32;
1673 		pqpair->flags.has_shadow_doorbell = 1;
1674 	} else {
1675 		pqpair->flags.has_shadow_doorbell = 0;
1676 	}
1677 	nvme_pcie_qpair_reset(qpair);
1678 	free(status);
1679 
1680 	return 0;
1681 }
1682 
1683 static struct spdk_nvme_qpair *
1684 nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
1685 				const struct spdk_nvme_io_qpair_opts *opts)
1686 {
1687 	struct nvme_pcie_qpair *pqpair;
1688 	struct spdk_nvme_qpair *qpair;
1689 	int rc;
1690 
1691 	assert(ctrlr != NULL);
1692 
1693 	pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL,
1694 			      SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
1695 	if (pqpair == NULL) {
1696 		return NULL;
1697 	}
1698 
1699 	pqpair->num_entries = opts->io_queue_size;
1700 	pqpair->flags.delay_cmd_submit = opts->delay_cmd_submit;
1701 
1702 	qpair = &pqpair->qpair;
1703 
1704 	rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests);
1705 	if (rc != 0) {
1706 		nvme_pcie_qpair_destroy(qpair);
1707 		return NULL;
1708 	}
1709 
1710 	rc = nvme_pcie_qpair_construct(qpair, opts);
1711 
1712 	if (rc != 0) {
1713 		nvme_pcie_qpair_destroy(qpair);
1714 		return NULL;
1715 	}
1716 
1717 	return qpair;
1718 }
1719 
1720 static int
1721 nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1722 {
1723 	if (nvme_qpair_is_admin_queue(qpair)) {
1724 		return 0;
1725 	} else {
1726 		return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id);
1727 	}
1728 }
1729 
1730 static void
1731 nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1732 {
1733 }
1734 
1735 static int
1736 nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1737 {
1738 	struct nvme_completion_poll_status *status;
1739 	int rc;
1740 
1741 	assert(ctrlr != NULL);
1742 
1743 	if (ctrlr->is_removed) {
1744 		goto free;
1745 	}
1746 
1747 	status = calloc(1, sizeof(*status));
1748 	if (!status) {
1749 		SPDK_ERRLOG("Failed to allocate status tracker\n");
1750 		return -ENOMEM;
1751 	}
1752 
1753 	/* Delete the I/O submission queue */
1754 	rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, status);
1755 	if (rc != 0) {
1756 		SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc);
1757 		free(status);
1758 		return rc;
1759 	}
1760 	if (spdk_nvme_wait_for_completion(ctrlr->adminq, status)) {
1761 		if (!status->timed_out) {
1762 			free(status);
1763 		}
1764 		return -1;
1765 	}
1766 
1767 	memset(status, 0, sizeof(*status));
1768 	/* Delete the completion queue */
1769 	rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status);
1770 	if (rc != 0) {
1771 		SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
1772 		free(status);
1773 		return rc;
1774 	}
1775 	if (spdk_nvme_wait_for_completion(ctrlr->adminq, status)) {
1776 		if (!status->timed_out) {
1777 			free(status);
1778 		}
1779 		return -1;
1780 	}
1781 	free(status);
1782 
1783 free:
1784 	if (qpair->no_deletion_notification_needed == 0) {
1785 		/* Abort the rest of the I/O */
1786 		nvme_pcie_qpair_abort_trackers(qpair, 1);
1787 	}
1788 
1789 	nvme_pcie_qpair_destroy(qpair);
1790 	return 0;
1791 }
1792 
1793 static void
1794 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
1795 {
1796 	/*
1797 	 * Bad vtophys translation, so abort this request and return
1798 	 *  immediately.
1799 	 */
1800 	nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
1801 						SPDK_NVME_SC_INVALID_FIELD,
1802 						1 /* do not retry */, true);
1803 }
1804 
1805 /*
1806  * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes.
1807  *
1808  * *prp_index will be updated to account for the number of PRP entries used.
1809  */
1810 static inline int
1811 nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len,
1812 			  uint32_t page_size)
1813 {
1814 	struct spdk_nvme_cmd *cmd = &tr->req->cmd;
1815 	uintptr_t page_mask = page_size - 1;
1816 	uint64_t phys_addr;
1817 	uint32_t i;
1818 
1819 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n",
1820 		      *prp_index, virt_addr, (uint32_t)len);
1821 
1822 	if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) {
1823 		SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1824 		return -EFAULT;
1825 	}
1826 
1827 	i = *prp_index;
1828 	while (len) {
1829 		uint32_t seg_len;
1830 
1831 		/*
1832 		 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array,
1833 		 * so prp_index == count is valid.
1834 		 */
1835 		if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) {
1836 			SPDK_ERRLOG("out of PRP entries\n");
1837 			return -EFAULT;
1838 		}
1839 
1840 		phys_addr = spdk_vtophys(virt_addr, NULL);
1841 		if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) {
1842 			SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr);
1843 			return -EFAULT;
1844 		}
1845 
1846 		if (i == 0) {
1847 			SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr);
1848 			cmd->dptr.prp.prp1 = phys_addr;
1849 			seg_len = page_size - ((uintptr_t)virt_addr & page_mask);
1850 		} else {
1851 			if ((phys_addr & page_mask) != 0) {
1852 				SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr);
1853 				return -EFAULT;
1854 			}
1855 
1856 			SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr);
1857 			tr->u.prp[i - 1] = phys_addr;
1858 			seg_len = page_size;
1859 		}
1860 
1861 		seg_len = spdk_min(seg_len, len);
1862 		virt_addr += seg_len;
1863 		len -= seg_len;
1864 		i++;
1865 	}
1866 
1867 	cmd->psdt = SPDK_NVME_PSDT_PRP;
1868 	if (i <= 1) {
1869 		cmd->dptr.prp.prp2 = 0;
1870 	} else if (i == 2) {
1871 		cmd->dptr.prp.prp2 = tr->u.prp[0];
1872 		SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2);
1873 	} else {
1874 		cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr;
1875 		SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2);
1876 	}
1877 
1878 	*prp_index = i;
1879 	return 0;
1880 }
1881 
1882 static int
1883 nvme_pcie_qpair_build_request_invalid(struct spdk_nvme_qpair *qpair,
1884 				      struct nvme_request *req, struct nvme_tracker *tr, bool dword_aligned)
1885 {
1886 	assert(0);
1887 	nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1888 	return -EINVAL;
1889 }
1890 
1891 /**
1892  * Build PRP list describing physically contiguous payload buffer.
1893  */
1894 static int
1895 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1896 				     struct nvme_tracker *tr, bool dword_aligned)
1897 {
1898 	uint32_t prp_index = 0;
1899 	int rc;
1900 
1901 	rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset,
1902 				       req->payload_size, qpair->ctrlr->page_size);
1903 	if (rc) {
1904 		nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1905 	}
1906 
1907 	return rc;
1908 }
1909 
1910 /**
1911  * Build an SGL describing a physically contiguous payload buffer.
1912  *
1913  * This is more efficient than using PRP because large buffers can be
1914  * described this way.
1915  */
1916 static int
1917 nvme_pcie_qpair_build_contig_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1918 		struct nvme_tracker *tr, bool dword_aligned)
1919 {
1920 	void *virt_addr;
1921 	uint64_t phys_addr, mapping_length;
1922 	uint32_t length;
1923 	struct spdk_nvme_sgl_descriptor *sgl;
1924 	uint32_t nseg = 0;
1925 
1926 	assert(req->payload_size != 0);
1927 	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
1928 
1929 	sgl = tr->u.sgl;
1930 	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
1931 	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
1932 
1933 	length = req->payload_size;
1934 	virt_addr = req->payload.contig_or_cb_arg + req->payload_offset;
1935 	mapping_length = length;
1936 
1937 	while (length > 0) {
1938 		if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1939 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1940 			return -EFAULT;
1941 		}
1942 
1943 		if (dword_aligned && ((uintptr_t)virt_addr & 3)) {
1944 			SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1945 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1946 			return -EFAULT;
1947 		}
1948 
1949 		phys_addr = spdk_vtophys(virt_addr, &mapping_length);
1950 		if (phys_addr == SPDK_VTOPHYS_ERROR) {
1951 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1952 			return -EFAULT;
1953 		}
1954 
1955 		mapping_length = spdk_min(length, mapping_length);
1956 
1957 		length -= mapping_length;
1958 		virt_addr += mapping_length;
1959 
1960 		sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1961 		sgl->unkeyed.length = mapping_length;
1962 		sgl->address = phys_addr;
1963 		sgl->unkeyed.subtype = 0;
1964 
1965 		sgl++;
1966 		nseg++;
1967 	}
1968 
1969 	if (nseg == 1) {
1970 		/*
1971 		 * The whole transfer can be described by a single SGL descriptor.
1972 		 *  Use the special case described by the spec where SGL1's type is Data Block.
1973 		 *  This means the SGL in the tracker is not used at all, so copy the first (and only)
1974 		 *  SGL element into SGL1.
1975 		 */
1976 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1977 		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
1978 		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
1979 	} else {
1980 		/* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because
1981 		 *  NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page.
1982 		 */
1983 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
1984 		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
1985 		req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
1986 	}
1987 
1988 	return 0;
1989 }
1990 
1991 /**
1992  * Build SGL list describing scattered payload buffer.
1993  */
1994 static int
1995 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1996 				     struct nvme_tracker *tr, bool dword_aligned)
1997 {
1998 	int rc;
1999 	void *virt_addr;
2000 	uint64_t phys_addr;
2001 	uint32_t remaining_transfer_len, remaining_user_sge_len, length;
2002 	struct spdk_nvme_sgl_descriptor *sgl;
2003 	uint32_t nseg = 0;
2004 
2005 	/*
2006 	 * Build scattered payloads.
2007 	 */
2008 	assert(req->payload_size != 0);
2009 	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
2010 	assert(req->payload.reset_sgl_fn != NULL);
2011 	assert(req->payload.next_sge_fn != NULL);
2012 	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
2013 
2014 	sgl = tr->u.sgl;
2015 	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
2016 	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
2017 
2018 	remaining_transfer_len = req->payload_size;
2019 
2020 	while (remaining_transfer_len > 0) {
2021 		rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg,
2022 					      &virt_addr, &remaining_user_sge_len);
2023 		if (rc) {
2024 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
2025 			return -EFAULT;
2026 		}
2027 
2028 		/* Bit Bucket SGL descriptor */
2029 		if ((uint64_t)virt_addr == UINT64_MAX) {
2030 			/* TODO: enable WRITE and COMPARE when necessary */
2031 			if (req->cmd.opc != SPDK_NVME_OPC_READ) {
2032 				SPDK_ERRLOG("Only READ command can be supported\n");
2033 				goto exit;
2034 			}
2035 			if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
2036 				SPDK_ERRLOG("Too many SGL entries\n");
2037 				goto exit;
2038 			}
2039 
2040 			sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_BIT_BUCKET;
2041 			/* If the SGL describes a destination data buffer, the length of data
2042 			 * buffer shall be discarded by controller, and the length is included
2043 			 * in Number of Logical Blocks (NLB) parameter. Otherwise, the length
2044 			 * is not included in the NLB parameter.
2045 			 */
2046 			remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
2047 			remaining_transfer_len -= remaining_user_sge_len;
2048 
2049 			sgl->unkeyed.length = remaining_user_sge_len;
2050 			sgl->address = 0;
2051 			sgl->unkeyed.subtype = 0;
2052 
2053 			sgl++;
2054 			nseg++;
2055 
2056 			continue;
2057 		}
2058 
2059 		remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
2060 		remaining_transfer_len -= remaining_user_sge_len;
2061 		while (remaining_user_sge_len > 0) {
2062 			if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
2063 				SPDK_ERRLOG("Too many SGL entries\n");
2064 				goto exit;
2065 			}
2066 
2067 			if (dword_aligned && ((uintptr_t)virt_addr & 3)) {
2068 				SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
2069 				goto exit;
2070 			}
2071 
2072 			phys_addr = spdk_vtophys(virt_addr, NULL);
2073 			if (phys_addr == SPDK_VTOPHYS_ERROR) {
2074 				goto exit;
2075 			}
2076 
2077 			length = spdk_min(remaining_user_sge_len, VALUE_2MB - _2MB_OFFSET(virt_addr));
2078 			remaining_user_sge_len -= length;
2079 			virt_addr += length;
2080 
2081 			if (nseg > 0 && phys_addr ==
2082 			    (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) {
2083 				/* extend previous entry */
2084 				(*(sgl - 1)).unkeyed.length += length;
2085 				continue;
2086 			}
2087 
2088 			sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
2089 			sgl->unkeyed.length = length;
2090 			sgl->address = phys_addr;
2091 			sgl->unkeyed.subtype = 0;
2092 
2093 			sgl++;
2094 			nseg++;
2095 		}
2096 	}
2097 
2098 	if (nseg == 1) {
2099 		/*
2100 		 * The whole transfer can be described by a single SGL descriptor.
2101 		 *  Use the special case described by the spec where SGL1's type is Data Block.
2102 		 *  This means the SGL in the tracker is not used at all, so copy the first (and only)
2103 		 *  SGL element into SGL1.
2104 		 */
2105 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
2106 		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
2107 		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
2108 	} else {
2109 		/* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because
2110 		 *  NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page.
2111 		 */
2112 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
2113 		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
2114 		req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
2115 	}
2116 
2117 	return 0;
2118 
2119 exit:
2120 	nvme_pcie_fail_request_bad_vtophys(qpair, tr);
2121 	return -EFAULT;
2122 }
2123 
2124 /**
2125  * Build PRP list describing scattered payload buffer.
2126  */
2127 static int
2128 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
2129 				       struct nvme_tracker *tr, bool dword_aligned)
2130 {
2131 	int rc;
2132 	void *virt_addr;
2133 	uint32_t remaining_transfer_len, length;
2134 	uint32_t prp_index = 0;
2135 	uint32_t page_size = qpair->ctrlr->page_size;
2136 
2137 	/*
2138 	 * Build scattered payloads.
2139 	 */
2140 	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
2141 	assert(req->payload.reset_sgl_fn != NULL);
2142 	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
2143 
2144 	remaining_transfer_len = req->payload_size;
2145 	while (remaining_transfer_len > 0) {
2146 		assert(req->payload.next_sge_fn != NULL);
2147 		rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length);
2148 		if (rc) {
2149 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
2150 			return -EFAULT;
2151 		}
2152 
2153 		length = spdk_min(remaining_transfer_len, length);
2154 
2155 		/*
2156 		 * Any incompatible sges should have been handled up in the splitting routine,
2157 		 *  but assert here as an additional check.
2158 		 *
2159 		 * All SGEs except last must end on a page boundary.
2160 		 */
2161 		assert((length == remaining_transfer_len) ||
2162 		       _is_page_aligned((uintptr_t)virt_addr + length, page_size));
2163 
2164 		rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size);
2165 		if (rc) {
2166 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
2167 			return rc;
2168 		}
2169 
2170 		remaining_transfer_len -= length;
2171 	}
2172 
2173 	return 0;
2174 }
2175 
2176 typedef int(*build_req_fn)(struct spdk_nvme_qpair *, struct nvme_request *, struct nvme_tracker *,
2177 			   bool);
2178 
2179 static build_req_fn const g_nvme_pcie_build_req_table[][2] = {
2180 	[NVME_PAYLOAD_TYPE_INVALID] = {
2181 		nvme_pcie_qpair_build_request_invalid,			/* PRP */
2182 		nvme_pcie_qpair_build_request_invalid			/* SGL */
2183 	},
2184 	[NVME_PAYLOAD_TYPE_CONTIG] = {
2185 		nvme_pcie_qpair_build_contig_request,			/* PRP */
2186 		nvme_pcie_qpair_build_contig_hw_sgl_request		/* SGL */
2187 	},
2188 	[NVME_PAYLOAD_TYPE_SGL] = {
2189 		nvme_pcie_qpair_build_prps_sgl_request,			/* PRP */
2190 		nvme_pcie_qpair_build_hw_sgl_request			/* SGL */
2191 	}
2192 };
2193 
2194 static int
2195 nvme_pcie_qpair_build_metadata(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
2196 			       bool sgl_supported, bool dword_aligned)
2197 {
2198 	void *md_payload;
2199 	struct nvme_request *req = tr->req;
2200 
2201 	if (req->payload.md) {
2202 		md_payload = req->payload.md + req->md_offset;
2203 		if (dword_aligned && ((uintptr_t)md_payload & 3)) {
2204 			SPDK_ERRLOG("virt_addr %p not dword aligned\n", md_payload);
2205 			goto exit;
2206 		}
2207 
2208 		if (sgl_supported && dword_aligned) {
2209 			assert(req->cmd.psdt == SPDK_NVME_PSDT_SGL_MPTR_CONTIG);
2210 			req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_SGL;
2211 			tr->meta_sgl.address = spdk_vtophys(md_payload, NULL);
2212 			if (tr->meta_sgl.address == SPDK_VTOPHYS_ERROR) {
2213 				goto exit;
2214 			}
2215 			tr->meta_sgl.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
2216 			tr->meta_sgl.unkeyed.length = req->md_size;
2217 			tr->meta_sgl.unkeyed.subtype = 0;
2218 			req->cmd.mptr = tr->prp_sgl_bus_addr - sizeof(struct spdk_nvme_sgl_descriptor);
2219 		} else {
2220 			req->cmd.mptr = spdk_vtophys(md_payload, NULL);
2221 			if (req->cmd.mptr == SPDK_VTOPHYS_ERROR) {
2222 				goto exit;
2223 			}
2224 		}
2225 	}
2226 
2227 	return 0;
2228 
2229 exit:
2230 	nvme_pcie_fail_request_bad_vtophys(qpair, tr);
2231 	return -EINVAL;
2232 }
2233 
2234 static int
2235 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
2236 {
2237 	struct nvme_tracker	*tr;
2238 	int			rc = 0;
2239 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
2240 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
2241 	enum nvme_payload_type	payload_type;
2242 	bool			sgl_supported;
2243 	bool			dword_aligned = true;
2244 
2245 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
2246 		nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
2247 	}
2248 
2249 	tr = TAILQ_FIRST(&pqpair->free_tr);
2250 
2251 	if (tr == NULL) {
2252 		/* Inform the upper layer to try again later. */
2253 		rc = -EAGAIN;
2254 		goto exit;
2255 	}
2256 
2257 	TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */
2258 	TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list);
2259 	tr->req = req;
2260 	tr->cb_fn = req->cb_fn;
2261 	tr->cb_arg = req->cb_arg;
2262 	req->cmd.cid = tr->cid;
2263 
2264 	if (req->payload_size != 0) {
2265 		payload_type = nvme_payload_type(&req->payload);
2266 		/* According to the specification, PRPs shall be used for all
2267 		 *  Admin commands for NVMe over PCIe implementations.
2268 		 */
2269 		sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) != 0 &&
2270 				!nvme_qpair_is_admin_queue(qpair);
2271 
2272 		if (sgl_supported && !(ctrlr->flags & SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT)) {
2273 			dword_aligned = false;
2274 		}
2275 		rc = g_nvme_pcie_build_req_table[payload_type][sgl_supported](qpair, req, tr, dword_aligned);
2276 		if (rc < 0) {
2277 			goto exit;
2278 		}
2279 
2280 		rc = nvme_pcie_qpair_build_metadata(qpair, tr, sgl_supported, dword_aligned);
2281 		if (rc < 0) {
2282 			goto exit;
2283 		}
2284 	}
2285 
2286 	nvme_pcie_qpair_submit_tracker(qpair, tr);
2287 
2288 exit:
2289 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
2290 		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
2291 	}
2292 
2293 	return rc;
2294 }
2295 
2296 static void
2297 nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
2298 {
2299 	uint64_t t02;
2300 	struct nvme_tracker *tr, *tmp;
2301 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
2302 	struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
2303 	struct spdk_nvme_ctrlr_process *active_proc;
2304 
2305 	/* Don't check timeouts during controller initialization. */
2306 	if (ctrlr->state != NVME_CTRLR_STATE_READY) {
2307 		return;
2308 	}
2309 
2310 	if (nvme_qpair_is_admin_queue(qpair)) {
2311 		active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
2312 	} else {
2313 		active_proc = qpair->active_proc;
2314 	}
2315 
2316 	/* Only check timeouts if the current process has a timeout callback. */
2317 	if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
2318 		return;
2319 	}
2320 
2321 	t02 = spdk_get_ticks();
2322 	TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
2323 		assert(tr->req != NULL);
2324 
2325 		if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) {
2326 			/*
2327 			 * The requests are in order, so as soon as one has not timed out,
2328 			 * stop iterating.
2329 			 */
2330 			break;
2331 		}
2332 	}
2333 }
2334 
2335 static int32_t
2336 nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
2337 {
2338 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
2339 	struct nvme_tracker	*tr;
2340 	struct spdk_nvme_cpl	*cpl, *next_cpl;
2341 	uint32_t		 num_completions = 0;
2342 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
2343 	uint16_t		 next_cq_head;
2344 	uint8_t			 next_phase;
2345 	bool			 next_is_valid = false;
2346 
2347 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
2348 		nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
2349 	}
2350 
2351 	if (max_completions == 0 || max_completions > pqpair->max_completions_cap) {
2352 		/*
2353 		 * max_completions == 0 means unlimited, but complete at most
2354 		 * max_completions_cap batch of I/O at a time so that the completion
2355 		 * queue doorbells don't wrap around.
2356 		 */
2357 		max_completions = pqpair->max_completions_cap;
2358 	}
2359 
2360 	while (1) {
2361 		cpl = &pqpair->cpl[pqpair->cq_head];
2362 
2363 		if (!next_is_valid && cpl->status.p != pqpair->flags.phase) {
2364 			break;
2365 		}
2366 
2367 		if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) {
2368 			next_cq_head = pqpair->cq_head + 1;
2369 			next_phase = pqpair->flags.phase;
2370 		} else {
2371 			next_cq_head = 0;
2372 			next_phase = !pqpair->flags.phase;
2373 		}
2374 		next_cpl = &pqpair->cpl[next_cq_head];
2375 		next_is_valid = (next_cpl->status.p == next_phase);
2376 		if (next_is_valid) {
2377 			__builtin_prefetch(&pqpair->tr[next_cpl->cid]);
2378 		}
2379 
2380 #ifdef __PPC64__
2381 		/*
2382 		 * This memory barrier prevents reordering of:
2383 		 * - load after store from/to tr
2384 		 * - load after load cpl phase and cpl cid
2385 		 */
2386 		spdk_mb();
2387 #elif defined(__aarch64__)
2388 		__asm volatile("dmb oshld" ::: "memory");
2389 #endif
2390 
2391 		if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) {
2392 			pqpair->cq_head = 0;
2393 			pqpair->flags.phase = !pqpair->flags.phase;
2394 		}
2395 
2396 		tr = &pqpair->tr[cpl->cid];
2397 		/* Prefetch the req's STAILQ_ENTRY since we'll need to access it
2398 		 * as part of putting the req back on the qpair's free list.
2399 		 */
2400 		__builtin_prefetch(&tr->req->stailq);
2401 		pqpair->sq_head = cpl->sqhd;
2402 
2403 		if (tr->req) {
2404 			nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true);
2405 		} else {
2406 			SPDK_ERRLOG("cpl does not map to outstanding cmd\n");
2407 			spdk_nvme_qpair_print_completion(qpair, cpl);
2408 			assert(0);
2409 		}
2410 
2411 		if (++num_completions == max_completions) {
2412 			break;
2413 		}
2414 	}
2415 
2416 	if (num_completions > 0) {
2417 		nvme_pcie_qpair_ring_cq_doorbell(qpair);
2418 	}
2419 
2420 	if (pqpair->flags.delay_cmd_submit) {
2421 		if (pqpair->last_sq_tail != pqpair->sq_tail) {
2422 			nvme_pcie_qpair_ring_sq_doorbell(qpair);
2423 			pqpair->last_sq_tail = pqpair->sq_tail;
2424 		}
2425 	}
2426 
2427 	if (spdk_unlikely(ctrlr->timeout_enabled)) {
2428 		/*
2429 		 * User registered for timeout callback
2430 		 */
2431 		nvme_pcie_qpair_check_timeout(qpair);
2432 	}
2433 
2434 	/* Before returning, complete any pending admin request. */
2435 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
2436 		nvme_pcie_qpair_complete_pending_admin_request(qpair);
2437 
2438 		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
2439 	}
2440 
2441 	return num_completions;
2442 }
2443 
2444 static struct spdk_nvme_transport_poll_group *
2445 nvme_pcie_poll_group_create(void)
2446 {
2447 	struct nvme_pcie_poll_group *group = calloc(1, sizeof(*group));
2448 
2449 	if (group == NULL) {
2450 		SPDK_ERRLOG("Unable to allocate poll group.\n");
2451 		return NULL;
2452 	}
2453 
2454 	return &group->group;
2455 }
2456 
2457 static int
2458 nvme_pcie_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
2459 {
2460 	return 0;
2461 }
2462 
2463 static int
2464 nvme_pcie_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
2465 {
2466 	return 0;
2467 }
2468 
2469 static int
2470 nvme_pcie_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
2471 			 struct spdk_nvme_qpair *qpair)
2472 {
2473 	return 0;
2474 }
2475 
2476 static int
2477 nvme_pcie_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
2478 			    struct spdk_nvme_qpair *qpair)
2479 {
2480 	return 0;
2481 }
2482 
2483 static int64_t
2484 nvme_pcie_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
2485 		uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
2486 {
2487 	struct spdk_nvme_qpair *qpair, *tmp_qpair;
2488 	int32_t local_completions = 0;
2489 	int64_t total_completions = 0;
2490 
2491 	STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) {
2492 		disconnected_qpair_cb(qpair, tgroup->group->ctx);
2493 	}
2494 
2495 	STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) {
2496 		local_completions = spdk_nvme_qpair_process_completions(qpair, completions_per_qpair);
2497 		if (local_completions < 0) {
2498 			disconnected_qpair_cb(qpair, tgroup->group->ctx);
2499 			local_completions = 0;
2500 		}
2501 		total_completions += local_completions;
2502 	}
2503 
2504 	return total_completions;
2505 }
2506 
2507 static int
2508 nvme_pcie_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup)
2509 {
2510 	if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) {
2511 		return -EBUSY;
2512 	}
2513 
2514 	free(tgroup);
2515 
2516 	return 0;
2517 }
2518 
2519 const struct spdk_nvme_transport_ops pcie_ops = {
2520 	.name = "PCIE",
2521 	.type = SPDK_NVME_TRANSPORT_PCIE,
2522 	.ctrlr_construct = nvme_pcie_ctrlr_construct,
2523 	.ctrlr_scan = nvme_pcie_ctrlr_scan,
2524 	.ctrlr_destruct = nvme_pcie_ctrlr_destruct,
2525 	.ctrlr_enable = nvme_pcie_ctrlr_enable,
2526 
2527 	.ctrlr_set_reg_4 = nvme_pcie_ctrlr_set_reg_4,
2528 	.ctrlr_set_reg_8 = nvme_pcie_ctrlr_set_reg_8,
2529 	.ctrlr_get_reg_4 = nvme_pcie_ctrlr_get_reg_4,
2530 	.ctrlr_get_reg_8 = nvme_pcie_ctrlr_get_reg_8,
2531 
2532 	.ctrlr_get_max_xfer_size = nvme_pcie_ctrlr_get_max_xfer_size,
2533 	.ctrlr_get_max_sges = nvme_pcie_ctrlr_get_max_sges,
2534 
2535 	.ctrlr_reserve_cmb = nvme_pcie_ctrlr_reserve_cmb,
2536 	.ctrlr_map_cmb = nvme_pcie_ctrlr_map_io_cmb,
2537 	.ctrlr_unmap_cmb = nvme_pcie_ctrlr_unmap_io_cmb,
2538 
2539 	.ctrlr_create_io_qpair = nvme_pcie_ctrlr_create_io_qpair,
2540 	.ctrlr_delete_io_qpair = nvme_pcie_ctrlr_delete_io_qpair,
2541 	.ctrlr_connect_qpair = nvme_pcie_ctrlr_connect_qpair,
2542 	.ctrlr_disconnect_qpair = nvme_pcie_ctrlr_disconnect_qpair,
2543 
2544 	.qpair_abort_reqs = nvme_pcie_qpair_abort_reqs,
2545 	.qpair_reset = nvme_pcie_qpair_reset,
2546 	.qpair_submit_request = nvme_pcie_qpair_submit_request,
2547 	.qpair_process_completions = nvme_pcie_qpair_process_completions,
2548 	.admin_qpair_abort_aers = nvme_pcie_admin_qpair_abort_aers,
2549 
2550 	.poll_group_create = nvme_pcie_poll_group_create,
2551 	.poll_group_connect_qpair = nvme_pcie_poll_group_connect_qpair,
2552 	.poll_group_disconnect_qpair = nvme_pcie_poll_group_disconnect_qpair,
2553 	.poll_group_add = nvme_pcie_poll_group_add,
2554 	.poll_group_remove = nvme_pcie_poll_group_remove,
2555 	.poll_group_process_completions = nvme_pcie_poll_group_process_completions,
2556 	.poll_group_destroy = nvme_pcie_poll_group_destroy,
2557 };
2558 
2559 SPDK_NVME_TRANSPORT_REGISTER(pcie, &pcie_ops);
2560