xref: /spdk/lib/nvme/nvme_pcie.c (revision fa2d95b3fe66e7f5c543eaef89fa00d4eaa0e6e7)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   Copyright (c) 2017, IBM Corporation.
6  *   All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 /*
36  * NVMe over PCIe transport
37  */
38 
39 #include "spdk/stdinc.h"
40 #include "spdk/env.h"
41 #include "spdk/likely.h"
42 #include "nvme_internal.h"
43 #include "nvme_uevent.h"
44 
45 /*
46  * Number of completion queue entries to process before ringing the
47  *  completion queue doorbell.
48  */
49 #define NVME_MIN_COMPLETIONS	(1)
50 #define NVME_MAX_COMPLETIONS	(128)
51 
52 #define NVME_ADMIN_ENTRIES	(128)
53 
54 /*
55  * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL
56  *  segment.
57  */
58 #define NVME_MAX_SGL_DESCRIPTORS	(253)
59 
60 #define NVME_MAX_PRP_LIST_ENTRIES	(506)
61 
62 struct nvme_pcie_enum_ctx {
63 	struct spdk_nvme_probe_ctx *probe_ctx;
64 	struct spdk_pci_addr pci_addr;
65 	bool has_pci_addr;
66 };
67 
68 /* PCIe transport extensions for spdk_nvme_ctrlr */
69 struct nvme_pcie_ctrlr {
70 	struct spdk_nvme_ctrlr ctrlr;
71 
72 	/** NVMe MMIO register space */
73 	volatile struct spdk_nvme_registers *regs;
74 
75 	/** NVMe MMIO register size */
76 	uint64_t regs_size;
77 
78 	/* BAR mapping address which contains controller memory buffer */
79 	void *cmb_bar_virt_addr;
80 
81 	/* BAR physical address which contains controller memory buffer */
82 	uint64_t cmb_bar_phys_addr;
83 
84 	/* Controller memory buffer size in Bytes */
85 	uint64_t cmb_size;
86 
87 	/* Current offset of controller memory buffer, relative to start of BAR virt addr */
88 	uint64_t cmb_current_offset;
89 
90 	/* Last valid offset into CMB, this differs if CMB memory registration occurs or not */
91 	uint64_t cmb_max_offset;
92 
93 	void *cmb_mem_register_addr;
94 	size_t cmb_mem_register_size;
95 
96 	bool cmb_io_data_supported;
97 
98 	/** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */
99 	uint32_t doorbell_stride_u32;
100 
101 	/* Opaque handle to associated PCI device. */
102 	struct spdk_pci_device *devhandle;
103 
104 	/* File descriptor returned from spdk_pci_device_claim().  Closed when ctrlr is detached. */
105 	int claim_fd;
106 
107 	/* Flag to indicate the MMIO register has been remapped */
108 	bool is_remapped;
109 };
110 
111 struct nvme_tracker {
112 	TAILQ_ENTRY(nvme_tracker)       tq_list;
113 
114 	struct nvme_request		*req;
115 	uint16_t			cid;
116 
117 	uint16_t			rsvd1: 15;
118 	uint16_t			active: 1;
119 
120 	uint32_t			rsvd2;
121 
122 	uint64_t			rsvd3;
123 
124 	uint64_t			prp_sgl_bus_addr;
125 
126 	union {
127 		uint64_t			prp[NVME_MAX_PRP_LIST_ENTRIES];
128 		struct spdk_nvme_sgl_descriptor	sgl[NVME_MAX_SGL_DESCRIPTORS];
129 	} u;
130 };
131 /*
132  * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary
133  * and so that there is no padding required to meet alignment requirements.
134  */
135 SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K");
136 SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned");
137 
138 /* PCIe transport extensions for spdk_nvme_qpair */
139 struct nvme_pcie_qpair {
140 	/* Submission queue tail doorbell */
141 	volatile uint32_t *sq_tdbl;
142 
143 	/* Completion queue head doorbell */
144 	volatile uint32_t *cq_hdbl;
145 
146 	/* Submission queue */
147 	struct spdk_nvme_cmd *cmd;
148 
149 	/* Completion queue */
150 	struct spdk_nvme_cpl *cpl;
151 
152 	TAILQ_HEAD(, nvme_tracker) free_tr;
153 	TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr;
154 
155 	/* Array of trackers indexed by command ID. */
156 	struct nvme_tracker *tr;
157 
158 	uint16_t num_entries;
159 
160 	uint16_t max_completions_cap;
161 
162 	uint16_t last_sq_tail;
163 	uint16_t sq_tail;
164 	uint16_t cq_head;
165 	uint16_t sq_head;
166 
167 	struct {
168 		uint8_t phase			: 1;
169 		uint8_t is_enabled		: 1;
170 		uint8_t delay_pcie_doorbell	: 1;
171 		uint8_t has_shadow_doorbell	: 1;
172 	} flags;
173 
174 	/*
175 	 * Base qpair structure.
176 	 * This is located after the hot data in this structure so that the important parts of
177 	 * nvme_pcie_qpair are in the same cache line.
178 	 */
179 	struct spdk_nvme_qpair qpair;
180 
181 	struct {
182 		/* Submission queue shadow tail doorbell */
183 		volatile uint32_t *sq_tdbl;
184 
185 		/* Completion queue shadow head doorbell */
186 		volatile uint32_t *cq_hdbl;
187 
188 		/* Submission queue event index */
189 		volatile uint32_t *sq_eventidx;
190 
191 		/* Completion queue event index */
192 		volatile uint32_t *cq_eventidx;
193 	} shadow_doorbell;
194 
195 	/*
196 	 * Fields below this point should not be touched on the normal I/O path.
197 	 */
198 
199 	bool sq_in_cmb;
200 
201 	uint64_t cmd_bus_addr;
202 	uint64_t cpl_bus_addr;
203 };
204 
205 static int nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx,
206 				  struct spdk_pci_addr *pci_addr);
207 static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair);
208 static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair);
209 
210 __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL;
211 static volatile uint16_t g_signal_lock;
212 static bool g_sigset = false;
213 static int hotplug_fd = -1;
214 
215 static void
216 nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx)
217 {
218 	void *map_address;
219 
220 	if (!__sync_bool_compare_and_swap(&g_signal_lock, 0, 1)) {
221 		return;
222 	}
223 
224 	assert(g_thread_mmio_ctrlr != NULL);
225 
226 	if (!g_thread_mmio_ctrlr->is_remapped) {
227 		map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size,
228 				   PROT_READ | PROT_WRITE,
229 				   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
230 		if (map_address == MAP_FAILED) {
231 			SPDK_ERRLOG("mmap failed\n");
232 			g_signal_lock = 0;
233 			return;
234 		}
235 		memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers));
236 		g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address;
237 		g_thread_mmio_ctrlr->is_remapped = true;
238 	}
239 	g_signal_lock = 0;
240 	return;
241 }
242 
243 static void
244 nvme_pcie_ctrlr_setup_signal(void)
245 {
246 	struct sigaction sa;
247 
248 	sa.sa_sigaction = nvme_sigbus_fault_sighandler;
249 	sigemptyset(&sa.sa_mask);
250 	sa.sa_flags = SA_SIGINFO;
251 	sigaction(SIGBUS, &sa, NULL);
252 }
253 
254 static int
255 _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx)
256 {
257 	struct spdk_nvme_ctrlr *ctrlr, *tmp;
258 	struct spdk_uevent event;
259 	struct spdk_pci_addr pci_addr;
260 	union spdk_nvme_csts_register csts;
261 	struct spdk_nvme_ctrlr_process *proc;
262 
263 	while (spdk_get_uevent(hotplug_fd, &event) > 0) {
264 		if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO ||
265 		    event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) {
266 			if (event.action == SPDK_NVME_UEVENT_ADD) {
267 				SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n",
268 					      event.traddr);
269 				if (spdk_process_is_primary()) {
270 					if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) {
271 						nvme_pcie_ctrlr_attach(probe_ctx, &pci_addr);
272 					}
273 				}
274 			} else if (event.action == SPDK_NVME_UEVENT_REMOVE) {
275 				struct spdk_nvme_transport_id trid;
276 
277 				memset(&trid, 0, sizeof(trid));
278 				trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
279 				snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr);
280 
281 				ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid);
282 				if (ctrlr == NULL) {
283 					return 0;
284 				}
285 				SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n",
286 					      event.traddr);
287 
288 				nvme_ctrlr_fail(ctrlr, true);
289 
290 				/* get the user app to clean up and stop I/O */
291 				if (probe_ctx->remove_cb) {
292 					nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
293 					probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr);
294 					nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
295 				}
296 			}
297 		}
298 	}
299 
300 	/* This is a work around for vfio-attached device hot remove detection. */
301 	TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) {
302 		/* NVMe controller BAR must be mapped to secondary process space before any access. */
303 		proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
304 		if (proc) {
305 			csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
306 			if (csts.raw == 0xffffffffU) {
307 				nvme_ctrlr_fail(ctrlr, true);
308 				if (probe_ctx->remove_cb) {
309 					nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
310 					probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr);
311 					nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
312 				}
313 			}
314 		}
315 	}
316 	return 0;
317 }
318 
319 static inline struct nvme_pcie_ctrlr *
320 nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
321 {
322 	assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE);
323 	return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr);
324 }
325 
326 static inline struct nvme_pcie_qpair *
327 nvme_pcie_qpair(struct spdk_nvme_qpair *qpair)
328 {
329 	assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE);
330 	return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair);
331 }
332 
333 static volatile void *
334 nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset)
335 {
336 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
337 
338 	return (volatile void *)((uintptr_t)pctrlr->regs + offset);
339 }
340 
341 int
342 nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
343 {
344 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
345 
346 	assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
347 	g_thread_mmio_ctrlr = pctrlr;
348 	spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value);
349 	g_thread_mmio_ctrlr = NULL;
350 	return 0;
351 }
352 
353 int
354 nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
355 {
356 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
357 
358 	assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
359 	g_thread_mmio_ctrlr = pctrlr;
360 	spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value);
361 	g_thread_mmio_ctrlr = NULL;
362 	return 0;
363 }
364 
365 int
366 nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
367 {
368 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
369 
370 	assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
371 	assert(value != NULL);
372 	g_thread_mmio_ctrlr = pctrlr;
373 	*value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset));
374 	g_thread_mmio_ctrlr = NULL;
375 	if (~(*value) == 0) {
376 		return -1;
377 	}
378 
379 	return 0;
380 }
381 
382 int
383 nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
384 {
385 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
386 
387 	assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
388 	assert(value != NULL);
389 	g_thread_mmio_ctrlr = pctrlr;
390 	*value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset));
391 	g_thread_mmio_ctrlr = NULL;
392 	if (~(*value) == 0) {
393 		return -1;
394 	}
395 
396 	return 0;
397 }
398 
399 static int
400 nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
401 {
402 	return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq),
403 					 value);
404 }
405 
406 static int
407 nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
408 {
409 	return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq),
410 					 value);
411 }
412 
413 static int
414 nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa)
415 {
416 	return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw),
417 					 aqa->raw);
418 }
419 
420 static int
421 nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc)
422 {
423 	return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw),
424 					 &cmbloc->raw);
425 }
426 
427 static int
428 nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz)
429 {
430 	return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw),
431 					 &cmbsz->raw);
432 }
433 
434 uint32_t
435 nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
436 {
437 	/*
438 	 * For commands requiring more than 2 PRP entries, one PRP will be
439 	 *  embedded in the command (prp1), and the rest of the PRP entries
440 	 *  will be in a list pointed to by the command (prp2).  This means
441 	 *  that real max number of PRP entries we support is 506+1, which
442 	 *  results in a max xfer size of 506*ctrlr->page_size.
443 	 */
444 	return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size;
445 }
446 
447 uint16_t
448 nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
449 {
450 	return NVME_MAX_SGL_DESCRIPTORS;
451 }
452 
453 static void
454 nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr)
455 {
456 	int rc;
457 	void *addr;
458 	uint32_t bir;
459 	union spdk_nvme_cmbsz_register cmbsz;
460 	union spdk_nvme_cmbloc_register cmbloc;
461 	uint64_t size, unit_size, offset, bar_size, bar_phys_addr;
462 	uint64_t mem_register_start, mem_register_end;
463 
464 	if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) ||
465 	    nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
466 		SPDK_ERRLOG("get registers failed\n");
467 		goto exit;
468 	}
469 
470 	if (!cmbsz.bits.sz) {
471 		goto exit;
472 	}
473 
474 	bir = cmbloc.bits.bir;
475 	/* Values 0 2 3 4 5 are valid for BAR */
476 	if (bir > 5 || bir == 1) {
477 		goto exit;
478 	}
479 
480 	/* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */
481 	unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu);
482 	/* controller memory buffer size in Bytes */
483 	size = unit_size * cmbsz.bits.sz;
484 	/* controller memory buffer offset from BAR in Bytes */
485 	offset = unit_size * cmbloc.bits.ofst;
486 
487 	rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr,
488 				     &bar_phys_addr, &bar_size);
489 	if ((rc != 0) || addr == NULL) {
490 		goto exit;
491 	}
492 
493 	if (offset > bar_size) {
494 		goto exit;
495 	}
496 
497 	if (size > bar_size - offset) {
498 		goto exit;
499 	}
500 
501 	pctrlr->cmb_bar_virt_addr = addr;
502 	pctrlr->cmb_bar_phys_addr = bar_phys_addr;
503 	pctrlr->cmb_size = size;
504 	pctrlr->cmb_current_offset = offset;
505 	pctrlr->cmb_max_offset = offset + size;
506 
507 	if (!cmbsz.bits.sqs) {
508 		pctrlr->ctrlr.opts.use_cmb_sqs = false;
509 	}
510 
511 	/* If only SQS is supported use legacy mapping */
512 	if (cmbsz.bits.sqs && !(cmbsz.bits.wds || cmbsz.bits.rds)) {
513 		return;
514 	}
515 
516 	/* If CMB is less than 4MiB in size then abort CMB mapping */
517 	if (pctrlr->cmb_size < (1ULL << 22)) {
518 		goto exit;
519 	}
520 
521 	mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + VALUE_2MB - 1);
522 	mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + pctrlr->cmb_size);
523 	pctrlr->cmb_mem_register_addr = (void *)mem_register_start;
524 	pctrlr->cmb_mem_register_size = mem_register_end - mem_register_start;
525 
526 	rc = spdk_mem_register(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size);
527 	if (rc) {
528 		SPDK_ERRLOG("spdk_mem_register() failed\n");
529 		goto exit;
530 	}
531 	pctrlr->cmb_current_offset = mem_register_start - ((uint64_t)pctrlr->cmb_bar_virt_addr);
532 	pctrlr->cmb_max_offset = mem_register_end - ((uint64_t)pctrlr->cmb_bar_virt_addr);
533 	pctrlr->cmb_io_data_supported = true;
534 
535 	return;
536 exit:
537 	pctrlr->cmb_bar_virt_addr = NULL;
538 	pctrlr->ctrlr.opts.use_cmb_sqs = false;
539 	return;
540 }
541 
542 static int
543 nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr)
544 {
545 	int rc = 0;
546 	union spdk_nvme_cmbloc_register cmbloc;
547 	void *addr = pctrlr->cmb_bar_virt_addr;
548 
549 	if (addr) {
550 		if (pctrlr->cmb_mem_register_addr) {
551 			spdk_mem_unregister(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size);
552 		}
553 
554 		if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
555 			SPDK_ERRLOG("get_cmbloc() failed\n");
556 			return -EIO;
557 		}
558 		rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr);
559 	}
560 	return rc;
561 }
562 
563 static int
564 nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t length, uint64_t aligned,
565 			  uint64_t *offset)
566 {
567 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
568 	uint64_t round_offset;
569 
570 	round_offset = pctrlr->cmb_current_offset;
571 	round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1);
572 
573 	/* CMB may only consume part of the BAR, calculate accordingly */
574 	if (round_offset + length > pctrlr->cmb_max_offset) {
575 		SPDK_ERRLOG("Tried to allocate past valid CMB range!\n");
576 		return -1;
577 	}
578 
579 	*offset = round_offset;
580 	pctrlr->cmb_current_offset = round_offset + length;
581 
582 	return 0;
583 }
584 
585 void *
586 nvme_pcie_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size)
587 {
588 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
589 	uint64_t offset;
590 
591 	if (pctrlr->cmb_bar_virt_addr == NULL) {
592 		SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n");
593 		return NULL;
594 	}
595 
596 	if (!pctrlr->cmb_io_data_supported) {
597 		SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB doesn't support I/O data\n");
598 		return NULL;
599 	}
600 
601 	if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, size, 4, &offset) != 0) {
602 		SPDK_DEBUGLOG(SPDK_LOG_NVME, "%zu-byte CMB allocation failed\n", size);
603 		return NULL;
604 	}
605 
606 	return pctrlr->cmb_bar_virt_addr + offset;
607 }
608 
609 int
610 nvme_pcie_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size)
611 {
612 	/*
613 	 * Do nothing for now.
614 	 * TODO: Track free space so buffers may be reused.
615 	 */
616 	SPDK_ERRLOG("%s: no deallocation for CMB buffers yet!\n",
617 		    __func__);
618 	return 0;
619 }
620 
621 static int
622 nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr)
623 {
624 	int rc;
625 	void *addr;
626 	uint64_t phys_addr, size;
627 
628 	rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr,
629 				     &phys_addr, &size);
630 	pctrlr->regs = (volatile struct spdk_nvme_registers *)addr;
631 	if ((pctrlr->regs == NULL) || (rc != 0)) {
632 		SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n",
633 			    rc, pctrlr->regs);
634 		return -1;
635 	}
636 
637 	pctrlr->regs_size = size;
638 	nvme_pcie_ctrlr_map_cmb(pctrlr);
639 
640 	return 0;
641 }
642 
643 static int
644 nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr)
645 {
646 	int rc = 0;
647 	void *addr = (void *)pctrlr->regs;
648 
649 	if (pctrlr->ctrlr.is_removed) {
650 		return rc;
651 	}
652 
653 	rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr);
654 	if (rc != 0) {
655 		SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc);
656 		return -1;
657 	}
658 
659 	if (addr) {
660 		/* NOTE: addr may have been remapped here. We're relying on DPDK to call
661 		 * munmap internally.
662 		 */
663 		rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr);
664 	}
665 	return rc;
666 }
667 
668 static int
669 nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr)
670 {
671 	struct nvme_pcie_qpair *pqpair;
672 	int rc;
673 
674 	pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
675 	if (pqpair == NULL) {
676 		return -ENOMEM;
677 	}
678 
679 	pqpair->num_entries = NVME_ADMIN_ENTRIES;
680 	pqpair->flags.delay_pcie_doorbell = 0;
681 
682 	ctrlr->adminq = &pqpair->qpair;
683 
684 	rc = nvme_qpair_init(ctrlr->adminq,
685 			     0, /* qpair ID */
686 			     ctrlr,
687 			     SPDK_NVME_QPRIO_URGENT,
688 			     NVME_ADMIN_ENTRIES);
689 	if (rc != 0) {
690 		return rc;
691 	}
692 
693 	return nvme_pcie_qpair_construct(ctrlr->adminq);
694 }
695 
696 /* This function must only be called while holding g_spdk_nvme_driver->lock */
697 static int
698 pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
699 {
700 	struct spdk_nvme_transport_id trid = {};
701 	struct nvme_pcie_enum_ctx *enum_ctx = ctx;
702 	struct spdk_nvme_ctrlr *ctrlr;
703 	struct spdk_pci_addr pci_addr;
704 
705 	pci_addr = spdk_pci_device_get_addr(pci_dev);
706 
707 	trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
708 	spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
709 
710 	ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid);
711 	if (!spdk_process_is_primary()) {
712 		if (!ctrlr) {
713 			SPDK_ERRLOG("Controller must be constructed in the primary process first.\n");
714 			return -1;
715 		}
716 
717 		return nvme_ctrlr_add_process(ctrlr, pci_dev);
718 	}
719 
720 	/* check whether user passes the pci_addr */
721 	if (enum_ctx->has_pci_addr &&
722 	    (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) {
723 		return 1;
724 	}
725 
726 	return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev);
727 }
728 
729 int
730 nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx,
731 		     bool direct_connect)
732 {
733 	struct nvme_pcie_enum_ctx enum_ctx = {};
734 
735 	enum_ctx.probe_ctx = probe_ctx;
736 
737 	if (strlen(probe_ctx->trid.traddr) != 0) {
738 		if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) {
739 			return -1;
740 		}
741 		enum_ctx.has_pci_addr = true;
742 	}
743 
744 	if (hotplug_fd < 0) {
745 		hotplug_fd = spdk_uevent_connect();
746 		if (hotplug_fd < 0) {
747 			SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n");
748 		}
749 	} else {
750 		_nvme_pcie_hotplug_monitor(probe_ctx);
751 	}
752 
753 	if (enum_ctx.has_pci_addr == false) {
754 		return spdk_pci_enumerate(spdk_pci_nvme_get_driver(),
755 					  pcie_nvme_enum_cb, &enum_ctx);
756 	} else {
757 		return spdk_pci_device_attach(spdk_pci_nvme_get_driver(),
758 					      pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr);
759 	}
760 }
761 
762 static int
763 nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, struct spdk_pci_addr *pci_addr)
764 {
765 	struct nvme_pcie_enum_ctx enum_ctx;
766 
767 	enum_ctx.probe_ctx = probe_ctx;
768 	enum_ctx.has_pci_addr = true;
769 	enum_ctx.pci_addr = *pci_addr;
770 
771 	return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pcie_nvme_enum_cb, &enum_ctx);
772 }
773 
774 struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
775 		const struct spdk_nvme_ctrlr_opts *opts,
776 		void *devhandle)
777 {
778 	struct spdk_pci_device *pci_dev = devhandle;
779 	struct nvme_pcie_ctrlr *pctrlr;
780 	union spdk_nvme_cap_register cap;
781 	union spdk_nvme_vs_register vs;
782 	uint32_t cmd_reg;
783 	int rc, claim_fd;
784 	struct spdk_pci_id pci_id;
785 	struct spdk_pci_addr pci_addr;
786 
787 	if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) {
788 		SPDK_ERRLOG("could not parse pci address\n");
789 		return NULL;
790 	}
791 
792 	claim_fd = spdk_pci_device_claim(&pci_addr);
793 	if (claim_fd < 0) {
794 		SPDK_ERRLOG("could not claim device %s\n", trid->traddr);
795 		return NULL;
796 	}
797 
798 	pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL,
799 			      SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
800 	if (pctrlr == NULL) {
801 		close(claim_fd);
802 		SPDK_ERRLOG("could not allocate ctrlr\n");
803 		return NULL;
804 	}
805 
806 	pctrlr->is_remapped = false;
807 	pctrlr->ctrlr.is_removed = false;
808 	pctrlr->ctrlr.trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
809 	pctrlr->devhandle = devhandle;
810 	pctrlr->ctrlr.opts = *opts;
811 	pctrlr->claim_fd = claim_fd;
812 	memcpy(&pctrlr->ctrlr.trid, trid, sizeof(pctrlr->ctrlr.trid));
813 
814 	rc = nvme_pcie_ctrlr_allocate_bars(pctrlr);
815 	if (rc != 0) {
816 		close(claim_fd);
817 		spdk_free(pctrlr);
818 		return NULL;
819 	}
820 
821 	/* Enable PCI busmaster and disable INTx */
822 	spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4);
823 	cmd_reg |= 0x404;
824 	spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4);
825 
826 	if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) {
827 		SPDK_ERRLOG("get_cap() failed\n");
828 		close(claim_fd);
829 		spdk_free(pctrlr);
830 		return NULL;
831 	}
832 
833 	if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) {
834 		SPDK_ERRLOG("get_vs() failed\n");
835 		close(claim_fd);
836 		spdk_free(pctrlr);
837 		return NULL;
838 	}
839 
840 	nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs);
841 
842 	/* Doorbell stride is 2 ^ (dstrd + 2),
843 	 * but we want multiples of 4, so drop the + 2 */
844 	pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd;
845 
846 	rc = nvme_ctrlr_construct(&pctrlr->ctrlr);
847 	if (rc != 0) {
848 		nvme_ctrlr_destruct(&pctrlr->ctrlr);
849 		return NULL;
850 	}
851 
852 	pci_id = spdk_pci_device_get_id(pci_dev);
853 	pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id);
854 
855 	rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr);
856 	if (rc != 0) {
857 		nvme_ctrlr_destruct(&pctrlr->ctrlr);
858 		return NULL;
859 	}
860 
861 	/* Construct the primary process properties */
862 	rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev);
863 	if (rc != 0) {
864 		nvme_ctrlr_destruct(&pctrlr->ctrlr);
865 		return NULL;
866 	}
867 
868 	if (g_sigset != true) {
869 		nvme_pcie_ctrlr_setup_signal();
870 		g_sigset = true;
871 	}
872 
873 	return &pctrlr->ctrlr;
874 }
875 
876 int
877 nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
878 {
879 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
880 	struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq);
881 	union spdk_nvme_aqa_register aqa;
882 
883 	if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) {
884 		SPDK_ERRLOG("set_asq() failed\n");
885 		return -EIO;
886 	}
887 
888 	if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) {
889 		SPDK_ERRLOG("set_acq() failed\n");
890 		return -EIO;
891 	}
892 
893 	aqa.raw = 0;
894 	/* acqs and asqs are 0-based. */
895 	aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
896 	aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
897 
898 	if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) {
899 		SPDK_ERRLOG("set_aqa() failed\n");
900 		return -EIO;
901 	}
902 
903 	return 0;
904 }
905 
906 int
907 nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
908 {
909 	struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
910 	struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr);
911 
912 	close(pctrlr->claim_fd);
913 
914 	if (ctrlr->adminq) {
915 		nvme_pcie_qpair_destroy(ctrlr->adminq);
916 	}
917 
918 	nvme_ctrlr_destruct_finish(ctrlr);
919 
920 	nvme_ctrlr_free_processes(ctrlr);
921 
922 	nvme_pcie_ctrlr_free_bars(pctrlr);
923 
924 	if (devhandle) {
925 		spdk_pci_device_detach(devhandle);
926 	}
927 
928 	spdk_free(pctrlr);
929 
930 	return 0;
931 }
932 
933 static void
934 nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr)
935 {
936 	tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
937 	tr->cid = cid;
938 	tr->active = false;
939 }
940 
941 int
942 nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair)
943 {
944 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
945 
946 	pqpair->last_sq_tail = pqpair->sq_tail = pqpair->cq_head = 0;
947 
948 	/*
949 	 * First time through the completion queue, HW will set phase
950 	 *  bit on completions to 1.  So set this to 1 here, indicating
951 	 *  we're looking for a 1 to know which entries have completed.
952 	 *  we'll toggle the bit each time when the completion queue
953 	 *  rolls over.
954 	 */
955 	pqpair->flags.phase = 1;
956 
957 	memset(pqpair->cmd, 0,
958 	       pqpair->num_entries * sizeof(struct spdk_nvme_cmd));
959 	memset(pqpair->cpl, 0,
960 	       pqpair->num_entries * sizeof(struct spdk_nvme_cpl));
961 
962 	return 0;
963 }
964 
965 static int
966 nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair)
967 {
968 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
969 	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(ctrlr);
970 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
971 	struct nvme_tracker	*tr;
972 	uint16_t		i;
973 	volatile uint32_t	*doorbell_base;
974 	uint64_t		offset;
975 	uint16_t		num_trackers;
976 	size_t			page_align = VALUE_2MB;
977 	uint32_t                flags = SPDK_MALLOC_DMA;
978 
979 	/*
980 	 * Limit the maximum number of completions to return per call to prevent wraparound,
981 	 * and calculate how many trackers can be submitted at once without overflowing the
982 	 * completion queue.
983 	 */
984 	pqpair->max_completions_cap = pqpair->num_entries / 4;
985 	pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS);
986 	pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS);
987 	num_trackers = pqpair->num_entries - pqpair->max_completions_cap;
988 
989 	SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n",
990 		     pqpair->max_completions_cap, num_trackers);
991 
992 	assert(num_trackers != 0);
993 
994 	pqpair->sq_in_cmb = false;
995 
996 	if (nvme_qpair_is_admin_queue(&pqpair->qpair)) {
997 		flags |= SPDK_MALLOC_SHARE;
998 	}
999 
1000 	/* cmd and cpl rings must be aligned on page size boundaries. */
1001 	if (ctrlr->opts.use_cmb_sqs) {
1002 		if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
1003 					      sysconf(_SC_PAGESIZE), &offset) == 0) {
1004 			pqpair->cmd = pctrlr->cmb_bar_virt_addr + offset;
1005 			pqpair->cmd_bus_addr = pctrlr->cmb_bar_phys_addr + offset;
1006 			pqpair->sq_in_cmb = true;
1007 		}
1008 	}
1009 
1010 	/* To ensure physical address contiguity we make each ring occupy
1011 	 * a single hugepage only. See MAX_IO_QUEUE_ENTRIES.
1012 	 */
1013 	if (pqpair->sq_in_cmb == false) {
1014 		pqpair->cmd = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
1015 					   page_align, NULL,
1016 					   SPDK_ENV_SOCKET_ID_ANY, flags);
1017 		if (pqpair->cmd == NULL) {
1018 			SPDK_ERRLOG("alloc qpair_cmd failed\n");
1019 			return -ENOMEM;
1020 		}
1021 
1022 		pqpair->cmd_bus_addr = spdk_vtophys(pqpair->cmd, NULL);
1023 		if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) {
1024 			SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n");
1025 			return -EFAULT;
1026 		}
1027 	}
1028 
1029 	pqpair->cpl = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cpl),
1030 				   page_align, NULL,
1031 				   SPDK_ENV_SOCKET_ID_ANY, flags);
1032 	if (pqpair->cpl == NULL) {
1033 		SPDK_ERRLOG("alloc qpair_cpl failed\n");
1034 		return -ENOMEM;
1035 	}
1036 
1037 	pqpair->cpl_bus_addr = spdk_vtophys(pqpair->cpl, NULL);
1038 	if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) {
1039 		SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n");
1040 		return -EFAULT;
1041 	}
1042 
1043 	doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl;
1044 	pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32;
1045 	pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32;
1046 
1047 	/*
1048 	 * Reserve space for all of the trackers in a single allocation.
1049 	 *   struct nvme_tracker must be padded so that its size is already a power of 2.
1050 	 *   This ensures the PRP list embedded in the nvme_tracker object will not span a
1051 	 *   4KB boundary, while allowing access to trackers in tr[] via normal array indexing.
1052 	 */
1053 	pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL,
1054 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
1055 	if (pqpair->tr == NULL) {
1056 		SPDK_ERRLOG("nvme_tr failed\n");
1057 		return -ENOMEM;
1058 	}
1059 
1060 	TAILQ_INIT(&pqpair->free_tr);
1061 	TAILQ_INIT(&pqpair->outstanding_tr);
1062 
1063 	for (i = 0; i < num_trackers; i++) {
1064 		tr = &pqpair->tr[i];
1065 		nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr, NULL));
1066 		TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
1067 	}
1068 
1069 	nvme_pcie_qpair_reset(qpair);
1070 
1071 	return 0;
1072 }
1073 
1074 static inline void
1075 nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
1076 {
1077 	/* dst and src are known to be non-overlapping and 64-byte aligned. */
1078 #if defined(__AVX__)
1079 	__m256i *d256 = (__m256i *)dst;
1080 	const __m256i *s256 = (const __m256i *)src;
1081 
1082 	_mm256_store_si256(&d256[0], _mm256_load_si256(&s256[0]));
1083 	_mm256_store_si256(&d256[1], _mm256_load_si256(&s256[1]));
1084 #elif defined(__SSE2__)
1085 	__m128i *d128 = (__m128i *)dst;
1086 	const __m128i *s128 = (const __m128i *)src;
1087 
1088 	_mm_store_si128(&d128[0], _mm_load_si128(&s128[0]));
1089 	_mm_store_si128(&d128[1], _mm_load_si128(&s128[1]));
1090 	_mm_store_si128(&d128[2], _mm_load_si128(&s128[2]));
1091 	_mm_store_si128(&d128[3], _mm_load_si128(&s128[3]));
1092 #else
1093 	*dst = *src;
1094 #endif
1095 }
1096 
1097 /**
1098  * Note: the ctrlr_lock must be held when calling this function.
1099  */
1100 static void
1101 nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair,
1102 		struct nvme_request *req, struct spdk_nvme_cpl *cpl)
1103 {
1104 	struct spdk_nvme_ctrlr		*ctrlr = qpair->ctrlr;
1105 	struct nvme_request		*active_req = req;
1106 	struct spdk_nvme_ctrlr_process	*active_proc;
1107 
1108 	/*
1109 	 * The admin request is from another process. Move to the per
1110 	 *  process list for that process to handle it later.
1111 	 */
1112 	assert(nvme_qpair_is_admin_queue(qpair));
1113 	assert(active_req->pid != getpid());
1114 
1115 	active_proc = spdk_nvme_ctrlr_get_process(ctrlr, active_req->pid);
1116 	if (active_proc) {
1117 		/* Save the original completion information */
1118 		memcpy(&active_req->cpl, cpl, sizeof(*cpl));
1119 		STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq);
1120 	} else {
1121 		SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n",
1122 			    active_req->pid);
1123 
1124 		nvme_free_request(active_req);
1125 	}
1126 }
1127 
1128 /**
1129  * Note: the ctrlr_lock must be held when calling this function.
1130  */
1131 static void
1132 nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair)
1133 {
1134 	struct spdk_nvme_ctrlr		*ctrlr = qpair->ctrlr;
1135 	struct nvme_request		*req, *tmp_req;
1136 	pid_t				pid = getpid();
1137 	struct spdk_nvme_ctrlr_process	*proc;
1138 
1139 	/*
1140 	 * Check whether there is any pending admin request from
1141 	 * other active processes.
1142 	 */
1143 	assert(nvme_qpair_is_admin_queue(qpair));
1144 
1145 	proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
1146 	if (!proc) {
1147 		SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid);
1148 		assert(proc);
1149 		return;
1150 	}
1151 
1152 	STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
1153 		STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
1154 
1155 		assert(req->pid == pid);
1156 
1157 		nvme_complete_request(req, &req->cpl);
1158 		nvme_free_request(req);
1159 	}
1160 }
1161 
1162 static inline int
1163 nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
1164 {
1165 	return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old);
1166 }
1167 
1168 static bool
1169 nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value,
1170 				     volatile uint32_t *shadow_db,
1171 				     volatile uint32_t *eventidx)
1172 {
1173 	uint16_t old;
1174 
1175 	if (!shadow_db) {
1176 		return true;
1177 	}
1178 
1179 	old = *shadow_db;
1180 	*shadow_db = value;
1181 
1182 	if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) {
1183 		return false;
1184 	}
1185 
1186 	return true;
1187 }
1188 
1189 static inline void
1190 nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair)
1191 {
1192 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
1193 	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
1194 	bool need_mmio = true;
1195 
1196 	if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
1197 		need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
1198 				pqpair->sq_tail,
1199 				pqpair->shadow_doorbell.sq_tdbl,
1200 				pqpair->shadow_doorbell.sq_eventidx);
1201 	}
1202 
1203 	if (spdk_likely(need_mmio)) {
1204 		spdk_wmb();
1205 		g_thread_mmio_ctrlr = pctrlr;
1206 		spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail);
1207 		g_thread_mmio_ctrlr = NULL;
1208 	}
1209 }
1210 
1211 static void
1212 nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
1213 {
1214 	struct nvme_request	*req;
1215 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
1216 
1217 	req = tr->req;
1218 	assert(req != NULL);
1219 
1220 	pqpair->tr[tr->cid].active = true;
1221 
1222 	/* Copy the command from the tracker to the submission queue. */
1223 	nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
1224 
1225 	if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) {
1226 		pqpair->sq_tail = 0;
1227 	}
1228 
1229 	if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) {
1230 		SPDK_ERRLOG("sq_tail is passing sq_head!\n");
1231 	}
1232 
1233 	if (!pqpair->flags.delay_pcie_doorbell) {
1234 		nvme_pcie_qpair_ring_sq_doorbell(qpair);
1235 	}
1236 }
1237 
1238 static void
1239 nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
1240 				 struct spdk_nvme_cpl *cpl, bool print_on_error)
1241 {
1242 	struct nvme_pcie_qpair		*pqpair = nvme_pcie_qpair(qpair);
1243 	struct nvme_request		*req;
1244 	bool				retry, error, was_active;
1245 	bool				req_from_current_proc = true;
1246 
1247 	req = tr->req;
1248 
1249 	assert(req != NULL);
1250 
1251 	error = spdk_nvme_cpl_is_error(cpl);
1252 	retry = error && nvme_completion_is_retry(cpl) &&
1253 		req->retries < spdk_nvme_retry_count;
1254 
1255 	if (error && print_on_error) {
1256 		nvme_qpair_print_command(qpair, &req->cmd);
1257 		nvme_qpair_print_completion(qpair, cpl);
1258 	}
1259 
1260 	was_active = pqpair->tr[cpl->cid].active;
1261 	pqpair->tr[cpl->cid].active = false;
1262 
1263 	assert(cpl->cid == req->cmd.cid);
1264 
1265 	if (retry) {
1266 		req->retries++;
1267 		nvme_pcie_qpair_submit_tracker(qpair, tr);
1268 	} else {
1269 		if (was_active) {
1270 			/* Only check admin requests from different processes. */
1271 			if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) {
1272 				req_from_current_proc = false;
1273 				nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl);
1274 			} else {
1275 				nvme_complete_request(req, cpl);
1276 			}
1277 		}
1278 
1279 		if (req_from_current_proc == true) {
1280 			nvme_free_request(req);
1281 		}
1282 
1283 		tr->req = NULL;
1284 
1285 		TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list);
1286 		TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
1287 
1288 		/*
1289 		 * If the controller is in the middle of resetting, don't
1290 		 *  try to submit queued requests here - let the reset logic
1291 		 *  handle that instead.
1292 		 */
1293 		if (!STAILQ_EMPTY(&qpair->queued_req) &&
1294 		    !qpair->ctrlr->is_resetting) {
1295 			req = STAILQ_FIRST(&qpair->queued_req);
1296 			STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1297 			nvme_qpair_submit_request(qpair, req);
1298 		}
1299 	}
1300 }
1301 
1302 static void
1303 nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair,
1304 					struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
1305 					bool print_on_error)
1306 {
1307 	struct spdk_nvme_cpl	cpl;
1308 
1309 	memset(&cpl, 0, sizeof(cpl));
1310 	cpl.sqid = qpair->id;
1311 	cpl.cid = tr->cid;
1312 	cpl.status.sct = sct;
1313 	cpl.status.sc = sc;
1314 	cpl.status.dnr = dnr;
1315 	nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
1316 }
1317 
1318 static void
1319 nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr)
1320 {
1321 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1322 	struct nvme_tracker *tr, *temp;
1323 
1324 	TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) {
1325 		SPDK_ERRLOG("aborting outstanding command\n");
1326 		nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
1327 							SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true);
1328 	}
1329 }
1330 
1331 static void
1332 nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
1333 {
1334 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
1335 	struct nvme_tracker	*tr;
1336 
1337 	tr = TAILQ_FIRST(&pqpair->outstanding_tr);
1338 	while (tr != NULL) {
1339 		assert(tr->req != NULL);
1340 		if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
1341 			nvme_pcie_qpair_manual_complete_tracker(qpair, tr,
1342 								SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0,
1343 								false);
1344 			tr = TAILQ_FIRST(&pqpair->outstanding_tr);
1345 		} else {
1346 			tr = TAILQ_NEXT(tr, tq_list);
1347 		}
1348 	}
1349 }
1350 
1351 static void
1352 nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair)
1353 {
1354 	nvme_pcie_admin_qpair_abort_aers(qpair);
1355 }
1356 
1357 static int
1358 nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair)
1359 {
1360 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1361 
1362 	if (nvme_qpair_is_admin_queue(qpair)) {
1363 		nvme_pcie_admin_qpair_destroy(qpair);
1364 	}
1365 	if (pqpair->cmd && !pqpair->sq_in_cmb) {
1366 		spdk_free(pqpair->cmd);
1367 	}
1368 	if (pqpair->cpl) {
1369 		spdk_free(pqpair->cpl);
1370 	}
1371 	if (pqpair->tr) {
1372 		spdk_free(pqpair->tr);
1373 	}
1374 
1375 	nvme_qpair_deinit(qpair);
1376 
1377 	spdk_free(pqpair);
1378 
1379 	return 0;
1380 }
1381 
1382 static void
1383 nvme_pcie_admin_qpair_enable(struct spdk_nvme_qpair *qpair)
1384 {
1385 	/*
1386 	 * Manually abort each outstanding admin command.  Do not retry
1387 	 *  admin commands found here, since they will be left over from
1388 	 *  a controller reset and its likely the context in which the
1389 	 *  command was issued no longer applies.
1390 	 */
1391 	nvme_pcie_qpair_abort_trackers(qpair, 1 /* do not retry */);
1392 }
1393 
1394 static void
1395 nvme_pcie_io_qpair_enable(struct spdk_nvme_qpair *qpair)
1396 {
1397 	/* Manually abort each outstanding I/O. */
1398 	nvme_pcie_qpair_abort_trackers(qpair, 0);
1399 }
1400 
1401 int
1402 nvme_pcie_qpair_enable(struct spdk_nvme_qpair *qpair)
1403 {
1404 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1405 
1406 	pqpair->flags.is_enabled = true;
1407 	if (nvme_qpair_is_io_queue(qpair)) {
1408 		nvme_pcie_io_qpair_enable(qpair);
1409 	} else {
1410 		nvme_pcie_admin_qpair_enable(qpair);
1411 	}
1412 
1413 	return 0;
1414 }
1415 
1416 static void
1417 nvme_pcie_admin_qpair_disable(struct spdk_nvme_qpair *qpair)
1418 {
1419 	nvme_pcie_admin_qpair_abort_aers(qpair);
1420 }
1421 
1422 static void
1423 nvme_pcie_io_qpair_disable(struct spdk_nvme_qpair *qpair)
1424 {
1425 }
1426 
1427 int
1428 nvme_pcie_qpair_disable(struct spdk_nvme_qpair *qpair)
1429 {
1430 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1431 
1432 	pqpair->flags.is_enabled = false;
1433 	if (nvme_qpair_is_io_queue(qpair)) {
1434 		nvme_pcie_io_qpair_disable(qpair);
1435 	} else {
1436 		nvme_pcie_admin_qpair_disable(qpair);
1437 	}
1438 
1439 	return 0;
1440 }
1441 
1442 
1443 int
1444 nvme_pcie_qpair_fail(struct spdk_nvme_qpair *qpair)
1445 {
1446 	nvme_pcie_qpair_abort_trackers(qpair, 1 /* do not retry */);
1447 
1448 	return 0;
1449 }
1450 
1451 static int
1452 nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr,
1453 				 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn,
1454 				 void *cb_arg)
1455 {
1456 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
1457 	struct nvme_request *req;
1458 	struct spdk_nvme_cmd *cmd;
1459 
1460 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1461 	if (req == NULL) {
1462 		return -ENOMEM;
1463 	}
1464 
1465 	cmd = &req->cmd;
1466 	cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ;
1467 
1468 	/*
1469 	 * TODO: create a create io completion queue command data
1470 	 *  structure.
1471 	 */
1472 	cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id;
1473 	/*
1474 	 * 0x2 = interrupts enabled
1475 	 * 0x1 = physically contiguous
1476 	 */
1477 	cmd->cdw11 = 0x1;
1478 	cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr;
1479 
1480 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
1481 }
1482 
1483 static int
1484 nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr,
1485 				 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1486 {
1487 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
1488 	struct nvme_request *req;
1489 	struct spdk_nvme_cmd *cmd;
1490 
1491 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1492 	if (req == NULL) {
1493 		return -ENOMEM;
1494 	}
1495 
1496 	cmd = &req->cmd;
1497 	cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ;
1498 
1499 	/*
1500 	 * TODO: create a create io submission queue command data
1501 	 *  structure.
1502 	 */
1503 	cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id;
1504 	/* 0x1 = physically contiguous */
1505 	cmd->cdw11 = (io_que->id << 16) | (io_que->qprio << 1) | 0x1;
1506 	cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr;
1507 
1508 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
1509 }
1510 
1511 static int
1512 nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1513 				 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1514 {
1515 	struct nvme_request *req;
1516 	struct spdk_nvme_cmd *cmd;
1517 
1518 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1519 	if (req == NULL) {
1520 		return -ENOMEM;
1521 	}
1522 
1523 	cmd = &req->cmd;
1524 	cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ;
1525 	cmd->cdw10 = qpair->id;
1526 
1527 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
1528 }
1529 
1530 static int
1531 nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1532 				 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1533 {
1534 	struct nvme_request *req;
1535 	struct spdk_nvme_cmd *cmd;
1536 
1537 	req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1538 	if (req == NULL) {
1539 		return -ENOMEM;
1540 	}
1541 
1542 	cmd = &req->cmd;
1543 	cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ;
1544 	cmd->cdw10 = qpair->id;
1545 
1546 	return nvme_ctrlr_submit_admin_request(ctrlr, req);
1547 }
1548 
1549 static int
1550 _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1551 				 uint16_t qid)
1552 {
1553 	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(ctrlr);
1554 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
1555 	struct nvme_completion_poll_status	status;
1556 	int					rc;
1557 
1558 	rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status);
1559 	if (rc != 0) {
1560 		return rc;
1561 	}
1562 
1563 	if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
1564 		SPDK_ERRLOG("nvme_create_io_cq failed!\n");
1565 		return -1;
1566 	}
1567 
1568 	rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status);
1569 	if (rc != 0) {
1570 		return rc;
1571 	}
1572 
1573 	if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
1574 		SPDK_ERRLOG("nvme_create_io_sq failed!\n");
1575 		/* Attempt to delete the completion queue */
1576 		rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status);
1577 		if (rc != 0) {
1578 			return -1;
1579 		}
1580 		spdk_nvme_wait_for_completion(ctrlr->adminq, &status);
1581 		return -1;
1582 	}
1583 
1584 	if (ctrlr->shadow_doorbell) {
1585 		pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) *
1586 						  pctrlr->doorbell_stride_u32;
1587 		pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) *
1588 						  pctrlr->doorbell_stride_u32;
1589 		pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) *
1590 						      pctrlr->doorbell_stride_u32;
1591 		pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) *
1592 						      pctrlr->doorbell_stride_u32;
1593 		pqpair->flags.has_shadow_doorbell = 1;
1594 	} else {
1595 		pqpair->flags.has_shadow_doorbell = 0;
1596 	}
1597 	nvme_pcie_qpair_reset(qpair);
1598 
1599 	return 0;
1600 }
1601 
1602 struct spdk_nvme_qpair *
1603 nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
1604 				const struct spdk_nvme_io_qpair_opts *opts)
1605 {
1606 	struct nvme_pcie_qpair *pqpair;
1607 	struct spdk_nvme_qpair *qpair;
1608 	int rc;
1609 
1610 	assert(ctrlr != NULL);
1611 
1612 	pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL,
1613 			      SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
1614 	if (pqpair == NULL) {
1615 		return NULL;
1616 	}
1617 
1618 	pqpair->num_entries = opts->io_queue_size;
1619 	pqpair->flags.delay_pcie_doorbell = opts->delay_pcie_doorbell;
1620 
1621 	qpair = &pqpair->qpair;
1622 
1623 	rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests);
1624 	if (rc != 0) {
1625 		nvme_pcie_qpair_destroy(qpair);
1626 		return NULL;
1627 	}
1628 
1629 	rc = nvme_pcie_qpair_construct(qpair);
1630 	if (rc != 0) {
1631 		nvme_pcie_qpair_destroy(qpair);
1632 		return NULL;
1633 	}
1634 
1635 	rc = _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qid);
1636 
1637 	if (rc != 0) {
1638 		SPDK_ERRLOG("I/O queue creation failed\n");
1639 		nvme_pcie_qpair_destroy(qpair);
1640 		return NULL;
1641 	}
1642 
1643 	return qpair;
1644 }
1645 
1646 int
1647 nvme_pcie_ctrlr_reinit_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1648 {
1649 	return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id);
1650 }
1651 
1652 int
1653 nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1654 {
1655 	struct nvme_completion_poll_status status;
1656 	int rc;
1657 
1658 	assert(ctrlr != NULL);
1659 
1660 	if (ctrlr->is_removed) {
1661 		goto free;
1662 	}
1663 
1664 	/* Delete the I/O submission queue */
1665 	rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status);
1666 	if (rc != 0) {
1667 		SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc);
1668 		return rc;
1669 	}
1670 	if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
1671 		return -1;
1672 	}
1673 
1674 	if (qpair->no_deletion_notification_needed == 0) {
1675 		/* Complete any I/O in the completion queue */
1676 		nvme_pcie_qpair_process_completions(qpair, 0);
1677 
1678 		/* Abort the rest of the I/O */
1679 		nvme_pcie_qpair_abort_trackers(qpair, 1);
1680 	}
1681 
1682 	/* Delete the completion queue */
1683 	rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status);
1684 	if (rc != 0) {
1685 		SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
1686 		return rc;
1687 	}
1688 	if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
1689 		return -1;
1690 	}
1691 
1692 free:
1693 	nvme_pcie_qpair_destroy(qpair);
1694 	return 0;
1695 }
1696 
1697 static void
1698 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
1699 {
1700 	/*
1701 	 * Bad vtophys translation, so abort this request and return
1702 	 *  immediately.
1703 	 */
1704 	nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
1705 						SPDK_NVME_SC_INVALID_FIELD,
1706 						1 /* do not retry */, true);
1707 }
1708 
1709 /*
1710  * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes.
1711  *
1712  * *prp_index will be updated to account for the number of PRP entries used.
1713  */
1714 static int
1715 nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len,
1716 			  uint32_t page_size)
1717 {
1718 	struct spdk_nvme_cmd *cmd = &tr->req->cmd;
1719 	uintptr_t page_mask = page_size - 1;
1720 	uint64_t phys_addr;
1721 	uint32_t i;
1722 
1723 	SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n",
1724 		      *prp_index, virt_addr, (uint32_t)len);
1725 
1726 	if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) {
1727 		SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1728 		return -EINVAL;
1729 	}
1730 
1731 	i = *prp_index;
1732 	while (len) {
1733 		uint32_t seg_len;
1734 
1735 		/*
1736 		 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array,
1737 		 * so prp_index == count is valid.
1738 		 */
1739 		if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) {
1740 			SPDK_ERRLOG("out of PRP entries\n");
1741 			return -EINVAL;
1742 		}
1743 
1744 		phys_addr = spdk_vtophys(virt_addr, NULL);
1745 		if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) {
1746 			SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr);
1747 			return -EINVAL;
1748 		}
1749 
1750 		if (i == 0) {
1751 			SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr);
1752 			cmd->dptr.prp.prp1 = phys_addr;
1753 			seg_len = page_size - ((uintptr_t)virt_addr & page_mask);
1754 		} else {
1755 			if ((phys_addr & page_mask) != 0) {
1756 				SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr);
1757 				return -EINVAL;
1758 			}
1759 
1760 			SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr);
1761 			tr->u.prp[i - 1] = phys_addr;
1762 			seg_len = page_size;
1763 		}
1764 
1765 		seg_len = spdk_min(seg_len, len);
1766 		virt_addr += seg_len;
1767 		len -= seg_len;
1768 		i++;
1769 	}
1770 
1771 	cmd->psdt = SPDK_NVME_PSDT_PRP;
1772 	if (i <= 1) {
1773 		cmd->dptr.prp.prp2 = 0;
1774 	} else if (i == 2) {
1775 		cmd->dptr.prp.prp2 = tr->u.prp[0];
1776 		SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2);
1777 	} else {
1778 		cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr;
1779 		SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2);
1780 	}
1781 
1782 	*prp_index = i;
1783 	return 0;
1784 }
1785 
1786 /**
1787  * Build PRP list describing physically contiguous payload buffer.
1788  */
1789 static int
1790 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1791 				     struct nvme_tracker *tr)
1792 {
1793 	uint32_t prp_index = 0;
1794 	int rc;
1795 
1796 	rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset,
1797 				       req->payload_size, qpair->ctrlr->page_size);
1798 	if (rc) {
1799 		nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1800 		return rc;
1801 	}
1802 
1803 	return 0;
1804 }
1805 
1806 /**
1807  * Build SGL list describing scattered payload buffer.
1808  */
1809 static int
1810 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1811 				     struct nvme_tracker *tr)
1812 {
1813 	int rc;
1814 	void *virt_addr;
1815 	uint64_t phys_addr;
1816 	uint32_t remaining_transfer_len, remaining_user_sge_len, length;
1817 	struct spdk_nvme_sgl_descriptor *sgl;
1818 	uint32_t nseg = 0;
1819 
1820 	/*
1821 	 * Build scattered payloads.
1822 	 */
1823 	assert(req->payload_size != 0);
1824 	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
1825 	assert(req->payload.reset_sgl_fn != NULL);
1826 	assert(req->payload.next_sge_fn != NULL);
1827 	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
1828 
1829 	sgl = tr->u.sgl;
1830 	req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
1831 	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
1832 
1833 	remaining_transfer_len = req->payload_size;
1834 
1835 	while (remaining_transfer_len > 0) {
1836 		rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg,
1837 					      &virt_addr, &remaining_user_sge_len);
1838 		if (rc) {
1839 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1840 			return -1;
1841 		}
1842 
1843 		remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
1844 		remaining_transfer_len -= remaining_user_sge_len;
1845 		while (remaining_user_sge_len > 0) {
1846 			if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1847 				nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1848 				return -1;
1849 			}
1850 
1851 			phys_addr = spdk_vtophys(virt_addr, NULL);
1852 			if (phys_addr == SPDK_VTOPHYS_ERROR) {
1853 				nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1854 				return -1;
1855 			}
1856 
1857 			length = spdk_min(remaining_user_sge_len, VALUE_2MB - _2MB_OFFSET(virt_addr));
1858 			remaining_user_sge_len -= length;
1859 			virt_addr += length;
1860 
1861 			if (nseg > 0 && phys_addr ==
1862 			    (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) {
1863 				/* extend previous entry */
1864 				(*(sgl - 1)).unkeyed.length += length;
1865 				continue;
1866 			}
1867 
1868 			sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1869 			sgl->unkeyed.length = length;
1870 			sgl->address = phys_addr;
1871 			sgl->unkeyed.subtype = 0;
1872 
1873 			sgl++;
1874 			nseg++;
1875 		}
1876 	}
1877 
1878 	if (nseg == 1) {
1879 		/*
1880 		 * The whole transfer can be described by a single SGL descriptor.
1881 		 *  Use the special case described by the spec where SGL1's type is Data Block.
1882 		 *  This means the SGL in the tracker is not used at all, so copy the first (and only)
1883 		 *  SGL element into SGL1.
1884 		 */
1885 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1886 		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
1887 		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
1888 	} else {
1889 		/* For now we can only support 1 SGL segment in NVMe controller */
1890 		req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
1891 		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
1892 		req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
1893 	}
1894 
1895 	return 0;
1896 }
1897 
1898 /**
1899  * Build PRP list describing scattered payload buffer.
1900  */
1901 static int
1902 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1903 				       struct nvme_tracker *tr)
1904 {
1905 	int rc;
1906 	void *virt_addr;
1907 	uint32_t remaining_transfer_len, length;
1908 	uint32_t prp_index = 0;
1909 	uint32_t page_size = qpair->ctrlr->page_size;
1910 
1911 	/*
1912 	 * Build scattered payloads.
1913 	 */
1914 	assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
1915 	assert(req->payload.reset_sgl_fn != NULL);
1916 	req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
1917 
1918 	remaining_transfer_len = req->payload_size;
1919 	while (remaining_transfer_len > 0) {
1920 		assert(req->payload.next_sge_fn != NULL);
1921 		rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length);
1922 		if (rc) {
1923 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1924 			return -1;
1925 		}
1926 
1927 		length = spdk_min(remaining_transfer_len, length);
1928 
1929 		/*
1930 		 * Any incompatible sges should have been handled up in the splitting routine,
1931 		 *  but assert here as an additional check.
1932 		 *
1933 		 * All SGEs except last must end on a page boundary.
1934 		 */
1935 		assert((length == remaining_transfer_len) ||
1936 		       _is_page_aligned((uintptr_t)virt_addr + length, page_size));
1937 
1938 		rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size);
1939 		if (rc) {
1940 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1941 			return rc;
1942 		}
1943 
1944 		remaining_transfer_len -= length;
1945 	}
1946 
1947 	return 0;
1948 }
1949 
1950 static inline bool
1951 nvme_pcie_qpair_check_enabled(struct spdk_nvme_qpair *qpair)
1952 {
1953 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1954 
1955 	if (!pqpair->flags.is_enabled &&
1956 	    !qpair->ctrlr->is_resetting) {
1957 		nvme_qpair_enable(qpair);
1958 	}
1959 	return pqpair->flags.is_enabled;
1960 }
1961 
1962 int
1963 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
1964 {
1965 	struct nvme_tracker	*tr;
1966 	int			rc = 0;
1967 	void			*md_payload;
1968 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
1969 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
1970 
1971 	nvme_pcie_qpair_check_enabled(qpair);
1972 
1973 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
1974 		nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
1975 	}
1976 
1977 	tr = TAILQ_FIRST(&pqpair->free_tr);
1978 
1979 	if (tr == NULL || !pqpair->flags.is_enabled) {
1980 		/*
1981 		 * No tracker is available, or the qpair is disabled due to
1982 		 *  an in-progress controller-level reset.
1983 		 *
1984 		 * Put the request on the qpair's request queue to be
1985 		 *  processed when a tracker frees up via a command
1986 		 *  completion or when the controller reset is
1987 		 *  completed.
1988 		 */
1989 		STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1990 		goto exit;
1991 	}
1992 
1993 	TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */
1994 	TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list);
1995 	tr->req = req;
1996 	req->cmd.cid = tr->cid;
1997 
1998 	if (req->payload_size && req->payload.md) {
1999 		md_payload = req->payload.md + req->md_offset;
2000 		tr->req->cmd.mptr = spdk_vtophys(md_payload, NULL);
2001 		if (tr->req->cmd.mptr == SPDK_VTOPHYS_ERROR) {
2002 			nvme_pcie_fail_request_bad_vtophys(qpair, tr);
2003 			rc = -EINVAL;
2004 			goto exit;
2005 		}
2006 	}
2007 
2008 	if (req->payload_size == 0) {
2009 		/* Null payload - leave PRP fields zeroed */
2010 		rc = 0;
2011 	} else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) {
2012 		rc = nvme_pcie_qpair_build_contig_request(qpair, req, tr);
2013 	} else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) {
2014 		if (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
2015 			rc = nvme_pcie_qpair_build_hw_sgl_request(qpair, req, tr);
2016 		} else {
2017 			rc = nvme_pcie_qpair_build_prps_sgl_request(qpair, req, tr);
2018 		}
2019 	} else {
2020 		assert(0);
2021 		nvme_pcie_fail_request_bad_vtophys(qpair, tr);
2022 		rc = -EINVAL;
2023 	}
2024 
2025 	if (rc < 0) {
2026 		goto exit;
2027 	}
2028 
2029 	nvme_pcie_qpair_submit_tracker(qpair, tr);
2030 
2031 exit:
2032 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
2033 		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
2034 	}
2035 
2036 	return rc;
2037 }
2038 
2039 static void
2040 nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
2041 {
2042 	uint64_t t02;
2043 	struct nvme_tracker *tr, *tmp;
2044 	struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
2045 	struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
2046 	struct spdk_nvme_ctrlr_process *active_proc;
2047 
2048 	/* Don't check timeouts during controller initialization. */
2049 	if (ctrlr->state != NVME_CTRLR_STATE_READY) {
2050 		return;
2051 	}
2052 
2053 	if (nvme_qpair_is_admin_queue(qpair)) {
2054 		active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
2055 	} else {
2056 		active_proc = qpair->active_proc;
2057 	}
2058 
2059 	/* Only check timeouts if the current process has a timeout callback. */
2060 	if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
2061 		return;
2062 	}
2063 
2064 	t02 = spdk_get_ticks();
2065 	TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
2066 		assert(tr->req != NULL);
2067 
2068 		if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) {
2069 			/*
2070 			 * The requests are in order, so as soon as one has not timed out,
2071 			 * stop iterating.
2072 			 */
2073 			break;
2074 		}
2075 	}
2076 }
2077 
2078 int32_t
2079 nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
2080 {
2081 	struct nvme_pcie_qpair	*pqpair = nvme_pcie_qpair(qpair);
2082 	struct nvme_pcie_ctrlr	*pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
2083 	struct nvme_tracker	*tr;
2084 	struct spdk_nvme_cpl	*cpl;
2085 	uint32_t		 num_completions = 0;
2086 	struct spdk_nvme_ctrlr	*ctrlr = qpair->ctrlr;
2087 
2088 	if (spdk_unlikely(!nvme_pcie_qpair_check_enabled(qpair))) {
2089 		/*
2090 		 * qpair is not enabled, likely because a controller reset is
2091 		 *  is in progress.  Ignore the interrupt - any I/O that was
2092 		 *  associated with this interrupt will get retried when the
2093 		 *  reset is complete.
2094 		 */
2095 		return 0;
2096 	}
2097 
2098 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
2099 		nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
2100 	}
2101 
2102 	if (max_completions == 0 || max_completions > pqpair->max_completions_cap) {
2103 		/*
2104 		 * max_completions == 0 means unlimited, but complete at most
2105 		 * max_completions_cap batch of I/O at a time so that the completion
2106 		 * queue doorbells don't wrap around.
2107 		 */
2108 		max_completions = pqpair->max_completions_cap;
2109 	}
2110 
2111 	while (1) {
2112 		cpl = &pqpair->cpl[pqpair->cq_head];
2113 
2114 		if (cpl->status.p != pqpair->flags.phase) {
2115 			break;
2116 		}
2117 #if defined(__PPC64__) || defined(__aarch64__)
2118 		/*
2119 		 * This memory barrier prevents reordering of:
2120 		 * - load after store from/to tr
2121 		 * - load after load cpl phase and cpl cid
2122 		 */
2123 		spdk_mb();
2124 #endif
2125 
2126 		if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) {
2127 			pqpair->cq_head = 0;
2128 			pqpair->flags.phase = !pqpair->flags.phase;
2129 		}
2130 
2131 		tr = &pqpair->tr[cpl->cid];
2132 		pqpair->sq_head = cpl->sqhd;
2133 
2134 		if (tr->active) {
2135 			nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true);
2136 		} else {
2137 			SPDK_ERRLOG("cpl does not map to outstanding cmd\n");
2138 			nvme_qpair_print_completion(qpair, cpl);
2139 			assert(0);
2140 		}
2141 
2142 		if (++num_completions == max_completions) {
2143 			break;
2144 		}
2145 	}
2146 
2147 	if (num_completions > 0) {
2148 		bool need_mmio = true;
2149 
2150 		if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
2151 			need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
2152 					pqpair->cq_head,
2153 					pqpair->shadow_doorbell.cq_hdbl,
2154 					pqpair->shadow_doorbell.cq_eventidx);
2155 		}
2156 
2157 		if (spdk_likely(need_mmio)) {
2158 			g_thread_mmio_ctrlr = pctrlr;
2159 			spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head);
2160 			g_thread_mmio_ctrlr = NULL;
2161 		}
2162 	}
2163 
2164 	if (pqpair->flags.delay_pcie_doorbell) {
2165 		if (pqpair->last_sq_tail != pqpair->sq_tail) {
2166 			nvme_pcie_qpair_ring_sq_doorbell(qpair);
2167 			pqpair->last_sq_tail = pqpair->sq_tail;
2168 		}
2169 	}
2170 
2171 	if (spdk_unlikely(ctrlr->timeout_enabled)) {
2172 		/*
2173 		 * User registered for timeout callback
2174 		 */
2175 		nvme_pcie_qpair_check_timeout(qpair);
2176 	}
2177 
2178 	/* Before returning, complete any pending admin request. */
2179 	if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
2180 		nvme_pcie_qpair_complete_pending_admin_request(qpair);
2181 
2182 		nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
2183 	}
2184 
2185 	return num_completions;
2186 }
2187