xref: /spdk/lib/vmd/vmd.c (revision 8afdeef3becfe9409cc9e7372bd0bc10e8b7d46d)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2019 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "vmd_internal.h"
7 
8 #include "spdk/stdinc.h"
9 #include "spdk/string.h"
10 #include "spdk/likely.h"
11 
12 static unsigned char *device_type[] = {
13 	"PCI Express Endpoint",
14 	"Legacy PCI Express Endpoint",
15 	"Reserved 1",
16 	"Reserved 2",
17 	"Root Port of PCI Express Root Complex",
18 	"Upstream Port of PCI Express Switch",
19 	"Downstream Port of PCI Express Switch",
20 	"PCI Express to PCI/PCI-X Bridge",
21 	"PCI/PCI-X to PCI Express Bridge",
22 	"Root Complex Integrated Endpoint",
23 	"Root Complex Event Collector",
24 	"Reserved Capability"
25 };
26 
27 /*
28  * Container for all VMD adapter probed in the system.
29  */
30 struct vmd_container {
31 	uint32_t count;
32 	struct vmd_adapter vmd[MAX_VMD_SUPPORTED];
33 };
34 
35 static struct vmd_container g_vmd_container;
36 static uint8_t g_end_device_count;
37 
38 static bool
39 vmd_is_valid_cfg_addr(struct vmd_pci_bus *bus, uint64_t addr)
40 {
41 	return addr >= (uint64_t)bus->vmd->cfg_vaddr &&
42 	       addr < bus->vmd->cfgbar_size + (uint64_t)bus->vmd->cfg_vaddr;
43 }
44 
45 static void
46 vmd_align_base_addrs(struct vmd_adapter *vmd, uint32_t alignment)
47 {
48 	uint32_t pad;
49 
50 	/*
51 	 *  Device is not in hot plug path, align the base address remaining from membar 1.
52 	 */
53 	if (vmd->physical_addr & (alignment - 1)) {
54 		pad = alignment - (vmd->physical_addr & (alignment - 1));
55 		vmd->physical_addr += pad;
56 		vmd->current_addr_size -= pad;
57 	}
58 }
59 
60 static bool
61 vmd_device_is_enumerated(volatile struct pci_header *header)
62 {
63 	return header->one.prefetch_base_upper == VMD_UPPER_BASE_SIGNATURE &&
64 	       header->one.prefetch_limit_upper == VMD_UPPER_LIMIT_SIGNATURE;
65 }
66 
67 static bool
68 vmd_device_is_root_port(volatile struct pci_header *header)
69 {
70 	return header->common.vendor_id == SPDK_PCI_VID_INTEL &&
71 	       (header->common.device_id == PCI_ROOT_PORT_A_INTEL_SKX ||
72 		header->common.device_id == PCI_ROOT_PORT_B_INTEL_SKX ||
73 		header->common.device_id == PCI_ROOT_PORT_C_INTEL_SKX ||
74 		header->common.device_id == PCI_ROOT_PORT_D_INTEL_SKX ||
75 		header->common.device_id == PCI_ROOT_PORT_A_INTEL_ICX ||
76 		header->common.device_id == PCI_ROOT_PORT_B_INTEL_ICX ||
77 		header->common.device_id == PCI_ROOT_PORT_C_INTEL_ICX ||
78 		header->common.device_id == PCI_ROOT_PORT_D_INTEL_ICX);
79 }
80 
81 static void
82 vmd_hotplug_coalesce_regions(struct vmd_hot_plug *hp)
83 {
84 	struct pci_mem_mgr *region, *prev;
85 
86 	do {
87 		prev = NULL;
88 		TAILQ_FOREACH(region, &hp->free_mem_queue, tailq) {
89 			if (prev != NULL && (prev->addr + prev->size == region->addr)) {
90 				break;
91 			}
92 
93 			prev = region;
94 		}
95 
96 		if (region != NULL) {
97 			prev->size += region->size;
98 			TAILQ_REMOVE(&hp->free_mem_queue, region, tailq);
99 			TAILQ_INSERT_TAIL(&hp->unused_mem_queue, region, tailq);
100 		}
101 	} while (region != NULL);
102 }
103 
104 static void
105 vmd_hotplug_free_region(struct vmd_hot_plug *hp, struct pci_mem_mgr *region)
106 {
107 	struct pci_mem_mgr *current, *prev = NULL;
108 
109 	assert(region->addr >= hp->bar.start && region->addr < hp->bar.start + hp->bar.size);
110 
111 	TAILQ_FOREACH(current, &hp->free_mem_queue, tailq) {
112 		if (current->addr > region->addr) {
113 			break;
114 		}
115 
116 		prev = current;
117 	}
118 
119 	if (prev != NULL) {
120 		assert(prev->addr + prev->size <= region->addr);
121 		assert(current == NULL || (region->addr + region->size <= current->addr));
122 		TAILQ_INSERT_AFTER(&hp->free_mem_queue, prev, region, tailq);
123 	} else {
124 		TAILQ_INSERT_HEAD(&hp->free_mem_queue, region, tailq);
125 	}
126 
127 	vmd_hotplug_coalesce_regions(hp);
128 }
129 
130 static void
131 vmd_hotplug_free_addr(struct vmd_hot_plug *hp, uint64_t addr)
132 {
133 	struct pci_mem_mgr *region;
134 
135 	TAILQ_FOREACH(region, &hp->alloc_mem_queue, tailq) {
136 		if (region->addr == addr) {
137 			break;
138 		}
139 	}
140 
141 	assert(region != NULL);
142 	TAILQ_REMOVE(&hp->alloc_mem_queue, region, tailq);
143 
144 	vmd_hotplug_free_region(hp, region);
145 }
146 
147 static uint64_t
148 vmd_hotplug_allocate_base_addr(struct vmd_hot_plug *hp, uint32_t size)
149 {
150 	struct pci_mem_mgr *region = NULL, *free_region;
151 
152 	TAILQ_FOREACH(region, &hp->free_mem_queue, tailq) {
153 		if (region->size >= size) {
154 			break;
155 		}
156 	}
157 
158 	if (region == NULL) {
159 		SPDK_INFOLOG(vmd, "Unable to find free hotplug memory region of size:"
160 			     "%"PRIx32"\n", size);
161 		return 0;
162 	}
163 
164 	TAILQ_REMOVE(&hp->free_mem_queue, region, tailq);
165 	if (size < region->size) {
166 		free_region = TAILQ_FIRST(&hp->unused_mem_queue);
167 		if (free_region == NULL) {
168 			SPDK_INFOLOG(vmd, "Unable to find unused descriptor to store the "
169 				     "free region of size: %"PRIu32"\n", region->size - size);
170 		} else {
171 			TAILQ_REMOVE(&hp->unused_mem_queue, free_region, tailq);
172 			free_region->size = region->size - size;
173 			free_region->addr = region->addr + size;
174 			region->size = size;
175 			vmd_hotplug_free_region(hp, free_region);
176 		}
177 	}
178 
179 	TAILQ_INSERT_TAIL(&hp->alloc_mem_queue, region, tailq);
180 
181 	return region->addr;
182 }
183 
184 /*
185  *  Allocates an address from vmd membar for the input memory size
186  *  vmdAdapter - vmd adapter object
187  *  dev - vmd_pci_device to allocate a base address for.
188  *  size - size of the memory window requested.
189  *  Size must be an integral multiple of 2. Addresses are returned on the size boundary.
190  *  Returns physical address within the VMD membar window, or 0x0 if cannot allocate window.
191  *  Consider increasing the size of vmd membar if 0x0 is returned.
192  */
193 static uint64_t
194 vmd_allocate_base_addr(struct vmd_adapter *vmd, struct vmd_pci_device *dev, uint32_t size)
195 {
196 	uint64_t base_address = 0, padding = 0;
197 	struct vmd_pci_bus *hp_bus;
198 
199 	if (size && ((size & (~size + 1)) != size)) {
200 		return base_address;
201 	}
202 
203 	/*
204 	 *  If device is downstream of a hot plug port, allocate address from the
205 	 *  range dedicated for the hot plug slot. Search the list of addresses allocated to determine
206 	 *  if a free range exists that satisfy the input request.  If a free range cannot be found,
207 	 *  get a buffer from the  unused chunk. First fit algorithm, is used.
208 	 */
209 	if (dev) {
210 		hp_bus = dev->parent;
211 		if (hp_bus && hp_bus->self && hp_bus->self->hotplug_capable) {
212 			return vmd_hotplug_allocate_base_addr(&hp_bus->self->hp, size);
213 		}
214 	}
215 
216 	/* Ensure physical membar allocated is size aligned */
217 	if (vmd->physical_addr & (size - 1)) {
218 		padding = size - (vmd->physical_addr & (size - 1));
219 	}
220 
221 	/* Allocate from membar if enough memory is left */
222 	if (vmd->current_addr_size >= size + padding) {
223 		base_address = vmd->physical_addr + padding;
224 		vmd->physical_addr += size + padding;
225 		vmd->current_addr_size -= size + padding;
226 	}
227 
228 	SPDK_INFOLOG(vmd, "allocated(size) %" PRIx64 " (%x)\n", base_address, size);
229 
230 	return base_address;
231 }
232 
233 static bool
234 vmd_is_end_device(struct vmd_pci_device *dev)
235 {
236 	return (dev && dev->header) &&
237 	       ((dev->header->common.header_type & ~PCI_MULTI_FUNCTION) == PCI_HEADER_TYPE_NORMAL);
238 }
239 
240 static void
241 vmd_update_base_limit_register(struct vmd_pci_device *dev, uint16_t base, uint16_t limit)
242 {
243 	struct vmd_pci_bus *bus;
244 	struct vmd_pci_device *bridge;
245 
246 	if (base == 0 ||  limit == 0) {
247 		return;
248 	}
249 
250 	if (dev->header->common.header_type == PCI_HEADER_TYPE_BRIDGE) {
251 		bus = dev->bus_object;
252 	} else {
253 		bus = dev->parent;
254 	}
255 
256 	bridge = bus->self;
257 	SPDK_INFOLOG(vmd, "base:limit = %x:%x\n", bridge->header->one.mem_base,
258 		     bridge->header->one.mem_limit);
259 
260 	if (dev->bus->vmd->scan_completed) {
261 		return;
262 	}
263 
264 	while (bus && bus->self != NULL) {
265 		bridge = bus->self;
266 
267 		/* This is only for 32-bit memory space, need to revisit to support 64-bit */
268 		if (bridge->header->one.mem_base > base) {
269 			bridge->header->one.mem_base = base;
270 			base = bridge->header->one.mem_base;
271 		}
272 
273 		if (bridge->header->one.mem_limit < limit) {
274 			bridge->header->one.mem_limit = limit;
275 			limit = bridge->header->one.mem_limit;
276 		}
277 
278 		bus = bus->parent;
279 	}
280 }
281 
282 static uint64_t
283 vmd_get_base_addr(struct vmd_pci_device *dev, uint32_t index, uint32_t size)
284 {
285 	struct vmd_pci_bus *bus = dev->parent;
286 
287 	if (dev->header_type == PCI_HEADER_TYPE_BRIDGE) {
288 		return dev->header->zero.BAR[index] & ~0xf;
289 	} else {
290 		if (bus->self->hotplug_capable) {
291 			return vmd_hotplug_allocate_base_addr(&bus->self->hp, size);
292 		} else {
293 			return (uint64_t)bus->self->header->one.mem_base << 16;
294 		}
295 	}
296 }
297 
298 static bool
299 vmd_assign_base_addrs(struct vmd_pci_device *dev)
300 {
301 	uint16_t mem_base = 0, mem_limit = 0;
302 	unsigned char mem_attr = 0;
303 	int last;
304 	struct vmd_adapter *vmd = NULL;
305 	bool ret_val = false;
306 	uint32_t bar_value;
307 	uint32_t table_offset;
308 
309 	if (dev && dev->bus) {
310 		vmd = dev->bus->vmd;
311 	}
312 
313 	if (!vmd) {
314 		return 0;
315 	}
316 
317 	vmd_align_base_addrs(vmd, ONE_MB);
318 
319 	last = dev->header_type ? 2 : 6;
320 	for (int i = 0; i < last; i++) {
321 		bar_value = dev->header->zero.BAR[i];
322 		dev->header->zero.BAR[i] = ~(0U);
323 		dev->bar[i].size = dev->header->zero.BAR[i];
324 		dev->header->zero.BAR[i] = bar_value;
325 
326 		if (dev->bar[i].size == ~(0U) || dev->bar[i].size == 0  ||
327 		    dev->header->zero.BAR[i] & 1) {
328 			dev->bar[i].size = 0;
329 			continue;
330 		}
331 		mem_attr = dev->bar[i].size & PCI_BASE_ADDR_MASK;
332 		dev->bar[i].size = TWOS_COMPLEMENT(dev->bar[i].size & PCI_BASE_ADDR_MASK);
333 
334 		if (vmd->scan_completed) {
335 			dev->bar[i].start = vmd_get_base_addr(dev, i, dev->bar[i].size);
336 		} else {
337 			dev->bar[i].start = vmd_allocate_base_addr(vmd, dev, dev->bar[i].size);
338 		}
339 
340 		dev->header->zero.BAR[i] = (uint32_t)dev->bar[i].start;
341 
342 		if (!dev->bar[i].start) {
343 			if (mem_attr == (PCI_BAR_MEMORY_PREFETCH | PCI_BAR_MEMORY_TYPE_64)) {
344 				i++;
345 			}
346 			continue;
347 		}
348 
349 		dev->bar[i].vaddr = ((uint64_t)vmd->mem_vaddr + (dev->bar[i].start - vmd->membar));
350 		mem_limit = BRIDGE_BASEREG(dev->header->zero.BAR[i]) +
351 			    BRIDGE_BASEREG(dev->bar[i].size - 1);
352 		if (!mem_base) {
353 			mem_base = BRIDGE_BASEREG(dev->header->zero.BAR[i]);
354 		}
355 
356 		ret_val = true;
357 
358 		if (mem_attr == (PCI_BAR_MEMORY_PREFETCH | PCI_BAR_MEMORY_TYPE_64)) {
359 			i++;
360 			if (i < last) {
361 				dev->header->zero.BAR[i] = (uint32_t)(dev->bar[i].start >> PCI_DWORD_SHIFT);
362 			}
363 		}
364 	}
365 
366 	/* Enable device MEM and bus mastering */
367 	dev->header->zero.command |= (PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
368 	/*
369 	 * Writes to the pci config space is posted write. To ensure transaction reaches its destination
370 	 * before another write is posed, an immediate read of the written value should be performed.
371 	 */
372 	{ uint16_t cmd = dev->header->zero.command; (void)cmd; }
373 
374 	if (dev->msix_cap && ret_val) {
375 		table_offset = ((volatile struct pci_msix_cap *)dev->msix_cap)->msix_table_offset;
376 		if (dev->bar[table_offset & 0x3].vaddr) {
377 			dev->msix_table = (volatile struct pci_msix_table_entry *)
378 					  (dev->bar[table_offset & 0x3].vaddr + (table_offset & 0xfff8));
379 		}
380 	}
381 
382 	if (ret_val && vmd_is_end_device(dev)) {
383 		vmd_update_base_limit_register(dev, mem_base, mem_limit);
384 	}
385 
386 	return ret_val;
387 }
388 
389 static void
390 vmd_get_device_capabilities(struct vmd_pci_device *dev)
391 
392 {
393 	volatile uint8_t *config_space;
394 	uint8_t capabilities_offset;
395 	struct pci_capabilities_header *capabilities_hdr;
396 
397 	config_space = (volatile uint8_t *)dev->header;
398 	if ((dev->header->common.status  & PCI_CAPABILITIES_LIST) == 0) {
399 		return;
400 	}
401 
402 	capabilities_offset = dev->header->zero.cap_pointer;
403 	if (dev->header->common.header_type & PCI_HEADER_TYPE_BRIDGE) {
404 		capabilities_offset = dev->header->one.cap_pointer;
405 	}
406 
407 	while (capabilities_offset > 0) {
408 		capabilities_hdr = (struct pci_capabilities_header *)
409 				   &config_space[capabilities_offset];
410 		switch (capabilities_hdr->capability_id) {
411 		case CAPABILITY_ID_PCI_EXPRESS:
412 			dev->pcie_cap = (volatile struct pci_express_cap *)(capabilities_hdr);
413 			break;
414 
415 		case CAPABILITY_ID_MSI:
416 			dev->msi_cap = (volatile struct pci_msi_cap *)capabilities_hdr;
417 			break;
418 
419 		case CAPABILITY_ID_MSIX:
420 			dev->msix_cap = (volatile struct pci_msix_capability *)capabilities_hdr;
421 			dev->msix_table_size = dev->msix_cap->message_control.bit.table_size + 1;
422 			break;
423 
424 		default:
425 			break;
426 		}
427 		capabilities_offset = capabilities_hdr->next;
428 	}
429 }
430 
431 static volatile struct pci_enhanced_capability_header *
432 vmd_get_enhanced_capabilities(struct vmd_pci_device *dev, uint16_t capability_id)
433 {
434 	uint8_t *data;
435 	uint16_t cap_offset = EXTENDED_CAPABILITY_OFFSET;
436 	volatile struct pci_enhanced_capability_header *cap_hdr = NULL;
437 
438 	data = (uint8_t *)dev->header;
439 	while (cap_offset >= EXTENDED_CAPABILITY_OFFSET) {
440 		cap_hdr = (volatile struct pci_enhanced_capability_header *) &data[cap_offset];
441 		if (cap_hdr->capability_id == capability_id) {
442 			return cap_hdr;
443 		}
444 		cap_offset = cap_hdr->next;
445 		if (cap_offset == 0 || cap_offset < EXTENDED_CAPABILITY_OFFSET) {
446 			break;
447 		}
448 	}
449 
450 	return NULL;
451 }
452 
453 static void
454 vmd_read_config_space(struct vmd_pci_device *dev)
455 {
456 	/*
457 	 * Writes to the pci config space is posted weite. To ensure transaction reaches its destination
458 	 * before another write is posed, an immediate read of the written value should be performed.
459 	 */
460 	dev->header->common.command |= (BUS_MASTER_ENABLE | MEMORY_SPACE_ENABLE);
461 	{ uint16_t cmd = dev->header->common.command; (void)cmd; }
462 
463 	vmd_get_device_capabilities(dev);
464 	dev->sn_cap = (struct serial_number_capability *)vmd_get_enhanced_capabilities(dev,
465 			DEVICE_SERIAL_NUMBER_CAP_ID);
466 }
467 
468 static void
469 vmd_update_scan_info(struct vmd_pci_device *dev)
470 {
471 	struct vmd_adapter *vmd_adapter = dev->bus->vmd;
472 
473 	if (vmd_adapter->root_port_updated) {
474 		return;
475 	}
476 
477 	if (dev->header_type == PCI_HEADER_TYPE_NORMAL) {
478 		return;
479 	}
480 
481 	if (vmd_device_is_root_port(dev->header)) {
482 		vmd_adapter->root_port_updated = 1;
483 		SPDK_INFOLOG(vmd, "root_port_updated = %d\n",
484 			     vmd_adapter->root_port_updated);
485 		SPDK_INFOLOG(vmd, "upper:limit = %x : %x\n",
486 			     dev->header->one.prefetch_base_upper,
487 			     dev->header->one.prefetch_limit_upper);
488 		if (vmd_device_is_enumerated(dev->header)) {
489 			vmd_adapter->scan_completed = 1;
490 			SPDK_INFOLOG(vmd, "scan_completed = %d\n",
491 				     vmd_adapter->scan_completed);
492 		}
493 	}
494 }
495 
496 static void
497 vmd_reset_base_limit_registers(volatile struct pci_header *header)
498 {
499 	uint32_t reg __attribute__((unused));
500 
501 	/*
502 	 * Writes to the pci config space are posted writes.
503 	 * To ensure transaction reaches its destination
504 	 * before another write is posted, an immediate read
505 	 * of the written value should be performed.
506 	 */
507 	header->one.mem_base = 0xfff0;
508 	reg = header->one.mem_base;
509 	header->one.mem_limit = 0x0;
510 	reg = header->one.mem_limit;
511 	header->one.prefetch_base = 0x0;
512 	reg = header->one.prefetch_base;
513 	header->one.prefetch_limit = 0x0;
514 	reg = header->one.prefetch_limit;
515 	header->one.prefetch_base_upper = 0x0;
516 	reg = header->one.prefetch_base_upper;
517 	header->one.prefetch_limit_upper = 0x0;
518 	reg = header->one.prefetch_limit_upper;
519 	header->one.io_base_upper = 0x0;
520 	reg = header->one.io_base_upper;
521 	header->one.io_limit_upper = 0x0;
522 	reg = header->one.io_limit_upper;
523 	header->one.primary = 0;
524 	reg = header->one.primary;
525 	header->one.secondary = 0;
526 	reg = header->one.secondary;
527 	header->one.subordinate = 0;
528 	reg = header->one.subordinate;
529 }
530 
531 static void
532 vmd_init_hotplug(struct vmd_pci_device *dev, struct vmd_pci_bus *bus)
533 {
534 	struct vmd_adapter *vmd = bus->vmd;
535 	struct vmd_hot_plug *hp = &dev->hp;
536 	size_t mem_id;
537 
538 	dev->hotplug_capable = true;
539 	hp->bar.size = 1 << 20;
540 
541 	if (!vmd->scan_completed) {
542 		hp->bar.start = vmd_allocate_base_addr(vmd, NULL, hp->bar.size);
543 		bus->self->header->one.mem_base = BRIDGE_BASEREG(hp->bar.start);
544 		bus->self->header->one.mem_limit =
545 			bus->self->header->one.mem_base + BRIDGE_BASEREG(hp->bar.size - 1);
546 	} else {
547 		hp->bar.start = (uint64_t)bus->self->header->one.mem_base << 16;
548 	}
549 
550 	hp->bar.vaddr = (uint64_t)vmd->mem_vaddr + (hp->bar.start - vmd->membar);
551 
552 	TAILQ_INIT(&hp->free_mem_queue);
553 	TAILQ_INIT(&hp->unused_mem_queue);
554 	TAILQ_INIT(&hp->alloc_mem_queue);
555 
556 	hp->mem[0].size = hp->bar.size;
557 	hp->mem[0].addr = hp->bar.start;
558 
559 	TAILQ_INSERT_TAIL(&hp->free_mem_queue, &hp->mem[0], tailq);
560 
561 	for (mem_id = 1; mem_id < ADDR_ELEM_COUNT; ++mem_id) {
562 		TAILQ_INSERT_TAIL(&hp->unused_mem_queue, &hp->mem[mem_id], tailq);
563 	}
564 
565 	SPDK_INFOLOG(vmd, "%s: mem_base:mem_limit = %x : %x\n", __func__,
566 		     bus->self->header->one.mem_base, bus->self->header->one.mem_limit);
567 }
568 
569 static bool
570 vmd_bus_device_present(struct vmd_pci_bus *bus, uint32_t devfn)
571 {
572 	volatile struct pci_header *header;
573 
574 	header = (volatile struct pci_header *)(bus->vmd->cfg_vaddr +
575 						CONFIG_OFFSET_ADDR(bus->config_bus_number, devfn, 0, 0));
576 	if (!vmd_is_valid_cfg_addr(bus, (uint64_t)header)) {
577 		return false;
578 	}
579 
580 	if (header->common.vendor_id == PCI_INVALID_VENDORID || header->common.vendor_id == 0x0) {
581 		return false;
582 	}
583 
584 	return true;
585 }
586 
587 static struct vmd_pci_device *
588 vmd_alloc_dev(struct vmd_pci_bus *bus, uint32_t devfn)
589 {
590 	struct vmd_pci_device *dev = NULL;
591 	struct pci_header volatile *header;
592 	uint8_t header_type;
593 	uint32_t rev_class;
594 
595 	/* Make sure we're not creating two devices on the same dev/fn */
596 	TAILQ_FOREACH(dev, &bus->dev_list, tailq) {
597 		if (dev->devfn == devfn) {
598 			return NULL;
599 		}
600 	}
601 
602 	if (!vmd_bus_device_present(bus, devfn)) {
603 		return NULL;
604 	}
605 
606 	header = (struct pci_header * volatile)(bus->vmd->cfg_vaddr +
607 						CONFIG_OFFSET_ADDR(bus->config_bus_number, devfn, 0, 0));
608 
609 	SPDK_INFOLOG(vmd, "PCI device found: %04x:%04x ***\n",
610 		     header->common.vendor_id, header->common.device_id);
611 
612 	dev = calloc(1, sizeof(*dev));
613 	if (!dev) {
614 		return NULL;
615 	}
616 
617 	dev->header = header;
618 	dev->vid = dev->header->common.vendor_id;
619 	dev->did = dev->header->common.device_id;
620 	dev->bus = bus;
621 	dev->parent = bus;
622 	dev->devfn = devfn;
623 	header_type = dev->header->common.header_type;
624 	rev_class = dev->header->common.rev_class;
625 	dev->class = rev_class >> 8;
626 	dev->header_type = header_type & 0x7;
627 
628 	if (header_type == PCI_HEADER_TYPE_BRIDGE) {
629 		vmd_update_scan_info(dev);
630 		if (!dev->bus->vmd->scan_completed) {
631 			vmd_reset_base_limit_registers(dev->header);
632 		}
633 	}
634 
635 	vmd_read_config_space(dev);
636 
637 	return dev;
638 }
639 
640 static struct vmd_pci_bus *
641 vmd_create_new_bus(struct vmd_pci_bus *parent, struct vmd_pci_device *bridge, uint8_t bus_number)
642 {
643 	struct vmd_pci_bus *new_bus;
644 
645 	new_bus = calloc(1, sizeof(*new_bus));
646 	if (!new_bus) {
647 		return NULL;
648 	}
649 
650 	new_bus->parent = parent;
651 	new_bus->domain = parent->domain;
652 	new_bus->bus_number = bus_number;
653 	new_bus->secondary_bus = new_bus->subordinate_bus = bus_number;
654 	new_bus->self = bridge;
655 	new_bus->vmd = parent->vmd;
656 	new_bus->config_bus_number = new_bus->bus_number - new_bus->vmd->vmd_bus.bus_start;
657 	TAILQ_INIT(&new_bus->dev_list);
658 
659 	bridge->subordinate = new_bus;
660 
661 	bridge->pci.addr.bus = new_bus->bus_number;
662 	bridge->pci.addr.dev = bridge->devfn;
663 	bridge->pci.addr.func = 0;
664 	bridge->pci.addr.domain = parent->vmd->pci->addr.domain;
665 
666 	return new_bus;
667 }
668 
669 static uint8_t
670 vmd_get_next_bus_number(struct vmd_adapter *vmd)
671 {
672 	uint8_t bus = 0xff;
673 
674 	if ((vmd->next_bus_number + 1) < vmd->max_pci_bus) {
675 		bus = vmd->next_bus_number;
676 		vmd->next_bus_number++;
677 	}
678 
679 	return bus;
680 }
681 
682 static uint8_t
683 vmd_get_hotplug_bus_numbers(struct vmd_pci_device *dev)
684 {
685 	uint8_t bus_number = 0xff;
686 
687 	if (dev && dev->bus && dev->bus->vmd &&
688 	    ((dev->bus->vmd->next_bus_number + RESERVED_HOTPLUG_BUSES) < dev->bus->vmd->max_pci_bus)) {
689 		bus_number = RESERVED_HOTPLUG_BUSES;
690 		dev->bus->vmd->next_bus_number += RESERVED_HOTPLUG_BUSES;
691 	}
692 
693 	return bus_number;
694 }
695 
696 static void
697 vmd_enable_msix(struct vmd_pci_device *dev)
698 {
699 	volatile uint16_t control;
700 
701 	control = dev->msix_cap->message_control.as_uint16_t | (1 << 14);
702 	dev->msix_cap->message_control.as_uint16_t = control;
703 	control = dev->msix_cap->message_control.as_uint16_t;
704 	dev->msix_cap->message_control.as_uint16_t = (control | (1 << 15));
705 	control = dev->msix_cap->message_control.as_uint16_t;
706 	control = control & ~(1 << 14);
707 	dev->msix_cap->message_control.as_uint16_t = control;
708 	control = dev->msix_cap->message_control.as_uint16_t;
709 }
710 
711 static void
712 vmd_disable_msix(struct vmd_pci_device *dev)
713 {
714 	volatile uint16_t control;
715 
716 	control = dev->msix_cap->message_control.as_uint16_t | (1 << 14);
717 	dev->msix_cap->message_control.as_uint16_t = control;
718 	control = dev->msix_cap->message_control.as_uint16_t & ~(1 << 15);
719 	dev->msix_cap->message_control.as_uint16_t = control;
720 	control = dev->msix_cap->message_control.as_uint16_t;
721 }
722 
723 /*
724  * Set up MSI-X table entries for the port. Vmd MSIX vector 0 is used for
725  * port interrupt, so vector 0 is mapped to all MSIX entries for the port.
726  */
727 static void
728 vmd_setup_msix(struct vmd_pci_device *dev, volatile struct pci_msix_table_entry *vmdEntry)
729 {
730 	int entry;
731 
732 	if (!dev || !vmdEntry || !dev->msix_cap) {
733 		return;
734 	}
735 
736 	vmd_disable_msix(dev);
737 	if (dev->msix_table == NULL || dev->msix_table_size > MAX_MSIX_TABLE_SIZE) {
738 		return;
739 	}
740 
741 	for (entry = 0; entry < dev->msix_table_size; ++entry) {
742 		dev->msix_table[entry].vector_control = 1;
743 	}
744 	vmd_enable_msix(dev);
745 }
746 
747 static void
748 vmd_bus_update_bridge_info(struct vmd_pci_device *bridge)
749 {
750 	/* Update the subordinate bus of all bridges above this bridge */
751 	volatile struct vmd_pci_device *dev = bridge;
752 	uint8_t subordinate_bus;
753 
754 	if (!dev) {
755 		return;
756 	}
757 	subordinate_bus = bridge->header->one.subordinate;
758 	while (dev->parent_bridge != NULL) {
759 		dev = dev->parent_bridge;
760 		if (dev->header->one.subordinate < subordinate_bus) {
761 			dev->header->one.subordinate = subordinate_bus;
762 			subordinate_bus = dev->header->one.subordinate;
763 		}
764 	}
765 }
766 
767 static bool
768 vmd_is_supported_device(struct vmd_pci_device *dev)
769 {
770 	return dev->class == PCI_CLASS_STORAGE_EXPRESS;
771 }
772 
773 static int
774 vmd_dev_map_bar(struct spdk_pci_device *pci_dev, uint32_t bar,
775 		void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
776 {
777 	struct vmd_pci_device *dev = SPDK_CONTAINEROF(pci_dev, struct vmd_pci_device, pci);
778 
779 	*size = dev->bar[bar].size;
780 	*phys_addr = dev->bar[bar].start;
781 	*mapped_addr = (void *)dev->bar[bar].vaddr;
782 
783 	return 0;
784 }
785 
786 static int
787 vmd_dev_unmap_bar(struct spdk_pci_device *_dev, uint32_t bar, void *addr)
788 {
789 	return 0;
790 }
791 
792 static int
793 vmd_dev_cfg_read(struct spdk_pci_device *_dev, void *value, uint32_t len,
794 		 uint32_t offset)
795 {
796 	struct vmd_pci_device *dev = SPDK_CONTAINEROF(_dev, struct vmd_pci_device, pci);
797 	volatile uint8_t *src = (volatile uint8_t *)dev->header;
798 	uint8_t *dst = value;
799 	size_t i;
800 
801 	if (len + offset > PCI_MAX_CFG_SIZE) {
802 		return -1;
803 	}
804 
805 	for (i = 0; i < len; ++i) {
806 		dst[i] = src[offset + i];
807 	}
808 
809 	return 0;
810 }
811 
812 static int
813 vmd_dev_cfg_write(struct spdk_pci_device *_dev,  void *value,
814 		  uint32_t len, uint32_t offset)
815 {
816 	struct vmd_pci_device *dev = SPDK_CONTAINEROF(_dev, struct vmd_pci_device, pci);
817 	volatile uint8_t *dst = (volatile uint8_t *)dev->header;
818 	uint8_t *src = value;
819 	size_t i;
820 
821 	if ((len + offset) > PCI_MAX_CFG_SIZE) {
822 		return -1;
823 	}
824 
825 	for (i = 0; i < len; ++i) {
826 		dst[offset + i] = src[i];
827 	}
828 
829 	return 0;
830 }
831 
832 static void
833 vmd_dev_free(struct vmd_pci_device *dev)
834 {
835 	struct vmd_pci_device *bus_device = dev->bus->self;
836 	size_t i, num_bars = dev->header_type ? 2 : 6;
837 
838 	/* Release the hotplug region if the device is under hotplug-capable bus */
839 	if (bus_device && bus_device->hotplug_capable) {
840 		for (i = 0; i < num_bars; ++i) {
841 			if (dev->bar[i].start != 0) {
842 				vmd_hotplug_free_addr(&bus_device->hp, dev->bar[i].start);
843 			}
844 		}
845 	}
846 
847 	free(dev);
848 }
849 
850 static void
851 vmd_dev_detach(struct spdk_pci_device *dev)
852 {
853 	struct vmd_pci_device *vmd_device = (struct vmd_pci_device *)dev;
854 	struct vmd_pci_bus *bus = vmd_device->bus;
855 
856 	spdk_pci_unhook_device(dev);
857 	TAILQ_REMOVE(&bus->dev_list, vmd_device, tailq);
858 
859 	vmd_dev_free(vmd_device);
860 }
861 
862 static void
863 vmd_dev_init(struct vmd_pci_device *dev)
864 {
865 	dev->pci.addr.domain = dev->bus->vmd->domain;
866 	dev->pci.addr.bus = dev->bus->bus_number;
867 	dev->pci.addr.dev = dev->devfn;
868 	dev->pci.addr.func = 0;
869 	dev->pci.socket_id = spdk_pci_device_get_socket_id(dev->bus->vmd->pci);
870 	dev->pci.id.vendor_id = dev->header->common.vendor_id;
871 	dev->pci.id.device_id = dev->header->common.device_id;
872 	dev->pci.type = "vmd";
873 	dev->pci.map_bar = vmd_dev_map_bar;
874 	dev->pci.unmap_bar = vmd_dev_unmap_bar;
875 	dev->pci.cfg_read = vmd_dev_cfg_read;
876 	dev->pci.cfg_write = vmd_dev_cfg_write;
877 	dev->hotplug_capable = false;
878 	if (dev->pcie_cap != NULL) {
879 		dev->cached_slot_control = dev->pcie_cap->slot_control;
880 	}
881 }
882 
883 static int
884 vmd_init_end_device(struct vmd_pci_device *dev)
885 {
886 	struct vmd_pci_bus *bus = dev->bus;
887 	struct vmd_adapter *vmd;
888 	struct spdk_pci_driver *driver;
889 	uint8_t bdf[32];
890 	int rc;
891 
892 	if (!vmd_assign_base_addrs(dev)) {
893 		SPDK_ERRLOG("Failed to allocate BARs for device: %p\n", dev);
894 		return -1;
895 	}
896 
897 	vmd_setup_msix(dev, &bus->vmd->msix_table[0]);
898 	vmd_dev_init(dev);
899 
900 	if (vmd_is_supported_device(dev)) {
901 		spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->pci.addr);
902 		SPDK_INFOLOG(vmd, "Initializing NVMe device at %s\n", bdf);
903 		dev->pci.parent = dev->bus->vmd->pci;
904 
905 		driver = spdk_pci_nvme_get_driver();
906 		assert(driver != NULL);
907 		rc = spdk_pci_hook_device(driver, &dev->pci);
908 		if (rc != 0) {
909 			SPDK_ERRLOG("Failed to hook device %s: %s\n", bdf, spdk_strerror(-rc));
910 			return -1;
911 		}
912 
913 		vmd = bus->vmd;
914 		vmd->target[vmd->nvme_count] = dev;
915 		vmd->nvme_count++;
916 	}
917 
918 	/* Attach the device to the current bus and assign base addresses */
919 	TAILQ_INSERT_TAIL(&bus->dev_list, dev, tailq);
920 	g_end_device_count++;
921 
922 	return 0;
923 }
924 
925 /*
926  * Scans a single bus for all devices attached and return a count of
927  * how many devices found. In the VMD topology, it is assume there are no multi-
928  * function devices. Hence a bus(bridge) will not have multi function with both type
929  * 0 and 1 header.
930  *
931  * The other option  for implementing this function is the bus is an int and
932  * create a new device PciBridge. PciBridge would inherit from PciDevice with extra fields,
933  * sub/pri/sec bus. The input becomes PciPort, bus number and parent_bridge.
934  *
935  * The bus number is scanned and if a device is found, based on the header_type, create
936  * either PciBridge(1) or PciDevice(0).
937  *
938  * If a PciBridge, assign bus numbers and rescan new bus. The currently PciBridge being
939  * scanned becomes the passed in parent_bridge with the new bus number.
940  *
941  * The linked list becomes list of pciBridges with PciDevices attached.
942  *
943  * Return count of how many devices found(type1 + type 0 header devices)
944  */
945 static uint8_t
946 vmd_scan_single_bus(struct vmd_pci_bus *bus, struct vmd_pci_device *parent_bridge, bool hotplug)
947 {
948 	/* assuming only single function devices are on the bus */
949 	struct vmd_pci_device *new_dev;
950 	union express_slot_capabilities_register slot_cap;
951 	struct vmd_pci_bus *new_bus;
952 	uint8_t  device_number, dev_cnt = 0;
953 	uint8_t new_bus_num;
954 	int rc;
955 
956 	for (device_number = 0; device_number < 32; device_number++) {
957 		new_dev = vmd_alloc_dev(bus, device_number);
958 		if (new_dev == NULL) {
959 			continue;
960 		}
961 
962 		if (new_dev->header->common.header_type & PCI_HEADER_TYPE_BRIDGE) {
963 			if (hotplug) {
964 				free(new_dev);
965 				continue;
966 			}
967 
968 			slot_cap.as_uint32_t = 0;
969 			if (new_dev->pcie_cap != NULL) {
970 				slot_cap.as_uint32_t = new_dev->pcie_cap->slot_cap.as_uint32_t;
971 			}
972 
973 			new_bus_num = vmd_get_next_bus_number(bus->vmd);
974 			if (new_bus_num == 0xff) {
975 				vmd_dev_free(new_dev);
976 				return dev_cnt;
977 			}
978 			new_bus = vmd_create_new_bus(bus, new_dev, new_bus_num);
979 			if (!new_bus) {
980 				vmd_dev_free(new_dev);
981 				return dev_cnt;
982 			}
983 			new_bus->primary_bus = bus->secondary_bus;
984 			new_bus->self = new_dev;
985 			new_dev->bus_object = new_bus;
986 
987 			if (slot_cap.bit_field.hotplug_capable && new_dev->pcie_cap != NULL &&
988 			    new_dev->pcie_cap->express_cap_register.bit_field.slot_implemented) {
989 				new_bus->hotplug_buses = vmd_get_hotplug_bus_numbers(new_dev);
990 				new_bus->subordinate_bus += new_bus->hotplug_buses;
991 
992 				/* Attach hot plug instance if HP is supported */
993 				/* Hot inserted SSDs can be assigned port bus of sub-ordinate + 1 */
994 				SPDK_INFOLOG(vmd, "hotplug_capable/slot_implemented = "
995 					     "%x:%x\n", slot_cap.bit_field.hotplug_capable,
996 					     new_dev->pcie_cap->express_cap_register.bit_field.slot_implemented);
997 			}
998 
999 			new_dev->parent_bridge = parent_bridge;
1000 			new_dev->header->one.primary = new_bus->primary_bus;
1001 			new_dev->header->one.secondary = new_bus->secondary_bus;
1002 			new_dev->header->one.subordinate = new_bus->subordinate_bus;
1003 
1004 			vmd_bus_update_bridge_info(new_dev);
1005 			TAILQ_INSERT_TAIL(&bus->vmd->bus_list, new_bus, tailq);
1006 
1007 			vmd_dev_init(new_dev);
1008 			dev_cnt++;
1009 
1010 			if (slot_cap.bit_field.hotplug_capable && new_dev->pcie_cap != NULL &&
1011 			    new_dev->pcie_cap->express_cap_register.bit_field.slot_implemented) {
1012 				vmd_init_hotplug(new_dev, new_bus);
1013 			}
1014 
1015 			dev_cnt += vmd_scan_single_bus(new_bus, new_dev, hotplug);
1016 			if (new_dev->pcie_cap != NULL) {
1017 				if (new_dev->pcie_cap->express_cap_register.bit_field.device_type == SwitchUpstreamPort) {
1018 					return dev_cnt;
1019 				}
1020 			}
1021 		} else {
1022 			rc = vmd_init_end_device(new_dev);
1023 			if (rc != 0) {
1024 				vmd_dev_free(new_dev);
1025 			} else {
1026 				dev_cnt++;
1027 			}
1028 		}
1029 	}
1030 
1031 	return dev_cnt;
1032 }
1033 
1034 static void
1035 vmd_print_pci_info(struct vmd_pci_device *dev)
1036 {
1037 	if (!dev) {
1038 		return;
1039 	}
1040 
1041 	if (dev->pcie_cap != NULL) {
1042 		SPDK_INFOLOG(vmd, "PCI DEVICE: [%04X:%04X] type(%x) : %s\n",
1043 			     dev->header->common.vendor_id, dev->header->common.device_id,
1044 			     dev->pcie_cap->express_cap_register.bit_field.device_type,
1045 			     device_type[dev->pcie_cap->express_cap_register.bit_field.device_type]);
1046 	} else {
1047 		SPDK_INFOLOG(vmd, "PCI DEVICE: [%04X:%04X]\n",
1048 			     dev->header->common.vendor_id, dev->header->common.device_id);
1049 	}
1050 
1051 	SPDK_INFOLOG(vmd, "\tDOMAIN:BDF: %04x:%02x:%02x:%x\n", dev->pci.addr.domain,
1052 		     dev->pci.addr.bus, dev->pci.addr.dev, dev->pci.addr.func);
1053 
1054 	if (!(dev->header_type & PCI_HEADER_TYPE_BRIDGE) && dev->bus) {
1055 		SPDK_INFOLOG(vmd, "\tbase addr: %x : %p\n",
1056 			     dev->header->zero.BAR[0], (void *)dev->bar[0].vaddr);
1057 	}
1058 
1059 	if ((dev->header_type & PCI_HEADER_TYPE_BRIDGE)) {
1060 		SPDK_INFOLOG(vmd, "\tPrimary = %d, Secondary = %d, Subordinate = %d\n",
1061 			     dev->header->one.primary, dev->header->one.secondary, dev->header->one.subordinate);
1062 		if (dev->pcie_cap && dev->pcie_cap->express_cap_register.bit_field.slot_implemented) {
1063 			SPDK_INFOLOG(vmd, "\tSlot implemented on this device.\n");
1064 			if (dev->pcie_cap->slot_cap.bit_field.hotplug_capable) {
1065 				SPDK_INFOLOG(vmd, "Device has HOT-PLUG capable slot.\n");
1066 			}
1067 		}
1068 	}
1069 
1070 	if (dev->sn_cap != NULL) {
1071 		uint8_t *snLow = (uint8_t *)&dev->sn_cap->sn_low;
1072 		uint8_t *snHi = (uint8_t *)&dev->sn_cap->sn_hi;
1073 
1074 		SPDK_INFOLOG(vmd, "\tSN: %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x\n",
1075 			     snHi[3], snHi[2], snHi[1], snHi[0], snLow[3], snLow[2], snLow[1], snLow[0]);
1076 	}
1077 }
1078 
1079 static void
1080 vmd_cache_scan_info(struct vmd_pci_device *dev)
1081 {
1082 	uint32_t reg __attribute__((unused));
1083 
1084 	if (dev->header_type == PCI_HEADER_TYPE_NORMAL) {
1085 		return;
1086 	}
1087 
1088 	SPDK_INFOLOG(vmd, "vendor/device id:%x:%x\n", dev->header->common.vendor_id,
1089 		     dev->header->common.device_id);
1090 
1091 	if (vmd_device_is_root_port(dev->header)) {
1092 		dev->header->one.prefetch_base_upper = VMD_UPPER_BASE_SIGNATURE;
1093 		reg = dev->header->one.prefetch_base_upper;
1094 		dev->header->one.prefetch_limit_upper = VMD_UPPER_LIMIT_SIGNATURE;
1095 		reg = dev->header->one.prefetch_limit_upper;
1096 
1097 		SPDK_INFOLOG(vmd, "prefetch: %x:%x\n",
1098 			     dev->header->one.prefetch_base_upper,
1099 			     dev->header->one.prefetch_limit_upper);
1100 	}
1101 }
1102 
1103 static void
1104 vmd_reset_root_ports(struct vmd_pci_bus *bus)
1105 {
1106 	volatile struct pci_header *header;
1107 	uint32_t devfn;
1108 
1109 	/*
1110 	 * The root ports might have been configured by some other driver (e.g.  Linux kernel) prior
1111 	 * to loading the SPDK one, so we need to clear it.  We need to do it before starting the
1112 	 * scanning process, as it's depth-first, so when initial root ports are scanned, the
1113 	 * latter ones might still be using stale configuration.  This can lead to two bridges
1114 	 * having the same secondary/subordinate bus configuration, which, of course, isn't correct.
1115 	 * (Note: this fixed issue #2413.)
1116 	 */
1117 	for (devfn = 0; devfn < 32; ++devfn) {
1118 		if (!vmd_bus_device_present(bus, devfn)) {
1119 			continue;
1120 		}
1121 
1122 		header = (volatile void *)(bus->vmd->cfg_vaddr +
1123 					   CONFIG_OFFSET_ADDR(bus->config_bus_number, devfn, 0, 0));
1124 		if (vmd_device_is_root_port(header) && !vmd_device_is_enumerated(header)) {
1125 			vmd_reset_base_limit_registers(header);
1126 		}
1127 	}
1128 }
1129 
1130 static uint8_t
1131 vmd_scan_pcibus(struct vmd_pci_bus *bus)
1132 {
1133 	struct vmd_pci_bus *bus_entry;
1134 	struct vmd_pci_device *dev;
1135 	uint8_t dev_cnt;
1136 
1137 	vmd_reset_root_ports(bus);
1138 
1139 	g_end_device_count = 0;
1140 	TAILQ_INSERT_TAIL(&bus->vmd->bus_list, bus, tailq);
1141 	bus->vmd->next_bus_number = bus->bus_number + 1;
1142 	dev_cnt = vmd_scan_single_bus(bus, NULL, false);
1143 
1144 	SPDK_INFOLOG(vmd, "VMD scan found %u devices\n", dev_cnt);
1145 	SPDK_INFOLOG(vmd, "VMD scan found %u END DEVICES\n", g_end_device_count);
1146 
1147 	SPDK_INFOLOG(vmd, "PCIe devices attached to VMD %04x:%02x:%02x:%x...\n",
1148 		     bus->vmd->pci->addr.domain, bus->vmd->pci->addr.bus,
1149 		     bus->vmd->pci->addr.dev, bus->vmd->pci->addr.func);
1150 
1151 	TAILQ_FOREACH(bus_entry, &bus->vmd->bus_list, tailq) {
1152 		if (bus_entry->self != NULL) {
1153 			vmd_print_pci_info(bus_entry->self);
1154 			vmd_cache_scan_info(bus_entry->self);
1155 		}
1156 
1157 		TAILQ_FOREACH(dev, &bus_entry->dev_list, tailq) {
1158 			vmd_print_pci_info(dev);
1159 		}
1160 	}
1161 
1162 	return dev_cnt;
1163 }
1164 
1165 static int
1166 vmd_domain_map_bar(struct vmd_adapter *vmd, uint32_t bar,
1167 		   void **vaddr, uint64_t *paddr, uint64_t *size)
1168 {
1169 	uint64_t unused;
1170 	int rc;
1171 
1172 	rc = spdk_pci_device_map_bar(vmd->pci, bar, vaddr, &unused, size);
1173 	if (rc != 0) {
1174 		return rc;
1175 	}
1176 
1177 	/* Depending on the IOVA configuration, the physical address of the BAR returned by
1178 	 * spdk_pci_device_map_bar() can be either an actual physical address or a virtual one (if
1179 	 * IOMMU is enabled).  Since we do need an actual physical address to fill out the
1180 	 * base/limit registers and the BARs of the devices behind the VMD, read the config space to
1181 	 * get the correct address, regardless of IOVA configuration. */
1182 	rc = spdk_pci_device_cfg_read(vmd->pci, paddr, sizeof(*paddr),
1183 				      PCI_BAR0_OFFSET + bar * PCI_BAR_SIZE);
1184 	if (rc != 0) {
1185 		return rc;
1186 	}
1187 
1188 	*paddr &= PCI_BAR_MEMORY_ADDR_OFFSET;
1189 
1190 	return 0;
1191 }
1192 
1193 static int
1194 vmd_domain_map_bars(struct vmd_adapter *vmd)
1195 {
1196 	int rc;
1197 
1198 	rc = vmd_domain_map_bar(vmd, 0, (void **)&vmd->cfg_vaddr,
1199 				&vmd->cfgbar, &vmd->cfgbar_size);
1200 	if (rc != 0) {
1201 		SPDK_ERRLOG("Failed to map config bar: %s\n", spdk_strerror(-rc));
1202 		return rc;
1203 	}
1204 
1205 	rc = vmd_domain_map_bar(vmd, 2, (void **)&vmd->mem_vaddr,
1206 				&vmd->membar, &vmd->membar_size);
1207 	if (rc != 0) {
1208 		SPDK_ERRLOG("Failed to map memory bar: %s\n", spdk_strerror(-rc));
1209 		return rc;
1210 	}
1211 
1212 	vmd->physical_addr = vmd->membar;
1213 	vmd->current_addr_size = vmd->membar_size;
1214 
1215 	return 0;
1216 }
1217 
1218 static void
1219 vmd_set_starting_bus_number(struct vmd_adapter *vmd, uint8_t *bus_start,
1220 			    uint8_t *max_bus)
1221 {
1222 	uint32_t vmd_cap = 0, vmd_config = 0;
1223 	uint8_t bus_restrict_cap, bus_restrictions;
1224 
1225 	spdk_pci_device_cfg_read32(vmd->pci, &vmd_cap, PCI_VMD_VMCAP);
1226 	spdk_pci_device_cfg_read32(vmd->pci, &vmd_config, PCI_VMD_VMCONFIG);
1227 
1228 	bus_restrict_cap = vmd_cap & 0x1; /* bit 0 */
1229 	bus_restrictions = (vmd_config >> 8) & 0x3; /* bits 8-9 */
1230 	if ((bus_restrict_cap == 0x1) && (bus_restrictions == 0x1)) {
1231 		*bus_start = 128;
1232 		*max_bus = 255;
1233 	} else {
1234 		*bus_start = 0;
1235 		*max_bus = 127;
1236 	}
1237 }
1238 
1239 static int
1240 vmd_enumerate_devices(struct vmd_adapter *vmd)
1241 {
1242 	uint8_t max_bus, bus_start;
1243 
1244 	vmd->vmd_bus.vmd = vmd;
1245 	vmd->vmd_bus.domain = vmd->pci->addr.domain;
1246 
1247 	if (vmd->pci->id.device_id == PCI_DEVICE_ID_INTEL_VMD_ICX) {
1248 		vmd_set_starting_bus_number(vmd, &bus_start, &max_bus);
1249 		vmd->vmd_bus.bus_start = bus_start;
1250 		vmd->vmd_bus.secondary_bus = vmd->vmd_bus.subordinate_bus = vmd->vmd_bus.bus_start;
1251 		vmd->vmd_bus.primary_bus = vmd->vmd_bus.bus_number = vmd->vmd_bus.bus_start;
1252 		vmd->max_pci_bus = max_bus;
1253 	} else {
1254 		vmd->vmd_bus.bus_start = 0;
1255 		vmd->vmd_bus.secondary_bus = vmd->vmd_bus.subordinate_bus = 0;
1256 		vmd->vmd_bus.primary_bus = vmd->vmd_bus.bus_number = 0;
1257 		vmd->max_pci_bus = PCI_MAX_BUS_NUMBER;
1258 	}
1259 
1260 	return vmd_scan_pcibus(&vmd->vmd_bus);
1261 }
1262 
1263 struct vmd_pci_device *
1264 vmd_find_device(const struct spdk_pci_addr *addr)
1265 {
1266 	struct vmd_pci_bus *bus;
1267 	struct vmd_pci_device *dev;
1268 	uint32_t i;
1269 
1270 	for (i = 0; i < g_vmd_container.count; ++i) {
1271 		TAILQ_FOREACH(bus, &g_vmd_container.vmd[i].bus_list, tailq) {
1272 			if (bus->self) {
1273 				if (spdk_pci_addr_compare(&bus->self->pci.addr, addr) == 0) {
1274 					return bus->self;
1275 				}
1276 			}
1277 
1278 			TAILQ_FOREACH(dev, &bus->dev_list, tailq) {
1279 				if (spdk_pci_addr_compare(&dev->pci.addr, addr) == 0) {
1280 					return dev;
1281 				}
1282 			}
1283 		}
1284 	}
1285 
1286 	return NULL;
1287 }
1288 
1289 static int
1290 vmd_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
1291 {
1292 	uint32_t cmd_reg = 0;
1293 	char bdf[32] = {0};
1294 	struct vmd_container *vmd_c = ctx;
1295 	struct vmd_adapter *vmd = &vmd_c->vmd[vmd_c->count];
1296 
1297 	spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4);
1298 	cmd_reg |= 0x6;                      /* PCI bus master/memory enable. */
1299 	spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4);
1300 
1301 	spdk_pci_addr_fmt(bdf, sizeof(bdf), &pci_dev->addr);
1302 	SPDK_INFOLOG(vmd, "Found a VMD[ %d ] at %s\n", vmd_c->count, bdf);
1303 
1304 	/* map vmd bars */
1305 	vmd->pci = pci_dev;
1306 	vmd->vmd_index = vmd_c->count;
1307 	vmd->domain = (pci_dev->addr.bus << 16) | (pci_dev->addr.dev << 8) | pci_dev->addr.func;
1308 	TAILQ_INIT(&vmd->bus_list);
1309 
1310 	if (vmd_domain_map_bars(vmd) != 0) {
1311 		return -1;
1312 	}
1313 
1314 	SPDK_INFOLOG(vmd, "vmd config bar(%p) vaddr(%p) size(%x)\n",
1315 		     (void *)vmd->cfgbar, (void *)vmd->cfg_vaddr,
1316 		     (uint32_t)vmd->cfgbar_size);
1317 	SPDK_INFOLOG(vmd, "vmd mem bar(%p) vaddr(%p) size(%x)\n",
1318 		     (void *)vmd->membar, (void *)vmd->mem_vaddr,
1319 		     (uint32_t)vmd->membar_size);
1320 
1321 	vmd_c->count++;
1322 	vmd_enumerate_devices(vmd);
1323 
1324 	return 0;
1325 }
1326 
1327 int
1328 spdk_vmd_pci_device_list(struct spdk_pci_addr vmd_addr, struct spdk_pci_device *nvme_list)
1329 {
1330 	int cnt = 0;
1331 	struct vmd_pci_bus *bus;
1332 	struct vmd_pci_device *dev;
1333 	uint32_t i;
1334 
1335 	if (!nvme_list) {
1336 		return -1;
1337 	}
1338 
1339 	for (i = 0; i < g_vmd_container.count; ++i) {
1340 		if (spdk_pci_addr_compare(&vmd_addr, &g_vmd_container.vmd[i].pci->addr) == 0) {
1341 			TAILQ_FOREACH(bus, &g_vmd_container.vmd[i].bus_list, tailq) {
1342 				TAILQ_FOREACH(dev, &bus->dev_list, tailq) {
1343 					nvme_list[cnt++] = dev->pci;
1344 					if (!dev->is_hooked) {
1345 						vmd_dev_init(dev);
1346 						dev->is_hooked = 1;
1347 					}
1348 				}
1349 			}
1350 		}
1351 	}
1352 
1353 	return cnt;
1354 }
1355 
1356 static void
1357 vmd_clear_hotplug_status(struct vmd_pci_bus *bus)
1358 {
1359 	struct vmd_pci_device *device = bus->self;
1360 	uint16_t status __attribute__((unused));
1361 
1362 	status = device->pcie_cap->slot_status.as_uint16_t;
1363 	device->pcie_cap->slot_status.as_uint16_t = status;
1364 	status = device->pcie_cap->slot_status.as_uint16_t;
1365 
1366 	status = device->pcie_cap->link_status.as_uint16_t;
1367 	device->pcie_cap->link_status.as_uint16_t = status;
1368 	status = device->pcie_cap->link_status.as_uint16_t;
1369 }
1370 
1371 static void
1372 vmd_bus_handle_hotplug(struct vmd_pci_bus *bus)
1373 {
1374 	uint8_t num_devices, sleep_count;
1375 
1376 	for (sleep_count = 0; sleep_count < 20; ++sleep_count) {
1377 		/* Scan until a new device is found */
1378 		num_devices = vmd_scan_single_bus(bus, bus->self, true);
1379 		if (num_devices > 0) {
1380 			break;
1381 		}
1382 
1383 		spdk_delay_us(200000);
1384 	}
1385 
1386 	if (num_devices == 0) {
1387 		SPDK_ERRLOG("Timed out while scanning for hotplugged devices\n");
1388 	}
1389 }
1390 
1391 static void
1392 vmd_remove_device(struct vmd_pci_device *device)
1393 {
1394 	device->pci.internal.pending_removal = true;
1395 
1396 	/* If the device isn't attached, remove it immediately */
1397 	if (!device->pci.internal.attached) {
1398 		vmd_dev_detach(&device->pci);
1399 	}
1400 }
1401 
1402 static void
1403 vmd_bus_handle_hotremove(struct vmd_pci_bus *bus)
1404 {
1405 	struct vmd_pci_device *device, *tmpdev;
1406 
1407 	TAILQ_FOREACH_SAFE(device, &bus->dev_list, tailq, tmpdev) {
1408 		if (!vmd_bus_device_present(bus, device->devfn)) {
1409 			vmd_remove_device(device);
1410 		}
1411 	}
1412 }
1413 
1414 int
1415 spdk_vmd_hotplug_monitor(void)
1416 {
1417 	struct vmd_pci_bus *bus;
1418 	struct vmd_pci_device *device;
1419 	int num_hotplugs = 0;
1420 	uint32_t i;
1421 
1422 	for (i = 0; i < g_vmd_container.count; ++i) {
1423 		TAILQ_FOREACH(bus, &g_vmd_container.vmd[i].bus_list, tailq) {
1424 			device = bus->self;
1425 			if (device == NULL || !device->hotplug_capable) {
1426 				continue;
1427 			}
1428 
1429 			if (device->pcie_cap->slot_status.bit_field.datalink_state_changed != 1) {
1430 				continue;
1431 			}
1432 
1433 			if (device->pcie_cap->link_status.bit_field.datalink_layer_active == 1) {
1434 				SPDK_INFOLOG(vmd, "Device hotplug detected on bus "
1435 					     "%"PRIu32"\n", bus->bus_number);
1436 				vmd_bus_handle_hotplug(bus);
1437 			} else {
1438 				SPDK_INFOLOG(vmd, "Device hotremove detected on bus "
1439 					     "%"PRIu32"\n", bus->bus_number);
1440 				vmd_bus_handle_hotremove(bus);
1441 			}
1442 
1443 			vmd_clear_hotplug_status(bus);
1444 			num_hotplugs++;
1445 		}
1446 	}
1447 
1448 	return num_hotplugs;
1449 }
1450 
1451 int
1452 spdk_vmd_remove_device(const struct spdk_pci_addr *addr)
1453 {
1454 	struct vmd_pci_device *device;
1455 
1456 	device = vmd_find_device(addr);
1457 	if (device == NULL) {
1458 		return -ENODEV;
1459 	}
1460 
1461 	assert(strcmp(spdk_pci_device_get_type(&device->pci), "vmd") == 0);
1462 	vmd_remove_device(device);
1463 
1464 	return 0;
1465 }
1466 
1467 int
1468 spdk_vmd_rescan(void)
1469 {
1470 	struct vmd_pci_bus *bus;
1471 	uint32_t i;
1472 	int rc = 0;
1473 
1474 	for (i = 0; i < g_vmd_container.count; ++i) {
1475 		TAILQ_FOREACH(bus, &g_vmd_container.vmd[i].bus_list, tailq) {
1476 			rc += vmd_scan_single_bus(bus, bus->self, true);
1477 		}
1478 	}
1479 
1480 	return rc;
1481 }
1482 
1483 static int
1484 vmd_attach_device(const struct spdk_pci_addr *addr)
1485 {
1486 	struct vmd_pci_bus *bus;
1487 	struct vmd_adapter *vmd;
1488 	struct vmd_pci_device *dev;
1489 	uint32_t i;
1490 	int rc;
1491 
1492 	/* VMD always sets function to zero */
1493 	if (addr->func != 0) {
1494 		return -ENODEV;
1495 	}
1496 
1497 	for (i = 0; i < g_vmd_container.count; ++i) {
1498 		vmd = &g_vmd_container.vmd[i];
1499 		if (vmd->domain != addr->domain) {
1500 			continue;
1501 		}
1502 
1503 		TAILQ_FOREACH(bus, &vmd->bus_list, tailq) {
1504 			if (bus->bus_number != addr->bus) {
1505 				continue;
1506 			}
1507 
1508 			dev = vmd_alloc_dev(bus, addr->dev);
1509 			if (dev == NULL) {
1510 				return -ENODEV;
1511 			}
1512 
1513 			/* Only allow attaching endpoint devices */
1514 			if (dev->header->common.header_type & PCI_HEADER_TYPE_BRIDGE) {
1515 				free(dev);
1516 				return -ENODEV;
1517 			}
1518 
1519 			rc = vmd_init_end_device(dev);
1520 			if (rc != 0) {
1521 				free(dev);
1522 				return -ENODEV;
1523 			}
1524 
1525 			return 0;
1526 		}
1527 	}
1528 
1529 	return -ENODEV;
1530 }
1531 
1532 static void
1533 vmd_detach_device(struct spdk_pci_device *pci_dev)
1534 {
1535 	struct vmd_pci_device *dev = SPDK_CONTAINEROF(pci_dev, struct vmd_pci_device, pci);
1536 
1537 	assert(strcmp(spdk_pci_device_get_type(pci_dev), "vmd") == 0);
1538 	assert(vmd_find_device(&pci_dev->addr) != NULL);
1539 
1540 	vmd_remove_device(dev);
1541 }
1542 
1543 static struct spdk_pci_device_provider g_vmd_device_provider = {
1544 	.name = "vmd",
1545 	.attach_cb = vmd_attach_device,
1546 	.detach_cb = vmd_detach_device,
1547 };
1548 
1549 SPDK_PCI_REGISTER_DEVICE_PROVIDER(vmd, &g_vmd_device_provider);
1550 
1551 int
1552 spdk_vmd_init(void)
1553 {
1554 	return spdk_pci_enumerate(spdk_pci_vmd_get_driver(), vmd_enum_cb, &g_vmd_container);
1555 }
1556 
1557 void
1558 spdk_vmd_fini(void)
1559 {
1560 	uint32_t i;
1561 
1562 	for (i = 0; i < g_vmd_container.count; ++i) {
1563 		spdk_pci_device_detach(g_vmd_container.vmd[i].pci);
1564 	}
1565 }
1566 
1567 SPDK_LOG_REGISTER_COMPONENT(vmd)
1568