xref: /spdk/lib/env_dpdk/pci.c (revision 8bb0ded3e55c182cea67af1f6790f8de5f38c05f)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "env_internal.h"
35 
36 #include <rte_alarm.h>
37 #include <rte_devargs.h>
38 #include "spdk/env.h"
39 #include "spdk/log.h"
40 
41 #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
42 
43 /* Compatibility for versions < 20.11 */
44 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0)
45 #define RTE_DEV_ALLOWED RTE_DEV_WHITELISTED
46 #define RTE_DEV_BLOCKED RTE_DEV_BLACKLISTED
47 #define RTE_BUS_SCAN_ALLOWLIST RTE_BUS_SCAN_WHITELIST
48 #endif
49 
50 #define PCI_CFG_SIZE		256
51 #define PCI_EXT_CAP_ID_SN	0x03
52 
53 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
54  * might cause the internal IPC to misbehave. Just retry in such case.
55  */
56 #define DPDK_HOTPLUG_RETRY_COUNT 4
57 
58 /* DPDK alarm/interrupt thread */
59 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
60 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
61 /* devices hotplugged on a dpdk thread */
62 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
63 	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
64 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
65 
66 static int
67 map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
68 	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
69 {
70 	struct rte_pci_device *dev = device->dev_handle;
71 
72 	*mapped_addr = dev->mem_resource[bar].addr;
73 	*phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr;
74 	*size = (uint64_t)dev->mem_resource[bar].len;
75 
76 	return 0;
77 }
78 
79 static int
80 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
81 {
82 	return 0;
83 }
84 
85 static int
86 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
87 {
88 	int rc;
89 
90 	rc = rte_pci_read_config(dev->dev_handle, value, len, offset);
91 
92 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
93 }
94 
95 static int
96 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
97 {
98 	int rc;
99 
100 	rc = rte_pci_write_config(dev->dev_handle, value, len, offset);
101 
102 #ifdef __FreeBSD__
103 	/* DPDK returns 0 on success and -1 on failure */
104 	return rc;
105 #endif
106 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
107 }
108 
109 static void
110 remove_rte_dev(struct rte_pci_device *rte_dev)
111 {
112 	char bdf[32];
113 	int i = 0, rc;
114 
115 	snprintf(bdf, sizeof(bdf), "%s", rte_dev->device.name);
116 	do {
117 		rc = rte_eal_hotplug_remove("pci", bdf);
118 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
119 }
120 
121 static void
122 detach_rte_cb(void *_dev)
123 {
124 	remove_rte_dev(_dev);
125 }
126 
127 static void
128 detach_rte(struct spdk_pci_device *dev)
129 {
130 	struct rte_pci_device *rte_dev = dev->dev_handle;
131 	int i;
132 	bool removed;
133 
134 	if (!spdk_process_is_primary()) {
135 		remove_rte_dev(rte_dev);
136 		return;
137 	}
138 
139 	pthread_mutex_lock(&g_pci_mutex);
140 	dev->internal.attached = false;
141 	/* prevent the hotremove notification from removing this device */
142 	dev->internal.pending_removal = true;
143 	pthread_mutex_unlock(&g_pci_mutex);
144 
145 	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
146 
147 	/* wait up to 2s for the cb to execute */
148 	for (i = 2000; i > 0; i--) {
149 
150 		spdk_delay_us(1000);
151 		pthread_mutex_lock(&g_pci_mutex);
152 		removed = dev->internal.removed;
153 		pthread_mutex_unlock(&g_pci_mutex);
154 
155 		if (removed) {
156 			break;
157 		}
158 	}
159 
160 	/* besides checking the removed flag, we also need to wait
161 	 * for the dpdk detach function to unwind, as it's doing some
162 	 * operations even after calling our detach callback. Simply
163 	 * cancel the alarm - if it started executing already, this
164 	 * call will block and wait for it to finish.
165 	 */
166 	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
167 
168 	/* the device could have been finally removed, so just check
169 	 * it again.
170 	 */
171 	pthread_mutex_lock(&g_pci_mutex);
172 	removed = dev->internal.removed;
173 	pthread_mutex_unlock(&g_pci_mutex);
174 	if (!removed) {
175 		SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n",
176 			    rte_dev->name);
177 		/* If we reach this state, then the device couldn't be removed and most likely
178 		   a subsequent hot add of a device in the same BDF will fail */
179 	}
180 }
181 
182 void
183 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
184 {
185 	struct spdk_pci_driver *driver;
186 
187 	driver = calloc(1, sizeof(*driver));
188 	if (!driver) {
189 		/* we can't do any better than bailing atm */
190 		return;
191 	}
192 
193 	driver->name = name;
194 	driver->id_table = id_table;
195 	driver->drv_flags = flags;
196 	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
197 }
198 
199 struct spdk_pci_driver *
200 spdk_pci_nvme_get_driver(void)
201 {
202 	return spdk_pci_get_driver("nvme");
203 }
204 
205 struct spdk_pci_driver *
206 spdk_pci_get_driver(const char *name)
207 {
208 	struct spdk_pci_driver *driver;
209 
210 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
211 		if (strcmp(driver->name, name) == 0) {
212 			return driver;
213 		}
214 	}
215 
216 	return NULL;
217 }
218 
219 static void
220 pci_device_rte_dev_event(const char *device_name,
221 			 enum rte_dev_event_type event,
222 			 void *cb_arg)
223 {
224 	struct spdk_pci_device *dev;
225 	bool can_detach = false;
226 
227 	switch (event) {
228 	default:
229 	case RTE_DEV_EVENT_ADD:
230 		/* Nothing to do here yet. */
231 		break;
232 	case RTE_DEV_EVENT_REMOVE:
233 		pthread_mutex_lock(&g_pci_mutex);
234 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
235 			struct rte_pci_device *rte_dev = dev->dev_handle;
236 
237 			if (strcmp(rte_dev->name, device_name) == 0 &&
238 			    !dev->internal.pending_removal) {
239 				can_detach = !dev->internal.attached;
240 				/* prevent any further attaches */
241 				dev->internal.pending_removal = true;
242 				break;
243 			}
244 		}
245 		pthread_mutex_unlock(&g_pci_mutex);
246 
247 		if (dev != NULL && can_detach) {
248 			/* if device is not attached we can remove it right away.
249 			* Otherwise it will be removed at detach. */
250 			remove_rte_dev(dev->dev_handle);
251 		}
252 		break;
253 	}
254 }
255 
256 static void
257 cleanup_pci_devices(void)
258 {
259 	struct spdk_pci_device *dev, *tmp;
260 
261 	pthread_mutex_lock(&g_pci_mutex);
262 	/* cleanup removed devices */
263 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
264 		if (!dev->internal.removed) {
265 			continue;
266 		}
267 
268 		vtophys_pci_device_removed(dev->dev_handle);
269 		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
270 		free(dev);
271 	}
272 
273 	/* add newly-attached devices */
274 	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
275 		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
276 		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
277 		vtophys_pci_device_added(dev->dev_handle);
278 	}
279 	pthread_mutex_unlock(&g_pci_mutex);
280 }
281 
282 static int scan_pci_bus(bool delay_init);
283 
284 /* translate spdk_pci_driver to an rte_pci_driver and register it to dpdk */
285 static int
286 register_rte_driver(struct spdk_pci_driver *driver)
287 {
288 	unsigned pci_id_count = 0;
289 	struct rte_pci_id *rte_id_table;
290 	char *rte_name;
291 	size_t rte_name_len;
292 	uint32_t rte_flags;
293 
294 	assert(driver->id_table);
295 	while (driver->id_table[pci_id_count].vendor_id) {
296 		pci_id_count++;
297 	}
298 	assert(pci_id_count > 0);
299 
300 	rte_id_table = calloc(pci_id_count + 1, sizeof(*rte_id_table));
301 	if (!rte_id_table) {
302 		return -ENOMEM;
303 	}
304 
305 	while (pci_id_count > 0) {
306 		struct rte_pci_id *rte_id = &rte_id_table[pci_id_count - 1];
307 		const struct spdk_pci_id *spdk_id = &driver->id_table[pci_id_count - 1];
308 
309 		rte_id->class_id = spdk_id->class_id;
310 		rte_id->vendor_id = spdk_id->vendor_id;
311 		rte_id->device_id = spdk_id->device_id;
312 		rte_id->subsystem_vendor_id = spdk_id->subvendor_id;
313 		rte_id->subsystem_device_id = spdk_id->subdevice_id;
314 		pci_id_count--;
315 	}
316 
317 	assert(driver->name);
318 	rte_name_len = strlen(driver->name) + strlen("spdk_") + 1;
319 	rte_name = calloc(rte_name_len, 1);
320 	if (!rte_name) {
321 		free(rte_id_table);
322 		return -ENOMEM;
323 	}
324 
325 	snprintf(rte_name, rte_name_len, "spdk_%s", driver->name);
326 	driver->driver.driver.name = rte_name;
327 	driver->driver.id_table = rte_id_table;
328 
329 	rte_flags = 0;
330 	if (driver->drv_flags & SPDK_PCI_DRIVER_NEED_MAPPING) {
331 		rte_flags |= RTE_PCI_DRV_NEED_MAPPING;
332 	}
333 	if (driver->drv_flags & SPDK_PCI_DRIVER_WC_ACTIVATE) {
334 		rte_flags |= RTE_PCI_DRV_WC_ACTIVATE;
335 	}
336 	driver->driver.drv_flags = rte_flags;
337 
338 	driver->driver.probe = pci_device_init;
339 	driver->driver.remove = pci_device_fini;
340 
341 	rte_pci_register(&driver->driver);
342 	return 0;
343 }
344 
345 static inline void
346 _pci_env_init(void)
347 {
348 	/* We assume devices were present on the bus for more than 2 seconds
349 	 * before initializing SPDK and there's no need to wait more. We scan
350 	 * the bus, but we don't block any devices.
351 	 */
352 	scan_pci_bus(false);
353 
354 	/* Register a single hotremove callback for all devices. */
355 	if (spdk_process_is_primary()) {
356 		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
357 	}
358 }
359 
360 void
361 pci_env_init(void)
362 {
363 	struct spdk_pci_driver *driver;
364 
365 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
366 		register_rte_driver(driver);
367 	}
368 
369 	_pci_env_init();
370 }
371 
372 void
373 pci_env_reinit(void)
374 {
375 	/* There is no need to register pci drivers again, since they were
376 	 * already pre-registered in pci_env_init.
377 	 */
378 
379 	_pci_env_init();
380 }
381 
382 void
383 pci_env_fini(void)
384 {
385 	struct spdk_pci_device *dev;
386 	char bdf[32];
387 
388 	cleanup_pci_devices();
389 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
390 		if (dev->internal.attached) {
391 			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
392 			SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf);
393 		}
394 	}
395 
396 	if (spdk_process_is_primary()) {
397 		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
398 	}
399 }
400 
401 int
402 pci_device_init(struct rte_pci_driver *_drv,
403 		struct rte_pci_device *_dev)
404 {
405 	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
406 	struct spdk_pci_device *dev;
407 	int rc;
408 
409 	dev = calloc(1, sizeof(*dev));
410 	if (dev == NULL) {
411 		return -1;
412 	}
413 
414 	dev->dev_handle = _dev;
415 
416 	dev->addr.domain = _dev->addr.domain;
417 	dev->addr.bus = _dev->addr.bus;
418 	dev->addr.dev = _dev->addr.devid;
419 	dev->addr.func = _dev->addr.function;
420 	dev->id.class_id = _dev->id.class_id;
421 	dev->id.vendor_id = _dev->id.vendor_id;
422 	dev->id.device_id = _dev->id.device_id;
423 	dev->id.subvendor_id = _dev->id.subsystem_vendor_id;
424 	dev->id.subdevice_id = _dev->id.subsystem_device_id;
425 	dev->socket_id = _dev->device.numa_node;
426 	dev->type = "pci";
427 
428 	dev->map_bar = map_bar_rte;
429 	dev->unmap_bar = unmap_bar_rte;
430 	dev->cfg_read = cfg_read_rte;
431 	dev->cfg_write = cfg_write_rte;
432 
433 	dev->internal.driver = driver;
434 	dev->internal.claim_fd = -1;
435 
436 	if (driver->cb_fn != NULL) {
437 		rc = driver->cb_fn(driver->cb_arg, dev);
438 		if (rc != 0) {
439 			free(dev);
440 			return rc;
441 		}
442 		dev->internal.attached = true;
443 	}
444 
445 	pthread_mutex_lock(&g_pci_mutex);
446 	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
447 	pthread_mutex_unlock(&g_pci_mutex);
448 	return 0;
449 }
450 
451 int
452 pci_device_fini(struct rte_pci_device *_dev)
453 {
454 	struct spdk_pci_device *dev;
455 
456 	pthread_mutex_lock(&g_pci_mutex);
457 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
458 		if (dev->dev_handle == _dev) {
459 			break;
460 		}
461 	}
462 
463 	if (dev == NULL || dev->internal.attached) {
464 		/* The device might be still referenced somewhere in SPDK. */
465 		pthread_mutex_unlock(&g_pci_mutex);
466 		return -1;
467 	}
468 
469 	/* remove our allowed_at option */
470 	if (_dev->device.devargs) {
471 		_dev->device.devargs->data = NULL;
472 	}
473 
474 	assert(!dev->internal.removed);
475 	dev->internal.removed = true;
476 	pthread_mutex_unlock(&g_pci_mutex);
477 	return 0;
478 
479 }
480 
481 void
482 spdk_pci_device_detach(struct spdk_pci_device *dev)
483 {
484 	assert(dev->internal.attached);
485 
486 	if (dev->internal.claim_fd >= 0) {
487 		spdk_pci_device_unclaim(dev);
488 	}
489 
490 	if (strcmp(dev->type, "pci") == 0) {
491 		/* if it's a physical device we need to deal with DPDK on
492 		 * a different process and we can't just unset one flag
493 		 * here. We also want to stop using any device resources
494 		 * so that the device isn't "in use" by the userspace driver
495 		 * once we detach it. This would allow attaching the device
496 		 * to a different process, or to a kernel driver like nvme.
497 		 */
498 		detach_rte(dev);
499 	} else {
500 		dev->internal.attached = false;
501 	}
502 
503 	cleanup_pci_devices();
504 }
505 
506 static int
507 scan_pci_bus(bool delay_init)
508 {
509 	struct spdk_pci_driver *driver;
510 	struct rte_pci_device *rte_dev;
511 	uint64_t now;
512 
513 	rte_bus_scan();
514 	now = spdk_get_ticks();
515 
516 	driver = TAILQ_FIRST(&g_pci_drivers);
517 	if (!driver) {
518 		return 0;
519 	}
520 
521 	TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) {
522 		struct rte_devargs *da;
523 
524 		da = rte_dev->device.devargs;
525 		if (!da) {
526 			char devargs_str[128];
527 
528 			/* the device was never blocked or allowed */
529 			da = calloc(1, sizeof(*da));
530 			if (!da) {
531 				return -1;
532 			}
533 
534 			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name);
535 			if (rte_devargs_parse(da, devargs_str) != 0) {
536 				free(da);
537 				return -1;
538 			}
539 
540 			rte_devargs_insert(&da);
541 			rte_dev->device.devargs = da;
542 		}
543 
544 		if (da->data) {
545 			uint64_t allowed_at = (uint64_t)(uintptr_t)da->data;
546 
547 			/* this device was seen by spdk before... */
548 			if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) {
549 				da->policy = RTE_DEV_ALLOWED;
550 			}
551 		} else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_ALLOWLIST &&
552 			    da->policy == RTE_DEV_ALLOWED) || da->policy != RTE_DEV_BLOCKED) {
553 			/* override the policy only if not permanently blocked */
554 
555 			if (delay_init) {
556 				da->policy = RTE_DEV_BLOCKED;
557 				da->data = (void *)(now + 2 * spdk_get_ticks_hz());
558 			} else {
559 				da->policy = RTE_DEV_ALLOWED;
560 				da->data = (void *)(uintptr_t)now;
561 			}
562 		}
563 	}
564 
565 	return 0;
566 }
567 
568 int
569 spdk_pci_device_attach(struct spdk_pci_driver *driver,
570 		       spdk_pci_enum_cb enum_cb,
571 		       void *enum_ctx, struct spdk_pci_addr *pci_address)
572 {
573 	struct spdk_pci_device *dev;
574 	struct rte_pci_device *rte_dev;
575 	struct rte_devargs *da;
576 	int rc;
577 	char bdf[32];
578 
579 	spdk_pci_addr_fmt(bdf, sizeof(bdf), pci_address);
580 
581 	cleanup_pci_devices();
582 
583 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
584 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
585 			break;
586 		}
587 	}
588 
589 	if (dev != NULL && dev->internal.driver == driver) {
590 		pthread_mutex_lock(&g_pci_mutex);
591 		if (dev->internal.attached || dev->internal.pending_removal) {
592 			pthread_mutex_unlock(&g_pci_mutex);
593 			return -1;
594 		}
595 
596 		rc = enum_cb(enum_ctx, dev);
597 		if (rc == 0) {
598 			dev->internal.attached = true;
599 		}
600 		pthread_mutex_unlock(&g_pci_mutex);
601 		return rc;
602 	}
603 
604 	driver->cb_fn = enum_cb;
605 	driver->cb_arg = enum_ctx;
606 
607 	int i = 0;
608 
609 	do {
610 		rc = rte_eal_hotplug_add("pci", bdf, "");
611 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
612 
613 	if (i > 1 && rc == -EEXIST) {
614 		/* Even though the previous request timed out, the device
615 		 * was attached successfully.
616 		 */
617 		rc = 0;
618 	}
619 
620 	driver->cb_arg = NULL;
621 	driver->cb_fn = NULL;
622 
623 	cleanup_pci_devices();
624 
625 	if (rc != 0) {
626 		return -1;
627 	}
628 
629 	/* explicit attach ignores the allowlist, so if we blocked this
630 	 * device before let's enable it now - just for clarity.
631 	 */
632 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
633 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
634 			break;
635 		}
636 	}
637 	assert(dev != NULL);
638 
639 	rte_dev = dev->dev_handle;
640 	da = rte_dev->device.devargs;
641 	if (da && da->data) {
642 		da->data = (void *)(uintptr_t)spdk_get_ticks();
643 		da->policy = RTE_DEV_ALLOWED;
644 	}
645 
646 	return 0;
647 }
648 
649 /* Note: You can call spdk_pci_enumerate from more than one thread
650  *       simultaneously safely, but you cannot call spdk_pci_enumerate
651  *       and rte_eal_pci_probe simultaneously.
652  */
653 int
654 spdk_pci_enumerate(struct spdk_pci_driver *driver,
655 		   spdk_pci_enum_cb enum_cb,
656 		   void *enum_ctx)
657 {
658 	struct spdk_pci_device *dev;
659 	int rc;
660 
661 	cleanup_pci_devices();
662 
663 	pthread_mutex_lock(&g_pci_mutex);
664 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
665 		if (dev->internal.attached ||
666 		    dev->internal.driver != driver ||
667 		    dev->internal.pending_removal) {
668 			continue;
669 		}
670 
671 		rc = enum_cb(enum_ctx, dev);
672 		if (rc == 0) {
673 			dev->internal.attached = true;
674 		} else if (rc < 0) {
675 			pthread_mutex_unlock(&g_pci_mutex);
676 			return -1;
677 		}
678 	}
679 	pthread_mutex_unlock(&g_pci_mutex);
680 
681 	if (scan_pci_bus(true) != 0) {
682 		return -1;
683 	}
684 
685 	driver->cb_fn = enum_cb;
686 	driver->cb_arg = enum_ctx;
687 
688 	if (rte_bus_probe() != 0) {
689 		driver->cb_arg = NULL;
690 		driver->cb_fn = NULL;
691 		return -1;
692 	}
693 
694 	driver->cb_arg = NULL;
695 	driver->cb_fn = NULL;
696 
697 	cleanup_pci_devices();
698 	return 0;
699 }
700 
701 struct spdk_pci_device *
702 spdk_pci_get_first_device(void)
703 {
704 	return TAILQ_FIRST(&g_pci_devices);
705 }
706 
707 struct spdk_pci_device *
708 spdk_pci_get_next_device(struct spdk_pci_device *prev)
709 {
710 	return TAILQ_NEXT(prev, internal.tailq);
711 }
712 
713 int
714 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
715 			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
716 {
717 	return dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
718 }
719 
720 int
721 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
722 {
723 	return dev->unmap_bar(dev, bar, addr);
724 }
725 
726 uint32_t
727 spdk_pci_device_get_domain(struct spdk_pci_device *dev)
728 {
729 	return dev->addr.domain;
730 }
731 
732 uint8_t
733 spdk_pci_device_get_bus(struct spdk_pci_device *dev)
734 {
735 	return dev->addr.bus;
736 }
737 
738 uint8_t
739 spdk_pci_device_get_dev(struct spdk_pci_device *dev)
740 {
741 	return dev->addr.dev;
742 }
743 
744 uint8_t
745 spdk_pci_device_get_func(struct spdk_pci_device *dev)
746 {
747 	return dev->addr.func;
748 }
749 
750 uint16_t
751 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
752 {
753 	return dev->id.vendor_id;
754 }
755 
756 uint16_t
757 spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
758 {
759 	return dev->id.device_id;
760 }
761 
762 uint16_t
763 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
764 {
765 	return dev->id.subvendor_id;
766 }
767 
768 uint16_t
769 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
770 {
771 	return dev->id.subdevice_id;
772 }
773 
774 struct spdk_pci_id
775 spdk_pci_device_get_id(struct spdk_pci_device *dev)
776 {
777 	return dev->id;
778 }
779 
780 int
781 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
782 {
783 	return dev->socket_id;
784 }
785 
786 int
787 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
788 {
789 	return dev->cfg_read(dev, value, len, offset);
790 }
791 
792 int
793 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
794 {
795 	return dev->cfg_write(dev, value, len, offset);
796 }
797 
798 int
799 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
800 {
801 	return spdk_pci_device_cfg_read(dev, value, 1, offset);
802 }
803 
804 int
805 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
806 {
807 	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
808 }
809 
810 int
811 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
812 {
813 	return spdk_pci_device_cfg_read(dev, value, 2, offset);
814 }
815 
816 int
817 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
818 {
819 	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
820 }
821 
822 int
823 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
824 {
825 	return spdk_pci_device_cfg_read(dev, value, 4, offset);
826 }
827 
828 int
829 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
830 {
831 	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
832 }
833 
834 int
835 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
836 {
837 	int err;
838 	uint32_t pos, header = 0;
839 	uint32_t i, buf[2];
840 
841 	if (len < 17) {
842 		return -1;
843 	}
844 
845 	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
846 	if (err || !header) {
847 		return -1;
848 	}
849 
850 	pos = PCI_CFG_SIZE;
851 	while (1) {
852 		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
853 			if (pos) {
854 				/* skip the header */
855 				pos += 4;
856 				for (i = 0; i < 2; i++) {
857 					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
858 					if (err) {
859 						return -1;
860 					}
861 				}
862 				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
863 				return 0;
864 			}
865 		}
866 		pos = (header >> 20) & 0xffc;
867 		/* 0 if no other items exist */
868 		if (pos < PCI_CFG_SIZE) {
869 			return -1;
870 		}
871 		err = spdk_pci_device_cfg_read32(dev, &header, pos);
872 		if (err) {
873 			return -1;
874 		}
875 	}
876 	return -1;
877 }
878 
879 struct spdk_pci_addr
880 spdk_pci_device_get_addr(struct spdk_pci_device *dev)
881 {
882 	return dev->addr;
883 }
884 
885 bool
886 spdk_pci_device_is_removed(struct spdk_pci_device *dev)
887 {
888 	return dev->internal.pending_removal;
889 }
890 
891 int
892 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
893 {
894 	if (a1->domain > a2->domain) {
895 		return 1;
896 	} else if (a1->domain < a2->domain) {
897 		return -1;
898 	} else if (a1->bus > a2->bus) {
899 		return 1;
900 	} else if (a1->bus < a2->bus) {
901 		return -1;
902 	} else if (a1->dev > a2->dev) {
903 		return 1;
904 	} else if (a1->dev < a2->dev) {
905 		return -1;
906 	} else if (a1->func > a2->func) {
907 		return 1;
908 	} else if (a1->func < a2->func) {
909 		return -1;
910 	}
911 
912 	return 0;
913 }
914 
915 #ifdef __linux__
916 int
917 spdk_pci_device_claim(struct spdk_pci_device *dev)
918 {
919 	int dev_fd;
920 	char dev_name[64];
921 	int pid;
922 	void *dev_map;
923 	struct flock pcidev_lock = {
924 		.l_type = F_WRLCK,
925 		.l_whence = SEEK_SET,
926 		.l_start = 0,
927 		.l_len = 0,
928 	};
929 
930 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
931 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
932 
933 	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
934 	if (dev_fd == -1) {
935 		SPDK_ERRLOG("could not open %s\n", dev_name);
936 		return -errno;
937 	}
938 
939 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
940 		SPDK_ERRLOG("could not truncate %s\n", dev_name);
941 		close(dev_fd);
942 		return -errno;
943 	}
944 
945 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
946 		       MAP_SHARED, dev_fd, 0);
947 	if (dev_map == MAP_FAILED) {
948 		SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno);
949 		close(dev_fd);
950 		return -errno;
951 	}
952 
953 	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
954 		pid = *(int *)dev_map;
955 		SPDK_ERRLOG("Cannot create lock on device %s, probably"
956 			    " process %d has claimed it\n", dev_name, pid);
957 		munmap(dev_map, sizeof(int));
958 		close(dev_fd);
959 		/* F_SETLK returns unspecified errnos, normalize them */
960 		return -EACCES;
961 	}
962 
963 	*(int *)dev_map = (int)getpid();
964 	munmap(dev_map, sizeof(int));
965 	dev->internal.claim_fd = dev_fd;
966 	/* Keep dev_fd open to maintain the lock. */
967 	return 0;
968 }
969 
970 void
971 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
972 {
973 	char dev_name[64];
974 
975 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
976 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
977 
978 	close(dev->internal.claim_fd);
979 	dev->internal.claim_fd = -1;
980 	unlink(dev_name);
981 }
982 #endif /* __linux__ */
983 
984 #ifdef __FreeBSD__
985 int
986 spdk_pci_device_claim(struct spdk_pci_device *dev)
987 {
988 	/* TODO */
989 	return 0;
990 }
991 
992 void
993 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
994 {
995 	/* TODO */
996 }
997 #endif /* __FreeBSD__ */
998 
999 int
1000 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
1001 {
1002 	unsigned domain, bus, dev, func;
1003 
1004 	if (addr == NULL || bdf == NULL) {
1005 		return -EINVAL;
1006 	}
1007 
1008 	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1009 	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1010 		/* Matched a full address - all variables are initialized */
1011 	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1012 		func = 0;
1013 	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1014 		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1015 		domain = 0;
1016 	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1017 		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1018 		domain = 0;
1019 		func = 0;
1020 	} else {
1021 		return -EINVAL;
1022 	}
1023 
1024 	if (bus > 0xFF || dev > 0x1F || func > 7) {
1025 		return -EINVAL;
1026 	}
1027 
1028 	addr->domain = domain;
1029 	addr->bus = bus;
1030 	addr->dev = dev;
1031 	addr->func = func;
1032 
1033 	return 0;
1034 }
1035 
1036 int
1037 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1038 {
1039 	int rc;
1040 
1041 	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1042 		      addr->domain, addr->bus,
1043 		      addr->dev, addr->func);
1044 
1045 	if (rc > 0 && (size_t)rc < sz) {
1046 		return 0;
1047 	}
1048 
1049 	return -1;
1050 }
1051 
1052 void
1053 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1054 {
1055 	assert(dev->map_bar != NULL);
1056 	assert(dev->unmap_bar != NULL);
1057 	assert(dev->cfg_read != NULL);
1058 	assert(dev->cfg_write != NULL);
1059 	dev->internal.driver = drv;
1060 	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1061 }
1062 
1063 void
1064 spdk_pci_unhook_device(struct spdk_pci_device *dev)
1065 {
1066 	assert(!dev->internal.attached);
1067 	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1068 }
1069 
1070 const char *
1071 spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1072 {
1073 	return dev->type;
1074 }
1075