xref: /spdk/lib/env_dpdk/pci.c (revision ea8f5b27612fa03698a9ce3ad4bd37765d9cdfa5)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "env_internal.h"
7 
8 #include <rte_alarm.h>
9 #include <rte_devargs.h>
10 #include "spdk/env.h"
11 #include "spdk/log.h"
12 #include "spdk/string.h"
13 
14 #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
15 
16 /* Compatibility for versions < 20.11 */
17 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0)
18 #define RTE_DEV_ALLOWED RTE_DEV_WHITELISTED
19 #define RTE_DEV_BLOCKED RTE_DEV_BLACKLISTED
20 #define RTE_BUS_SCAN_ALLOWLIST RTE_BUS_SCAN_WHITELIST
21 #endif
22 
23 #define PCI_CFG_SIZE		256
24 #define PCI_EXT_CAP_ID_SN	0x03
25 
26 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
27  * might cause the internal IPC to misbehave. Just retry in such case.
28  */
29 #define DPDK_HOTPLUG_RETRY_COUNT 4
30 
31 /* DPDK alarm/interrupt thread */
32 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
33 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
34 /* devices hotplugged on a dpdk thread */
35 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
36 	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
37 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
38 static TAILQ_HEAD(, spdk_pci_device_provider) g_pci_device_providers =
39 	TAILQ_HEAD_INITIALIZER(g_pci_device_providers);
40 
41 struct env_devargs {
42 	struct rte_bus	*bus;
43 	char		name[128];
44 	uint64_t	allowed_at;
45 	TAILQ_ENTRY(env_devargs) link;
46 };
47 static TAILQ_HEAD(, env_devargs) g_env_devargs = TAILQ_HEAD_INITIALIZER(g_env_devargs);
48 
49 static struct env_devargs *
50 find_env_devargs(struct rte_bus *bus, const char *name)
51 {
52 	struct env_devargs *da;
53 
54 	TAILQ_FOREACH(da, &g_env_devargs, link) {
55 		if (bus == da->bus && !strcmp(name, da->name)) {
56 			return da;
57 		}
58 	}
59 
60 	return NULL;
61 }
62 
63 static int
64 map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
65 	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
66 {
67 	struct rte_pci_device *dev = device->dev_handle;
68 
69 	*mapped_addr = dev->mem_resource[bar].addr;
70 	*phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr;
71 	*size = (uint64_t)dev->mem_resource[bar].len;
72 
73 	return 0;
74 }
75 
76 static int
77 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
78 {
79 	return 0;
80 }
81 
82 static int
83 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
84 {
85 	int rc;
86 
87 	rc = rte_pci_read_config(dev->dev_handle, value, len, offset);
88 
89 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
90 }
91 
92 static int
93 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
94 {
95 	int rc;
96 
97 	rc = rte_pci_write_config(dev->dev_handle, value, len, offset);
98 
99 #ifdef __FreeBSD__
100 	/* DPDK returns 0 on success and -1 on failure */
101 	return rc;
102 #endif
103 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
104 }
105 
106 static void
107 remove_rte_dev(struct rte_pci_device *rte_dev)
108 {
109 	char bdf[32];
110 	int i = 0, rc;
111 
112 	snprintf(bdf, sizeof(bdf), "%s", rte_dev->device.name);
113 	do {
114 		rc = rte_eal_hotplug_remove("pci", bdf);
115 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
116 }
117 
118 static void
119 detach_rte_cb(void *_dev)
120 {
121 	remove_rte_dev(_dev);
122 }
123 
124 /* if it's a physical device we need to deal with DPDK on
125  * a different process and we can't just unset one flag
126  * here. We also want to stop using any device resources
127  * so that the device isn't "in use" by the userspace driver
128  * once we detach it. This would allow attaching the device
129  * to a different process, or to a kernel driver like nvme.
130  */
131 static void
132 detach_rte(struct spdk_pci_device *dev)
133 {
134 	struct rte_pci_device *rte_dev = dev->dev_handle;
135 	int i;
136 	bool removed;
137 
138 	if (!spdk_process_is_primary()) {
139 		remove_rte_dev(rte_dev);
140 		return;
141 	}
142 
143 	pthread_mutex_lock(&g_pci_mutex);
144 	dev->internal.attached = false;
145 	/* prevent the hotremove notification from removing this device */
146 	dev->internal.pending_removal = true;
147 	pthread_mutex_unlock(&g_pci_mutex);
148 
149 	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
150 
151 	/* wait up to 2s for the cb to execute */
152 	for (i = 2000; i > 0; i--) {
153 
154 		spdk_delay_us(1000);
155 		pthread_mutex_lock(&g_pci_mutex);
156 		removed = dev->internal.removed;
157 		pthread_mutex_unlock(&g_pci_mutex);
158 
159 		if (removed) {
160 			break;
161 		}
162 	}
163 
164 	/* besides checking the removed flag, we also need to wait
165 	 * for the dpdk detach function to unwind, as it's doing some
166 	 * operations even after calling our detach callback. Simply
167 	 * cancel the alarm - if it started executing already, this
168 	 * call will block and wait for it to finish.
169 	 */
170 	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
171 
172 	/* the device could have been finally removed, so just check
173 	 * it again.
174 	 */
175 	pthread_mutex_lock(&g_pci_mutex);
176 	removed = dev->internal.removed;
177 	pthread_mutex_unlock(&g_pci_mutex);
178 	if (!removed) {
179 		SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n",
180 			    rte_dev->name);
181 		/* If we reach this state, then the device couldn't be removed and most likely
182 		   a subsequent hot add of a device in the same BDF will fail */
183 	}
184 }
185 
186 void
187 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
188 {
189 	struct spdk_pci_driver *driver;
190 
191 	driver = calloc(1, sizeof(*driver));
192 	if (!driver) {
193 		/* we can't do any better than bailing atm */
194 		return;
195 	}
196 
197 	driver->name = name;
198 	driver->id_table = id_table;
199 	driver->drv_flags = flags;
200 	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
201 }
202 
203 struct spdk_pci_driver *
204 spdk_pci_nvme_get_driver(void)
205 {
206 	return spdk_pci_get_driver("nvme");
207 }
208 
209 struct spdk_pci_driver *
210 spdk_pci_get_driver(const char *name)
211 {
212 	struct spdk_pci_driver *driver;
213 
214 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
215 		if (strcmp(driver->name, name) == 0) {
216 			return driver;
217 		}
218 	}
219 
220 	return NULL;
221 }
222 
223 static void
224 pci_device_rte_dev_event(const char *device_name,
225 			 enum rte_dev_event_type event,
226 			 void *cb_arg)
227 {
228 	struct spdk_pci_device *dev;
229 	bool can_detach = false;
230 
231 	switch (event) {
232 	default:
233 	case RTE_DEV_EVENT_ADD:
234 		/* Nothing to do here yet. */
235 		break;
236 	case RTE_DEV_EVENT_REMOVE:
237 		pthread_mutex_lock(&g_pci_mutex);
238 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
239 			struct rte_pci_device *rte_dev = dev->dev_handle;
240 
241 			if (strcmp(rte_dev->name, device_name) == 0 &&
242 			    !dev->internal.pending_removal) {
243 				can_detach = !dev->internal.attached;
244 				/* prevent any further attaches */
245 				dev->internal.pending_removal = true;
246 				break;
247 			}
248 		}
249 		pthread_mutex_unlock(&g_pci_mutex);
250 
251 		if (dev != NULL && can_detach) {
252 			/* if device is not attached we can remove it right away.
253 			 * Otherwise it will be removed at detach.
254 			 *
255 			 * Because the user's callback is invoked in eal interrupt
256 			 * callback, the interrupt callback need to be finished before
257 			 * it can be unregistered when detaching device. So finish
258 			 * callback soon and use a deferred removal to detach device
259 			 * is need. It is a workaround, once the device detaching be
260 			 * moved into the eal in the future, the deferred removal could
261 			 * be deleted.
262 			 */
263 			rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle);
264 		}
265 		break;
266 	}
267 }
268 
269 static void
270 cleanup_pci_devices(void)
271 {
272 	struct spdk_pci_device *dev, *tmp;
273 
274 	pthread_mutex_lock(&g_pci_mutex);
275 	/* cleanup removed devices */
276 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
277 		if (!dev->internal.removed) {
278 			continue;
279 		}
280 
281 		vtophys_pci_device_removed(dev->dev_handle);
282 		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
283 		free(dev);
284 	}
285 
286 	/* add newly-attached devices */
287 	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
288 		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
289 		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
290 		vtophys_pci_device_added(dev->dev_handle);
291 	}
292 	pthread_mutex_unlock(&g_pci_mutex);
293 }
294 
295 static int scan_pci_bus(bool delay_init);
296 
297 /* translate spdk_pci_driver to an rte_pci_driver and register it to dpdk */
298 static int
299 register_rte_driver(struct spdk_pci_driver *driver)
300 {
301 	unsigned pci_id_count = 0;
302 	struct rte_pci_id *rte_id_table;
303 	char *rte_name;
304 	size_t rte_name_len;
305 	uint32_t rte_flags;
306 
307 	assert(driver->id_table);
308 	while (driver->id_table[pci_id_count].vendor_id) {
309 		pci_id_count++;
310 	}
311 	assert(pci_id_count > 0);
312 
313 	rte_id_table = calloc(pci_id_count + 1, sizeof(*rte_id_table));
314 	if (!rte_id_table) {
315 		return -ENOMEM;
316 	}
317 
318 	while (pci_id_count > 0) {
319 		struct rte_pci_id *rte_id = &rte_id_table[pci_id_count - 1];
320 		const struct spdk_pci_id *spdk_id = &driver->id_table[pci_id_count - 1];
321 
322 		rte_id->class_id = spdk_id->class_id;
323 		rte_id->vendor_id = spdk_id->vendor_id;
324 		rte_id->device_id = spdk_id->device_id;
325 		rte_id->subsystem_vendor_id = spdk_id->subvendor_id;
326 		rte_id->subsystem_device_id = spdk_id->subdevice_id;
327 		pci_id_count--;
328 	}
329 
330 	assert(driver->name);
331 	rte_name_len = strlen(driver->name) + strlen("spdk_") + 1;
332 	rte_name = calloc(rte_name_len, 1);
333 	if (!rte_name) {
334 		free(rte_id_table);
335 		return -ENOMEM;
336 	}
337 
338 	snprintf(rte_name, rte_name_len, "spdk_%s", driver->name);
339 	driver->driver.driver.name = rte_name;
340 	driver->driver.id_table = rte_id_table;
341 
342 	rte_flags = 0;
343 	if (driver->drv_flags & SPDK_PCI_DRIVER_NEED_MAPPING) {
344 		rte_flags |= RTE_PCI_DRV_NEED_MAPPING;
345 	}
346 	if (driver->drv_flags & SPDK_PCI_DRIVER_WC_ACTIVATE) {
347 		rte_flags |= RTE_PCI_DRV_WC_ACTIVATE;
348 	}
349 	driver->driver.drv_flags = rte_flags;
350 
351 	driver->driver.probe = pci_device_init;
352 	driver->driver.remove = pci_device_fini;
353 
354 	rte_pci_register(&driver->driver);
355 	return 0;
356 }
357 
358 static inline void
359 _pci_env_init(void)
360 {
361 	/* We assume devices were present on the bus for more than 2 seconds
362 	 * before initializing SPDK and there's no need to wait more. We scan
363 	 * the bus, but we don't block any devices.
364 	 */
365 	scan_pci_bus(false);
366 
367 	/* Register a single hotremove callback for all devices. */
368 	if (spdk_process_is_primary()) {
369 		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
370 	}
371 }
372 
373 void
374 pci_env_init(void)
375 {
376 	struct spdk_pci_driver *driver;
377 
378 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
379 		register_rte_driver(driver);
380 	}
381 
382 	_pci_env_init();
383 }
384 
385 void
386 pci_env_reinit(void)
387 {
388 	/* There is no need to register pci drivers again, since they were
389 	 * already pre-registered in pci_env_init.
390 	 */
391 
392 	_pci_env_init();
393 }
394 
395 void
396 pci_env_fini(void)
397 {
398 	struct spdk_pci_device *dev;
399 	char bdf[32];
400 
401 	cleanup_pci_devices();
402 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
403 		if (dev->internal.attached) {
404 			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
405 			SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf);
406 		}
407 	}
408 
409 	if (spdk_process_is_primary()) {
410 		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
411 	}
412 }
413 
414 int
415 pci_device_init(struct rte_pci_driver *_drv,
416 		struct rte_pci_device *_dev)
417 {
418 	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
419 	struct spdk_pci_device *dev;
420 	int rc;
421 
422 	dev = calloc(1, sizeof(*dev));
423 	if (dev == NULL) {
424 		return -1;
425 	}
426 
427 	dev->dev_handle = _dev;
428 
429 	dev->addr.domain = _dev->addr.domain;
430 	dev->addr.bus = _dev->addr.bus;
431 	dev->addr.dev = _dev->addr.devid;
432 	dev->addr.func = _dev->addr.function;
433 	dev->id.class_id = _dev->id.class_id;
434 	dev->id.vendor_id = _dev->id.vendor_id;
435 	dev->id.device_id = _dev->id.device_id;
436 	dev->id.subvendor_id = _dev->id.subsystem_vendor_id;
437 	dev->id.subdevice_id = _dev->id.subsystem_device_id;
438 	dev->socket_id = _dev->device.numa_node;
439 	dev->type = "pci";
440 
441 	dev->map_bar = map_bar_rte;
442 	dev->unmap_bar = unmap_bar_rte;
443 	dev->cfg_read = cfg_read_rte;
444 	dev->cfg_write = cfg_write_rte;
445 
446 	dev->internal.driver = driver;
447 	dev->internal.claim_fd = -1;
448 
449 	if (driver->cb_fn != NULL) {
450 		rc = driver->cb_fn(driver->cb_arg, dev);
451 		if (rc != 0) {
452 			free(dev);
453 			return rc;
454 		}
455 		dev->internal.attached = true;
456 	}
457 
458 	pthread_mutex_lock(&g_pci_mutex);
459 	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
460 	pthread_mutex_unlock(&g_pci_mutex);
461 	return 0;
462 }
463 
464 static void
465 set_allowed_at(struct rte_devargs *rte_da, uint64_t tsc)
466 {
467 	struct env_devargs *env_da;
468 
469 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
470 	if (env_da == NULL) {
471 		env_da = calloc(1, sizeof(*env_da));
472 		if (env_da == NULL) {
473 			SPDK_ERRLOG("could not set_allowed_at for device %s\n", rte_da->name);
474 			return;
475 		}
476 		env_da->bus = rte_da->bus;
477 		spdk_strcpy_pad(env_da->name, rte_da->name, sizeof(env_da->name), 0);
478 		TAILQ_INSERT_TAIL(&g_env_devargs, env_da, link);
479 	}
480 
481 	env_da->allowed_at = tsc;
482 }
483 
484 static uint64_t
485 get_allowed_at(struct rte_devargs *rte_da)
486 {
487 	struct env_devargs *env_da;
488 
489 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
490 	if (env_da) {
491 		return env_da->allowed_at;
492 	} else {
493 		return 0;
494 	}
495 }
496 
497 int
498 pci_device_fini(struct rte_pci_device *_dev)
499 {
500 	struct spdk_pci_device *dev;
501 
502 	pthread_mutex_lock(&g_pci_mutex);
503 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
504 		if (dev->dev_handle == _dev) {
505 			break;
506 		}
507 	}
508 
509 	if (dev == NULL || dev->internal.attached) {
510 		/* The device might be still referenced somewhere in SPDK. */
511 		pthread_mutex_unlock(&g_pci_mutex);
512 		return -EBUSY;
513 	}
514 
515 	/* remove our allowed_at option */
516 	if (_dev->device.devargs) {
517 		set_allowed_at(_dev->device.devargs, 0);
518 	}
519 
520 	/* It is possible that removed flag was already set when there is a race
521 	 * between the remove notification for this process, and another process
522 	 * that is also detaching from this same device (for example, when using
523 	 * nvme driver in multi-process mode.  So do not assert here.  See
524 	 * #2456 for additional details.
525 	 */
526 	dev->internal.removed = true;
527 	pthread_mutex_unlock(&g_pci_mutex);
528 	return 0;
529 
530 }
531 
532 void
533 spdk_pci_device_detach(struct spdk_pci_device *dev)
534 {
535 	struct spdk_pci_device_provider *provider;
536 
537 	assert(dev->internal.attached);
538 
539 	if (dev->internal.claim_fd >= 0) {
540 		spdk_pci_device_unclaim(dev);
541 	}
542 
543 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
544 		if (strcmp(dev->type, provider->name) == 0) {
545 			break;
546 		}
547 	}
548 
549 	assert(provider != NULL);
550 	dev->internal.attached = false;
551 	provider->detach_cb(dev);
552 
553 	cleanup_pci_devices();
554 }
555 
556 static int
557 scan_pci_bus(bool delay_init)
558 {
559 	struct spdk_pci_driver *driver;
560 	struct rte_pci_device *rte_dev;
561 	uint64_t now;
562 
563 	rte_bus_scan();
564 	now = spdk_get_ticks();
565 
566 	driver = TAILQ_FIRST(&g_pci_drivers);
567 	if (!driver) {
568 		return 0;
569 	}
570 
571 	TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) {
572 		struct rte_devargs *da;
573 
574 		da = rte_dev->device.devargs;
575 		if (!da) {
576 			char devargs_str[128];
577 
578 			/* the device was never blocked or allowed */
579 			da = calloc(1, sizeof(*da));
580 			if (!da) {
581 				return -1;
582 			}
583 
584 			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name);
585 			if (rte_devargs_parse(da, devargs_str) != 0) {
586 				free(da);
587 				return -1;
588 			}
589 
590 			rte_devargs_insert(&da);
591 			rte_dev->device.devargs = da;
592 		}
593 
594 		if (get_allowed_at(da)) {
595 			uint64_t allowed_at = get_allowed_at(da);
596 
597 			/* this device was seen by spdk before... */
598 			if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) {
599 				da->policy = RTE_DEV_ALLOWED;
600 			}
601 		} else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_ALLOWLIST &&
602 			    da->policy == RTE_DEV_ALLOWED) || da->policy != RTE_DEV_BLOCKED) {
603 			/* override the policy only if not permanently blocked */
604 
605 			if (delay_init) {
606 				da->policy = RTE_DEV_BLOCKED;
607 				set_allowed_at(da, now + 2 * spdk_get_ticks_hz());
608 			} else {
609 				da->policy = RTE_DEV_ALLOWED;
610 				set_allowed_at(da, now);
611 			}
612 		}
613 	}
614 
615 	return 0;
616 }
617 
618 static int
619 pci_attach_rte(const struct spdk_pci_addr *addr)
620 {
621 	char bdf[32];
622 	int rc, i = 0;
623 
624 	spdk_pci_addr_fmt(bdf, sizeof(bdf), addr);
625 
626 	do {
627 		rc = rte_eal_hotplug_add("pci", bdf, "");
628 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
629 
630 	if (i > 1 && rc == -EEXIST) {
631 		/* Even though the previous request timed out, the device
632 		 * was attached successfully.
633 		 */
634 		rc = 0;
635 	}
636 
637 	return rc;
638 }
639 
640 static struct spdk_pci_device_provider g_pci_rte_provider = {
641 	.name = "pci",
642 	.attach_cb = pci_attach_rte,
643 	.detach_cb = detach_rte,
644 };
645 
646 SPDK_PCI_REGISTER_DEVICE_PROVIDER(pci, &g_pci_rte_provider);
647 
648 int
649 spdk_pci_device_attach(struct spdk_pci_driver *driver,
650 		       spdk_pci_enum_cb enum_cb,
651 		       void *enum_ctx, struct spdk_pci_addr *pci_address)
652 {
653 	struct spdk_pci_device *dev;
654 	struct spdk_pci_device_provider *provider;
655 	struct rte_pci_device *rte_dev;
656 	struct rte_devargs *da;
657 	int rc;
658 
659 	cleanup_pci_devices();
660 
661 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
662 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
663 			break;
664 		}
665 	}
666 
667 	if (dev != NULL && dev->internal.driver == driver) {
668 		pthread_mutex_lock(&g_pci_mutex);
669 		if (dev->internal.attached || dev->internal.pending_removal) {
670 			pthread_mutex_unlock(&g_pci_mutex);
671 			return -1;
672 		}
673 
674 		rc = enum_cb(enum_ctx, dev);
675 		if (rc == 0) {
676 			dev->internal.attached = true;
677 		}
678 		pthread_mutex_unlock(&g_pci_mutex);
679 		return rc;
680 	}
681 
682 	driver->cb_fn = enum_cb;
683 	driver->cb_arg = enum_ctx;
684 
685 	rc = -ENODEV;
686 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
687 		rc = provider->attach_cb(pci_address);
688 		if (rc == 0) {
689 			break;
690 		}
691 	}
692 
693 	driver->cb_arg = NULL;
694 	driver->cb_fn = NULL;
695 
696 	cleanup_pci_devices();
697 
698 	if (rc != 0) {
699 		return -1;
700 	}
701 
702 	/* explicit attach ignores the allowlist, so if we blocked this
703 	 * device before let's enable it now - just for clarity.
704 	 */
705 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
706 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
707 			break;
708 		}
709 	}
710 	assert(dev != NULL);
711 
712 	rte_dev = dev->dev_handle;
713 	if (rte_dev != NULL) {
714 		da = rte_dev->device.devargs;
715 		if (da && get_allowed_at(da)) {
716 			set_allowed_at(da, spdk_get_ticks());
717 			da->policy = RTE_DEV_ALLOWED;
718 		}
719 	}
720 
721 	return 0;
722 }
723 
724 /* Note: You can call spdk_pci_enumerate from more than one thread
725  *       simultaneously safely, but you cannot call spdk_pci_enumerate
726  *       and rte_eal_pci_probe simultaneously.
727  */
728 int
729 spdk_pci_enumerate(struct spdk_pci_driver *driver,
730 		   spdk_pci_enum_cb enum_cb,
731 		   void *enum_ctx)
732 {
733 	struct spdk_pci_device *dev;
734 	int rc;
735 
736 	cleanup_pci_devices();
737 
738 	pthread_mutex_lock(&g_pci_mutex);
739 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
740 		if (dev->internal.attached ||
741 		    dev->internal.driver != driver ||
742 		    dev->internal.pending_removal) {
743 			continue;
744 		}
745 
746 		rc = enum_cb(enum_ctx, dev);
747 		if (rc == 0) {
748 			dev->internal.attached = true;
749 		} else if (rc < 0) {
750 			pthread_mutex_unlock(&g_pci_mutex);
751 			return -1;
752 		}
753 	}
754 	pthread_mutex_unlock(&g_pci_mutex);
755 
756 	if (scan_pci_bus(true) != 0) {
757 		return -1;
758 	}
759 
760 	driver->cb_fn = enum_cb;
761 	driver->cb_arg = enum_ctx;
762 
763 	if (rte_bus_probe() != 0) {
764 		driver->cb_arg = NULL;
765 		driver->cb_fn = NULL;
766 		return -1;
767 	}
768 
769 	driver->cb_arg = NULL;
770 	driver->cb_fn = NULL;
771 
772 	cleanup_pci_devices();
773 	return 0;
774 }
775 
776 void
777 spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev))
778 {
779 	struct spdk_pci_device *dev, *tmp;
780 
781 	pthread_mutex_lock(&g_pci_mutex);
782 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
783 		fn(ctx, dev);
784 	}
785 	pthread_mutex_unlock(&g_pci_mutex);
786 }
787 
788 int
789 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
790 			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
791 {
792 	return dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
793 }
794 
795 int
796 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
797 {
798 	return dev->unmap_bar(dev, bar, addr);
799 }
800 
801 int
802 spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev)
803 {
804 	struct rte_pci_device *rte_dev = dev->dev_handle;
805 #if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
806 	return rte_intr_enable(&rte_dev->intr_handle);
807 #else
808 	return rte_intr_enable(rte_dev->intr_handle);
809 #endif
810 }
811 
812 int
813 spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev)
814 {
815 	struct rte_pci_device *rte_dev = dev->dev_handle;
816 #if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
817 	return rte_intr_disable(&rte_dev->intr_handle);
818 #else
819 	return rte_intr_disable(rte_dev->intr_handle);
820 #endif
821 }
822 
823 int
824 spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev)
825 {
826 	struct rte_pci_device *rte_dev = dev->dev_handle;
827 #if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
828 	return rte_dev->intr_handle.fd;
829 #else
830 	return rte_intr_fd_get(rte_dev->intr_handle);
831 #endif
832 }
833 
834 uint32_t
835 spdk_pci_device_get_domain(struct spdk_pci_device *dev)
836 {
837 	return dev->addr.domain;
838 }
839 
840 uint8_t
841 spdk_pci_device_get_bus(struct spdk_pci_device *dev)
842 {
843 	return dev->addr.bus;
844 }
845 
846 uint8_t
847 spdk_pci_device_get_dev(struct spdk_pci_device *dev)
848 {
849 	return dev->addr.dev;
850 }
851 
852 uint8_t
853 spdk_pci_device_get_func(struct spdk_pci_device *dev)
854 {
855 	return dev->addr.func;
856 }
857 
858 uint16_t
859 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
860 {
861 	return dev->id.vendor_id;
862 }
863 
864 uint16_t
865 spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
866 {
867 	return dev->id.device_id;
868 }
869 
870 uint16_t
871 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
872 {
873 	return dev->id.subvendor_id;
874 }
875 
876 uint16_t
877 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
878 {
879 	return dev->id.subdevice_id;
880 }
881 
882 struct spdk_pci_id
883 spdk_pci_device_get_id(struct spdk_pci_device *dev)
884 {
885 	return dev->id;
886 }
887 
888 int
889 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
890 {
891 	return dev->socket_id;
892 }
893 
894 int
895 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
896 {
897 	return dev->cfg_read(dev, value, len, offset);
898 }
899 
900 int
901 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
902 {
903 	return dev->cfg_write(dev, value, len, offset);
904 }
905 
906 int
907 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
908 {
909 	return spdk_pci_device_cfg_read(dev, value, 1, offset);
910 }
911 
912 int
913 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
914 {
915 	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
916 }
917 
918 int
919 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
920 {
921 	return spdk_pci_device_cfg_read(dev, value, 2, offset);
922 }
923 
924 int
925 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
926 {
927 	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
928 }
929 
930 int
931 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
932 {
933 	return spdk_pci_device_cfg_read(dev, value, 4, offset);
934 }
935 
936 int
937 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
938 {
939 	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
940 }
941 
942 int
943 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
944 {
945 	int err;
946 	uint32_t pos, header = 0;
947 	uint32_t i, buf[2];
948 
949 	if (len < 17) {
950 		return -1;
951 	}
952 
953 	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
954 	if (err || !header) {
955 		return -1;
956 	}
957 
958 	pos = PCI_CFG_SIZE;
959 	while (1) {
960 		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
961 			if (pos) {
962 				/* skip the header */
963 				pos += 4;
964 				for (i = 0; i < 2; i++) {
965 					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
966 					if (err) {
967 						return -1;
968 					}
969 				}
970 				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
971 				return 0;
972 			}
973 		}
974 		pos = (header >> 20) & 0xffc;
975 		/* 0 if no other items exist */
976 		if (pos < PCI_CFG_SIZE) {
977 			return -1;
978 		}
979 		err = spdk_pci_device_cfg_read32(dev, &header, pos);
980 		if (err) {
981 			return -1;
982 		}
983 	}
984 	return -1;
985 }
986 
987 struct spdk_pci_addr
988 spdk_pci_device_get_addr(struct spdk_pci_device *dev)
989 {
990 	return dev->addr;
991 }
992 
993 bool
994 spdk_pci_device_is_removed(struct spdk_pci_device *dev)
995 {
996 	return dev->internal.pending_removal;
997 }
998 
999 int
1000 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
1001 {
1002 	if (a1->domain > a2->domain) {
1003 		return 1;
1004 	} else if (a1->domain < a2->domain) {
1005 		return -1;
1006 	} else if (a1->bus > a2->bus) {
1007 		return 1;
1008 	} else if (a1->bus < a2->bus) {
1009 		return -1;
1010 	} else if (a1->dev > a2->dev) {
1011 		return 1;
1012 	} else if (a1->dev < a2->dev) {
1013 		return -1;
1014 	} else if (a1->func > a2->func) {
1015 		return 1;
1016 	} else if (a1->func < a2->func) {
1017 		return -1;
1018 	}
1019 
1020 	return 0;
1021 }
1022 
1023 #ifdef __linux__
1024 int
1025 spdk_pci_device_claim(struct spdk_pci_device *dev)
1026 {
1027 	int dev_fd;
1028 	char dev_name[64];
1029 	int pid;
1030 	void *dev_map;
1031 	struct flock pcidev_lock = {
1032 		.l_type = F_WRLCK,
1033 		.l_whence = SEEK_SET,
1034 		.l_start = 0,
1035 		.l_len = 0,
1036 	};
1037 
1038 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1039 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1040 
1041 	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
1042 	if (dev_fd == -1) {
1043 		SPDK_ERRLOG("could not open %s\n", dev_name);
1044 		return -errno;
1045 	}
1046 
1047 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
1048 		SPDK_ERRLOG("could not truncate %s\n", dev_name);
1049 		close(dev_fd);
1050 		return -errno;
1051 	}
1052 
1053 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
1054 		       MAP_SHARED, dev_fd, 0);
1055 	if (dev_map == MAP_FAILED) {
1056 		SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno);
1057 		close(dev_fd);
1058 		return -errno;
1059 	}
1060 
1061 	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
1062 		pid = *(int *)dev_map;
1063 		SPDK_ERRLOG("Cannot create lock on device %s, probably"
1064 			    " process %d has claimed it\n", dev_name, pid);
1065 		munmap(dev_map, sizeof(int));
1066 		close(dev_fd);
1067 		/* F_SETLK returns unspecified errnos, normalize them */
1068 		return -EACCES;
1069 	}
1070 
1071 	*(int *)dev_map = (int)getpid();
1072 	munmap(dev_map, sizeof(int));
1073 	dev->internal.claim_fd = dev_fd;
1074 	/* Keep dev_fd open to maintain the lock. */
1075 	return 0;
1076 }
1077 
1078 void
1079 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1080 {
1081 	char dev_name[64];
1082 
1083 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1084 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1085 
1086 	close(dev->internal.claim_fd);
1087 	dev->internal.claim_fd = -1;
1088 	unlink(dev_name);
1089 }
1090 #else /* !__linux__ */
1091 int
1092 spdk_pci_device_claim(struct spdk_pci_device *dev)
1093 {
1094 	/* TODO */
1095 	return 0;
1096 }
1097 
1098 void
1099 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1100 {
1101 	/* TODO */
1102 }
1103 #endif /* __linux__ */
1104 
1105 int
1106 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
1107 {
1108 	unsigned domain, bus, dev, func;
1109 
1110 	if (addr == NULL || bdf == NULL) {
1111 		return -EINVAL;
1112 	}
1113 
1114 	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1115 	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1116 		/* Matched a full address - all variables are initialized */
1117 	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1118 		func = 0;
1119 	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1120 		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1121 		domain = 0;
1122 	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1123 		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1124 		domain = 0;
1125 		func = 0;
1126 	} else {
1127 		return -EINVAL;
1128 	}
1129 
1130 	if (bus > 0xFF || dev > 0x1F || func > 7) {
1131 		return -EINVAL;
1132 	}
1133 
1134 	addr->domain = domain;
1135 	addr->bus = bus;
1136 	addr->dev = dev;
1137 	addr->func = func;
1138 
1139 	return 0;
1140 }
1141 
1142 int
1143 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1144 {
1145 	int rc;
1146 
1147 	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1148 		      addr->domain, addr->bus,
1149 		      addr->dev, addr->func);
1150 
1151 	if (rc > 0 && (size_t)rc < sz) {
1152 		return 0;
1153 	}
1154 
1155 	return -1;
1156 }
1157 
1158 int
1159 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1160 {
1161 	int rc;
1162 
1163 	assert(dev->map_bar != NULL);
1164 	assert(dev->unmap_bar != NULL);
1165 	assert(dev->cfg_read != NULL);
1166 	assert(dev->cfg_write != NULL);
1167 	dev->internal.driver = drv;
1168 
1169 	if (drv->cb_fn != NULL) {
1170 		rc = drv->cb_fn(drv->cb_arg, dev);
1171 		if (rc != 0) {
1172 			return -ECANCELED;
1173 		}
1174 
1175 		dev->internal.attached = true;
1176 	}
1177 
1178 	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1179 
1180 	return 0;
1181 }
1182 
1183 void
1184 spdk_pci_unhook_device(struct spdk_pci_device *dev)
1185 {
1186 	assert(!dev->internal.attached);
1187 	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1188 }
1189 
1190 void
1191 spdk_pci_register_device_provider(struct spdk_pci_device_provider *provider)
1192 {
1193 	TAILQ_INSERT_TAIL(&g_pci_device_providers, provider, tailq);
1194 }
1195 
1196 const char *
1197 spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1198 {
1199 	return dev->type;
1200 }
1201 
1202 int
1203 spdk_pci_device_allow(struct spdk_pci_addr *pci_addr)
1204 {
1205 	struct rte_devargs *da;
1206 	char devargs_str[128];
1207 
1208 	da = calloc(1, sizeof(*da));
1209 	if (da == NULL) {
1210 		SPDK_ERRLOG("could not allocate rte_devargs\n");
1211 		return -ENOMEM;
1212 	}
1213 
1214 	snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x",
1215 		 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func);
1216 	if (rte_devargs_parse(da, devargs_str) != 0) {
1217 		SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str);
1218 		free(da);
1219 		return -EINVAL;
1220 	}
1221 	da->policy = RTE_DEV_ALLOWED;
1222 	/* Note: if a devargs already exists for this device address, it just gets
1223 	 * overridden.  So we do not need to check if the devargs already exists.
1224 	 * DPDK will take care of memory management for the devargs structure after
1225 	 * it has been inserted, so there's nothing SPDK needs to track.
1226 	 */
1227 	if (rte_devargs_insert(&da) != 0) {
1228 		SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str);
1229 		free(da);
1230 		return -EINVAL;
1231 	}
1232 
1233 	return 0;
1234 }
1235