xref: /spdk/lib/env_dpdk/pci.c (revision 4a9209bf1db1fc02a00f683aeb3c2754fe8ef99b)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "env_internal.h"
7 
8 #include <rte_alarm.h>
9 #include <rte_bus_pci.h>
10 #include <rte_devargs.h>
11 #include "spdk/env.h"
12 #include "spdk/log.h"
13 #include "spdk/string.h"
14 
15 #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
16 
17 /* Compatibility for versions < 20.11 */
18 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0)
19 #define RTE_DEV_ALLOWED RTE_DEV_WHITELISTED
20 #define RTE_DEV_BLOCKED RTE_DEV_BLACKLISTED
21 #define RTE_BUS_SCAN_ALLOWLIST RTE_BUS_SCAN_WHITELIST
22 #endif
23 
24 #define PCI_CFG_SIZE		256
25 #define PCI_EXT_CAP_ID_SN	0x03
26 
27 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
28  * might cause the internal IPC to misbehave. Just retry in such case.
29  */
30 #define DPDK_HOTPLUG_RETRY_COUNT 4
31 
32 /* DPDK alarm/interrupt thread */
33 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
34 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
35 /* devices hotplugged on a dpdk thread */
36 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
37 	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
38 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
39 static TAILQ_HEAD(, spdk_pci_device_provider) g_pci_device_providers =
40 	TAILQ_HEAD_INITIALIZER(g_pci_device_providers);
41 
42 struct spdk_pci_driver {
43 	struct rte_pci_driver		driver;
44 
45 	const char                      *name;
46 	const struct spdk_pci_id	*id_table;
47 	uint32_t			drv_flags;
48 
49 	spdk_pci_enum_cb		cb_fn;
50 	void				*cb_arg;
51 	TAILQ_ENTRY(spdk_pci_driver)	tailq;
52 };
53 
54 int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device);
55 int pci_device_fini(struct rte_pci_device *device);
56 
57 struct env_devargs {
58 	struct rte_bus	*bus;
59 	char		name[128];
60 	uint64_t	allowed_at;
61 	TAILQ_ENTRY(env_devargs) link;
62 };
63 static TAILQ_HEAD(, env_devargs) g_env_devargs = TAILQ_HEAD_INITIALIZER(g_env_devargs);
64 
65 static struct env_devargs *
66 find_env_devargs(struct rte_bus *bus, const char *name)
67 {
68 	struct env_devargs *da;
69 
70 	TAILQ_FOREACH(da, &g_env_devargs, link) {
71 		if (bus == da->bus && !strcmp(name, da->name)) {
72 			return da;
73 		}
74 	}
75 
76 	return NULL;
77 }
78 
79 static int
80 map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
81 	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
82 {
83 	struct rte_pci_device *dev = device->dev_handle;
84 
85 	*mapped_addr = dev->mem_resource[bar].addr;
86 	*phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr;
87 	*size = (uint64_t)dev->mem_resource[bar].len;
88 
89 	return 0;
90 }
91 
92 static int
93 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
94 {
95 	return 0;
96 }
97 
98 static int
99 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
100 {
101 	int rc;
102 
103 	rc = rte_pci_read_config(dev->dev_handle, value, len, offset);
104 
105 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
106 }
107 
108 static int
109 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
110 {
111 	int rc;
112 
113 	rc = rte_pci_write_config(dev->dev_handle, value, len, offset);
114 
115 #ifdef __FreeBSD__
116 	/* DPDK returns 0 on success and -1 on failure */
117 	return rc;
118 #endif
119 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
120 }
121 
122 static void
123 remove_rte_dev(struct rte_pci_device *rte_dev)
124 {
125 	char bdf[32];
126 	int i = 0, rc;
127 
128 	snprintf(bdf, sizeof(bdf), "%s", rte_dev->device.name);
129 	do {
130 		rc = rte_eal_hotplug_remove("pci", bdf);
131 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
132 }
133 
134 static void
135 detach_rte_cb(void *_dev)
136 {
137 	remove_rte_dev(_dev);
138 }
139 
140 /* if it's a physical device we need to deal with DPDK on
141  * a different process and we can't just unset one flag
142  * here. We also want to stop using any device resources
143  * so that the device isn't "in use" by the userspace driver
144  * once we detach it. This would allow attaching the device
145  * to a different process, or to a kernel driver like nvme.
146  */
147 static void
148 detach_rte(struct spdk_pci_device *dev)
149 {
150 	struct rte_pci_device *rte_dev = dev->dev_handle;
151 	int i;
152 	bool removed;
153 
154 	if (!spdk_process_is_primary()) {
155 		remove_rte_dev(rte_dev);
156 		return;
157 	}
158 
159 	pthread_mutex_lock(&g_pci_mutex);
160 	dev->internal.attached = false;
161 	/* prevent the hotremove notification from removing this device */
162 	dev->internal.pending_removal = true;
163 	pthread_mutex_unlock(&g_pci_mutex);
164 
165 	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
166 
167 	/* wait up to 2s for the cb to execute */
168 	for (i = 2000; i > 0; i--) {
169 
170 		spdk_delay_us(1000);
171 		pthread_mutex_lock(&g_pci_mutex);
172 		removed = dev->internal.removed;
173 		pthread_mutex_unlock(&g_pci_mutex);
174 
175 		if (removed) {
176 			break;
177 		}
178 	}
179 
180 	/* besides checking the removed flag, we also need to wait
181 	 * for the dpdk detach function to unwind, as it's doing some
182 	 * operations even after calling our detach callback. Simply
183 	 * cancel the alarm - if it started executing already, this
184 	 * call will block and wait for it to finish.
185 	 */
186 	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
187 
188 	/* the device could have been finally removed, so just check
189 	 * it again.
190 	 */
191 	pthread_mutex_lock(&g_pci_mutex);
192 	removed = dev->internal.removed;
193 	pthread_mutex_unlock(&g_pci_mutex);
194 	if (!removed) {
195 		SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n",
196 			    rte_dev->name);
197 		/* If we reach this state, then the device couldn't be removed and most likely
198 		   a subsequent hot add of a device in the same BDF will fail */
199 	}
200 }
201 
202 void
203 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
204 {
205 	struct spdk_pci_driver *driver;
206 
207 	driver = calloc(1, sizeof(*driver));
208 	if (!driver) {
209 		/* we can't do any better than bailing atm */
210 		return;
211 	}
212 
213 	driver->name = name;
214 	driver->id_table = id_table;
215 	driver->drv_flags = flags;
216 	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
217 }
218 
219 struct spdk_pci_driver *
220 spdk_pci_nvme_get_driver(void)
221 {
222 	return spdk_pci_get_driver("nvme");
223 }
224 
225 struct spdk_pci_driver *
226 spdk_pci_get_driver(const char *name)
227 {
228 	struct spdk_pci_driver *driver;
229 
230 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
231 		if (strcmp(driver->name, name) == 0) {
232 			return driver;
233 		}
234 	}
235 
236 	return NULL;
237 }
238 
239 static void
240 pci_device_rte_dev_event(const char *device_name,
241 			 enum rte_dev_event_type event,
242 			 void *cb_arg)
243 {
244 	struct spdk_pci_device *dev;
245 	bool can_detach = false;
246 
247 	switch (event) {
248 	default:
249 	case RTE_DEV_EVENT_ADD:
250 		/* Nothing to do here yet. */
251 		break;
252 	case RTE_DEV_EVENT_REMOVE:
253 		pthread_mutex_lock(&g_pci_mutex);
254 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
255 			struct rte_pci_device *rte_dev = dev->dev_handle;
256 
257 			if (strcmp(rte_dev->name, device_name) == 0 &&
258 			    !dev->internal.pending_removal) {
259 				can_detach = !dev->internal.attached;
260 				/* prevent any further attaches */
261 				dev->internal.pending_removal = true;
262 				break;
263 			}
264 		}
265 		pthread_mutex_unlock(&g_pci_mutex);
266 
267 		if (dev != NULL && can_detach) {
268 			/* if device is not attached we can remove it right away.
269 			 * Otherwise it will be removed at detach.
270 			 *
271 			 * Because the user's callback is invoked in eal interrupt
272 			 * callback, the interrupt callback need to be finished before
273 			 * it can be unregistered when detaching device. So finish
274 			 * callback soon and use a deferred removal to detach device
275 			 * is need. It is a workaround, once the device detaching be
276 			 * moved into the eal in the future, the deferred removal could
277 			 * be deleted.
278 			 */
279 			rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle);
280 		}
281 		break;
282 	}
283 }
284 
285 static void
286 cleanup_pci_devices(void)
287 {
288 	struct spdk_pci_device *dev, *tmp;
289 
290 	pthread_mutex_lock(&g_pci_mutex);
291 	/* cleanup removed devices */
292 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
293 		if (!dev->internal.removed) {
294 			continue;
295 		}
296 
297 		vtophys_pci_device_removed(dev->dev_handle);
298 		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
299 		free(dev);
300 	}
301 
302 	/* add newly-attached devices */
303 	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
304 		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
305 		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
306 		vtophys_pci_device_added(dev->dev_handle);
307 	}
308 	pthread_mutex_unlock(&g_pci_mutex);
309 }
310 
311 static int scan_pci_bus(bool delay_init);
312 
313 /* translate spdk_pci_driver to an rte_pci_driver and register it to dpdk */
314 static int
315 register_rte_driver(struct spdk_pci_driver *driver)
316 {
317 	unsigned pci_id_count = 0;
318 	struct rte_pci_id *rte_id_table;
319 	char *rte_name;
320 	size_t rte_name_len;
321 	uint32_t rte_flags;
322 
323 	assert(driver->id_table);
324 	while (driver->id_table[pci_id_count].vendor_id) {
325 		pci_id_count++;
326 	}
327 	assert(pci_id_count > 0);
328 
329 	rte_id_table = calloc(pci_id_count + 1, sizeof(*rte_id_table));
330 	if (!rte_id_table) {
331 		return -ENOMEM;
332 	}
333 
334 	while (pci_id_count > 0) {
335 		struct rte_pci_id *rte_id = &rte_id_table[pci_id_count - 1];
336 		const struct spdk_pci_id *spdk_id = &driver->id_table[pci_id_count - 1];
337 
338 		rte_id->class_id = spdk_id->class_id;
339 		rte_id->vendor_id = spdk_id->vendor_id;
340 		rte_id->device_id = spdk_id->device_id;
341 		rte_id->subsystem_vendor_id = spdk_id->subvendor_id;
342 		rte_id->subsystem_device_id = spdk_id->subdevice_id;
343 		pci_id_count--;
344 	}
345 
346 	assert(driver->name);
347 	rte_name_len = strlen(driver->name) + strlen("spdk_") + 1;
348 	rte_name = calloc(rte_name_len, 1);
349 	if (!rte_name) {
350 		free(rte_id_table);
351 		return -ENOMEM;
352 	}
353 
354 	snprintf(rte_name, rte_name_len, "spdk_%s", driver->name);
355 	driver->driver.driver.name = rte_name;
356 	driver->driver.id_table = rte_id_table;
357 
358 	rte_flags = 0;
359 	if (driver->drv_flags & SPDK_PCI_DRIVER_NEED_MAPPING) {
360 		rte_flags |= RTE_PCI_DRV_NEED_MAPPING;
361 	}
362 	if (driver->drv_flags & SPDK_PCI_DRIVER_WC_ACTIVATE) {
363 		rte_flags |= RTE_PCI_DRV_WC_ACTIVATE;
364 	}
365 	driver->driver.drv_flags = rte_flags;
366 
367 	driver->driver.probe = pci_device_init;
368 	driver->driver.remove = pci_device_fini;
369 
370 	rte_pci_register(&driver->driver);
371 	return 0;
372 }
373 
374 static inline void
375 _pci_env_init(void)
376 {
377 	/* We assume devices were present on the bus for more than 2 seconds
378 	 * before initializing SPDK and there's no need to wait more. We scan
379 	 * the bus, but we don't block any devices.
380 	 */
381 	scan_pci_bus(false);
382 
383 	/* Register a single hotremove callback for all devices. */
384 	if (spdk_process_is_primary()) {
385 		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
386 	}
387 }
388 
389 void
390 pci_env_init(void)
391 {
392 	struct spdk_pci_driver *driver;
393 
394 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
395 		register_rte_driver(driver);
396 	}
397 
398 	_pci_env_init();
399 }
400 
401 void
402 pci_env_reinit(void)
403 {
404 	/* There is no need to register pci drivers again, since they were
405 	 * already pre-registered in pci_env_init.
406 	 */
407 
408 	_pci_env_init();
409 }
410 
411 void
412 pci_env_fini(void)
413 {
414 	struct spdk_pci_device *dev;
415 	char bdf[32];
416 
417 	cleanup_pci_devices();
418 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
419 		if (dev->internal.attached) {
420 			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
421 			SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf);
422 		}
423 	}
424 
425 	if (spdk_process_is_primary()) {
426 		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
427 	}
428 }
429 
430 int
431 pci_device_init(struct rte_pci_driver *_drv,
432 		struct rte_pci_device *_dev)
433 {
434 	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
435 	struct spdk_pci_device *dev;
436 	int rc;
437 
438 	dev = calloc(1, sizeof(*dev));
439 	if (dev == NULL) {
440 		return -1;
441 	}
442 
443 	dev->dev_handle = _dev;
444 
445 	dev->addr.domain = _dev->addr.domain;
446 	dev->addr.bus = _dev->addr.bus;
447 	dev->addr.dev = _dev->addr.devid;
448 	dev->addr.func = _dev->addr.function;
449 	dev->id.class_id = _dev->id.class_id;
450 	dev->id.vendor_id = _dev->id.vendor_id;
451 	dev->id.device_id = _dev->id.device_id;
452 	dev->id.subvendor_id = _dev->id.subsystem_vendor_id;
453 	dev->id.subdevice_id = _dev->id.subsystem_device_id;
454 	dev->socket_id = _dev->device.numa_node;
455 	dev->type = "pci";
456 
457 	dev->map_bar = map_bar_rte;
458 	dev->unmap_bar = unmap_bar_rte;
459 	dev->cfg_read = cfg_read_rte;
460 	dev->cfg_write = cfg_write_rte;
461 
462 	dev->internal.driver = driver;
463 	dev->internal.claim_fd = -1;
464 
465 	if (driver->cb_fn != NULL) {
466 		rc = driver->cb_fn(driver->cb_arg, dev);
467 		if (rc != 0) {
468 			free(dev);
469 			return rc;
470 		}
471 		dev->internal.attached = true;
472 	}
473 
474 	pthread_mutex_lock(&g_pci_mutex);
475 	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
476 	pthread_mutex_unlock(&g_pci_mutex);
477 	return 0;
478 }
479 
480 static void
481 set_allowed_at(struct rte_devargs *rte_da, uint64_t tsc)
482 {
483 	struct env_devargs *env_da;
484 
485 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
486 	if (env_da == NULL) {
487 		env_da = calloc(1, sizeof(*env_da));
488 		if (env_da == NULL) {
489 			SPDK_ERRLOG("could not set_allowed_at for device %s\n", rte_da->name);
490 			return;
491 		}
492 		env_da->bus = rte_da->bus;
493 		spdk_strcpy_pad(env_da->name, rte_da->name, sizeof(env_da->name), 0);
494 		TAILQ_INSERT_TAIL(&g_env_devargs, env_da, link);
495 	}
496 
497 	env_da->allowed_at = tsc;
498 }
499 
500 static uint64_t
501 get_allowed_at(struct rte_devargs *rte_da)
502 {
503 	struct env_devargs *env_da;
504 
505 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
506 	if (env_da) {
507 		return env_da->allowed_at;
508 	} else {
509 		return 0;
510 	}
511 }
512 
513 int
514 pci_device_fini(struct rte_pci_device *_dev)
515 {
516 	struct spdk_pci_device *dev;
517 
518 	pthread_mutex_lock(&g_pci_mutex);
519 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
520 		if (dev->dev_handle == _dev) {
521 			break;
522 		}
523 	}
524 
525 	if (dev == NULL || dev->internal.attached) {
526 		/* The device might be still referenced somewhere in SPDK. */
527 		pthread_mutex_unlock(&g_pci_mutex);
528 		return -EBUSY;
529 	}
530 
531 	/* remove our allowed_at option */
532 	if (_dev->device.devargs) {
533 		set_allowed_at(_dev->device.devargs, 0);
534 	}
535 
536 	/* It is possible that removed flag was already set when there is a race
537 	 * between the remove notification for this process, and another process
538 	 * that is also detaching from this same device (for example, when using
539 	 * nvme driver in multi-process mode.  So do not assert here.  See
540 	 * #2456 for additional details.
541 	 */
542 	dev->internal.removed = true;
543 	pthread_mutex_unlock(&g_pci_mutex);
544 	return 0;
545 
546 }
547 
548 void
549 spdk_pci_device_detach(struct spdk_pci_device *dev)
550 {
551 	struct spdk_pci_device_provider *provider;
552 
553 	assert(dev->internal.attached);
554 
555 	if (dev->internal.claim_fd >= 0) {
556 		spdk_pci_device_unclaim(dev);
557 	}
558 
559 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
560 		if (strcmp(dev->type, provider->name) == 0) {
561 			break;
562 		}
563 	}
564 
565 	assert(provider != NULL);
566 	dev->internal.attached = false;
567 	provider->detach_cb(dev);
568 
569 	cleanup_pci_devices();
570 }
571 
572 static int
573 scan_pci_bus(bool delay_init)
574 {
575 	struct rte_dev_iterator it;
576 	struct rte_device *rte_dev;
577 	uint64_t now;
578 
579 	rte_bus_scan();
580 	now = spdk_get_ticks();
581 
582 	if (!TAILQ_FIRST(&g_pci_drivers)) {
583 		return 0;
584 	}
585 
586 	RTE_DEV_FOREACH(rte_dev, "bus=pci", &it) {
587 		struct rte_devargs *da;
588 
589 		da = rte_dev->devargs;
590 		if (!da) {
591 			char devargs_str[128];
592 
593 			/* the device was never blocked or allowed */
594 			da = calloc(1, sizeof(*da));
595 			if (!da) {
596 				return -1;
597 			}
598 
599 			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->name);
600 			if (rte_devargs_parse(da, devargs_str) != 0) {
601 				free(da);
602 				return -1;
603 			}
604 
605 			rte_devargs_insert(&da);
606 			rte_dev->devargs = da;
607 		}
608 
609 		if (get_allowed_at(da)) {
610 			uint64_t allowed_at = get_allowed_at(da);
611 
612 			/* this device was seen by spdk before... */
613 			if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) {
614 				da->policy = RTE_DEV_ALLOWED;
615 			}
616 		} else if ((rte_dev->bus->conf.scan_mode == RTE_BUS_SCAN_ALLOWLIST &&
617 			    da->policy == RTE_DEV_ALLOWED) || da->policy != RTE_DEV_BLOCKED) {
618 			/* override the policy only if not permanently blocked */
619 
620 			if (delay_init) {
621 				da->policy = RTE_DEV_BLOCKED;
622 				set_allowed_at(da, now + 2 * spdk_get_ticks_hz());
623 			} else {
624 				da->policy = RTE_DEV_ALLOWED;
625 				set_allowed_at(da, now);
626 			}
627 		}
628 	}
629 
630 	return 0;
631 }
632 
633 static int
634 pci_attach_rte(const struct spdk_pci_addr *addr)
635 {
636 	char bdf[32];
637 	int rc, i = 0;
638 
639 	spdk_pci_addr_fmt(bdf, sizeof(bdf), addr);
640 
641 	do {
642 		rc = rte_eal_hotplug_add("pci", bdf, "");
643 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
644 
645 	if (i > 1 && rc == -EEXIST) {
646 		/* Even though the previous request timed out, the device
647 		 * was attached successfully.
648 		 */
649 		rc = 0;
650 	}
651 
652 	return rc;
653 }
654 
655 static struct spdk_pci_device_provider g_pci_rte_provider = {
656 	.name = "pci",
657 	.attach_cb = pci_attach_rte,
658 	.detach_cb = detach_rte,
659 };
660 
661 SPDK_PCI_REGISTER_DEVICE_PROVIDER(pci, &g_pci_rte_provider);
662 
663 int
664 spdk_pci_device_attach(struct spdk_pci_driver *driver,
665 		       spdk_pci_enum_cb enum_cb,
666 		       void *enum_ctx, struct spdk_pci_addr *pci_address)
667 {
668 	struct spdk_pci_device *dev;
669 	struct spdk_pci_device_provider *provider;
670 	struct rte_pci_device *rte_dev;
671 	struct rte_devargs *da;
672 	int rc;
673 
674 	cleanup_pci_devices();
675 
676 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
677 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
678 			break;
679 		}
680 	}
681 
682 	if (dev != NULL && dev->internal.driver == driver) {
683 		pthread_mutex_lock(&g_pci_mutex);
684 		if (dev->internal.attached || dev->internal.pending_removal) {
685 			pthread_mutex_unlock(&g_pci_mutex);
686 			return -1;
687 		}
688 
689 		rc = enum_cb(enum_ctx, dev);
690 		if (rc == 0) {
691 			dev->internal.attached = true;
692 		}
693 		pthread_mutex_unlock(&g_pci_mutex);
694 		return rc;
695 	}
696 
697 	driver->cb_fn = enum_cb;
698 	driver->cb_arg = enum_ctx;
699 
700 	rc = -ENODEV;
701 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
702 		rc = provider->attach_cb(pci_address);
703 		if (rc == 0) {
704 			break;
705 		}
706 	}
707 
708 	driver->cb_arg = NULL;
709 	driver->cb_fn = NULL;
710 
711 	cleanup_pci_devices();
712 
713 	if (rc != 0) {
714 		return -1;
715 	}
716 
717 	/* explicit attach ignores the allowlist, so if we blocked this
718 	 * device before let's enable it now - just for clarity.
719 	 */
720 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
721 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
722 			break;
723 		}
724 	}
725 	assert(dev != NULL);
726 
727 	rte_dev = dev->dev_handle;
728 	if (rte_dev != NULL) {
729 		da = rte_dev->device.devargs;
730 		if (da && get_allowed_at(da)) {
731 			set_allowed_at(da, spdk_get_ticks());
732 			da->policy = RTE_DEV_ALLOWED;
733 		}
734 	}
735 
736 	return 0;
737 }
738 
739 /* Note: You can call spdk_pci_enumerate from more than one thread
740  *       simultaneously safely, but you cannot call spdk_pci_enumerate
741  *       and rte_eal_pci_probe simultaneously.
742  */
743 int
744 spdk_pci_enumerate(struct spdk_pci_driver *driver,
745 		   spdk_pci_enum_cb enum_cb,
746 		   void *enum_ctx)
747 {
748 	struct spdk_pci_device *dev;
749 	int rc;
750 
751 	cleanup_pci_devices();
752 
753 	pthread_mutex_lock(&g_pci_mutex);
754 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
755 		if (dev->internal.attached ||
756 		    dev->internal.driver != driver ||
757 		    dev->internal.pending_removal) {
758 			continue;
759 		}
760 
761 		rc = enum_cb(enum_ctx, dev);
762 		if (rc == 0) {
763 			dev->internal.attached = true;
764 		} else if (rc < 0) {
765 			pthread_mutex_unlock(&g_pci_mutex);
766 			return -1;
767 		}
768 	}
769 	pthread_mutex_unlock(&g_pci_mutex);
770 
771 	if (scan_pci_bus(true) != 0) {
772 		return -1;
773 	}
774 
775 	driver->cb_fn = enum_cb;
776 	driver->cb_arg = enum_ctx;
777 
778 	if (rte_bus_probe() != 0) {
779 		driver->cb_arg = NULL;
780 		driver->cb_fn = NULL;
781 		return -1;
782 	}
783 
784 	driver->cb_arg = NULL;
785 	driver->cb_fn = NULL;
786 
787 	cleanup_pci_devices();
788 	return 0;
789 }
790 
791 void
792 spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev))
793 {
794 	struct spdk_pci_device *dev, *tmp;
795 
796 	pthread_mutex_lock(&g_pci_mutex);
797 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
798 		fn(ctx, dev);
799 	}
800 	pthread_mutex_unlock(&g_pci_mutex);
801 }
802 
803 int
804 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
805 			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
806 {
807 	return dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
808 }
809 
810 int
811 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
812 {
813 	return dev->unmap_bar(dev, bar, addr);
814 }
815 
816 int
817 spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev)
818 {
819 	struct rte_pci_device *rte_dev = dev->dev_handle;
820 #if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
821 	return rte_intr_enable(&rte_dev->intr_handle);
822 #else
823 	return rte_intr_enable(rte_dev->intr_handle);
824 #endif
825 }
826 
827 int
828 spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev)
829 {
830 	struct rte_pci_device *rte_dev = dev->dev_handle;
831 #if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
832 	return rte_intr_disable(&rte_dev->intr_handle);
833 #else
834 	return rte_intr_disable(rte_dev->intr_handle);
835 #endif
836 }
837 
838 int
839 spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev)
840 {
841 	struct rte_pci_device *rte_dev = dev->dev_handle;
842 #if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
843 	return rte_dev->intr_handle.fd;
844 #else
845 	return rte_intr_fd_get(rte_dev->intr_handle);
846 #endif
847 }
848 
849 uint32_t
850 spdk_pci_device_get_domain(struct spdk_pci_device *dev)
851 {
852 	return dev->addr.domain;
853 }
854 
855 uint8_t
856 spdk_pci_device_get_bus(struct spdk_pci_device *dev)
857 {
858 	return dev->addr.bus;
859 }
860 
861 uint8_t
862 spdk_pci_device_get_dev(struct spdk_pci_device *dev)
863 {
864 	return dev->addr.dev;
865 }
866 
867 uint8_t
868 spdk_pci_device_get_func(struct spdk_pci_device *dev)
869 {
870 	return dev->addr.func;
871 }
872 
873 uint16_t
874 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
875 {
876 	return dev->id.vendor_id;
877 }
878 
879 uint16_t
880 spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
881 {
882 	return dev->id.device_id;
883 }
884 
885 uint16_t
886 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
887 {
888 	return dev->id.subvendor_id;
889 }
890 
891 uint16_t
892 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
893 {
894 	return dev->id.subdevice_id;
895 }
896 
897 struct spdk_pci_id
898 spdk_pci_device_get_id(struct spdk_pci_device *dev)
899 {
900 	return dev->id;
901 }
902 
903 int
904 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
905 {
906 	return dev->socket_id;
907 }
908 
909 int
910 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
911 {
912 	return dev->cfg_read(dev, value, len, offset);
913 }
914 
915 int
916 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
917 {
918 	return dev->cfg_write(dev, value, len, offset);
919 }
920 
921 int
922 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
923 {
924 	return spdk_pci_device_cfg_read(dev, value, 1, offset);
925 }
926 
927 int
928 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
929 {
930 	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
931 }
932 
933 int
934 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
935 {
936 	return spdk_pci_device_cfg_read(dev, value, 2, offset);
937 }
938 
939 int
940 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
941 {
942 	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
943 }
944 
945 int
946 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
947 {
948 	return spdk_pci_device_cfg_read(dev, value, 4, offset);
949 }
950 
951 int
952 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
953 {
954 	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
955 }
956 
957 int
958 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
959 {
960 	int err;
961 	uint32_t pos, header = 0;
962 	uint32_t i, buf[2];
963 
964 	if (len < 17) {
965 		return -1;
966 	}
967 
968 	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
969 	if (err || !header) {
970 		return -1;
971 	}
972 
973 	pos = PCI_CFG_SIZE;
974 	while (1) {
975 		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
976 			if (pos) {
977 				/* skip the header */
978 				pos += 4;
979 				for (i = 0; i < 2; i++) {
980 					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
981 					if (err) {
982 						return -1;
983 					}
984 				}
985 				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
986 				return 0;
987 			}
988 		}
989 		pos = (header >> 20) & 0xffc;
990 		/* 0 if no other items exist */
991 		if (pos < PCI_CFG_SIZE) {
992 			return -1;
993 		}
994 		err = spdk_pci_device_cfg_read32(dev, &header, pos);
995 		if (err) {
996 			return -1;
997 		}
998 	}
999 	return -1;
1000 }
1001 
1002 struct spdk_pci_addr
1003 spdk_pci_device_get_addr(struct spdk_pci_device *dev)
1004 {
1005 	return dev->addr;
1006 }
1007 
1008 bool
1009 spdk_pci_device_is_removed(struct spdk_pci_device *dev)
1010 {
1011 	return dev->internal.pending_removal;
1012 }
1013 
1014 int
1015 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
1016 {
1017 	if (a1->domain > a2->domain) {
1018 		return 1;
1019 	} else if (a1->domain < a2->domain) {
1020 		return -1;
1021 	} else if (a1->bus > a2->bus) {
1022 		return 1;
1023 	} else if (a1->bus < a2->bus) {
1024 		return -1;
1025 	} else if (a1->dev > a2->dev) {
1026 		return 1;
1027 	} else if (a1->dev < a2->dev) {
1028 		return -1;
1029 	} else if (a1->func > a2->func) {
1030 		return 1;
1031 	} else if (a1->func < a2->func) {
1032 		return -1;
1033 	}
1034 
1035 	return 0;
1036 }
1037 
1038 #ifdef __linux__
1039 int
1040 spdk_pci_device_claim(struct spdk_pci_device *dev)
1041 {
1042 	int dev_fd;
1043 	char dev_name[64];
1044 	int pid;
1045 	void *dev_map;
1046 	struct flock pcidev_lock = {
1047 		.l_type = F_WRLCK,
1048 		.l_whence = SEEK_SET,
1049 		.l_start = 0,
1050 		.l_len = 0,
1051 	};
1052 
1053 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1054 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1055 
1056 	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
1057 	if (dev_fd == -1) {
1058 		SPDK_ERRLOG("could not open %s\n", dev_name);
1059 		return -errno;
1060 	}
1061 
1062 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
1063 		SPDK_ERRLOG("could not truncate %s\n", dev_name);
1064 		close(dev_fd);
1065 		return -errno;
1066 	}
1067 
1068 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
1069 		       MAP_SHARED, dev_fd, 0);
1070 	if (dev_map == MAP_FAILED) {
1071 		SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno);
1072 		close(dev_fd);
1073 		return -errno;
1074 	}
1075 
1076 	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
1077 		pid = *(int *)dev_map;
1078 		SPDK_ERRLOG("Cannot create lock on device %s, probably"
1079 			    " process %d has claimed it\n", dev_name, pid);
1080 		munmap(dev_map, sizeof(int));
1081 		close(dev_fd);
1082 		/* F_SETLK returns unspecified errnos, normalize them */
1083 		return -EACCES;
1084 	}
1085 
1086 	*(int *)dev_map = (int)getpid();
1087 	munmap(dev_map, sizeof(int));
1088 	dev->internal.claim_fd = dev_fd;
1089 	/* Keep dev_fd open to maintain the lock. */
1090 	return 0;
1091 }
1092 
1093 void
1094 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1095 {
1096 	char dev_name[64];
1097 
1098 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1099 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1100 
1101 	close(dev->internal.claim_fd);
1102 	dev->internal.claim_fd = -1;
1103 	unlink(dev_name);
1104 }
1105 #else /* !__linux__ */
1106 int
1107 spdk_pci_device_claim(struct spdk_pci_device *dev)
1108 {
1109 	/* TODO */
1110 	return 0;
1111 }
1112 
1113 void
1114 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1115 {
1116 	/* TODO */
1117 }
1118 #endif /* __linux__ */
1119 
1120 int
1121 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
1122 {
1123 	unsigned domain, bus, dev, func;
1124 
1125 	if (addr == NULL || bdf == NULL) {
1126 		return -EINVAL;
1127 	}
1128 
1129 	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1130 	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1131 		/* Matched a full address - all variables are initialized */
1132 	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1133 		func = 0;
1134 	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1135 		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1136 		domain = 0;
1137 	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1138 		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1139 		domain = 0;
1140 		func = 0;
1141 	} else {
1142 		return -EINVAL;
1143 	}
1144 
1145 	if (bus > 0xFF || dev > 0x1F || func > 7) {
1146 		return -EINVAL;
1147 	}
1148 
1149 	addr->domain = domain;
1150 	addr->bus = bus;
1151 	addr->dev = dev;
1152 	addr->func = func;
1153 
1154 	return 0;
1155 }
1156 
1157 int
1158 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1159 {
1160 	int rc;
1161 
1162 	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1163 		      addr->domain, addr->bus,
1164 		      addr->dev, addr->func);
1165 
1166 	if (rc > 0 && (size_t)rc < sz) {
1167 		return 0;
1168 	}
1169 
1170 	return -1;
1171 }
1172 
1173 int
1174 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1175 {
1176 	int rc;
1177 
1178 	assert(dev->map_bar != NULL);
1179 	assert(dev->unmap_bar != NULL);
1180 	assert(dev->cfg_read != NULL);
1181 	assert(dev->cfg_write != NULL);
1182 	dev->internal.driver = drv;
1183 
1184 	if (drv->cb_fn != NULL) {
1185 		rc = drv->cb_fn(drv->cb_arg, dev);
1186 		if (rc != 0) {
1187 			return -ECANCELED;
1188 		}
1189 
1190 		dev->internal.attached = true;
1191 	}
1192 
1193 	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1194 
1195 	return 0;
1196 }
1197 
1198 void
1199 spdk_pci_unhook_device(struct spdk_pci_device *dev)
1200 {
1201 	assert(!dev->internal.attached);
1202 	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1203 }
1204 
1205 void
1206 spdk_pci_register_device_provider(struct spdk_pci_device_provider *provider)
1207 {
1208 	TAILQ_INSERT_TAIL(&g_pci_device_providers, provider, tailq);
1209 }
1210 
1211 const char *
1212 spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1213 {
1214 	return dev->type;
1215 }
1216 
1217 int
1218 spdk_pci_device_allow(struct spdk_pci_addr *pci_addr)
1219 {
1220 	struct rte_devargs *da;
1221 	char devargs_str[128];
1222 
1223 	da = calloc(1, sizeof(*da));
1224 	if (da == NULL) {
1225 		SPDK_ERRLOG("could not allocate rte_devargs\n");
1226 		return -ENOMEM;
1227 	}
1228 
1229 	snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x",
1230 		 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func);
1231 	if (rte_devargs_parse(da, devargs_str) != 0) {
1232 		SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str);
1233 		free(da);
1234 		return -EINVAL;
1235 	}
1236 	da->policy = RTE_DEV_ALLOWED;
1237 	/* Note: if a devargs already exists for this device address, it just gets
1238 	 * overridden.  So we do not need to check if the devargs already exists.
1239 	 * DPDK will take care of memory management for the devargs structure after
1240 	 * it has been inserted, so there's nothing SPDK needs to track.
1241 	 */
1242 	if (rte_devargs_insert(&da) != 0) {
1243 		SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str);
1244 		free(da);
1245 		return -EINVAL;
1246 	}
1247 
1248 	return 0;
1249 }
1250 
1251 uint64_t
1252 dpdk_pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr)
1253 {
1254 	struct rte_mem_resource *res;
1255 	uint64_t paddr;
1256 	unsigned r;
1257 
1258 	for (r = 0; r < PCI_MAX_RESOURCE; r++) {
1259 		res = &dev->mem_resource[r];
1260 		if (res->phys_addr && vaddr >= (uint64_t)res->addr &&
1261 		    vaddr < (uint64_t)res->addr + res->len) {
1262 			paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
1263 			return paddr;
1264 		}
1265 	}
1266 
1267 	return SPDK_VTOPHYS_ERROR;
1268 }
1269