xref: /spdk/lib/env_dpdk/pci.c (revision 927f1fd57bd004df581518466ec4c1b8083e5d23)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "env_internal.h"
35 
36 #include <rte_alarm.h>
37 #include <rte_devargs.h>
38 #include "spdk/env.h"
39 #include "spdk/log.h"
40 #include "spdk/string.h"
41 
42 #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
43 
44 /* Compatibility for versions < 20.11 */
45 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0)
46 #define RTE_DEV_ALLOWED RTE_DEV_WHITELISTED
47 #define RTE_DEV_BLOCKED RTE_DEV_BLACKLISTED
48 #define RTE_BUS_SCAN_ALLOWLIST RTE_BUS_SCAN_WHITELIST
49 #endif
50 
51 #define PCI_CFG_SIZE		256
52 #define PCI_EXT_CAP_ID_SN	0x03
53 
54 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
55  * might cause the internal IPC to misbehave. Just retry in such case.
56  */
57 #define DPDK_HOTPLUG_RETRY_COUNT 4
58 
59 /* DPDK alarm/interrupt thread */
60 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
61 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
62 /* devices hotplugged on a dpdk thread */
63 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
64 	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
65 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
66 
67 struct env_devargs {
68 	struct rte_bus	*bus;
69 	char		name[128];
70 	uint64_t	allowed_at;
71 	TAILQ_ENTRY(env_devargs) link;
72 };
73 static TAILQ_HEAD(, env_devargs) g_env_devargs = TAILQ_HEAD_INITIALIZER(g_env_devargs);
74 
75 static struct env_devargs *
76 find_env_devargs(struct rte_bus *bus, const char *name)
77 {
78 	struct env_devargs *da;
79 
80 	TAILQ_FOREACH(da, &g_env_devargs, link) {
81 		if (bus == da->bus && !strcmp(name, da->name)) {
82 			return da;
83 		}
84 	}
85 
86 	return NULL;
87 }
88 
89 static int
90 map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
91 	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
92 {
93 	struct rte_pci_device *dev = device->dev_handle;
94 
95 	*mapped_addr = dev->mem_resource[bar].addr;
96 	*phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr;
97 	*size = (uint64_t)dev->mem_resource[bar].len;
98 
99 	return 0;
100 }
101 
102 static int
103 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
104 {
105 	return 0;
106 }
107 
108 static int
109 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
110 {
111 	int rc;
112 
113 	rc = rte_pci_read_config(dev->dev_handle, value, len, offset);
114 
115 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
116 }
117 
118 static int
119 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
120 {
121 	int rc;
122 
123 	rc = rte_pci_write_config(dev->dev_handle, value, len, offset);
124 
125 #ifdef __FreeBSD__
126 	/* DPDK returns 0 on success and -1 on failure */
127 	return rc;
128 #endif
129 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
130 }
131 
132 static void
133 remove_rte_dev(struct rte_pci_device *rte_dev)
134 {
135 	char bdf[32];
136 	int i = 0, rc;
137 
138 	snprintf(bdf, sizeof(bdf), "%s", rte_dev->device.name);
139 	do {
140 		rc = rte_eal_hotplug_remove("pci", bdf);
141 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
142 }
143 
144 static void
145 detach_rte_cb(void *_dev)
146 {
147 	remove_rte_dev(_dev);
148 }
149 
150 static void
151 detach_rte(struct spdk_pci_device *dev)
152 {
153 	struct rte_pci_device *rte_dev = dev->dev_handle;
154 	int i;
155 	bool removed;
156 
157 	if (!spdk_process_is_primary()) {
158 		remove_rte_dev(rte_dev);
159 		return;
160 	}
161 
162 	pthread_mutex_lock(&g_pci_mutex);
163 	dev->internal.attached = false;
164 	/* prevent the hotremove notification from removing this device */
165 	dev->internal.pending_removal = true;
166 	pthread_mutex_unlock(&g_pci_mutex);
167 
168 	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
169 
170 	/* wait up to 2s for the cb to execute */
171 	for (i = 2000; i > 0; i--) {
172 
173 		spdk_delay_us(1000);
174 		pthread_mutex_lock(&g_pci_mutex);
175 		removed = dev->internal.removed;
176 		pthread_mutex_unlock(&g_pci_mutex);
177 
178 		if (removed) {
179 			break;
180 		}
181 	}
182 
183 	/* besides checking the removed flag, we also need to wait
184 	 * for the dpdk detach function to unwind, as it's doing some
185 	 * operations even after calling our detach callback. Simply
186 	 * cancel the alarm - if it started executing already, this
187 	 * call will block and wait for it to finish.
188 	 */
189 	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
190 
191 	/* the device could have been finally removed, so just check
192 	 * it again.
193 	 */
194 	pthread_mutex_lock(&g_pci_mutex);
195 	removed = dev->internal.removed;
196 	pthread_mutex_unlock(&g_pci_mutex);
197 	if (!removed) {
198 		SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n",
199 			    rte_dev->name);
200 		/* If we reach this state, then the device couldn't be removed and most likely
201 		   a subsequent hot add of a device in the same BDF will fail */
202 	}
203 }
204 
205 void
206 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
207 {
208 	struct spdk_pci_driver *driver;
209 
210 	driver = calloc(1, sizeof(*driver));
211 	if (!driver) {
212 		/* we can't do any better than bailing atm */
213 		return;
214 	}
215 
216 	driver->name = name;
217 	driver->id_table = id_table;
218 	driver->drv_flags = flags;
219 	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
220 }
221 
222 struct spdk_pci_driver *
223 spdk_pci_nvme_get_driver(void)
224 {
225 	return spdk_pci_get_driver("nvme");
226 }
227 
228 struct spdk_pci_driver *
229 spdk_pci_get_driver(const char *name)
230 {
231 	struct spdk_pci_driver *driver;
232 
233 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
234 		if (strcmp(driver->name, name) == 0) {
235 			return driver;
236 		}
237 	}
238 
239 	return NULL;
240 }
241 
242 static void
243 pci_device_rte_dev_event(const char *device_name,
244 			 enum rte_dev_event_type event,
245 			 void *cb_arg)
246 {
247 	struct spdk_pci_device *dev;
248 	bool can_detach = false;
249 
250 	switch (event) {
251 	default:
252 	case RTE_DEV_EVENT_ADD:
253 		/* Nothing to do here yet. */
254 		break;
255 	case RTE_DEV_EVENT_REMOVE:
256 		pthread_mutex_lock(&g_pci_mutex);
257 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
258 			struct rte_pci_device *rte_dev = dev->dev_handle;
259 
260 			if (strcmp(rte_dev->name, device_name) == 0 &&
261 			    !dev->internal.pending_removal) {
262 				can_detach = !dev->internal.attached;
263 				/* prevent any further attaches */
264 				dev->internal.pending_removal = true;
265 				break;
266 			}
267 		}
268 		pthread_mutex_unlock(&g_pci_mutex);
269 
270 		if (dev != NULL && can_detach) {
271 			/* if device is not attached we can remove it right away.
272 			 * Otherwise it will be removed at detach.
273 			 *
274 			 * Because the user's callback is invoked in eal interrupt
275 			 * callback, the interrupt callback need to be finished before
276 			 * it can be unregistered when detaching device. So finish
277 			 * callback soon and use a deferred removal to detach device
278 			 * is need. It is a workaround, once the device detaching be
279 			 * moved into the eal in the future, the deferred removal could
280 			 * be deleted.
281 			 */
282 			rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle);
283 		}
284 		break;
285 	}
286 }
287 
288 static void
289 cleanup_pci_devices(void)
290 {
291 	struct spdk_pci_device *dev, *tmp;
292 
293 	pthread_mutex_lock(&g_pci_mutex);
294 	/* cleanup removed devices */
295 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
296 		if (!dev->internal.removed) {
297 			continue;
298 		}
299 
300 		vtophys_pci_device_removed(dev->dev_handle);
301 		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
302 		free(dev);
303 	}
304 
305 	/* add newly-attached devices */
306 	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
307 		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
308 		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
309 		vtophys_pci_device_added(dev->dev_handle);
310 	}
311 	pthread_mutex_unlock(&g_pci_mutex);
312 }
313 
314 static int scan_pci_bus(bool delay_init);
315 
316 /* translate spdk_pci_driver to an rte_pci_driver and register it to dpdk */
317 static int
318 register_rte_driver(struct spdk_pci_driver *driver)
319 {
320 	unsigned pci_id_count = 0;
321 	struct rte_pci_id *rte_id_table;
322 	char *rte_name;
323 	size_t rte_name_len;
324 	uint32_t rte_flags;
325 
326 	assert(driver->id_table);
327 	while (driver->id_table[pci_id_count].vendor_id) {
328 		pci_id_count++;
329 	}
330 	assert(pci_id_count > 0);
331 
332 	rte_id_table = calloc(pci_id_count + 1, sizeof(*rte_id_table));
333 	if (!rte_id_table) {
334 		return -ENOMEM;
335 	}
336 
337 	while (pci_id_count > 0) {
338 		struct rte_pci_id *rte_id = &rte_id_table[pci_id_count - 1];
339 		const struct spdk_pci_id *spdk_id = &driver->id_table[pci_id_count - 1];
340 
341 		rte_id->class_id = spdk_id->class_id;
342 		rte_id->vendor_id = spdk_id->vendor_id;
343 		rte_id->device_id = spdk_id->device_id;
344 		rte_id->subsystem_vendor_id = spdk_id->subvendor_id;
345 		rte_id->subsystem_device_id = spdk_id->subdevice_id;
346 		pci_id_count--;
347 	}
348 
349 	assert(driver->name);
350 	rte_name_len = strlen(driver->name) + strlen("spdk_") + 1;
351 	rte_name = calloc(rte_name_len, 1);
352 	if (!rte_name) {
353 		free(rte_id_table);
354 		return -ENOMEM;
355 	}
356 
357 	snprintf(rte_name, rte_name_len, "spdk_%s", driver->name);
358 	driver->driver.driver.name = rte_name;
359 	driver->driver.id_table = rte_id_table;
360 
361 	rte_flags = 0;
362 	if (driver->drv_flags & SPDK_PCI_DRIVER_NEED_MAPPING) {
363 		rte_flags |= RTE_PCI_DRV_NEED_MAPPING;
364 	}
365 	if (driver->drv_flags & SPDK_PCI_DRIVER_WC_ACTIVATE) {
366 		rte_flags |= RTE_PCI_DRV_WC_ACTIVATE;
367 	}
368 	driver->driver.drv_flags = rte_flags;
369 
370 	driver->driver.probe = pci_device_init;
371 	driver->driver.remove = pci_device_fini;
372 
373 	rte_pci_register(&driver->driver);
374 	return 0;
375 }
376 
377 static inline void
378 _pci_env_init(void)
379 {
380 	/* We assume devices were present on the bus for more than 2 seconds
381 	 * before initializing SPDK and there's no need to wait more. We scan
382 	 * the bus, but we don't block any devices.
383 	 */
384 	scan_pci_bus(false);
385 
386 	/* Register a single hotremove callback for all devices. */
387 	if (spdk_process_is_primary()) {
388 		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
389 	}
390 }
391 
392 void
393 pci_env_init(void)
394 {
395 	struct spdk_pci_driver *driver;
396 
397 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
398 		register_rte_driver(driver);
399 	}
400 
401 	_pci_env_init();
402 }
403 
404 void
405 pci_env_reinit(void)
406 {
407 	/* There is no need to register pci drivers again, since they were
408 	 * already pre-registered in pci_env_init.
409 	 */
410 
411 	_pci_env_init();
412 }
413 
414 void
415 pci_env_fini(void)
416 {
417 	struct spdk_pci_device *dev;
418 	char bdf[32];
419 
420 	cleanup_pci_devices();
421 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
422 		if (dev->internal.attached) {
423 			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
424 			SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf);
425 		}
426 	}
427 
428 	if (spdk_process_is_primary()) {
429 		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
430 	}
431 }
432 
433 int
434 pci_device_init(struct rte_pci_driver *_drv,
435 		struct rte_pci_device *_dev)
436 {
437 	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
438 	struct spdk_pci_device *dev;
439 	int rc;
440 
441 	dev = calloc(1, sizeof(*dev));
442 	if (dev == NULL) {
443 		return -1;
444 	}
445 
446 	dev->dev_handle = _dev;
447 
448 	dev->addr.domain = _dev->addr.domain;
449 	dev->addr.bus = _dev->addr.bus;
450 	dev->addr.dev = _dev->addr.devid;
451 	dev->addr.func = _dev->addr.function;
452 	dev->id.class_id = _dev->id.class_id;
453 	dev->id.vendor_id = _dev->id.vendor_id;
454 	dev->id.device_id = _dev->id.device_id;
455 	dev->id.subvendor_id = _dev->id.subsystem_vendor_id;
456 	dev->id.subdevice_id = _dev->id.subsystem_device_id;
457 	dev->socket_id = _dev->device.numa_node;
458 	dev->type = "pci";
459 
460 	dev->map_bar = map_bar_rte;
461 	dev->unmap_bar = unmap_bar_rte;
462 	dev->cfg_read = cfg_read_rte;
463 	dev->cfg_write = cfg_write_rte;
464 
465 	dev->internal.driver = driver;
466 	dev->internal.claim_fd = -1;
467 
468 	if (driver->cb_fn != NULL) {
469 		rc = driver->cb_fn(driver->cb_arg, dev);
470 		if (rc != 0) {
471 			free(dev);
472 			return rc;
473 		}
474 		dev->internal.attached = true;
475 	}
476 
477 	pthread_mutex_lock(&g_pci_mutex);
478 	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
479 	pthread_mutex_unlock(&g_pci_mutex);
480 	return 0;
481 }
482 
483 static void
484 set_allowed_at(struct rte_devargs *rte_da, uint64_t tsc)
485 {
486 	struct env_devargs *env_da;
487 
488 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
489 	if (env_da == NULL) {
490 		env_da = calloc(1, sizeof(*env_da));
491 		if (env_da == NULL) {
492 			SPDK_ERRLOG("could not set_allowed_at for device %s\n", rte_da->name);
493 			return;
494 		}
495 		env_da->bus = rte_da->bus;
496 		spdk_strcpy_pad(env_da->name, rte_da->name, sizeof(env_da->name), 0);
497 		TAILQ_INSERT_TAIL(&g_env_devargs, env_da, link);
498 	}
499 
500 	env_da->allowed_at = tsc;
501 }
502 
503 static uint64_t
504 get_allowed_at(struct rte_devargs *rte_da)
505 {
506 	struct env_devargs *env_da;
507 
508 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
509 	if (env_da) {
510 		return env_da->allowed_at;
511 	} else {
512 		return 0;
513 	}
514 }
515 
516 int
517 pci_device_fini(struct rte_pci_device *_dev)
518 {
519 	struct spdk_pci_device *dev;
520 
521 	pthread_mutex_lock(&g_pci_mutex);
522 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
523 		if (dev->dev_handle == _dev) {
524 			break;
525 		}
526 	}
527 
528 	if (dev == NULL || dev->internal.attached) {
529 		/* The device might be still referenced somewhere in SPDK. */
530 		pthread_mutex_unlock(&g_pci_mutex);
531 		return -EBUSY;
532 	}
533 
534 	/* remove our allowed_at option */
535 	if (_dev->device.devargs) {
536 		set_allowed_at(_dev->device.devargs, 0);
537 	}
538 
539 	/* It is possible that removed flag was already set when there is a race
540 	 * between the remove notification for this process, and another process
541 	 * that is also detaching from this same device (for example, when using
542 	 * nvme driver in multi-process mode.  So do not assert here.  See
543 	 * #2456 for additional details.
544 	 */
545 	dev->internal.removed = true;
546 	pthread_mutex_unlock(&g_pci_mutex);
547 	return 0;
548 
549 }
550 
551 void
552 spdk_pci_device_detach(struct spdk_pci_device *dev)
553 {
554 	assert(dev->internal.attached);
555 
556 	if (dev->internal.claim_fd >= 0) {
557 		spdk_pci_device_unclaim(dev);
558 	}
559 
560 	dev->internal.attached = false;
561 	if (strcmp(dev->type, "pci") == 0) {
562 		/* if it's a physical device we need to deal with DPDK on
563 		 * a different process and we can't just unset one flag
564 		 * here. We also want to stop using any device resources
565 		 * so that the device isn't "in use" by the userspace driver
566 		 * once we detach it. This would allow attaching the device
567 		 * to a different process, or to a kernel driver like nvme.
568 		 */
569 		detach_rte(dev);
570 	}
571 
572 	cleanup_pci_devices();
573 }
574 
575 static int
576 scan_pci_bus(bool delay_init)
577 {
578 	struct spdk_pci_driver *driver;
579 	struct rte_pci_device *rte_dev;
580 	uint64_t now;
581 
582 	rte_bus_scan();
583 	now = spdk_get_ticks();
584 
585 	driver = TAILQ_FIRST(&g_pci_drivers);
586 	if (!driver) {
587 		return 0;
588 	}
589 
590 	TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) {
591 		struct rte_devargs *da;
592 
593 		da = rte_dev->device.devargs;
594 		if (!da) {
595 			char devargs_str[128];
596 
597 			/* the device was never blocked or allowed */
598 			da = calloc(1, sizeof(*da));
599 			if (!da) {
600 				return -1;
601 			}
602 
603 			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name);
604 			if (rte_devargs_parse(da, devargs_str) != 0) {
605 				free(da);
606 				return -1;
607 			}
608 
609 			rte_devargs_insert(&da);
610 			rte_dev->device.devargs = da;
611 		}
612 
613 		if (get_allowed_at(da)) {
614 			uint64_t allowed_at = get_allowed_at(da);
615 
616 			/* this device was seen by spdk before... */
617 			if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) {
618 				da->policy = RTE_DEV_ALLOWED;
619 			}
620 		} else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_ALLOWLIST &&
621 			    da->policy == RTE_DEV_ALLOWED) || da->policy != RTE_DEV_BLOCKED) {
622 			/* override the policy only if not permanently blocked */
623 
624 			if (delay_init) {
625 				da->policy = RTE_DEV_BLOCKED;
626 				set_allowed_at(da, now + 2 * spdk_get_ticks_hz());
627 			} else {
628 				da->policy = RTE_DEV_ALLOWED;
629 				set_allowed_at(da, now);
630 			}
631 		}
632 	}
633 
634 	return 0;
635 }
636 
637 int
638 spdk_pci_device_attach(struct spdk_pci_driver *driver,
639 		       spdk_pci_enum_cb enum_cb,
640 		       void *enum_ctx, struct spdk_pci_addr *pci_address)
641 {
642 	struct spdk_pci_device *dev;
643 	struct rte_pci_device *rte_dev;
644 	struct rte_devargs *da;
645 	int rc;
646 	char bdf[32];
647 
648 	spdk_pci_addr_fmt(bdf, sizeof(bdf), pci_address);
649 
650 	cleanup_pci_devices();
651 
652 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
653 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
654 			break;
655 		}
656 	}
657 
658 	if (dev != NULL && dev->internal.driver == driver) {
659 		pthread_mutex_lock(&g_pci_mutex);
660 		if (dev->internal.attached || dev->internal.pending_removal) {
661 			pthread_mutex_unlock(&g_pci_mutex);
662 			return -1;
663 		}
664 
665 		rc = enum_cb(enum_ctx, dev);
666 		if (rc == 0) {
667 			dev->internal.attached = true;
668 		}
669 		pthread_mutex_unlock(&g_pci_mutex);
670 		return rc;
671 	}
672 
673 	driver->cb_fn = enum_cb;
674 	driver->cb_arg = enum_ctx;
675 
676 	int i = 0;
677 
678 	do {
679 		rc = rte_eal_hotplug_add("pci", bdf, "");
680 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
681 
682 	if (i > 1 && rc == -EEXIST) {
683 		/* Even though the previous request timed out, the device
684 		 * was attached successfully.
685 		 */
686 		rc = 0;
687 	}
688 
689 	driver->cb_arg = NULL;
690 	driver->cb_fn = NULL;
691 
692 	cleanup_pci_devices();
693 
694 	if (rc != 0) {
695 		return -1;
696 	}
697 
698 	/* explicit attach ignores the allowlist, so if we blocked this
699 	 * device before let's enable it now - just for clarity.
700 	 */
701 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
702 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
703 			break;
704 		}
705 	}
706 	assert(dev != NULL);
707 
708 	rte_dev = dev->dev_handle;
709 	da = rte_dev->device.devargs;
710 	if (da && get_allowed_at(da)) {
711 		set_allowed_at(da, spdk_get_ticks());
712 		da->policy = RTE_DEV_ALLOWED;
713 	}
714 
715 	return 0;
716 }
717 
718 /* Note: You can call spdk_pci_enumerate from more than one thread
719  *       simultaneously safely, but you cannot call spdk_pci_enumerate
720  *       and rte_eal_pci_probe simultaneously.
721  */
722 int
723 spdk_pci_enumerate(struct spdk_pci_driver *driver,
724 		   spdk_pci_enum_cb enum_cb,
725 		   void *enum_ctx)
726 {
727 	struct spdk_pci_device *dev;
728 	int rc;
729 
730 	cleanup_pci_devices();
731 
732 	pthread_mutex_lock(&g_pci_mutex);
733 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
734 		if (dev->internal.attached ||
735 		    dev->internal.driver != driver ||
736 		    dev->internal.pending_removal) {
737 			continue;
738 		}
739 
740 		rc = enum_cb(enum_ctx, dev);
741 		if (rc == 0) {
742 			dev->internal.attached = true;
743 		} else if (rc < 0) {
744 			pthread_mutex_unlock(&g_pci_mutex);
745 			return -1;
746 		}
747 	}
748 	pthread_mutex_unlock(&g_pci_mutex);
749 
750 	if (scan_pci_bus(true) != 0) {
751 		return -1;
752 	}
753 
754 	driver->cb_fn = enum_cb;
755 	driver->cb_arg = enum_ctx;
756 
757 	if (rte_bus_probe() != 0) {
758 		driver->cb_arg = NULL;
759 		driver->cb_fn = NULL;
760 		return -1;
761 	}
762 
763 	driver->cb_arg = NULL;
764 	driver->cb_fn = NULL;
765 
766 	cleanup_pci_devices();
767 	return 0;
768 }
769 
770 void
771 spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev))
772 {
773 	struct spdk_pci_device *dev;
774 
775 	pthread_mutex_lock(&g_pci_mutex);
776 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
777 		fn(ctx, dev);
778 	}
779 	pthread_mutex_unlock(&g_pci_mutex);
780 }
781 
782 int
783 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
784 			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
785 {
786 	return dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
787 }
788 
789 int
790 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
791 {
792 	return dev->unmap_bar(dev, bar, addr);
793 }
794 
795 int
796 spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev)
797 {
798 	struct rte_pci_device *rte_dev = dev->dev_handle;
799 #if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
800 	return rte_intr_enable(&rte_dev->intr_handle);
801 #else
802 	return rte_intr_enable(rte_dev->intr_handle);
803 #endif
804 }
805 
806 int
807 spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev)
808 {
809 	struct rte_pci_device *rte_dev = dev->dev_handle;
810 #if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
811 	return rte_intr_disable(&rte_dev->intr_handle);
812 #else
813 	return rte_intr_disable(rte_dev->intr_handle);
814 #endif
815 }
816 
817 int
818 spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev)
819 {
820 	struct rte_pci_device *rte_dev = dev->dev_handle;
821 #if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
822 	return rte_dev->intr_handle.fd;
823 #else
824 	return rte_intr_fd_get(rte_dev->intr_handle);
825 #endif
826 }
827 
828 uint32_t
829 spdk_pci_device_get_domain(struct spdk_pci_device *dev)
830 {
831 	return dev->addr.domain;
832 }
833 
834 uint8_t
835 spdk_pci_device_get_bus(struct spdk_pci_device *dev)
836 {
837 	return dev->addr.bus;
838 }
839 
840 uint8_t
841 spdk_pci_device_get_dev(struct spdk_pci_device *dev)
842 {
843 	return dev->addr.dev;
844 }
845 
846 uint8_t
847 spdk_pci_device_get_func(struct spdk_pci_device *dev)
848 {
849 	return dev->addr.func;
850 }
851 
852 uint16_t
853 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
854 {
855 	return dev->id.vendor_id;
856 }
857 
858 uint16_t
859 spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
860 {
861 	return dev->id.device_id;
862 }
863 
864 uint16_t
865 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
866 {
867 	return dev->id.subvendor_id;
868 }
869 
870 uint16_t
871 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
872 {
873 	return dev->id.subdevice_id;
874 }
875 
876 struct spdk_pci_id
877 spdk_pci_device_get_id(struct spdk_pci_device *dev)
878 {
879 	return dev->id;
880 }
881 
882 int
883 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
884 {
885 	return dev->socket_id;
886 }
887 
888 int
889 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
890 {
891 	return dev->cfg_read(dev, value, len, offset);
892 }
893 
894 int
895 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
896 {
897 	return dev->cfg_write(dev, value, len, offset);
898 }
899 
900 int
901 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
902 {
903 	return spdk_pci_device_cfg_read(dev, value, 1, offset);
904 }
905 
906 int
907 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
908 {
909 	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
910 }
911 
912 int
913 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
914 {
915 	return spdk_pci_device_cfg_read(dev, value, 2, offset);
916 }
917 
918 int
919 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
920 {
921 	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
922 }
923 
924 int
925 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
926 {
927 	return spdk_pci_device_cfg_read(dev, value, 4, offset);
928 }
929 
930 int
931 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
932 {
933 	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
934 }
935 
936 int
937 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
938 {
939 	int err;
940 	uint32_t pos, header = 0;
941 	uint32_t i, buf[2];
942 
943 	if (len < 17) {
944 		return -1;
945 	}
946 
947 	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
948 	if (err || !header) {
949 		return -1;
950 	}
951 
952 	pos = PCI_CFG_SIZE;
953 	while (1) {
954 		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
955 			if (pos) {
956 				/* skip the header */
957 				pos += 4;
958 				for (i = 0; i < 2; i++) {
959 					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
960 					if (err) {
961 						return -1;
962 					}
963 				}
964 				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
965 				return 0;
966 			}
967 		}
968 		pos = (header >> 20) & 0xffc;
969 		/* 0 if no other items exist */
970 		if (pos < PCI_CFG_SIZE) {
971 			return -1;
972 		}
973 		err = spdk_pci_device_cfg_read32(dev, &header, pos);
974 		if (err) {
975 			return -1;
976 		}
977 	}
978 	return -1;
979 }
980 
981 struct spdk_pci_addr
982 spdk_pci_device_get_addr(struct spdk_pci_device *dev)
983 {
984 	return dev->addr;
985 }
986 
987 bool
988 spdk_pci_device_is_removed(struct spdk_pci_device *dev)
989 {
990 	return dev->internal.pending_removal;
991 }
992 
993 int
994 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
995 {
996 	if (a1->domain > a2->domain) {
997 		return 1;
998 	} else if (a1->domain < a2->domain) {
999 		return -1;
1000 	} else if (a1->bus > a2->bus) {
1001 		return 1;
1002 	} else if (a1->bus < a2->bus) {
1003 		return -1;
1004 	} else if (a1->dev > a2->dev) {
1005 		return 1;
1006 	} else if (a1->dev < a2->dev) {
1007 		return -1;
1008 	} else if (a1->func > a2->func) {
1009 		return 1;
1010 	} else if (a1->func < a2->func) {
1011 		return -1;
1012 	}
1013 
1014 	return 0;
1015 }
1016 
1017 #ifdef __linux__
1018 int
1019 spdk_pci_device_claim(struct spdk_pci_device *dev)
1020 {
1021 	int dev_fd;
1022 	char dev_name[64];
1023 	int pid;
1024 	void *dev_map;
1025 	struct flock pcidev_lock = {
1026 		.l_type = F_WRLCK,
1027 		.l_whence = SEEK_SET,
1028 		.l_start = 0,
1029 		.l_len = 0,
1030 	};
1031 
1032 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1033 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1034 
1035 	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
1036 	if (dev_fd == -1) {
1037 		SPDK_ERRLOG("could not open %s\n", dev_name);
1038 		return -errno;
1039 	}
1040 
1041 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
1042 		SPDK_ERRLOG("could not truncate %s\n", dev_name);
1043 		close(dev_fd);
1044 		return -errno;
1045 	}
1046 
1047 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
1048 		       MAP_SHARED, dev_fd, 0);
1049 	if (dev_map == MAP_FAILED) {
1050 		SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno);
1051 		close(dev_fd);
1052 		return -errno;
1053 	}
1054 
1055 	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
1056 		pid = *(int *)dev_map;
1057 		SPDK_ERRLOG("Cannot create lock on device %s, probably"
1058 			    " process %d has claimed it\n", dev_name, pid);
1059 		munmap(dev_map, sizeof(int));
1060 		close(dev_fd);
1061 		/* F_SETLK returns unspecified errnos, normalize them */
1062 		return -EACCES;
1063 	}
1064 
1065 	*(int *)dev_map = (int)getpid();
1066 	munmap(dev_map, sizeof(int));
1067 	dev->internal.claim_fd = dev_fd;
1068 	/* Keep dev_fd open to maintain the lock. */
1069 	return 0;
1070 }
1071 
1072 void
1073 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1074 {
1075 	char dev_name[64];
1076 
1077 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1078 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1079 
1080 	close(dev->internal.claim_fd);
1081 	dev->internal.claim_fd = -1;
1082 	unlink(dev_name);
1083 }
1084 #else /* !__linux__ */
1085 int
1086 spdk_pci_device_claim(struct spdk_pci_device *dev)
1087 {
1088 	/* TODO */
1089 	return 0;
1090 }
1091 
1092 void
1093 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1094 {
1095 	/* TODO */
1096 }
1097 #endif /* __linux__ */
1098 
1099 int
1100 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
1101 {
1102 	unsigned domain, bus, dev, func;
1103 
1104 	if (addr == NULL || bdf == NULL) {
1105 		return -EINVAL;
1106 	}
1107 
1108 	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1109 	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1110 		/* Matched a full address - all variables are initialized */
1111 	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1112 		func = 0;
1113 	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1114 		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1115 		domain = 0;
1116 	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1117 		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1118 		domain = 0;
1119 		func = 0;
1120 	} else {
1121 		return -EINVAL;
1122 	}
1123 
1124 	if (bus > 0xFF || dev > 0x1F || func > 7) {
1125 		return -EINVAL;
1126 	}
1127 
1128 	addr->domain = domain;
1129 	addr->bus = bus;
1130 	addr->dev = dev;
1131 	addr->func = func;
1132 
1133 	return 0;
1134 }
1135 
1136 int
1137 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1138 {
1139 	int rc;
1140 
1141 	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1142 		      addr->domain, addr->bus,
1143 		      addr->dev, addr->func);
1144 
1145 	if (rc > 0 && (size_t)rc < sz) {
1146 		return 0;
1147 	}
1148 
1149 	return -1;
1150 }
1151 
1152 void
1153 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1154 {
1155 	assert(dev->map_bar != NULL);
1156 	assert(dev->unmap_bar != NULL);
1157 	assert(dev->cfg_read != NULL);
1158 	assert(dev->cfg_write != NULL);
1159 	dev->internal.driver = drv;
1160 	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1161 }
1162 
1163 void
1164 spdk_pci_unhook_device(struct spdk_pci_device *dev)
1165 {
1166 	assert(!dev->internal.attached);
1167 	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1168 }
1169 
1170 const char *
1171 spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1172 {
1173 	return dev->type;
1174 }
1175 
1176 int
1177 spdk_pci_device_allow(struct spdk_pci_addr *pci_addr)
1178 {
1179 	struct rte_devargs *da;
1180 	char devargs_str[128];
1181 
1182 	da = calloc(1, sizeof(*da));
1183 	if (da == NULL) {
1184 		SPDK_ERRLOG("could not allocate rte_devargs\n");
1185 		return -ENOMEM;
1186 	}
1187 
1188 	snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x",
1189 		 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func);
1190 	if (rte_devargs_parse(da, devargs_str) != 0) {
1191 		SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str);
1192 		free(da);
1193 		return -EINVAL;
1194 	}
1195 	da->policy = RTE_DEV_ALLOWED;
1196 	/* Note: if a devargs already exists for this device address, it just gets
1197 	 * overridden.  So we do not need to check if the devargs already exists.
1198 	 * DPDK will take care of memory management for the devargs structure after
1199 	 * it has been inserted, so there's nothing SPDK needs to track.
1200 	 */
1201 	if (rte_devargs_insert(&da) != 0) {
1202 		SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str);
1203 		free(da);
1204 		return -EINVAL;
1205 	}
1206 
1207 	return 0;
1208 }
1209