xref: /spdk/lib/env_dpdk/pci.c (revision e0d7428b482257aa6999b8b4cc44159dcc292df9)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2015 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "env_internal.h"
7 #include "pci_dpdk.h"
8 
9 #include <rte_alarm.h>
10 #include <rte_devargs.h>
11 #include <rte_pci.h>
12 #include "spdk/env.h"
13 #include "spdk/log.h"
14 #include "spdk/string.h"
15 #include "spdk/memory.h"
16 
17 #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
18 
19 #define PCI_CFG_SIZE		256
20 #define PCI_EXT_CAP_ID_SN	0x03
21 
22 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
23  * might cause the internal IPC to misbehave. Just retry in such case.
24  */
25 #define DPDK_HOTPLUG_RETRY_COUNT 4
26 
27 /* DPDK alarm/interrupt thread */
28 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
29 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
30 /* devices hotplugged on a dpdk thread */
31 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
32 	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
33 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
34 static TAILQ_HEAD(, spdk_pci_device_provider) g_pci_device_providers =
35 	TAILQ_HEAD_INITIALIZER(g_pci_device_providers);
36 
37 int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device);
38 int pci_device_fini(struct rte_pci_device *device);
39 
40 struct env_devargs {
41 	struct rte_bus	*bus;
42 	char		name[128];
43 	uint64_t	allowed_at;
44 	TAILQ_ENTRY(env_devargs) link;
45 };
46 static TAILQ_HEAD(, env_devargs) g_env_devargs = TAILQ_HEAD_INITIALIZER(g_env_devargs);
47 
48 static struct env_devargs *
49 find_env_devargs(struct rte_bus *bus, const char *name)
50 {
51 	struct env_devargs *da;
52 
53 	TAILQ_FOREACH(da, &g_env_devargs, link) {
54 		if (bus == da->bus && !strcmp(name, da->name)) {
55 			return da;
56 		}
57 	}
58 
59 	return NULL;
60 }
61 
62 static int
63 map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
64 	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
65 {
66 	struct rte_mem_resource *res;
67 
68 	res = dpdk_pci_device_get_mem_resource(device->dev_handle, bar);
69 	*mapped_addr = res->addr;
70 	*phys_addr = (uint64_t)res->phys_addr;
71 	*size = (uint64_t)res->len;
72 
73 	return 0;
74 }
75 
76 static int
77 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
78 {
79 	return 0;
80 }
81 
82 static int
83 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
84 {
85 	return dpdk_pci_device_read_config(dev->dev_handle, value, len, offset);
86 }
87 
88 static int
89 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
90 {
91 	return dpdk_pci_device_write_config(dev->dev_handle, value, len, offset);
92 }
93 
94 static void
95 remove_rte_dev(struct rte_pci_device *rte_dev)
96 {
97 	char bdf[32];
98 	int i = 0, rc;
99 
100 	snprintf(bdf, sizeof(bdf), "%s", dpdk_pci_device_get_name(rte_dev));
101 	do {
102 		rc = rte_eal_hotplug_remove("pci", bdf);
103 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
104 }
105 
106 static void
107 detach_rte_cb(void *_dev)
108 {
109 	remove_rte_dev(_dev);
110 }
111 
112 /* if it's a physical device we need to deal with DPDK on
113  * a different process and we can't just unset one flag
114  * here. We also want to stop using any device resources
115  * so that the device isn't "in use" by the userspace driver
116  * once we detach it. This would allow attaching the device
117  * to a different process, or to a kernel driver like nvme.
118  */
119 static void
120 detach_rte(struct spdk_pci_device *dev)
121 {
122 	struct rte_pci_device *rte_dev = dev->dev_handle;
123 	int i;
124 	bool removed;
125 
126 	if (!spdk_process_is_primary()) {
127 		return;
128 	}
129 
130 	pthread_mutex_lock(&g_pci_mutex);
131 	dev->internal.attached = false;
132 	/* prevent the hotremove notification from removing this device */
133 	dev->internal.pending_removal = true;
134 	pthread_mutex_unlock(&g_pci_mutex);
135 
136 	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
137 
138 	/* wait up to 2s for the cb to execute */
139 	for (i = 2000; i > 0; i--) {
140 
141 		spdk_delay_us(1000);
142 		pthread_mutex_lock(&g_pci_mutex);
143 		removed = dev->internal.removed;
144 		pthread_mutex_unlock(&g_pci_mutex);
145 
146 		if (removed) {
147 			break;
148 		}
149 	}
150 
151 	/* besides checking the removed flag, we also need to wait
152 	 * for the dpdk detach function to unwind, as it's doing some
153 	 * operations even after calling our detach callback. Simply
154 	 * cancel the alarm - if it started executing already, this
155 	 * call will block and wait for it to finish.
156 	 */
157 	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
158 
159 	/* the device could have been finally removed, so just check
160 	 * it again.
161 	 */
162 	pthread_mutex_lock(&g_pci_mutex);
163 	removed = dev->internal.removed;
164 	pthread_mutex_unlock(&g_pci_mutex);
165 	if (!removed) {
166 		SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n",
167 			    dpdk_pci_device_get_name(rte_dev));
168 		/* If we reach this state, then the device couldn't be removed and most likely
169 		   a subsequent hot add of a device in the same BDF will fail */
170 	}
171 }
172 
173 void
174 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
175 {
176 	struct spdk_pci_driver *driver;
177 
178 	driver = calloc(1, sizeof(*driver));
179 	if (!driver) {
180 		/* we can't do any better than bailing atm */
181 		return;
182 	}
183 
184 	driver->name = name;
185 	driver->id_table = id_table;
186 	driver->drv_flags = flags;
187 	driver->driver = (struct rte_pci_driver *)driver->driver_buf;
188 	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
189 }
190 
191 struct spdk_pci_driver *
192 spdk_pci_nvme_get_driver(void)
193 {
194 	return spdk_pci_get_driver("nvme");
195 }
196 
197 struct spdk_pci_driver *
198 spdk_pci_get_driver(const char *name)
199 {
200 	struct spdk_pci_driver *driver;
201 
202 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
203 		if (strcmp(driver->name, name) == 0) {
204 			return driver;
205 		}
206 	}
207 
208 	return NULL;
209 }
210 
211 static void
212 pci_device_rte_dev_event(const char *device_name,
213 			 enum rte_dev_event_type event,
214 			 void *cb_arg)
215 {
216 	struct spdk_pci_device *dev;
217 	bool can_detach = false;
218 
219 	switch (event) {
220 	default:
221 	case RTE_DEV_EVENT_ADD:
222 		/* Nothing to do here yet. */
223 		break;
224 	case RTE_DEV_EVENT_REMOVE:
225 		pthread_mutex_lock(&g_pci_mutex);
226 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
227 			struct rte_pci_device *rte_dev = dev->dev_handle;
228 
229 			if (strcmp(dpdk_pci_device_get_name(rte_dev), device_name)) {
230 				continue;
231 			}
232 
233 			/* Note: these ERRLOGs are useful for triaging issue #2983. */
234 			if (dev->internal.pending_removal || dev->internal.removed) {
235 				SPDK_ERRLOG("Received event for device SPDK already tried to remove\n");
236 				SPDK_ERRLOG("pending_removal=%d removed=%d\n", dev->internal.pending_removal,
237 					    dev->internal.removed);
238 			}
239 
240 			if (!dev->internal.pending_removal) {
241 				can_detach = !dev->internal.attached;
242 				/* prevent any further attaches */
243 				dev->internal.pending_removal = true;
244 				break;
245 			}
246 		}
247 		pthread_mutex_unlock(&g_pci_mutex);
248 
249 		if (can_detach) {
250 			/* if device is not attached we can remove it right away.
251 			 * Otherwise it will be removed at detach.
252 			 *
253 			 * Because the user's callback is invoked in eal interrupt
254 			 * callback, the interrupt callback need to be finished before
255 			 * it can be unregistered when detaching device. So finish
256 			 * callback soon and use a deferred removal to detach device
257 			 * is need. It is a workaround, once the device detaching be
258 			 * moved into the eal in the future, the deferred removal could
259 			 * be deleted.
260 			 */
261 			assert(dev != NULL);
262 			rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle);
263 		}
264 		break;
265 	}
266 }
267 
268 static void
269 cleanup_pci_devices(void)
270 {
271 	struct spdk_pci_device *dev, *tmp;
272 
273 	pthread_mutex_lock(&g_pci_mutex);
274 	/* cleanup removed devices */
275 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
276 		if (!dev->internal.removed) {
277 			continue;
278 		}
279 
280 		vtophys_pci_device_removed(dev->dev_handle);
281 		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
282 		free(dev);
283 	}
284 
285 	/* add newly-attached devices */
286 	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
287 		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
288 		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
289 		vtophys_pci_device_added(dev->dev_handle);
290 	}
291 	pthread_mutex_unlock(&g_pci_mutex);
292 }
293 
294 static int scan_pci_bus(bool delay_init);
295 
296 static inline void
297 _pci_env_init(void)
298 {
299 	/* We assume devices were present on the bus for more than 2 seconds
300 	 * before initializing SPDK and there's no need to wait more. We scan
301 	 * the bus, but we don't block any devices.
302 	 */
303 	scan_pci_bus(false);
304 
305 	/* Register a single hotremove callback for all devices. */
306 	if (spdk_process_is_primary()) {
307 		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
308 	}
309 }
310 
311 int
312 pci_env_init(void)
313 {
314 	struct spdk_pci_driver *driver;
315 	int rc;
316 
317 	rc = dpdk_pci_init();
318 	if (rc) {
319 		return rc;
320 	}
321 
322 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
323 		dpdk_pci_driver_register(driver, pci_device_init, pci_device_fini);
324 	}
325 
326 	_pci_env_init();
327 	return 0;
328 }
329 
330 void
331 pci_env_reinit(void)
332 {
333 	/* There is no need to register pci drivers again, since they were
334 	 * already pre-registered in pci_env_init.
335 	 */
336 
337 	_pci_env_init();
338 }
339 
340 void
341 pci_env_fini(void)
342 {
343 	struct spdk_pci_device *dev;
344 	char bdf[32];
345 
346 	cleanup_pci_devices();
347 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
348 		if (dev->internal.attached) {
349 			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
350 			SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf);
351 		}
352 	}
353 
354 	if (spdk_process_is_primary()) {
355 		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
356 	}
357 }
358 
359 int
360 pci_device_init(struct rte_pci_driver *_drv,
361 		struct rte_pci_device *_dev)
362 {
363 	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
364 	struct spdk_pci_device *dev;
365 	struct rte_pci_addr *addr;
366 	struct rte_pci_id *id;
367 	int rc;
368 
369 	dev = calloc(1, sizeof(*dev));
370 	if (dev == NULL) {
371 		return -1;
372 	}
373 
374 	dev->dev_handle = _dev;
375 
376 	addr = dpdk_pci_device_get_addr(_dev);
377 	dev->addr.domain = addr->domain;
378 	dev->addr.bus = addr->bus;
379 	dev->addr.dev = addr->devid;
380 	dev->addr.func = addr->function;
381 
382 	id = dpdk_pci_device_get_id(_dev);
383 	dev->id.class_id = id->class_id;
384 	dev->id.vendor_id = id->vendor_id;
385 	dev->id.device_id = id->device_id;
386 	dev->id.subvendor_id = id->subsystem_vendor_id;
387 	dev->id.subdevice_id = id->subsystem_device_id;
388 
389 	dev->numa_id = dpdk_pci_device_get_numa_node(_dev);
390 	dev->type = "pci";
391 
392 	dev->map_bar = map_bar_rte;
393 	dev->unmap_bar = unmap_bar_rte;
394 	dev->cfg_read = cfg_read_rte;
395 	dev->cfg_write = cfg_write_rte;
396 
397 	dev->internal.driver = driver;
398 	dev->internal.claim_fd = -1;
399 
400 	if (driver->cb_fn != NULL) {
401 		rc = driver->cb_fn(driver->cb_arg, dev);
402 		if (rc != 0) {
403 			free(dev);
404 			return rc;
405 		}
406 		dev->internal.attached = true;
407 	}
408 
409 	pthread_mutex_lock(&g_pci_mutex);
410 	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
411 	pthread_mutex_unlock(&g_pci_mutex);
412 	return 0;
413 }
414 
415 static void
416 set_allowed_at(struct rte_devargs *rte_da, uint64_t tsc)
417 {
418 	struct env_devargs *env_da;
419 
420 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
421 	if (env_da == NULL) {
422 		env_da = calloc(1, sizeof(*env_da));
423 		if (env_da == NULL) {
424 			SPDK_ERRLOG("could not set_allowed_at for device %s\n", rte_da->name);
425 			return;
426 		}
427 		env_da->bus = rte_da->bus;
428 		spdk_strcpy_pad(env_da->name, rte_da->name, sizeof(env_da->name), 0);
429 		TAILQ_INSERT_TAIL(&g_env_devargs, env_da, link);
430 	}
431 
432 	env_da->allowed_at = tsc;
433 }
434 
435 static uint64_t
436 get_allowed_at(struct rte_devargs *rte_da)
437 {
438 	struct env_devargs *env_da;
439 
440 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
441 	if (env_da) {
442 		return env_da->allowed_at;
443 	} else {
444 		return 0;
445 	}
446 }
447 
448 int
449 pci_device_fini(struct rte_pci_device *_dev)
450 {
451 	struct spdk_pci_device *dev;
452 
453 	pthread_mutex_lock(&g_pci_mutex);
454 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
455 		if (dev->dev_handle == _dev) {
456 			break;
457 		}
458 	}
459 
460 	if (dev == NULL || dev->internal.attached) {
461 		/* The device might be still referenced somewhere in SPDK. */
462 		pthread_mutex_unlock(&g_pci_mutex);
463 		return -EBUSY;
464 	}
465 
466 	/* remove our allowed_at option */
467 	if (dpdk_pci_device_get_devargs(_dev)) {
468 		set_allowed_at(dpdk_pci_device_get_devargs(_dev), 0);
469 	}
470 
471 	/* It is possible that removed flag was already set when there is a race
472 	 * between the remove notification for this process, and another process
473 	 * that is also detaching from this same device (for example, when using
474 	 * nvme driver in multi-process mode.  So do not assert here.  See
475 	 * #2456 for additional details.
476 	 */
477 	dev->internal.removed = true;
478 	pthread_mutex_unlock(&g_pci_mutex);
479 	return 0;
480 
481 }
482 
483 void
484 spdk_pci_device_detach(struct spdk_pci_device *dev)
485 {
486 	struct spdk_pci_device_provider *provider;
487 
488 	assert(dev->internal.attached);
489 
490 	if (dev->internal.claim_fd >= 0) {
491 		spdk_pci_device_unclaim(dev);
492 	}
493 
494 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
495 		if (strcmp(dev->type, provider->name) == 0) {
496 			break;
497 		}
498 	}
499 
500 	assert(provider != NULL);
501 	dev->internal.attached = false;
502 	provider->detach_cb(dev);
503 
504 	cleanup_pci_devices();
505 }
506 
507 static int
508 scan_pci_bus(bool delay_init)
509 {
510 	struct rte_dev_iterator it;
511 	struct rte_device *rte_dev;
512 	uint64_t now;
513 
514 	dpdk_bus_scan();
515 	now = spdk_get_ticks();
516 
517 	if (!TAILQ_FIRST(&g_pci_drivers)) {
518 		return 0;
519 	}
520 
521 	RTE_DEV_FOREACH(rte_dev, "bus=pci", &it) {
522 		struct rte_devargs *da;
523 
524 		da = dpdk_device_get_devargs(rte_dev);
525 		if (!da) {
526 			char devargs_str[128];
527 
528 			/* the device was never blocked or allowed */
529 			da = calloc(1, sizeof(*da));
530 			if (!da) {
531 				return -1;
532 			}
533 
534 			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", dpdk_device_get_name(rte_dev));
535 			if (rte_devargs_parse(da, devargs_str) != 0) {
536 				free(da);
537 				return -1;
538 			}
539 
540 			rte_devargs_insert(&da);
541 			dpdk_device_set_devargs(rte_dev, da);
542 		}
543 
544 		if (get_allowed_at(da)) {
545 			uint64_t allowed_at = get_allowed_at(da);
546 
547 			/* this device was seen by spdk before... */
548 			if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) {
549 				da->policy = RTE_DEV_ALLOWED;
550 			}
551 		} else if ((dpdk_device_scan_allowed(rte_dev) && da->policy == RTE_DEV_ALLOWED) ||
552 			   da->policy != RTE_DEV_BLOCKED) {
553 			/* override the policy only if not permanently blocked */
554 
555 			if (delay_init) {
556 				da->policy = RTE_DEV_BLOCKED;
557 				set_allowed_at(da, now + 2 * spdk_get_ticks_hz());
558 			} else {
559 				da->policy = RTE_DEV_ALLOWED;
560 				set_allowed_at(da, now);
561 			}
562 		}
563 	}
564 
565 	return 0;
566 }
567 
568 static int
569 pci_attach_rte(const struct spdk_pci_addr *addr)
570 {
571 	char bdf[32];
572 	int rc, i = 0;
573 
574 	spdk_pci_addr_fmt(bdf, sizeof(bdf), addr);
575 
576 	do {
577 		rc = rte_eal_hotplug_add("pci", bdf, "");
578 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
579 
580 	if (i > 1 && rc == -EEXIST) {
581 		/* Even though the previous request timed out, the device
582 		 * was attached successfully.
583 		 */
584 		rc = 0;
585 	}
586 
587 	return rc;
588 }
589 
590 static struct spdk_pci_device_provider g_pci_rte_provider = {
591 	.name = "pci",
592 	.attach_cb = pci_attach_rte,
593 	.detach_cb = detach_rte,
594 };
595 
596 SPDK_PCI_REGISTER_DEVICE_PROVIDER(pci, &g_pci_rte_provider);
597 
598 int
599 spdk_pci_device_attach(struct spdk_pci_driver *driver,
600 		       spdk_pci_enum_cb enum_cb,
601 		       void *enum_ctx, struct spdk_pci_addr *pci_address)
602 {
603 	struct spdk_pci_device *dev;
604 	struct spdk_pci_device_provider *provider;
605 	struct rte_pci_device *rte_dev;
606 	struct rte_devargs *da;
607 	int rc;
608 
609 	cleanup_pci_devices();
610 
611 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
612 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
613 			break;
614 		}
615 	}
616 
617 	if (dev != NULL && dev->internal.driver == driver) {
618 		pthread_mutex_lock(&g_pci_mutex);
619 		if (dev->internal.attached || dev->internal.pending_removal) {
620 			pthread_mutex_unlock(&g_pci_mutex);
621 			return -1;
622 		}
623 
624 		rc = enum_cb(enum_ctx, dev);
625 		if (rc == 0) {
626 			dev->internal.attached = true;
627 		}
628 		pthread_mutex_unlock(&g_pci_mutex);
629 		return rc;
630 	}
631 
632 	driver->cb_fn = enum_cb;
633 	driver->cb_arg = enum_ctx;
634 
635 	rc = -ENODEV;
636 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
637 		rc = provider->attach_cb(pci_address);
638 		if (rc == 0) {
639 			break;
640 		}
641 	}
642 
643 	driver->cb_arg = NULL;
644 	driver->cb_fn = NULL;
645 
646 	cleanup_pci_devices();
647 
648 	if (rc != 0) {
649 		return -1;
650 	}
651 
652 	/* explicit attach ignores the allowlist, so if we blocked this
653 	 * device before let's enable it now - just for clarity.
654 	 */
655 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
656 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
657 			break;
658 		}
659 	}
660 	assert(dev != NULL);
661 
662 	rte_dev = dev->dev_handle;
663 	if (rte_dev != NULL) {
664 		da = dpdk_pci_device_get_devargs(rte_dev);
665 		if (da && get_allowed_at(da)) {
666 			set_allowed_at(da, spdk_get_ticks());
667 			da->policy = RTE_DEV_ALLOWED;
668 		}
669 	}
670 
671 	return 0;
672 }
673 
674 /* Note: You can call spdk_pci_enumerate from more than one thread
675  *       simultaneously safely, but you cannot call spdk_pci_enumerate
676  *       and rte_eal_pci_probe simultaneously.
677  */
678 int
679 spdk_pci_enumerate(struct spdk_pci_driver *driver,
680 		   spdk_pci_enum_cb enum_cb,
681 		   void *enum_ctx)
682 {
683 	struct spdk_pci_device *dev;
684 	int rc;
685 
686 	cleanup_pci_devices();
687 
688 	pthread_mutex_lock(&g_pci_mutex);
689 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
690 		if (dev->internal.attached ||
691 		    dev->internal.driver != driver ||
692 		    dev->internal.pending_removal) {
693 			continue;
694 		}
695 
696 		rc = enum_cb(enum_ctx, dev);
697 		if (rc == 0) {
698 			dev->internal.attached = true;
699 		} else if (rc < 0) {
700 			pthread_mutex_unlock(&g_pci_mutex);
701 			return -1;
702 		}
703 	}
704 	pthread_mutex_unlock(&g_pci_mutex);
705 
706 	if (scan_pci_bus(true) != 0) {
707 		return -1;
708 	}
709 
710 	driver->cb_fn = enum_cb;
711 	driver->cb_arg = enum_ctx;
712 
713 	if (dpdk_bus_probe() != 0) {
714 		driver->cb_arg = NULL;
715 		driver->cb_fn = NULL;
716 		return -1;
717 	}
718 
719 	driver->cb_arg = NULL;
720 	driver->cb_fn = NULL;
721 
722 	cleanup_pci_devices();
723 	return 0;
724 }
725 
726 void
727 spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev))
728 {
729 	struct spdk_pci_device *dev, *tmp;
730 
731 	pthread_mutex_lock(&g_pci_mutex);
732 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
733 		fn(ctx, dev);
734 	}
735 	pthread_mutex_unlock(&g_pci_mutex);
736 }
737 
738 int
739 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
740 			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
741 {
742 	int rc;
743 
744 	rc = dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
745 	if (rc) {
746 		return rc;
747 	}
748 
749 #if VFIO_ENABLED
750 	/* Automatically map the BAR to the IOMMU */
751 	if (!spdk_iommu_is_enabled()) {
752 		return 0;
753 	}
754 
755 	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
756 		/* We'll use the virtual address as the iova to match DPDK. */
757 		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), (uint64_t) * mapped_addr, *size);
758 		if (rc) {
759 			dev->unmap_bar(dev, bar, *mapped_addr);
760 			return -EFAULT;
761 		}
762 
763 		*phys_addr = (uint64_t)(*mapped_addr);
764 	} else {
765 		/* We'll use the physical address as the iova to match DPDK. */
766 		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), *phys_addr, *size);
767 		if (rc) {
768 			dev->unmap_bar(dev, bar, *mapped_addr);
769 			return -EFAULT;
770 		}
771 	}
772 #endif
773 	return rc;
774 }
775 
776 int
777 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
778 {
779 #if VFIO_ENABLED
780 	int rc;
781 
782 	if (spdk_iommu_is_enabled()) {
783 		rc = vtophys_iommu_unmap_dma_bar((uint64_t)addr);
784 		if (rc) {
785 			return -EFAULT;
786 		}
787 	}
788 #endif
789 
790 	return dev->unmap_bar(dev, bar, addr);
791 }
792 
793 int
794 spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev)
795 {
796 	return dpdk_pci_device_enable_interrupt(dev->dev_handle);
797 }
798 
799 int
800 spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev)
801 {
802 	return dpdk_pci_device_disable_interrupt(dev->dev_handle);
803 }
804 
805 int
806 spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev)
807 {
808 	return dpdk_pci_device_get_interrupt_efd(dev->dev_handle);
809 }
810 
811 int
812 spdk_pci_device_enable_interrupts(struct spdk_pci_device *dev, uint32_t efd_count)
813 {
814 	struct rte_pci_device *rte_dev = dev->dev_handle;
815 	int rc;
816 
817 	if (efd_count == 0) {
818 		SPDK_ERRLOG("Invalid efd_count (%u)\n", efd_count);
819 		return -EINVAL;
820 	}
821 
822 	/* Detect if device has MSI-X capability */
823 	if (dpdk_pci_device_interrupt_cap_multi(rte_dev) != 1) {
824 		SPDK_ERRLOG("VFIO MSI-X capability not present for device %s\n",
825 			    dpdk_pci_device_get_name(rte_dev));
826 		return -ENOTSUP;
827 	}
828 
829 	/* Create event file descriptors */
830 	rc = dpdk_pci_device_create_interrupt_efds(rte_dev, efd_count);
831 	if (rc) {
832 		SPDK_ERRLOG("Can't setup eventfd (%u)\n", efd_count);
833 		return rc;
834 	}
835 
836 	/* Bind each event fd to each interrupt vector */
837 	rc = dpdk_pci_device_enable_interrupt(rte_dev);
838 	if (rc) {
839 		SPDK_ERRLOG("Failed to enable interrupt for PCI device %s\n",
840 			    dpdk_pci_device_get_name(rte_dev));
841 		dpdk_pci_device_delete_interrupt_efds(rte_dev);
842 		return rc;
843 	}
844 
845 	return 0;
846 }
847 
848 int
849 spdk_pci_device_disable_interrupts(struct spdk_pci_device *dev)
850 {
851 	struct rte_pci_device *rte_dev = dev->dev_handle;
852 	int rc;
853 
854 	rc = dpdk_pci_device_disable_interrupt(rte_dev);
855 	if (rc) {
856 		SPDK_ERRLOG("Failed to disable interrupt for PCI device %s\n",
857 			    dpdk_pci_device_get_name(rte_dev));
858 		return rc;
859 	}
860 
861 	dpdk_pci_device_delete_interrupt_efds(rte_dev);
862 
863 	return 0;
864 }
865 
866 int
867 spdk_pci_device_get_interrupt_efd_by_index(struct spdk_pci_device *dev, uint32_t index)
868 {
869 	if (index == 0) {
870 		return dpdk_pci_device_get_interrupt_efd(dev->dev_handle);
871 	} else {
872 		/* Note: The interrupt vector offset starts from 1, and in DPDK these
873 		 * are mapped to efd index 0 onwards.
874 		 */
875 		return dpdk_pci_device_get_interrupt_efd_by_index(dev->dev_handle, index - 1);
876 	}
877 }
878 
879 uint32_t
880 spdk_pci_device_get_domain(struct spdk_pci_device *dev)
881 {
882 	return dev->addr.domain;
883 }
884 
885 uint8_t
886 spdk_pci_device_get_bus(struct spdk_pci_device *dev)
887 {
888 	return dev->addr.bus;
889 }
890 
891 uint8_t
892 spdk_pci_device_get_dev(struct spdk_pci_device *dev)
893 {
894 	return dev->addr.dev;
895 }
896 
897 uint8_t
898 spdk_pci_device_get_func(struct spdk_pci_device *dev)
899 {
900 	return dev->addr.func;
901 }
902 
903 uint16_t
904 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
905 {
906 	return dev->id.vendor_id;
907 }
908 
909 uint16_t
910 spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
911 {
912 	return dev->id.device_id;
913 }
914 
915 uint16_t
916 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
917 {
918 	return dev->id.subvendor_id;
919 }
920 
921 uint16_t
922 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
923 {
924 	return dev->id.subdevice_id;
925 }
926 
927 struct spdk_pci_id
928 spdk_pci_device_get_id(struct spdk_pci_device *dev)
929 {
930 	return dev->id;
931 }
932 
933 int
934 spdk_pci_device_get_numa_id(struct spdk_pci_device *dev)
935 {
936 	return dev->numa_id;
937 }
938 
939 SPDK_LOG_DEPRECATION_REGISTER(pci_device_socket_id, "spdk_pci_device_get_socket_id", "v25.05", 0);
940 
941 int
942 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
943 {
944 	SPDK_LOG_DEPRECATED(pci_device_socket_id);
945 	return spdk_pci_device_get_numa_id(dev);
946 }
947 
948 int
949 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
950 {
951 	return dev->cfg_read(dev, value, len, offset);
952 }
953 
954 int
955 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
956 {
957 	return dev->cfg_write(dev, value, len, offset);
958 }
959 
960 int
961 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
962 {
963 	return spdk_pci_device_cfg_read(dev, value, 1, offset);
964 }
965 
966 int
967 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
968 {
969 	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
970 }
971 
972 int
973 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
974 {
975 	return spdk_pci_device_cfg_read(dev, value, 2, offset);
976 }
977 
978 int
979 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
980 {
981 	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
982 }
983 
984 int
985 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
986 {
987 	return spdk_pci_device_cfg_read(dev, value, 4, offset);
988 }
989 
990 int
991 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
992 {
993 	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
994 }
995 
996 int
997 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
998 {
999 	int err;
1000 	uint32_t pos, header = 0;
1001 	uint32_t i, buf[2];
1002 
1003 	if (len < 17) {
1004 		return -1;
1005 	}
1006 
1007 	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
1008 	if (err || !header) {
1009 		return -1;
1010 	}
1011 
1012 	pos = PCI_CFG_SIZE;
1013 	while (1) {
1014 		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
1015 			if (pos) {
1016 				/* skip the header */
1017 				pos += 4;
1018 				for (i = 0; i < 2; i++) {
1019 					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
1020 					if (err) {
1021 						return -1;
1022 					}
1023 				}
1024 				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
1025 				return 0;
1026 			}
1027 		}
1028 		pos = (header >> 20) & 0xffc;
1029 		/* 0 if no other items exist */
1030 		if (pos < PCI_CFG_SIZE) {
1031 			return -1;
1032 		}
1033 		err = spdk_pci_device_cfg_read32(dev, &header, pos);
1034 		if (err) {
1035 			return -1;
1036 		}
1037 	}
1038 	return -1;
1039 }
1040 
1041 struct spdk_pci_addr
1042 spdk_pci_device_get_addr(struct spdk_pci_device *dev)
1043 {
1044 	return dev->addr;
1045 }
1046 
1047 bool
1048 spdk_pci_device_is_removed(struct spdk_pci_device *dev)
1049 {
1050 	return dev->internal.pending_removal;
1051 }
1052 
1053 int
1054 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
1055 {
1056 	if (a1->domain > a2->domain) {
1057 		return 1;
1058 	} else if (a1->domain < a2->domain) {
1059 		return -1;
1060 	} else if (a1->bus > a2->bus) {
1061 		return 1;
1062 	} else if (a1->bus < a2->bus) {
1063 		return -1;
1064 	} else if (a1->dev > a2->dev) {
1065 		return 1;
1066 	} else if (a1->dev < a2->dev) {
1067 		return -1;
1068 	} else if (a1->func > a2->func) {
1069 		return 1;
1070 	} else if (a1->func < a2->func) {
1071 		return -1;
1072 	}
1073 
1074 	return 0;
1075 }
1076 
1077 #ifdef __linux__
1078 int
1079 spdk_pci_device_claim(struct spdk_pci_device *dev)
1080 {
1081 	int dev_fd;
1082 	char dev_name[64];
1083 	int pid;
1084 	void *dev_map;
1085 	struct flock pcidev_lock = {
1086 		.l_type = F_WRLCK,
1087 		.l_whence = SEEK_SET,
1088 		.l_start = 0,
1089 		.l_len = 0,
1090 	};
1091 
1092 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1093 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1094 
1095 	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
1096 	if (dev_fd == -1) {
1097 		SPDK_ERRLOG("could not open %s\n", dev_name);
1098 		return -errno;
1099 	}
1100 
1101 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
1102 		SPDK_ERRLOG("could not truncate %s\n", dev_name);
1103 		close(dev_fd);
1104 		return -errno;
1105 	}
1106 
1107 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
1108 		       MAP_SHARED, dev_fd, 0);
1109 	if (dev_map == MAP_FAILED) {
1110 		SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno);
1111 		close(dev_fd);
1112 		return -errno;
1113 	}
1114 
1115 	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
1116 		pid = *(int *)dev_map;
1117 		SPDK_ERRLOG("Cannot create lock on device %s, probably"
1118 			    " process %d has claimed it\n", dev_name, pid);
1119 		munmap(dev_map, sizeof(int));
1120 		close(dev_fd);
1121 		/* F_SETLK returns unspecified errnos, normalize them */
1122 		return -EACCES;
1123 	}
1124 
1125 	*(int *)dev_map = (int)getpid();
1126 	munmap(dev_map, sizeof(int));
1127 	dev->internal.claim_fd = dev_fd;
1128 	/* Keep dev_fd open to maintain the lock. */
1129 	return 0;
1130 }
1131 
1132 void
1133 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1134 {
1135 	char dev_name[64];
1136 
1137 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1138 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1139 
1140 	close(dev->internal.claim_fd);
1141 	dev->internal.claim_fd = -1;
1142 	unlink(dev_name);
1143 }
1144 #else /* !__linux__ */
1145 int
1146 spdk_pci_device_claim(struct spdk_pci_device *dev)
1147 {
1148 	/* TODO */
1149 	return 0;
1150 }
1151 
1152 void
1153 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1154 {
1155 	/* TODO */
1156 }
1157 #endif /* __linux__ */
1158 
1159 int
1160 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
1161 {
1162 	unsigned domain, bus, dev, func;
1163 
1164 	if (addr == NULL || bdf == NULL) {
1165 		return -EINVAL;
1166 	}
1167 
1168 	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1169 	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1170 		/* Matched a full address - all variables are initialized */
1171 	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1172 		func = 0;
1173 	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1174 		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1175 		domain = 0;
1176 	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1177 		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1178 		domain = 0;
1179 		func = 0;
1180 	} else {
1181 		return -EINVAL;
1182 	}
1183 
1184 	if (bus > 0xFF || dev > 0x1F || func > 7) {
1185 		return -EINVAL;
1186 	}
1187 
1188 	addr->domain = domain;
1189 	addr->bus = bus;
1190 	addr->dev = dev;
1191 	addr->func = func;
1192 
1193 	return 0;
1194 }
1195 
1196 int
1197 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1198 {
1199 	int rc;
1200 
1201 	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1202 		      addr->domain, addr->bus,
1203 		      addr->dev, addr->func);
1204 
1205 	if (rc > 0 && (size_t)rc < sz) {
1206 		return 0;
1207 	}
1208 
1209 	return -1;
1210 }
1211 
1212 int
1213 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1214 {
1215 	int rc;
1216 
1217 	assert(dev->map_bar != NULL);
1218 	assert(dev->unmap_bar != NULL);
1219 	assert(dev->cfg_read != NULL);
1220 	assert(dev->cfg_write != NULL);
1221 	dev->internal.driver = drv;
1222 
1223 	if (drv->cb_fn != NULL) {
1224 		rc = drv->cb_fn(drv->cb_arg, dev);
1225 		if (rc != 0) {
1226 			return -ECANCELED;
1227 		}
1228 
1229 		dev->internal.attached = true;
1230 	}
1231 
1232 	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1233 
1234 	return 0;
1235 }
1236 
1237 void
1238 spdk_pci_unhook_device(struct spdk_pci_device *dev)
1239 {
1240 	assert(!dev->internal.attached);
1241 	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1242 }
1243 
1244 void
1245 spdk_pci_register_device_provider(struct spdk_pci_device_provider *provider)
1246 {
1247 	TAILQ_INSERT_TAIL(&g_pci_device_providers, provider, tailq);
1248 }
1249 
1250 const char *
1251 spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1252 {
1253 	return dev->type;
1254 }
1255 
1256 int
1257 spdk_pci_device_allow(struct spdk_pci_addr *pci_addr)
1258 {
1259 	struct rte_devargs *da;
1260 	char devargs_str[128];
1261 
1262 	da = calloc(1, sizeof(*da));
1263 	if (da == NULL) {
1264 		SPDK_ERRLOG("could not allocate rte_devargs\n");
1265 		return -ENOMEM;
1266 	}
1267 
1268 	snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x",
1269 		 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func);
1270 	if (rte_devargs_parse(da, devargs_str) != 0) {
1271 		SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str);
1272 		free(da);
1273 		return -EINVAL;
1274 	}
1275 	da->policy = RTE_DEV_ALLOWED;
1276 	/* Note: if a devargs already exists for this device address, it just gets
1277 	 * overridden.  So we do not need to check if the devargs already exists.
1278 	 * DPDK will take care of memory management for the devargs structure after
1279 	 * it has been inserted, so there's nothing SPDK needs to track.
1280 	 */
1281 	if (rte_devargs_insert(&da) != 0) {
1282 		SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str);
1283 		free(da);
1284 		return -EINVAL;
1285 	}
1286 
1287 	return 0;
1288 }
1289