xref: /spdk/lib/env_dpdk/pci.c (revision 4586880f596e61c5a599d0766bb47c004bbd2dd6)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2015 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "env_internal.h"
7 #include "pci_dpdk.h"
8 
9 #include <rte_alarm.h>
10 #include <rte_devargs.h>
11 #include <rte_pci.h>
12 #include "spdk/env.h"
13 #include "spdk/log.h"
14 #include "spdk/string.h"
15 #include "spdk/memory.h"
16 
17 #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
18 
19 #define PCI_CFG_SIZE		256
20 #define PCI_EXT_CAP_ID_SN	0x03
21 
22 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
23  * might cause the internal IPC to misbehave. Just retry in such case.
24  */
25 #define DPDK_HOTPLUG_RETRY_COUNT 4
26 
27 /* DPDK alarm/interrupt thread */
28 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
29 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
30 /* devices hotplugged on a dpdk thread */
31 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
32 	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
33 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
34 static TAILQ_HEAD(, spdk_pci_device_provider) g_pci_device_providers =
35 	TAILQ_HEAD_INITIALIZER(g_pci_device_providers);
36 
37 int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device);
38 int pci_device_fini(struct rte_pci_device *device);
39 
40 struct env_devargs {
41 	struct rte_bus	*bus;
42 	char		name[128];
43 	uint64_t	allowed_at;
44 	TAILQ_ENTRY(env_devargs) link;
45 };
46 static TAILQ_HEAD(, env_devargs) g_env_devargs = TAILQ_HEAD_INITIALIZER(g_env_devargs);
47 
48 static struct env_devargs *
49 find_env_devargs(struct rte_bus *bus, const char *name)
50 {
51 	struct env_devargs *da;
52 
53 	TAILQ_FOREACH(da, &g_env_devargs, link) {
54 		if (bus == da->bus && !strcmp(name, da->name)) {
55 			return da;
56 		}
57 	}
58 
59 	return NULL;
60 }
61 
62 static int
63 map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
64 	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
65 {
66 	struct rte_mem_resource *res;
67 
68 	res = dpdk_pci_device_get_mem_resource(device->dev_handle, bar);
69 	*mapped_addr = res->addr;
70 	*phys_addr = (uint64_t)res->phys_addr;
71 	*size = (uint64_t)res->len;
72 
73 	return 0;
74 }
75 
76 static int
77 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
78 {
79 	return 0;
80 }
81 
82 static int
83 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
84 {
85 	return dpdk_pci_device_read_config(dev->dev_handle, value, len, offset);
86 }
87 
88 static int
89 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
90 {
91 	return dpdk_pci_device_write_config(dev->dev_handle, value, len, offset);
92 }
93 
94 static void
95 remove_rte_dev(struct rte_pci_device *rte_dev)
96 {
97 	char bdf[32];
98 	int i = 0, rc;
99 
100 	snprintf(bdf, sizeof(bdf), "%s", dpdk_pci_device_get_name(rte_dev));
101 	do {
102 		rc = rte_eal_hotplug_remove("pci", bdf);
103 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
104 }
105 
106 static void
107 detach_rte_cb(void *_dev)
108 {
109 	remove_rte_dev(_dev);
110 }
111 
112 /* if it's a physical device we need to deal with DPDK on
113  * a different process and we can't just unset one flag
114  * here. We also want to stop using any device resources
115  * so that the device isn't "in use" by the userspace driver
116  * once we detach it. This would allow attaching the device
117  * to a different process, or to a kernel driver like nvme.
118  */
119 static void
120 detach_rte(struct spdk_pci_device *dev)
121 {
122 	struct rte_pci_device *rte_dev = dev->dev_handle;
123 	int i;
124 	bool removed;
125 
126 	if (!spdk_process_is_primary()) {
127 		return;
128 	}
129 
130 	pthread_mutex_lock(&g_pci_mutex);
131 	dev->internal.attached = false;
132 	/* prevent the hotremove notification from removing this device */
133 	dev->internal.pending_removal = true;
134 	pthread_mutex_unlock(&g_pci_mutex);
135 
136 	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
137 
138 	/* wait up to 2s for the cb to execute */
139 	for (i = 2000; i > 0; i--) {
140 
141 		spdk_delay_us(1000);
142 		pthread_mutex_lock(&g_pci_mutex);
143 		removed = dev->internal.removed;
144 		pthread_mutex_unlock(&g_pci_mutex);
145 
146 		if (removed) {
147 			break;
148 		}
149 	}
150 
151 	/* besides checking the removed flag, we also need to wait
152 	 * for the dpdk detach function to unwind, as it's doing some
153 	 * operations even after calling our detach callback. Simply
154 	 * cancel the alarm - if it started executing already, this
155 	 * call will block and wait for it to finish.
156 	 */
157 	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
158 
159 	/* the device could have been finally removed, so just check
160 	 * it again.
161 	 */
162 	pthread_mutex_lock(&g_pci_mutex);
163 	removed = dev->internal.removed;
164 	pthread_mutex_unlock(&g_pci_mutex);
165 	if (!removed) {
166 		SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n",
167 			    dpdk_pci_device_get_name(rte_dev));
168 		/* If we reach this state, then the device couldn't be removed and most likely
169 		   a subsequent hot add of a device in the same BDF will fail */
170 	}
171 }
172 
173 void
174 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
175 {
176 	struct spdk_pci_driver *driver;
177 
178 	driver = calloc(1, sizeof(*driver));
179 	if (!driver) {
180 		/* we can't do any better than bailing atm */
181 		return;
182 	}
183 
184 	driver->name = name;
185 	driver->id_table = id_table;
186 	driver->drv_flags = flags;
187 	driver->driver = (struct rte_pci_driver *)driver->driver_buf;
188 	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
189 }
190 
191 struct spdk_pci_driver *
192 spdk_pci_nvme_get_driver(void)
193 {
194 	return spdk_pci_get_driver("nvme");
195 }
196 
197 struct spdk_pci_driver *
198 spdk_pci_get_driver(const char *name)
199 {
200 	struct spdk_pci_driver *driver;
201 
202 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
203 		if (strcmp(driver->name, name) == 0) {
204 			return driver;
205 		}
206 	}
207 
208 	return NULL;
209 }
210 
211 static void
212 pci_device_rte_dev_event(const char *device_name,
213 			 enum rte_dev_event_type event,
214 			 void *cb_arg)
215 {
216 	struct spdk_pci_device *dev;
217 	bool can_detach = false;
218 
219 	switch (event) {
220 	default:
221 	case RTE_DEV_EVENT_ADD:
222 		/* Nothing to do here yet. */
223 		break;
224 	case RTE_DEV_EVENT_REMOVE:
225 		pthread_mutex_lock(&g_pci_mutex);
226 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
227 			struct rte_pci_device *rte_dev = dev->dev_handle;
228 
229 			if (strcmp(dpdk_pci_device_get_name(rte_dev), device_name)) {
230 				continue;
231 			}
232 
233 			/* Note: these ERRLOGs are useful for triaging issue #2983. */
234 			if (dev->internal.pending_removal || dev->internal.removed) {
235 				SPDK_ERRLOG("Received event for device SPDK already tried to remove\n");
236 				SPDK_ERRLOG("pending_removal=%d removed=%d\n", dev->internal.pending_removal,
237 					    dev->internal.removed);
238 			}
239 
240 			if (!dev->internal.pending_removal) {
241 				can_detach = !dev->internal.attached;
242 				/* prevent any further attaches */
243 				dev->internal.pending_removal = true;
244 				break;
245 			}
246 		}
247 		pthread_mutex_unlock(&g_pci_mutex);
248 
249 		if (can_detach) {
250 			/* if device is not attached we can remove it right away.
251 			 * Otherwise it will be removed at detach.
252 			 *
253 			 * Because the user's callback is invoked in eal interrupt
254 			 * callback, the interrupt callback need to be finished before
255 			 * it can be unregistered when detaching device. So finish
256 			 * callback soon and use a deferred removal to detach device
257 			 * is need. It is a workaround, once the device detaching be
258 			 * moved into the eal in the future, the deferred removal could
259 			 * be deleted.
260 			 */
261 			assert(dev != NULL);
262 			rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle);
263 		}
264 		break;
265 	}
266 }
267 
268 static void
269 cleanup_pci_devices(void)
270 {
271 	struct spdk_pci_device *dev, *tmp;
272 
273 	pthread_mutex_lock(&g_pci_mutex);
274 	/* cleanup removed devices */
275 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
276 		if (!dev->internal.removed) {
277 			continue;
278 		}
279 
280 		vtophys_pci_device_removed(dev->dev_handle);
281 		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
282 		free(dev);
283 	}
284 
285 	/* add newly-attached devices */
286 	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
287 		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
288 		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
289 		vtophys_pci_device_added(dev->dev_handle);
290 	}
291 	pthread_mutex_unlock(&g_pci_mutex);
292 }
293 
294 static int scan_pci_bus(bool delay_init);
295 
296 static inline void
297 _pci_env_init(void)
298 {
299 	/* We assume devices were present on the bus for more than 2 seconds
300 	 * before initializing SPDK and there's no need to wait more. We scan
301 	 * the bus, but we don't block any devices.
302 	 */
303 	scan_pci_bus(false);
304 
305 	/* Register a single hotremove callback for all devices. */
306 	if (spdk_process_is_primary()) {
307 		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
308 	}
309 }
310 
311 int
312 pci_env_init(void)
313 {
314 	struct spdk_pci_driver *driver;
315 	int rc;
316 
317 	rc = dpdk_pci_init();
318 	if (rc) {
319 		return rc;
320 	}
321 
322 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
323 		dpdk_pci_driver_register(driver, pci_device_init, pci_device_fini);
324 	}
325 
326 	_pci_env_init();
327 	return 0;
328 }
329 
330 void
331 pci_env_reinit(void)
332 {
333 	/* There is no need to register pci drivers again, since they were
334 	 * already pre-registered in pci_env_init.
335 	 */
336 
337 	_pci_env_init();
338 }
339 
340 void
341 pci_env_fini(void)
342 {
343 	struct spdk_pci_device *dev;
344 	char bdf[32];
345 
346 	cleanup_pci_devices();
347 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
348 		if (dev->internal.attached) {
349 			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
350 			SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf);
351 		}
352 	}
353 
354 	if (spdk_process_is_primary()) {
355 		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
356 	}
357 }
358 
359 int
360 pci_device_init(struct rte_pci_driver *_drv,
361 		struct rte_pci_device *_dev)
362 {
363 	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
364 	struct spdk_pci_device *dev;
365 	struct rte_pci_addr *addr;
366 	struct rte_pci_id *id;
367 	int rc;
368 
369 	dev = calloc(1, sizeof(*dev));
370 	if (dev == NULL) {
371 		return -1;
372 	}
373 
374 	dev->dev_handle = _dev;
375 
376 	addr = dpdk_pci_device_get_addr(_dev);
377 	dev->addr.domain = addr->domain;
378 	dev->addr.bus = addr->bus;
379 	dev->addr.dev = addr->devid;
380 	dev->addr.func = addr->function;
381 
382 	id = dpdk_pci_device_get_id(_dev);
383 	dev->id.class_id = id->class_id;
384 	dev->id.vendor_id = id->vendor_id;
385 	dev->id.device_id = id->device_id;
386 	dev->id.subvendor_id = id->subsystem_vendor_id;
387 	dev->id.subdevice_id = id->subsystem_device_id;
388 
389 	dev->numa_id = dpdk_pci_device_get_numa_node(_dev);
390 	dev->type = "pci";
391 
392 	dev->map_bar = map_bar_rte;
393 	dev->unmap_bar = unmap_bar_rte;
394 	dev->cfg_read = cfg_read_rte;
395 	dev->cfg_write = cfg_write_rte;
396 
397 	dev->internal.driver = driver;
398 	dev->internal.claim_fd = -1;
399 
400 	if (driver->cb_fn != NULL) {
401 		rc = driver->cb_fn(driver->cb_arg, dev);
402 		if (rc != 0) {
403 			free(dev);
404 			return rc;
405 		}
406 		dev->internal.attached = true;
407 	}
408 
409 	pthread_mutex_lock(&g_pci_mutex);
410 	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
411 	pthread_mutex_unlock(&g_pci_mutex);
412 	return 0;
413 }
414 
415 static void
416 set_allowed_at(struct rte_devargs *rte_da, uint64_t tsc)
417 {
418 	struct env_devargs *env_da;
419 
420 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
421 	if (env_da == NULL) {
422 		env_da = calloc(1, sizeof(*env_da));
423 		if (env_da == NULL) {
424 			SPDK_ERRLOG("could not set_allowed_at for device %s\n", rte_da->name);
425 			return;
426 		}
427 		env_da->bus = rte_da->bus;
428 		spdk_strcpy_pad(env_da->name, rte_da->name, sizeof(env_da->name), 0);
429 		TAILQ_INSERT_TAIL(&g_env_devargs, env_da, link);
430 	}
431 
432 	env_da->allowed_at = tsc;
433 }
434 
435 static uint64_t
436 get_allowed_at(struct rte_devargs *rte_da)
437 {
438 	struct env_devargs *env_da;
439 
440 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
441 	if (env_da) {
442 		return env_da->allowed_at;
443 	} else {
444 		return 0;
445 	}
446 }
447 
448 int
449 pci_device_fini(struct rte_pci_device *_dev)
450 {
451 	struct spdk_pci_device *dev;
452 
453 	pthread_mutex_lock(&g_pci_mutex);
454 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
455 		if (dev->dev_handle == _dev) {
456 			break;
457 		}
458 	}
459 
460 	if (dev == NULL || dev->internal.attached) {
461 		/* The device might be still referenced somewhere in SPDK. */
462 		pthread_mutex_unlock(&g_pci_mutex);
463 		return -EBUSY;
464 	}
465 
466 	/* remove our allowed_at option */
467 	if (dpdk_pci_device_get_devargs(_dev)) {
468 		set_allowed_at(dpdk_pci_device_get_devargs(_dev), 0);
469 	}
470 
471 	/* It is possible that removed flag was already set when there is a race
472 	 * between the remove notification for this process, and another process
473 	 * that is also detaching from this same device (for example, when using
474 	 * nvme driver in multi-process mode.  So do not assert here.  See
475 	 * #2456 for additional details.
476 	 */
477 	dev->internal.removed = true;
478 	pthread_mutex_unlock(&g_pci_mutex);
479 	return 0;
480 
481 }
482 
483 void
484 spdk_pci_device_detach(struct spdk_pci_device *dev)
485 {
486 	struct spdk_pci_device_provider *provider;
487 
488 	assert(dev->internal.attached);
489 
490 	if (dev->internal.claim_fd >= 0) {
491 		spdk_pci_device_unclaim(dev);
492 	}
493 
494 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
495 		if (strcmp(dev->type, provider->name) == 0) {
496 			break;
497 		}
498 	}
499 
500 	assert(provider != NULL);
501 	dev->internal.attached = false;
502 	provider->detach_cb(dev);
503 
504 	cleanup_pci_devices();
505 }
506 
507 static int
508 scan_pci_bus(bool delay_init)
509 {
510 	struct rte_dev_iterator it;
511 	struct rte_device *rte_dev;
512 	uint64_t now;
513 
514 	dpdk_bus_scan();
515 	now = spdk_get_ticks();
516 
517 	if (!TAILQ_FIRST(&g_pci_drivers)) {
518 		return 0;
519 	}
520 
521 	RTE_DEV_FOREACH(rte_dev, "bus=pci", &it) {
522 		struct rte_devargs *da;
523 
524 		da = dpdk_device_get_devargs(rte_dev);
525 		if (!da) {
526 			char devargs_str[128];
527 
528 			/* the device was never blocked or allowed */
529 			da = calloc(1, sizeof(*da));
530 			if (!da) {
531 				return -1;
532 			}
533 
534 			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", dpdk_device_get_name(rte_dev));
535 			if (rte_devargs_parse(da, devargs_str) != 0) {
536 				free(da);
537 				return -1;
538 			}
539 
540 			rte_devargs_insert(&da);
541 			dpdk_device_set_devargs(rte_dev, da);
542 		}
543 
544 		if (get_allowed_at(da)) {
545 			uint64_t allowed_at = get_allowed_at(da);
546 
547 			/* this device was seen by spdk before... */
548 			if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) {
549 				da->policy = RTE_DEV_ALLOWED;
550 			}
551 		} else if ((dpdk_device_scan_allowed(rte_dev) && da->policy == RTE_DEV_ALLOWED) ||
552 			   da->policy != RTE_DEV_BLOCKED) {
553 			/* override the policy only if not permanently blocked */
554 
555 			if (delay_init) {
556 				da->policy = RTE_DEV_BLOCKED;
557 				set_allowed_at(da, now + 2 * spdk_get_ticks_hz());
558 			} else {
559 				da->policy = RTE_DEV_ALLOWED;
560 				set_allowed_at(da, now);
561 			}
562 		}
563 	}
564 
565 	return 0;
566 }
567 
568 static int
569 pci_attach_rte(const struct spdk_pci_addr *addr)
570 {
571 	char bdf[32];
572 	int rc, i = 0;
573 
574 	spdk_pci_addr_fmt(bdf, sizeof(bdf), addr);
575 
576 	do {
577 		rc = rte_eal_hotplug_add("pci", bdf, "");
578 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
579 
580 	if (i > 1 && rc == -EEXIST) {
581 		/* Even though the previous request timed out, the device
582 		 * was attached successfully.
583 		 */
584 		rc = 0;
585 	}
586 
587 	return rc;
588 }
589 
590 static struct spdk_pci_device_provider g_pci_rte_provider = {
591 	.name = "pci",
592 	.attach_cb = pci_attach_rte,
593 	.detach_cb = detach_rte,
594 };
595 
596 SPDK_PCI_REGISTER_DEVICE_PROVIDER(pci, &g_pci_rte_provider);
597 
598 int
599 spdk_pci_device_attach(struct spdk_pci_driver *driver,
600 		       spdk_pci_enum_cb enum_cb,
601 		       void *enum_ctx, struct spdk_pci_addr *pci_address)
602 {
603 	struct spdk_pci_device *dev;
604 	struct spdk_pci_device_provider *provider;
605 	struct rte_pci_device *rte_dev;
606 	struct rte_devargs *da;
607 	int rc;
608 
609 	cleanup_pci_devices();
610 
611 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
612 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
613 			break;
614 		}
615 	}
616 
617 	if (dev != NULL && dev->internal.driver == driver) {
618 		pthread_mutex_lock(&g_pci_mutex);
619 		if (dev->internal.attached || dev->internal.pending_removal) {
620 			pthread_mutex_unlock(&g_pci_mutex);
621 			return -1;
622 		}
623 
624 		rc = enum_cb(enum_ctx, dev);
625 		if (rc == 0) {
626 			dev->internal.attached = true;
627 		}
628 		pthread_mutex_unlock(&g_pci_mutex);
629 		return rc;
630 	}
631 
632 	driver->cb_fn = enum_cb;
633 	driver->cb_arg = enum_ctx;
634 
635 	rc = -ENODEV;
636 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
637 		rc = provider->attach_cb(pci_address);
638 		if (rc == 0) {
639 			break;
640 		}
641 	}
642 
643 	driver->cb_arg = NULL;
644 	driver->cb_fn = NULL;
645 
646 	cleanup_pci_devices();
647 
648 	if (rc != 0) {
649 		return -1;
650 	}
651 
652 	/* explicit attach ignores the allowlist, so if we blocked this
653 	 * device before let's enable it now - just for clarity.
654 	 */
655 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
656 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
657 			break;
658 		}
659 	}
660 	assert(dev != NULL);
661 
662 	rte_dev = dev->dev_handle;
663 	if (rte_dev != NULL) {
664 		da = dpdk_pci_device_get_devargs(rte_dev);
665 		if (da && get_allowed_at(da)) {
666 			set_allowed_at(da, spdk_get_ticks());
667 			da->policy = RTE_DEV_ALLOWED;
668 		}
669 	}
670 
671 	return 0;
672 }
673 
674 /* Note: You can call spdk_pci_enumerate from more than one thread
675  *       simultaneously safely, but you cannot call spdk_pci_enumerate
676  *       and rte_eal_pci_probe simultaneously.
677  */
678 int
679 spdk_pci_enumerate(struct spdk_pci_driver *driver,
680 		   spdk_pci_enum_cb enum_cb,
681 		   void *enum_ctx)
682 {
683 	struct spdk_pci_device *dev;
684 	int rc;
685 
686 	cleanup_pci_devices();
687 
688 	pthread_mutex_lock(&g_pci_mutex);
689 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
690 		if (dev->internal.attached ||
691 		    dev->internal.driver != driver ||
692 		    dev->internal.pending_removal) {
693 			continue;
694 		}
695 
696 		rc = enum_cb(enum_ctx, dev);
697 		if (rc == 0) {
698 			dev->internal.attached = true;
699 		} else if (rc < 0) {
700 			pthread_mutex_unlock(&g_pci_mutex);
701 			return -1;
702 		}
703 	}
704 	pthread_mutex_unlock(&g_pci_mutex);
705 
706 	if (scan_pci_bus(true) != 0) {
707 		return -1;
708 	}
709 
710 	driver->cb_fn = enum_cb;
711 	driver->cb_arg = enum_ctx;
712 
713 	if (dpdk_bus_probe() != 0) {
714 		driver->cb_arg = NULL;
715 		driver->cb_fn = NULL;
716 		return -1;
717 	}
718 
719 	driver->cb_arg = NULL;
720 	driver->cb_fn = NULL;
721 
722 	cleanup_pci_devices();
723 	return 0;
724 }
725 
726 void
727 spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev))
728 {
729 	struct spdk_pci_device *dev, *tmp;
730 
731 	pthread_mutex_lock(&g_pci_mutex);
732 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
733 		fn(ctx, dev);
734 	}
735 	pthread_mutex_unlock(&g_pci_mutex);
736 }
737 
738 int
739 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
740 			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
741 {
742 	int rc;
743 
744 	rc = dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
745 	if (rc) {
746 		return rc;
747 	}
748 
749 #if VFIO_ENABLED
750 	/* Automatically map the BAR to the IOMMU */
751 	if (!spdk_iommu_is_enabled()) {
752 		return 0;
753 	}
754 
755 	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
756 		/* We'll use the virtual address as the iova to match DPDK. */
757 		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), (uint64_t) * mapped_addr, *size);
758 		if (rc) {
759 			dev->unmap_bar(dev, bar, *mapped_addr);
760 			return -EFAULT;
761 		}
762 
763 		*phys_addr = (uint64_t)(*mapped_addr);
764 	} else {
765 		/* We'll use the physical address as the iova to match DPDK. */
766 		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), *phys_addr, *size);
767 		if (rc) {
768 			dev->unmap_bar(dev, bar, *mapped_addr);
769 			return -EFAULT;
770 		}
771 	}
772 #endif
773 	return rc;
774 }
775 
776 int
777 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
778 {
779 #if VFIO_ENABLED
780 	int rc;
781 
782 	if (spdk_iommu_is_enabled()) {
783 		rc = vtophys_iommu_unmap_dma_bar((uint64_t)addr);
784 		if (rc) {
785 			return -EFAULT;
786 		}
787 	}
788 #endif
789 
790 	return dev->unmap_bar(dev, bar, addr);
791 }
792 
793 int
794 spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev)
795 {
796 	return dpdk_pci_device_enable_interrupt(dev->dev_handle);
797 }
798 
799 int
800 spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev)
801 {
802 	return dpdk_pci_device_disable_interrupt(dev->dev_handle);
803 }
804 
805 int
806 spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev)
807 {
808 	return dpdk_pci_device_get_interrupt_efd(dev->dev_handle);
809 }
810 
811 uint32_t
812 spdk_pci_device_get_domain(struct spdk_pci_device *dev)
813 {
814 	return dev->addr.domain;
815 }
816 
817 uint8_t
818 spdk_pci_device_get_bus(struct spdk_pci_device *dev)
819 {
820 	return dev->addr.bus;
821 }
822 
823 uint8_t
824 spdk_pci_device_get_dev(struct spdk_pci_device *dev)
825 {
826 	return dev->addr.dev;
827 }
828 
829 uint8_t
830 spdk_pci_device_get_func(struct spdk_pci_device *dev)
831 {
832 	return dev->addr.func;
833 }
834 
835 uint16_t
836 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
837 {
838 	return dev->id.vendor_id;
839 }
840 
841 uint16_t
842 spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
843 {
844 	return dev->id.device_id;
845 }
846 
847 uint16_t
848 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
849 {
850 	return dev->id.subvendor_id;
851 }
852 
853 uint16_t
854 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
855 {
856 	return dev->id.subdevice_id;
857 }
858 
859 struct spdk_pci_id
860 spdk_pci_device_get_id(struct spdk_pci_device *dev)
861 {
862 	return dev->id;
863 }
864 
865 int
866 spdk_pci_device_get_numa_id(struct spdk_pci_device *dev)
867 {
868 	return dev->numa_id;
869 }
870 
871 SPDK_LOG_DEPRECATION_REGISTER(pci_device_socket_id, "spdk_pci_device_get_socket_id", "v25.05", 0);
872 
873 int
874 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
875 {
876 	SPDK_LOG_DEPRECATED(pci_device_socket_id);
877 	return spdk_pci_device_get_numa_id(dev);
878 }
879 
880 int
881 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
882 {
883 	return dev->cfg_read(dev, value, len, offset);
884 }
885 
886 int
887 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
888 {
889 	return dev->cfg_write(dev, value, len, offset);
890 }
891 
892 int
893 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
894 {
895 	return spdk_pci_device_cfg_read(dev, value, 1, offset);
896 }
897 
898 int
899 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
900 {
901 	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
902 }
903 
904 int
905 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
906 {
907 	return spdk_pci_device_cfg_read(dev, value, 2, offset);
908 }
909 
910 int
911 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
912 {
913 	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
914 }
915 
916 int
917 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
918 {
919 	return spdk_pci_device_cfg_read(dev, value, 4, offset);
920 }
921 
922 int
923 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
924 {
925 	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
926 }
927 
928 int
929 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
930 {
931 	int err;
932 	uint32_t pos, header = 0;
933 	uint32_t i, buf[2];
934 
935 	if (len < 17) {
936 		return -1;
937 	}
938 
939 	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
940 	if (err || !header) {
941 		return -1;
942 	}
943 
944 	pos = PCI_CFG_SIZE;
945 	while (1) {
946 		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
947 			if (pos) {
948 				/* skip the header */
949 				pos += 4;
950 				for (i = 0; i < 2; i++) {
951 					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
952 					if (err) {
953 						return -1;
954 					}
955 				}
956 				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
957 				return 0;
958 			}
959 		}
960 		pos = (header >> 20) & 0xffc;
961 		/* 0 if no other items exist */
962 		if (pos < PCI_CFG_SIZE) {
963 			return -1;
964 		}
965 		err = spdk_pci_device_cfg_read32(dev, &header, pos);
966 		if (err) {
967 			return -1;
968 		}
969 	}
970 	return -1;
971 }
972 
973 struct spdk_pci_addr
974 spdk_pci_device_get_addr(struct spdk_pci_device *dev)
975 {
976 	return dev->addr;
977 }
978 
979 bool
980 spdk_pci_device_is_removed(struct spdk_pci_device *dev)
981 {
982 	return dev->internal.pending_removal;
983 }
984 
985 int
986 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
987 {
988 	if (a1->domain > a2->domain) {
989 		return 1;
990 	} else if (a1->domain < a2->domain) {
991 		return -1;
992 	} else if (a1->bus > a2->bus) {
993 		return 1;
994 	} else if (a1->bus < a2->bus) {
995 		return -1;
996 	} else if (a1->dev > a2->dev) {
997 		return 1;
998 	} else if (a1->dev < a2->dev) {
999 		return -1;
1000 	} else if (a1->func > a2->func) {
1001 		return 1;
1002 	} else if (a1->func < a2->func) {
1003 		return -1;
1004 	}
1005 
1006 	return 0;
1007 }
1008 
1009 #ifdef __linux__
1010 int
1011 spdk_pci_device_claim(struct spdk_pci_device *dev)
1012 {
1013 	int dev_fd;
1014 	char dev_name[64];
1015 	int pid;
1016 	void *dev_map;
1017 	struct flock pcidev_lock = {
1018 		.l_type = F_WRLCK,
1019 		.l_whence = SEEK_SET,
1020 		.l_start = 0,
1021 		.l_len = 0,
1022 	};
1023 
1024 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1025 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1026 
1027 	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
1028 	if (dev_fd == -1) {
1029 		SPDK_ERRLOG("could not open %s\n", dev_name);
1030 		return -errno;
1031 	}
1032 
1033 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
1034 		SPDK_ERRLOG("could not truncate %s\n", dev_name);
1035 		close(dev_fd);
1036 		return -errno;
1037 	}
1038 
1039 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
1040 		       MAP_SHARED, dev_fd, 0);
1041 	if (dev_map == MAP_FAILED) {
1042 		SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno);
1043 		close(dev_fd);
1044 		return -errno;
1045 	}
1046 
1047 	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
1048 		pid = *(int *)dev_map;
1049 		SPDK_ERRLOG("Cannot create lock on device %s, probably"
1050 			    " process %d has claimed it\n", dev_name, pid);
1051 		munmap(dev_map, sizeof(int));
1052 		close(dev_fd);
1053 		/* F_SETLK returns unspecified errnos, normalize them */
1054 		return -EACCES;
1055 	}
1056 
1057 	*(int *)dev_map = (int)getpid();
1058 	munmap(dev_map, sizeof(int));
1059 	dev->internal.claim_fd = dev_fd;
1060 	/* Keep dev_fd open to maintain the lock. */
1061 	return 0;
1062 }
1063 
1064 void
1065 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1066 {
1067 	char dev_name[64];
1068 
1069 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1070 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1071 
1072 	close(dev->internal.claim_fd);
1073 	dev->internal.claim_fd = -1;
1074 	unlink(dev_name);
1075 }
1076 #else /* !__linux__ */
1077 int
1078 spdk_pci_device_claim(struct spdk_pci_device *dev)
1079 {
1080 	/* TODO */
1081 	return 0;
1082 }
1083 
1084 void
1085 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1086 {
1087 	/* TODO */
1088 }
1089 #endif /* __linux__ */
1090 
1091 int
1092 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
1093 {
1094 	unsigned domain, bus, dev, func;
1095 
1096 	if (addr == NULL || bdf == NULL) {
1097 		return -EINVAL;
1098 	}
1099 
1100 	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1101 	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1102 		/* Matched a full address - all variables are initialized */
1103 	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1104 		func = 0;
1105 	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1106 		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1107 		domain = 0;
1108 	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1109 		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1110 		domain = 0;
1111 		func = 0;
1112 	} else {
1113 		return -EINVAL;
1114 	}
1115 
1116 	if (bus > 0xFF || dev > 0x1F || func > 7) {
1117 		return -EINVAL;
1118 	}
1119 
1120 	addr->domain = domain;
1121 	addr->bus = bus;
1122 	addr->dev = dev;
1123 	addr->func = func;
1124 
1125 	return 0;
1126 }
1127 
1128 int
1129 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1130 {
1131 	int rc;
1132 
1133 	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1134 		      addr->domain, addr->bus,
1135 		      addr->dev, addr->func);
1136 
1137 	if (rc > 0 && (size_t)rc < sz) {
1138 		return 0;
1139 	}
1140 
1141 	return -1;
1142 }
1143 
1144 int
1145 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1146 {
1147 	int rc;
1148 
1149 	assert(dev->map_bar != NULL);
1150 	assert(dev->unmap_bar != NULL);
1151 	assert(dev->cfg_read != NULL);
1152 	assert(dev->cfg_write != NULL);
1153 	dev->internal.driver = drv;
1154 
1155 	if (drv->cb_fn != NULL) {
1156 		rc = drv->cb_fn(drv->cb_arg, dev);
1157 		if (rc != 0) {
1158 			return -ECANCELED;
1159 		}
1160 
1161 		dev->internal.attached = true;
1162 	}
1163 
1164 	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1165 
1166 	return 0;
1167 }
1168 
1169 void
1170 spdk_pci_unhook_device(struct spdk_pci_device *dev)
1171 {
1172 	assert(!dev->internal.attached);
1173 	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1174 }
1175 
1176 void
1177 spdk_pci_register_device_provider(struct spdk_pci_device_provider *provider)
1178 {
1179 	TAILQ_INSERT_TAIL(&g_pci_device_providers, provider, tailq);
1180 }
1181 
1182 const char *
1183 spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1184 {
1185 	return dev->type;
1186 }
1187 
1188 int
1189 spdk_pci_device_allow(struct spdk_pci_addr *pci_addr)
1190 {
1191 	struct rte_devargs *da;
1192 	char devargs_str[128];
1193 
1194 	da = calloc(1, sizeof(*da));
1195 	if (da == NULL) {
1196 		SPDK_ERRLOG("could not allocate rte_devargs\n");
1197 		return -ENOMEM;
1198 	}
1199 
1200 	snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x",
1201 		 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func);
1202 	if (rte_devargs_parse(da, devargs_str) != 0) {
1203 		SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str);
1204 		free(da);
1205 		return -EINVAL;
1206 	}
1207 	da->policy = RTE_DEV_ALLOWED;
1208 	/* Note: if a devargs already exists for this device address, it just gets
1209 	 * overridden.  So we do not need to check if the devargs already exists.
1210 	 * DPDK will take care of memory management for the devargs structure after
1211 	 * it has been inserted, so there's nothing SPDK needs to track.
1212 	 */
1213 	if (rte_devargs_insert(&da) != 0) {
1214 		SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str);
1215 		free(da);
1216 		return -EINVAL;
1217 	}
1218 
1219 	return 0;
1220 }
1221