xref: /spdk/lib/env_dpdk/pci.c (revision 60982c759db49b4f4579f16e3b24df0725ba4b94)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2015 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "env_internal.h"
7 #include "pci_dpdk.h"
8 
9 #include <rte_alarm.h>
10 #include <rte_devargs.h>
11 #include <rte_pci.h>
12 #include "spdk/env.h"
13 #include "spdk/log.h"
14 #include "spdk/string.h"
15 #include "spdk/memory.h"
16 
17 #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
18 
19 /* Compatibility for versions < 20.11 */
20 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0)
21 #define RTE_DEV_ALLOWED RTE_DEV_WHITELISTED
22 #define RTE_DEV_BLOCKED RTE_DEV_BLACKLISTED
23 #define RTE_BUS_SCAN_ALLOWLIST RTE_BUS_SCAN_WHITELIST
24 #endif
25 
26 #define PCI_CFG_SIZE		256
27 #define PCI_EXT_CAP_ID_SN	0x03
28 
29 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
30  * might cause the internal IPC to misbehave. Just retry in such case.
31  */
32 #define DPDK_HOTPLUG_RETRY_COUNT 4
33 
34 /* DPDK alarm/interrupt thread */
35 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
36 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
37 /* devices hotplugged on a dpdk thread */
38 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
39 	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
40 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
41 static TAILQ_HEAD(, spdk_pci_device_provider) g_pci_device_providers =
42 	TAILQ_HEAD_INITIALIZER(g_pci_device_providers);
43 
44 int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device);
45 int pci_device_fini(struct rte_pci_device *device);
46 
47 struct env_devargs {
48 	struct rte_bus	*bus;
49 	char		name[128];
50 	uint64_t	allowed_at;
51 	TAILQ_ENTRY(env_devargs) link;
52 };
53 static TAILQ_HEAD(, env_devargs) g_env_devargs = TAILQ_HEAD_INITIALIZER(g_env_devargs);
54 
55 static struct env_devargs *
56 find_env_devargs(struct rte_bus *bus, const char *name)
57 {
58 	struct env_devargs *da;
59 
60 	TAILQ_FOREACH(da, &g_env_devargs, link) {
61 		if (bus == da->bus && !strcmp(name, da->name)) {
62 			return da;
63 		}
64 	}
65 
66 	return NULL;
67 }
68 
69 static int
70 map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
71 	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
72 {
73 	struct rte_mem_resource *res;
74 
75 	res = dpdk_pci_device_get_mem_resource(device->dev_handle, bar);
76 	*mapped_addr = res->addr;
77 	*phys_addr = (uint64_t)res->phys_addr;
78 	*size = (uint64_t)res->len;
79 
80 	return 0;
81 }
82 
83 static int
84 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
85 {
86 	return 0;
87 }
88 
89 static int
90 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
91 {
92 	return dpdk_pci_device_read_config(dev->dev_handle, value, len, offset);
93 }
94 
95 static int
96 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
97 {
98 	return dpdk_pci_device_write_config(dev->dev_handle, value, len, offset);
99 }
100 
101 static void
102 remove_rte_dev(struct rte_pci_device *rte_dev)
103 {
104 	char bdf[32];
105 	int i = 0, rc;
106 
107 	snprintf(bdf, sizeof(bdf), "%s", dpdk_pci_device_get_name(rte_dev));
108 	do {
109 		rc = rte_eal_hotplug_remove("pci", bdf);
110 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
111 }
112 
113 static void
114 detach_rte_cb(void *_dev)
115 {
116 	remove_rte_dev(_dev);
117 }
118 
119 /* if it's a physical device we need to deal with DPDK on
120  * a different process and we can't just unset one flag
121  * here. We also want to stop using any device resources
122  * so that the device isn't "in use" by the userspace driver
123  * once we detach it. This would allow attaching the device
124  * to a different process, or to a kernel driver like nvme.
125  */
126 static void
127 detach_rte(struct spdk_pci_device *dev)
128 {
129 	struct rte_pci_device *rte_dev = dev->dev_handle;
130 	int i;
131 	bool removed;
132 
133 	if (!spdk_process_is_primary()) {
134 		return;
135 	}
136 
137 	pthread_mutex_lock(&g_pci_mutex);
138 	dev->internal.attached = false;
139 	/* prevent the hotremove notification from removing this device */
140 	dev->internal.pending_removal = true;
141 	pthread_mutex_unlock(&g_pci_mutex);
142 
143 	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
144 
145 	/* wait up to 2s for the cb to execute */
146 	for (i = 2000; i > 0; i--) {
147 
148 		spdk_delay_us(1000);
149 		pthread_mutex_lock(&g_pci_mutex);
150 		removed = dev->internal.removed;
151 		pthread_mutex_unlock(&g_pci_mutex);
152 
153 		if (removed) {
154 			break;
155 		}
156 	}
157 
158 	/* besides checking the removed flag, we also need to wait
159 	 * for the dpdk detach function to unwind, as it's doing some
160 	 * operations even after calling our detach callback. Simply
161 	 * cancel the alarm - if it started executing already, this
162 	 * call will block and wait for it to finish.
163 	 */
164 	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
165 
166 	/* the device could have been finally removed, so just check
167 	 * it again.
168 	 */
169 	pthread_mutex_lock(&g_pci_mutex);
170 	removed = dev->internal.removed;
171 	pthread_mutex_unlock(&g_pci_mutex);
172 	if (!removed) {
173 		SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n",
174 			    dpdk_pci_device_get_name(rte_dev));
175 		/* If we reach this state, then the device couldn't be removed and most likely
176 		   a subsequent hot add of a device in the same BDF will fail */
177 	}
178 }
179 
180 void
181 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
182 {
183 	struct spdk_pci_driver *driver;
184 
185 	driver = calloc(1, sizeof(*driver));
186 	if (!driver) {
187 		/* we can't do any better than bailing atm */
188 		return;
189 	}
190 
191 	driver->name = name;
192 	driver->id_table = id_table;
193 	driver->drv_flags = flags;
194 	driver->driver = (struct rte_pci_driver *)driver->driver_buf;
195 	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
196 }
197 
198 struct spdk_pci_driver *
199 spdk_pci_nvme_get_driver(void)
200 {
201 	return spdk_pci_get_driver("nvme");
202 }
203 
204 struct spdk_pci_driver *
205 spdk_pci_get_driver(const char *name)
206 {
207 	struct spdk_pci_driver *driver;
208 
209 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
210 		if (strcmp(driver->name, name) == 0) {
211 			return driver;
212 		}
213 	}
214 
215 	return NULL;
216 }
217 
218 static void
219 pci_device_rte_dev_event(const char *device_name,
220 			 enum rte_dev_event_type event,
221 			 void *cb_arg)
222 {
223 	struct spdk_pci_device *dev;
224 	bool can_detach = false;
225 
226 	switch (event) {
227 	default:
228 	case RTE_DEV_EVENT_ADD:
229 		/* Nothing to do here yet. */
230 		break;
231 	case RTE_DEV_EVENT_REMOVE:
232 		pthread_mutex_lock(&g_pci_mutex);
233 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
234 			struct rte_pci_device *rte_dev = dev->dev_handle;
235 
236 			/* Note: these ERRLOGs are useful for triaging issue #2983. */
237 			if (dev->internal.pending_removal || dev->internal.removed) {
238 				SPDK_ERRLOG("Received event for device SPDK already tried to remove\n");
239 				SPDK_ERRLOG("pending_removal=%d removed=%d\n", dev->internal.pending_removal,
240 					    dev->internal.removed);
241 			}
242 
243 			if (strcmp(dpdk_pci_device_get_name(rte_dev), device_name) == 0 &&
244 			    !dev->internal.pending_removal) {
245 				can_detach = !dev->internal.attached;
246 				/* prevent any further attaches */
247 				dev->internal.pending_removal = true;
248 				break;
249 			}
250 		}
251 		pthread_mutex_unlock(&g_pci_mutex);
252 
253 		if (dev != NULL && can_detach) {
254 			/* if device is not attached we can remove it right away.
255 			 * Otherwise it will be removed at detach.
256 			 *
257 			 * Because the user's callback is invoked in eal interrupt
258 			 * callback, the interrupt callback need to be finished before
259 			 * it can be unregistered when detaching device. So finish
260 			 * callback soon and use a deferred removal to detach device
261 			 * is need. It is a workaround, once the device detaching be
262 			 * moved into the eal in the future, the deferred removal could
263 			 * be deleted.
264 			 */
265 			rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle);
266 		}
267 		break;
268 	}
269 }
270 
271 static void
272 cleanup_pci_devices(void)
273 {
274 	struct spdk_pci_device *dev, *tmp;
275 
276 	pthread_mutex_lock(&g_pci_mutex);
277 	/* cleanup removed devices */
278 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
279 		if (!dev->internal.removed) {
280 			continue;
281 		}
282 
283 		vtophys_pci_device_removed(dev->dev_handle);
284 		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
285 		free(dev);
286 	}
287 
288 	/* add newly-attached devices */
289 	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
290 		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
291 		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
292 		vtophys_pci_device_added(dev->dev_handle);
293 	}
294 	pthread_mutex_unlock(&g_pci_mutex);
295 }
296 
297 static int scan_pci_bus(bool delay_init);
298 
299 static inline void
300 _pci_env_init(void)
301 {
302 	/* We assume devices were present on the bus for more than 2 seconds
303 	 * before initializing SPDK and there's no need to wait more. We scan
304 	 * the bus, but we don't block any devices.
305 	 */
306 	scan_pci_bus(false);
307 
308 	/* Register a single hotremove callback for all devices. */
309 	if (spdk_process_is_primary()) {
310 		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
311 	}
312 }
313 
314 int
315 pci_env_init(void)
316 {
317 	struct spdk_pci_driver *driver;
318 	int rc;
319 
320 	rc = dpdk_pci_init();
321 	if (rc) {
322 		return rc;
323 	}
324 
325 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
326 		dpdk_pci_driver_register(driver, pci_device_init, pci_device_fini);
327 	}
328 
329 	_pci_env_init();
330 	return 0;
331 }
332 
333 void
334 pci_env_reinit(void)
335 {
336 	/* There is no need to register pci drivers again, since they were
337 	 * already pre-registered in pci_env_init.
338 	 */
339 
340 	_pci_env_init();
341 }
342 
343 void
344 pci_env_fini(void)
345 {
346 	struct spdk_pci_device *dev;
347 	char bdf[32];
348 
349 	cleanup_pci_devices();
350 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
351 		if (dev->internal.attached) {
352 			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
353 			SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf);
354 		}
355 	}
356 
357 	if (spdk_process_is_primary()) {
358 		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
359 	}
360 }
361 
362 int
363 pci_device_init(struct rte_pci_driver *_drv,
364 		struct rte_pci_device *_dev)
365 {
366 	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
367 	struct spdk_pci_device *dev;
368 	struct rte_pci_addr *addr;
369 	struct rte_pci_id *id;
370 	int rc;
371 
372 	dev = calloc(1, sizeof(*dev));
373 	if (dev == NULL) {
374 		return -1;
375 	}
376 
377 	dev->dev_handle = _dev;
378 
379 	addr = dpdk_pci_device_get_addr(_dev);
380 	dev->addr.domain = addr->domain;
381 	dev->addr.bus = addr->bus;
382 	dev->addr.dev = addr->devid;
383 	dev->addr.func = addr->function;
384 
385 	id = dpdk_pci_device_get_id(_dev);
386 	dev->id.class_id = id->class_id;
387 	dev->id.vendor_id = id->vendor_id;
388 	dev->id.device_id = id->device_id;
389 	dev->id.subvendor_id = id->subsystem_vendor_id;
390 	dev->id.subdevice_id = id->subsystem_device_id;
391 
392 	dev->socket_id = dpdk_pci_device_get_numa_node(_dev);
393 	dev->type = "pci";
394 
395 	dev->map_bar = map_bar_rte;
396 	dev->unmap_bar = unmap_bar_rte;
397 	dev->cfg_read = cfg_read_rte;
398 	dev->cfg_write = cfg_write_rte;
399 
400 	dev->internal.driver = driver;
401 	dev->internal.claim_fd = -1;
402 
403 	if (driver->cb_fn != NULL) {
404 		rc = driver->cb_fn(driver->cb_arg, dev);
405 		if (rc != 0) {
406 			free(dev);
407 			return rc;
408 		}
409 		dev->internal.attached = true;
410 	}
411 
412 	pthread_mutex_lock(&g_pci_mutex);
413 	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
414 	pthread_mutex_unlock(&g_pci_mutex);
415 	return 0;
416 }
417 
418 static void
419 set_allowed_at(struct rte_devargs *rte_da, uint64_t tsc)
420 {
421 	struct env_devargs *env_da;
422 
423 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
424 	if (env_da == NULL) {
425 		env_da = calloc(1, sizeof(*env_da));
426 		if (env_da == NULL) {
427 			SPDK_ERRLOG("could not set_allowed_at for device %s\n", rte_da->name);
428 			return;
429 		}
430 		env_da->bus = rte_da->bus;
431 		spdk_strcpy_pad(env_da->name, rte_da->name, sizeof(env_da->name), 0);
432 		TAILQ_INSERT_TAIL(&g_env_devargs, env_da, link);
433 	}
434 
435 	env_da->allowed_at = tsc;
436 }
437 
438 static uint64_t
439 get_allowed_at(struct rte_devargs *rte_da)
440 {
441 	struct env_devargs *env_da;
442 
443 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
444 	if (env_da) {
445 		return env_da->allowed_at;
446 	} else {
447 		return 0;
448 	}
449 }
450 
451 int
452 pci_device_fini(struct rte_pci_device *_dev)
453 {
454 	struct spdk_pci_device *dev;
455 
456 	pthread_mutex_lock(&g_pci_mutex);
457 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
458 		if (dev->dev_handle == _dev) {
459 			break;
460 		}
461 	}
462 
463 	if (dev == NULL || dev->internal.attached) {
464 		/* The device might be still referenced somewhere in SPDK. */
465 		pthread_mutex_unlock(&g_pci_mutex);
466 		return -EBUSY;
467 	}
468 
469 	/* remove our allowed_at option */
470 	if (dpdk_pci_device_get_devargs(_dev)) {
471 		set_allowed_at(dpdk_pci_device_get_devargs(_dev), 0);
472 	}
473 
474 	/* It is possible that removed flag was already set when there is a race
475 	 * between the remove notification for this process, and another process
476 	 * that is also detaching from this same device (for example, when using
477 	 * nvme driver in multi-process mode.  So do not assert here.  See
478 	 * #2456 for additional details.
479 	 */
480 	dev->internal.removed = true;
481 	pthread_mutex_unlock(&g_pci_mutex);
482 	return 0;
483 
484 }
485 
486 void
487 spdk_pci_device_detach(struct spdk_pci_device *dev)
488 {
489 	struct spdk_pci_device_provider *provider;
490 
491 	assert(dev->internal.attached);
492 
493 	if (dev->internal.claim_fd >= 0) {
494 		spdk_pci_device_unclaim(dev);
495 	}
496 
497 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
498 		if (strcmp(dev->type, provider->name) == 0) {
499 			break;
500 		}
501 	}
502 
503 	assert(provider != NULL);
504 	dev->internal.attached = false;
505 	provider->detach_cb(dev);
506 
507 	cleanup_pci_devices();
508 }
509 
510 static int
511 scan_pci_bus(bool delay_init)
512 {
513 	struct rte_dev_iterator it;
514 	struct rte_device *rte_dev;
515 	uint64_t now;
516 
517 	dpdk_bus_scan();
518 	now = spdk_get_ticks();
519 
520 	if (!TAILQ_FIRST(&g_pci_drivers)) {
521 		return 0;
522 	}
523 
524 	RTE_DEV_FOREACH(rte_dev, "bus=pci", &it) {
525 		struct rte_devargs *da;
526 
527 		da = dpdk_device_get_devargs(rte_dev);
528 		if (!da) {
529 			char devargs_str[128];
530 
531 			/* the device was never blocked or allowed */
532 			da = calloc(1, sizeof(*da));
533 			if (!da) {
534 				return -1;
535 			}
536 
537 			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", dpdk_device_get_name(rte_dev));
538 			if (rte_devargs_parse(da, devargs_str) != 0) {
539 				free(da);
540 				return -1;
541 			}
542 
543 			rte_devargs_insert(&da);
544 			dpdk_device_set_devargs(rte_dev, da);
545 		}
546 
547 		if (get_allowed_at(da)) {
548 			uint64_t allowed_at = get_allowed_at(da);
549 
550 			/* this device was seen by spdk before... */
551 			if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) {
552 				da->policy = RTE_DEV_ALLOWED;
553 			}
554 		} else if ((dpdk_device_scan_allowed(rte_dev) && da->policy == RTE_DEV_ALLOWED) ||
555 			   da->policy != RTE_DEV_BLOCKED) {
556 			/* override the policy only if not permanently blocked */
557 
558 			if (delay_init) {
559 				da->policy = RTE_DEV_BLOCKED;
560 				set_allowed_at(da, now + 2 * spdk_get_ticks_hz());
561 			} else {
562 				da->policy = RTE_DEV_ALLOWED;
563 				set_allowed_at(da, now);
564 			}
565 		}
566 	}
567 
568 	return 0;
569 }
570 
571 static int
572 pci_attach_rte(const struct spdk_pci_addr *addr)
573 {
574 	char bdf[32];
575 	int rc, i = 0;
576 
577 	spdk_pci_addr_fmt(bdf, sizeof(bdf), addr);
578 
579 	do {
580 		rc = rte_eal_hotplug_add("pci", bdf, "");
581 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
582 
583 	if (i > 1 && rc == -EEXIST) {
584 		/* Even though the previous request timed out, the device
585 		 * was attached successfully.
586 		 */
587 		rc = 0;
588 	}
589 
590 	return rc;
591 }
592 
593 static struct spdk_pci_device_provider g_pci_rte_provider = {
594 	.name = "pci",
595 	.attach_cb = pci_attach_rte,
596 	.detach_cb = detach_rte,
597 };
598 
599 SPDK_PCI_REGISTER_DEVICE_PROVIDER(pci, &g_pci_rte_provider);
600 
601 int
602 spdk_pci_device_attach(struct spdk_pci_driver *driver,
603 		       spdk_pci_enum_cb enum_cb,
604 		       void *enum_ctx, struct spdk_pci_addr *pci_address)
605 {
606 	struct spdk_pci_device *dev;
607 	struct spdk_pci_device_provider *provider;
608 	struct rte_pci_device *rte_dev;
609 	struct rte_devargs *da;
610 	int rc;
611 
612 	cleanup_pci_devices();
613 
614 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
615 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
616 			break;
617 		}
618 	}
619 
620 	if (dev != NULL && dev->internal.driver == driver) {
621 		pthread_mutex_lock(&g_pci_mutex);
622 		if (dev->internal.attached || dev->internal.pending_removal) {
623 			pthread_mutex_unlock(&g_pci_mutex);
624 			return -1;
625 		}
626 
627 		rc = enum_cb(enum_ctx, dev);
628 		if (rc == 0) {
629 			dev->internal.attached = true;
630 		}
631 		pthread_mutex_unlock(&g_pci_mutex);
632 		return rc;
633 	}
634 
635 	driver->cb_fn = enum_cb;
636 	driver->cb_arg = enum_ctx;
637 
638 	rc = -ENODEV;
639 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
640 		rc = provider->attach_cb(pci_address);
641 		if (rc == 0) {
642 			break;
643 		}
644 	}
645 
646 	driver->cb_arg = NULL;
647 	driver->cb_fn = NULL;
648 
649 	cleanup_pci_devices();
650 
651 	if (rc != 0) {
652 		return -1;
653 	}
654 
655 	/* explicit attach ignores the allowlist, so if we blocked this
656 	 * device before let's enable it now - just for clarity.
657 	 */
658 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
659 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
660 			break;
661 		}
662 	}
663 	assert(dev != NULL);
664 
665 	rte_dev = dev->dev_handle;
666 	if (rte_dev != NULL) {
667 		da = dpdk_pci_device_get_devargs(rte_dev);
668 		if (da && get_allowed_at(da)) {
669 			set_allowed_at(da, spdk_get_ticks());
670 			da->policy = RTE_DEV_ALLOWED;
671 		}
672 	}
673 
674 	return 0;
675 }
676 
677 /* Note: You can call spdk_pci_enumerate from more than one thread
678  *       simultaneously safely, but you cannot call spdk_pci_enumerate
679  *       and rte_eal_pci_probe simultaneously.
680  */
681 int
682 spdk_pci_enumerate(struct spdk_pci_driver *driver,
683 		   spdk_pci_enum_cb enum_cb,
684 		   void *enum_ctx)
685 {
686 	struct spdk_pci_device *dev;
687 	int rc;
688 
689 	cleanup_pci_devices();
690 
691 	pthread_mutex_lock(&g_pci_mutex);
692 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
693 		if (dev->internal.attached ||
694 		    dev->internal.driver != driver ||
695 		    dev->internal.pending_removal) {
696 			continue;
697 		}
698 
699 		rc = enum_cb(enum_ctx, dev);
700 		if (rc == 0) {
701 			dev->internal.attached = true;
702 		} else if (rc < 0) {
703 			pthread_mutex_unlock(&g_pci_mutex);
704 			return -1;
705 		}
706 	}
707 	pthread_mutex_unlock(&g_pci_mutex);
708 
709 	if (scan_pci_bus(true) != 0) {
710 		return -1;
711 	}
712 
713 	driver->cb_fn = enum_cb;
714 	driver->cb_arg = enum_ctx;
715 
716 	if (dpdk_bus_probe() != 0) {
717 		driver->cb_arg = NULL;
718 		driver->cb_fn = NULL;
719 		return -1;
720 	}
721 
722 	driver->cb_arg = NULL;
723 	driver->cb_fn = NULL;
724 
725 	cleanup_pci_devices();
726 	return 0;
727 }
728 
729 void
730 spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev))
731 {
732 	struct spdk_pci_device *dev, *tmp;
733 
734 	pthread_mutex_lock(&g_pci_mutex);
735 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
736 		fn(ctx, dev);
737 	}
738 	pthread_mutex_unlock(&g_pci_mutex);
739 }
740 
741 int
742 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
743 			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
744 {
745 	int rc;
746 
747 	rc = dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
748 	if (rc) {
749 		return rc;
750 	}
751 
752 #if VFIO_ENABLED
753 	/* Automatically map the BAR to the IOMMU */
754 	if (!spdk_iommu_is_enabled()) {
755 		return 0;
756 	}
757 
758 	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
759 		/* We'll use the virtual address as the iova to match DPDK. */
760 		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), (uint64_t) * mapped_addr, *size);
761 		if (rc) {
762 			dev->unmap_bar(dev, bar, *mapped_addr);
763 			return -EFAULT;
764 		}
765 
766 		*phys_addr = (uint64_t)(*mapped_addr);
767 	} else {
768 		/* We'll use the physical address as the iova to match DPDK. */
769 		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), *phys_addr, *size);
770 		if (rc) {
771 			dev->unmap_bar(dev, bar, *mapped_addr);
772 			return -EFAULT;
773 		}
774 	}
775 #endif
776 	return rc;
777 }
778 
779 int
780 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
781 {
782 #if VFIO_ENABLED
783 	int rc;
784 
785 	if (spdk_iommu_is_enabled()) {
786 		rc = vtophys_iommu_unmap_dma_bar((uint64_t)addr);
787 		if (rc) {
788 			return -EFAULT;
789 		}
790 	}
791 #endif
792 
793 	return dev->unmap_bar(dev, bar, addr);
794 }
795 
796 int
797 spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev)
798 {
799 	return dpdk_pci_device_enable_interrupt(dev->dev_handle);
800 }
801 
802 int
803 spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev)
804 {
805 	return dpdk_pci_device_disable_interrupt(dev->dev_handle);
806 }
807 
808 int
809 spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev)
810 {
811 	return dpdk_pci_device_get_interrupt_efd(dev->dev_handle);
812 }
813 
814 uint32_t
815 spdk_pci_device_get_domain(struct spdk_pci_device *dev)
816 {
817 	return dev->addr.domain;
818 }
819 
820 uint8_t
821 spdk_pci_device_get_bus(struct spdk_pci_device *dev)
822 {
823 	return dev->addr.bus;
824 }
825 
826 uint8_t
827 spdk_pci_device_get_dev(struct spdk_pci_device *dev)
828 {
829 	return dev->addr.dev;
830 }
831 
832 uint8_t
833 spdk_pci_device_get_func(struct spdk_pci_device *dev)
834 {
835 	return dev->addr.func;
836 }
837 
838 uint16_t
839 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
840 {
841 	return dev->id.vendor_id;
842 }
843 
844 uint16_t
845 spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
846 {
847 	return dev->id.device_id;
848 }
849 
850 uint16_t
851 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
852 {
853 	return dev->id.subvendor_id;
854 }
855 
856 uint16_t
857 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
858 {
859 	return dev->id.subdevice_id;
860 }
861 
862 struct spdk_pci_id
863 spdk_pci_device_get_id(struct spdk_pci_device *dev)
864 {
865 	return dev->id;
866 }
867 
868 int
869 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
870 {
871 	return dev->socket_id;
872 }
873 
874 int
875 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
876 {
877 	return dev->cfg_read(dev, value, len, offset);
878 }
879 
880 int
881 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
882 {
883 	return dev->cfg_write(dev, value, len, offset);
884 }
885 
886 int
887 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
888 {
889 	return spdk_pci_device_cfg_read(dev, value, 1, offset);
890 }
891 
892 int
893 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
894 {
895 	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
896 }
897 
898 int
899 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
900 {
901 	return spdk_pci_device_cfg_read(dev, value, 2, offset);
902 }
903 
904 int
905 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
906 {
907 	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
908 }
909 
910 int
911 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
912 {
913 	return spdk_pci_device_cfg_read(dev, value, 4, offset);
914 }
915 
916 int
917 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
918 {
919 	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
920 }
921 
922 int
923 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
924 {
925 	int err;
926 	uint32_t pos, header = 0;
927 	uint32_t i, buf[2];
928 
929 	if (len < 17) {
930 		return -1;
931 	}
932 
933 	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
934 	if (err || !header) {
935 		return -1;
936 	}
937 
938 	pos = PCI_CFG_SIZE;
939 	while (1) {
940 		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
941 			if (pos) {
942 				/* skip the header */
943 				pos += 4;
944 				for (i = 0; i < 2; i++) {
945 					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
946 					if (err) {
947 						return -1;
948 					}
949 				}
950 				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
951 				return 0;
952 			}
953 		}
954 		pos = (header >> 20) & 0xffc;
955 		/* 0 if no other items exist */
956 		if (pos < PCI_CFG_SIZE) {
957 			return -1;
958 		}
959 		err = spdk_pci_device_cfg_read32(dev, &header, pos);
960 		if (err) {
961 			return -1;
962 		}
963 	}
964 	return -1;
965 }
966 
967 struct spdk_pci_addr
968 spdk_pci_device_get_addr(struct spdk_pci_device *dev)
969 {
970 	return dev->addr;
971 }
972 
973 bool
974 spdk_pci_device_is_removed(struct spdk_pci_device *dev)
975 {
976 	return dev->internal.pending_removal;
977 }
978 
979 int
980 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
981 {
982 	if (a1->domain > a2->domain) {
983 		return 1;
984 	} else if (a1->domain < a2->domain) {
985 		return -1;
986 	} else if (a1->bus > a2->bus) {
987 		return 1;
988 	} else if (a1->bus < a2->bus) {
989 		return -1;
990 	} else if (a1->dev > a2->dev) {
991 		return 1;
992 	} else if (a1->dev < a2->dev) {
993 		return -1;
994 	} else if (a1->func > a2->func) {
995 		return 1;
996 	} else if (a1->func < a2->func) {
997 		return -1;
998 	}
999 
1000 	return 0;
1001 }
1002 
1003 #ifdef __linux__
1004 int
1005 spdk_pci_device_claim(struct spdk_pci_device *dev)
1006 {
1007 	int dev_fd;
1008 	char dev_name[64];
1009 	int pid;
1010 	void *dev_map;
1011 	struct flock pcidev_lock = {
1012 		.l_type = F_WRLCK,
1013 		.l_whence = SEEK_SET,
1014 		.l_start = 0,
1015 		.l_len = 0,
1016 	};
1017 
1018 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1019 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1020 
1021 	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
1022 	if (dev_fd == -1) {
1023 		SPDK_ERRLOG("could not open %s\n", dev_name);
1024 		return -errno;
1025 	}
1026 
1027 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
1028 		SPDK_ERRLOG("could not truncate %s\n", dev_name);
1029 		close(dev_fd);
1030 		return -errno;
1031 	}
1032 
1033 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
1034 		       MAP_SHARED, dev_fd, 0);
1035 	if (dev_map == MAP_FAILED) {
1036 		SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno);
1037 		close(dev_fd);
1038 		return -errno;
1039 	}
1040 
1041 	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
1042 		pid = *(int *)dev_map;
1043 		SPDK_ERRLOG("Cannot create lock on device %s, probably"
1044 			    " process %d has claimed it\n", dev_name, pid);
1045 		munmap(dev_map, sizeof(int));
1046 		close(dev_fd);
1047 		/* F_SETLK returns unspecified errnos, normalize them */
1048 		return -EACCES;
1049 	}
1050 
1051 	*(int *)dev_map = (int)getpid();
1052 	munmap(dev_map, sizeof(int));
1053 	dev->internal.claim_fd = dev_fd;
1054 	/* Keep dev_fd open to maintain the lock. */
1055 	return 0;
1056 }
1057 
1058 void
1059 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1060 {
1061 	char dev_name[64];
1062 
1063 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1064 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1065 
1066 	close(dev->internal.claim_fd);
1067 	dev->internal.claim_fd = -1;
1068 	unlink(dev_name);
1069 }
1070 #else /* !__linux__ */
1071 int
1072 spdk_pci_device_claim(struct spdk_pci_device *dev)
1073 {
1074 	/* TODO */
1075 	return 0;
1076 }
1077 
1078 void
1079 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1080 {
1081 	/* TODO */
1082 }
1083 #endif /* __linux__ */
1084 
1085 int
1086 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
1087 {
1088 	unsigned domain, bus, dev, func;
1089 
1090 	if (addr == NULL || bdf == NULL) {
1091 		return -EINVAL;
1092 	}
1093 
1094 	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1095 	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1096 		/* Matched a full address - all variables are initialized */
1097 	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1098 		func = 0;
1099 	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1100 		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1101 		domain = 0;
1102 	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1103 		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1104 		domain = 0;
1105 		func = 0;
1106 	} else {
1107 		return -EINVAL;
1108 	}
1109 
1110 	if (bus > 0xFF || dev > 0x1F || func > 7) {
1111 		return -EINVAL;
1112 	}
1113 
1114 	addr->domain = domain;
1115 	addr->bus = bus;
1116 	addr->dev = dev;
1117 	addr->func = func;
1118 
1119 	return 0;
1120 }
1121 
1122 int
1123 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1124 {
1125 	int rc;
1126 
1127 	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1128 		      addr->domain, addr->bus,
1129 		      addr->dev, addr->func);
1130 
1131 	if (rc > 0 && (size_t)rc < sz) {
1132 		return 0;
1133 	}
1134 
1135 	return -1;
1136 }
1137 
1138 int
1139 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1140 {
1141 	int rc;
1142 
1143 	assert(dev->map_bar != NULL);
1144 	assert(dev->unmap_bar != NULL);
1145 	assert(dev->cfg_read != NULL);
1146 	assert(dev->cfg_write != NULL);
1147 	dev->internal.driver = drv;
1148 
1149 	if (drv->cb_fn != NULL) {
1150 		rc = drv->cb_fn(drv->cb_arg, dev);
1151 		if (rc != 0) {
1152 			return -ECANCELED;
1153 		}
1154 
1155 		dev->internal.attached = true;
1156 	}
1157 
1158 	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1159 
1160 	return 0;
1161 }
1162 
1163 void
1164 spdk_pci_unhook_device(struct spdk_pci_device *dev)
1165 {
1166 	assert(!dev->internal.attached);
1167 	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1168 }
1169 
1170 void
1171 spdk_pci_register_device_provider(struct spdk_pci_device_provider *provider)
1172 {
1173 	TAILQ_INSERT_TAIL(&g_pci_device_providers, provider, tailq);
1174 }
1175 
1176 const char *
1177 spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1178 {
1179 	return dev->type;
1180 }
1181 
1182 int
1183 spdk_pci_device_allow(struct spdk_pci_addr *pci_addr)
1184 {
1185 	struct rte_devargs *da;
1186 	char devargs_str[128];
1187 
1188 	da = calloc(1, sizeof(*da));
1189 	if (da == NULL) {
1190 		SPDK_ERRLOG("could not allocate rte_devargs\n");
1191 		return -ENOMEM;
1192 	}
1193 
1194 	snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x",
1195 		 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func);
1196 	if (rte_devargs_parse(da, devargs_str) != 0) {
1197 		SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str);
1198 		free(da);
1199 		return -EINVAL;
1200 	}
1201 	da->policy = RTE_DEV_ALLOWED;
1202 	/* Note: if a devargs already exists for this device address, it just gets
1203 	 * overridden.  So we do not need to check if the devargs already exists.
1204 	 * DPDK will take care of memory management for the devargs structure after
1205 	 * it has been inserted, so there's nothing SPDK needs to track.
1206 	 */
1207 	if (rte_devargs_insert(&da) != 0) {
1208 		SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str);
1209 		free(da);
1210 		return -EINVAL;
1211 	}
1212 
1213 	return 0;
1214 }
1215