xref: /spdk/lib/env_dpdk/pci.c (revision a6dbe3721eb3b5990707fc3e378c95e505dd8ab5)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2015 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "env_internal.h"
7 #include "pci_dpdk.h"
8 
9 #include <rte_alarm.h>
10 #include <rte_devargs.h>
11 #include <rte_pci.h>
12 #include "spdk/env.h"
13 #include "spdk/log.h"
14 #include "spdk/string.h"
15 #include "spdk/memory.h"
16 
17 #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
18 
19 /* Compatibility for versions < 20.11 */
20 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0)
21 #define RTE_DEV_ALLOWED RTE_DEV_WHITELISTED
22 #define RTE_DEV_BLOCKED RTE_DEV_BLACKLISTED
23 #define RTE_BUS_SCAN_ALLOWLIST RTE_BUS_SCAN_WHITELIST
24 #endif
25 
26 #define PCI_CFG_SIZE		256
27 #define PCI_EXT_CAP_ID_SN	0x03
28 
29 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
30  * might cause the internal IPC to misbehave. Just retry in such case.
31  */
32 #define DPDK_HOTPLUG_RETRY_COUNT 4
33 
34 /* DPDK alarm/interrupt thread */
35 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
36 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
37 /* devices hotplugged on a dpdk thread */
38 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
39 	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
40 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
41 static TAILQ_HEAD(, spdk_pci_device_provider) g_pci_device_providers =
42 	TAILQ_HEAD_INITIALIZER(g_pci_device_providers);
43 
44 int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device);
45 int pci_device_fini(struct rte_pci_device *device);
46 
47 struct env_devargs {
48 	struct rte_bus	*bus;
49 	char		name[128];
50 	uint64_t	allowed_at;
51 	TAILQ_ENTRY(env_devargs) link;
52 };
53 static TAILQ_HEAD(, env_devargs) g_env_devargs = TAILQ_HEAD_INITIALIZER(g_env_devargs);
54 
55 static struct env_devargs *
56 find_env_devargs(struct rte_bus *bus, const char *name)
57 {
58 	struct env_devargs *da;
59 
60 	TAILQ_FOREACH(da, &g_env_devargs, link) {
61 		if (bus == da->bus && !strcmp(name, da->name)) {
62 			return da;
63 		}
64 	}
65 
66 	return NULL;
67 }
68 
69 static int
70 map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
71 	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
72 {
73 	struct rte_mem_resource *res;
74 
75 	res = dpdk_pci_device_get_mem_resource(device->dev_handle, bar);
76 	*mapped_addr = res->addr;
77 	*phys_addr = (uint64_t)res->phys_addr;
78 	*size = (uint64_t)res->len;
79 
80 	return 0;
81 }
82 
83 static int
84 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
85 {
86 	return 0;
87 }
88 
89 static int
90 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
91 {
92 	return dpdk_pci_device_read_config(dev->dev_handle, value, len, offset);
93 }
94 
95 static int
96 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
97 {
98 	return dpdk_pci_device_write_config(dev->dev_handle, value, len, offset);
99 }
100 
101 static void
102 remove_rte_dev(struct rte_pci_device *rte_dev)
103 {
104 	char bdf[32];
105 	int i = 0, rc;
106 
107 	snprintf(bdf, sizeof(bdf), "%s", dpdk_pci_device_get_name(rte_dev));
108 	do {
109 		rc = rte_eal_hotplug_remove("pci", bdf);
110 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
111 }
112 
113 static void
114 detach_rte_cb(void *_dev)
115 {
116 	remove_rte_dev(_dev);
117 }
118 
119 /* if it's a physical device we need to deal with DPDK on
120  * a different process and we can't just unset one flag
121  * here. We also want to stop using any device resources
122  * so that the device isn't "in use" by the userspace driver
123  * once we detach it. This would allow attaching the device
124  * to a different process, or to a kernel driver like nvme.
125  */
126 static void
127 detach_rte(struct spdk_pci_device *dev)
128 {
129 	struct rte_pci_device *rte_dev = dev->dev_handle;
130 	int i;
131 	bool removed;
132 
133 	if (!spdk_process_is_primary()) {
134 		remove_rte_dev(rte_dev);
135 		return;
136 	}
137 
138 	pthread_mutex_lock(&g_pci_mutex);
139 	dev->internal.attached = false;
140 	/* prevent the hotremove notification from removing this device */
141 	dev->internal.pending_removal = true;
142 	pthread_mutex_unlock(&g_pci_mutex);
143 
144 	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
145 
146 	/* wait up to 2s for the cb to execute */
147 	for (i = 2000; i > 0; i--) {
148 
149 		spdk_delay_us(1000);
150 		pthread_mutex_lock(&g_pci_mutex);
151 		removed = dev->internal.removed;
152 		pthread_mutex_unlock(&g_pci_mutex);
153 
154 		if (removed) {
155 			break;
156 		}
157 	}
158 
159 	/* besides checking the removed flag, we also need to wait
160 	 * for the dpdk detach function to unwind, as it's doing some
161 	 * operations even after calling our detach callback. Simply
162 	 * cancel the alarm - if it started executing already, this
163 	 * call will block and wait for it to finish.
164 	 */
165 	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
166 
167 	/* the device could have been finally removed, so just check
168 	 * it again.
169 	 */
170 	pthread_mutex_lock(&g_pci_mutex);
171 	removed = dev->internal.removed;
172 	pthread_mutex_unlock(&g_pci_mutex);
173 	if (!removed) {
174 		SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n",
175 			    dpdk_pci_device_get_name(rte_dev));
176 		/* If we reach this state, then the device couldn't be removed and most likely
177 		   a subsequent hot add of a device in the same BDF will fail */
178 	}
179 }
180 
181 void
182 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
183 {
184 	struct spdk_pci_driver *driver;
185 
186 	driver = calloc(1, sizeof(*driver));
187 	if (!driver) {
188 		/* we can't do any better than bailing atm */
189 		return;
190 	}
191 
192 	driver->name = name;
193 	driver->id_table = id_table;
194 	driver->drv_flags = flags;
195 	driver->driver = (struct rte_pci_driver *)driver->driver_buf;
196 	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
197 }
198 
199 struct spdk_pci_driver *
200 spdk_pci_nvme_get_driver(void)
201 {
202 	return spdk_pci_get_driver("nvme");
203 }
204 
205 struct spdk_pci_driver *
206 spdk_pci_get_driver(const char *name)
207 {
208 	struct spdk_pci_driver *driver;
209 
210 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
211 		if (strcmp(driver->name, name) == 0) {
212 			return driver;
213 		}
214 	}
215 
216 	return NULL;
217 }
218 
219 static void
220 pci_device_rte_dev_event(const char *device_name,
221 			 enum rte_dev_event_type event,
222 			 void *cb_arg)
223 {
224 	struct spdk_pci_device *dev;
225 	bool can_detach = false;
226 
227 	switch (event) {
228 	default:
229 	case RTE_DEV_EVENT_ADD:
230 		/* Nothing to do here yet. */
231 		break;
232 	case RTE_DEV_EVENT_REMOVE:
233 		pthread_mutex_lock(&g_pci_mutex);
234 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
235 			struct rte_pci_device *rte_dev = dev->dev_handle;
236 
237 			if (strcmp(dpdk_pci_device_get_name(rte_dev), device_name) == 0 &&
238 			    !dev->internal.pending_removal) {
239 				can_detach = !dev->internal.attached;
240 				/* prevent any further attaches */
241 				dev->internal.pending_removal = true;
242 				break;
243 			}
244 		}
245 		pthread_mutex_unlock(&g_pci_mutex);
246 
247 		if (dev != NULL && can_detach) {
248 			/* if device is not attached we can remove it right away.
249 			 * Otherwise it will be removed at detach.
250 			 *
251 			 * Because the user's callback is invoked in eal interrupt
252 			 * callback, the interrupt callback need to be finished before
253 			 * it can be unregistered when detaching device. So finish
254 			 * callback soon and use a deferred removal to detach device
255 			 * is need. It is a workaround, once the device detaching be
256 			 * moved into the eal in the future, the deferred removal could
257 			 * be deleted.
258 			 */
259 			rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle);
260 		}
261 		break;
262 	}
263 }
264 
265 static void
266 cleanup_pci_devices(void)
267 {
268 	struct spdk_pci_device *dev, *tmp;
269 
270 	pthread_mutex_lock(&g_pci_mutex);
271 	/* cleanup removed devices */
272 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
273 		if (!dev->internal.removed) {
274 			continue;
275 		}
276 
277 		vtophys_pci_device_removed(dev->dev_handle);
278 		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
279 		free(dev);
280 	}
281 
282 	/* add newly-attached devices */
283 	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
284 		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
285 		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
286 		vtophys_pci_device_added(dev->dev_handle);
287 	}
288 	pthread_mutex_unlock(&g_pci_mutex);
289 }
290 
291 static int scan_pci_bus(bool delay_init);
292 
293 static inline void
294 _pci_env_init(void)
295 {
296 	/* We assume devices were present on the bus for more than 2 seconds
297 	 * before initializing SPDK and there's no need to wait more. We scan
298 	 * the bus, but we don't block any devices.
299 	 */
300 	scan_pci_bus(false);
301 
302 	/* Register a single hotremove callback for all devices. */
303 	if (spdk_process_is_primary()) {
304 		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
305 	}
306 }
307 
308 int
309 pci_env_init(void)
310 {
311 	struct spdk_pci_driver *driver;
312 	int rc;
313 
314 	rc = dpdk_pci_init();
315 	if (rc) {
316 		return rc;
317 	}
318 
319 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
320 		dpdk_pci_driver_register(driver, pci_device_init, pci_device_fini);
321 	}
322 
323 	_pci_env_init();
324 	return 0;
325 }
326 
327 void
328 pci_env_reinit(void)
329 {
330 	/* There is no need to register pci drivers again, since they were
331 	 * already pre-registered in pci_env_init.
332 	 */
333 
334 	_pci_env_init();
335 }
336 
337 void
338 pci_env_fini(void)
339 {
340 	struct spdk_pci_device *dev;
341 	char bdf[32];
342 
343 	cleanup_pci_devices();
344 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
345 		if (dev->internal.attached) {
346 			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
347 			SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf);
348 		}
349 	}
350 
351 	if (spdk_process_is_primary()) {
352 		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
353 	}
354 }
355 
356 int
357 pci_device_init(struct rte_pci_driver *_drv,
358 		struct rte_pci_device *_dev)
359 {
360 	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
361 	struct spdk_pci_device *dev;
362 	struct rte_pci_addr *addr;
363 	struct rte_pci_id *id;
364 	int rc;
365 
366 	dev = calloc(1, sizeof(*dev));
367 	if (dev == NULL) {
368 		return -1;
369 	}
370 
371 	dev->dev_handle = _dev;
372 
373 	addr = dpdk_pci_device_get_addr(_dev);
374 	dev->addr.domain = addr->domain;
375 	dev->addr.bus = addr->bus;
376 	dev->addr.dev = addr->devid;
377 	dev->addr.func = addr->function;
378 
379 	id = dpdk_pci_device_get_id(_dev);
380 	dev->id.class_id = id->class_id;
381 	dev->id.vendor_id = id->vendor_id;
382 	dev->id.device_id = id->device_id;
383 	dev->id.subvendor_id = id->subsystem_vendor_id;
384 	dev->id.subdevice_id = id->subsystem_device_id;
385 
386 	dev->socket_id = dpdk_pci_device_get_numa_node(_dev);
387 	dev->type = "pci";
388 
389 	dev->map_bar = map_bar_rte;
390 	dev->unmap_bar = unmap_bar_rte;
391 	dev->cfg_read = cfg_read_rte;
392 	dev->cfg_write = cfg_write_rte;
393 
394 	dev->internal.driver = driver;
395 	dev->internal.claim_fd = -1;
396 
397 	if (driver->cb_fn != NULL) {
398 		rc = driver->cb_fn(driver->cb_arg, dev);
399 		if (rc != 0) {
400 			free(dev);
401 			return rc;
402 		}
403 		dev->internal.attached = true;
404 	}
405 
406 	pthread_mutex_lock(&g_pci_mutex);
407 	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
408 	pthread_mutex_unlock(&g_pci_mutex);
409 	return 0;
410 }
411 
412 static void
413 set_allowed_at(struct rte_devargs *rte_da, uint64_t tsc)
414 {
415 	struct env_devargs *env_da;
416 
417 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
418 	if (env_da == NULL) {
419 		env_da = calloc(1, sizeof(*env_da));
420 		if (env_da == NULL) {
421 			SPDK_ERRLOG("could not set_allowed_at for device %s\n", rte_da->name);
422 			return;
423 		}
424 		env_da->bus = rte_da->bus;
425 		spdk_strcpy_pad(env_da->name, rte_da->name, sizeof(env_da->name), 0);
426 		TAILQ_INSERT_TAIL(&g_env_devargs, env_da, link);
427 	}
428 
429 	env_da->allowed_at = tsc;
430 }
431 
432 static uint64_t
433 get_allowed_at(struct rte_devargs *rte_da)
434 {
435 	struct env_devargs *env_da;
436 
437 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
438 	if (env_da) {
439 		return env_da->allowed_at;
440 	} else {
441 		return 0;
442 	}
443 }
444 
445 int
446 pci_device_fini(struct rte_pci_device *_dev)
447 {
448 	struct spdk_pci_device *dev;
449 
450 	pthread_mutex_lock(&g_pci_mutex);
451 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
452 		if (dev->dev_handle == _dev) {
453 			break;
454 		}
455 	}
456 
457 	if (dev == NULL || dev->internal.attached) {
458 		/* The device might be still referenced somewhere in SPDK. */
459 		pthread_mutex_unlock(&g_pci_mutex);
460 		return -EBUSY;
461 	}
462 
463 	/* remove our allowed_at option */
464 	if (dpdk_pci_device_get_devargs(_dev)) {
465 		set_allowed_at(dpdk_pci_device_get_devargs(_dev), 0);
466 	}
467 
468 	/* It is possible that removed flag was already set when there is a race
469 	 * between the remove notification for this process, and another process
470 	 * that is also detaching from this same device (for example, when using
471 	 * nvme driver in multi-process mode.  So do not assert here.  See
472 	 * #2456 for additional details.
473 	 */
474 	dev->internal.removed = true;
475 	pthread_mutex_unlock(&g_pci_mutex);
476 	return 0;
477 
478 }
479 
480 void
481 spdk_pci_device_detach(struct spdk_pci_device *dev)
482 {
483 	struct spdk_pci_device_provider *provider;
484 
485 	assert(dev->internal.attached);
486 
487 	if (dev->internal.claim_fd >= 0) {
488 		spdk_pci_device_unclaim(dev);
489 	}
490 
491 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
492 		if (strcmp(dev->type, provider->name) == 0) {
493 			break;
494 		}
495 	}
496 
497 	assert(provider != NULL);
498 	dev->internal.attached = false;
499 	provider->detach_cb(dev);
500 
501 	cleanup_pci_devices();
502 }
503 
504 static int
505 scan_pci_bus(bool delay_init)
506 {
507 	struct rte_dev_iterator it;
508 	struct rte_device *rte_dev;
509 	uint64_t now;
510 
511 	dpdk_bus_scan();
512 	now = spdk_get_ticks();
513 
514 	if (!TAILQ_FIRST(&g_pci_drivers)) {
515 		return 0;
516 	}
517 
518 	RTE_DEV_FOREACH(rte_dev, "bus=pci", &it) {
519 		struct rte_devargs *da;
520 
521 		da = dpdk_device_get_devargs(rte_dev);
522 		if (!da) {
523 			char devargs_str[128];
524 
525 			/* the device was never blocked or allowed */
526 			da = calloc(1, sizeof(*da));
527 			if (!da) {
528 				return -1;
529 			}
530 
531 			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", dpdk_device_get_name(rte_dev));
532 			if (rte_devargs_parse(da, devargs_str) != 0) {
533 				free(da);
534 				return -1;
535 			}
536 
537 			rte_devargs_insert(&da);
538 			dpdk_device_set_devargs(rte_dev, da);
539 		}
540 
541 		if (get_allowed_at(da)) {
542 			uint64_t allowed_at = get_allowed_at(da);
543 
544 			/* this device was seen by spdk before... */
545 			if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) {
546 				da->policy = RTE_DEV_ALLOWED;
547 			}
548 		} else if ((dpdk_device_scan_allowed(rte_dev) && da->policy == RTE_DEV_ALLOWED) ||
549 			   da->policy != RTE_DEV_BLOCKED) {
550 			/* override the policy only if not permanently blocked */
551 
552 			if (delay_init) {
553 				da->policy = RTE_DEV_BLOCKED;
554 				set_allowed_at(da, now + 2 * spdk_get_ticks_hz());
555 			} else {
556 				da->policy = RTE_DEV_ALLOWED;
557 				set_allowed_at(da, now);
558 			}
559 		}
560 	}
561 
562 	return 0;
563 }
564 
565 static int
566 pci_attach_rte(const struct spdk_pci_addr *addr)
567 {
568 	char bdf[32];
569 	int rc, i = 0;
570 
571 	spdk_pci_addr_fmt(bdf, sizeof(bdf), addr);
572 
573 	do {
574 		rc = rte_eal_hotplug_add("pci", bdf, "");
575 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
576 
577 	if (i > 1 && rc == -EEXIST) {
578 		/* Even though the previous request timed out, the device
579 		 * was attached successfully.
580 		 */
581 		rc = 0;
582 	}
583 
584 	return rc;
585 }
586 
587 static struct spdk_pci_device_provider g_pci_rte_provider = {
588 	.name = "pci",
589 	.attach_cb = pci_attach_rte,
590 	.detach_cb = detach_rte,
591 };
592 
593 SPDK_PCI_REGISTER_DEVICE_PROVIDER(pci, &g_pci_rte_provider);
594 
595 int
596 spdk_pci_device_attach(struct spdk_pci_driver *driver,
597 		       spdk_pci_enum_cb enum_cb,
598 		       void *enum_ctx, struct spdk_pci_addr *pci_address)
599 {
600 	struct spdk_pci_device *dev;
601 	struct spdk_pci_device_provider *provider;
602 	struct rte_pci_device *rte_dev;
603 	struct rte_devargs *da;
604 	int rc;
605 
606 	cleanup_pci_devices();
607 
608 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
609 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
610 			break;
611 		}
612 	}
613 
614 	if (dev != NULL && dev->internal.driver == driver) {
615 		pthread_mutex_lock(&g_pci_mutex);
616 		if (dev->internal.attached || dev->internal.pending_removal) {
617 			pthread_mutex_unlock(&g_pci_mutex);
618 			return -1;
619 		}
620 
621 		rc = enum_cb(enum_ctx, dev);
622 		if (rc == 0) {
623 			dev->internal.attached = true;
624 		}
625 		pthread_mutex_unlock(&g_pci_mutex);
626 		return rc;
627 	}
628 
629 	driver->cb_fn = enum_cb;
630 	driver->cb_arg = enum_ctx;
631 
632 	rc = -ENODEV;
633 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
634 		rc = provider->attach_cb(pci_address);
635 		if (rc == 0) {
636 			break;
637 		}
638 	}
639 
640 	driver->cb_arg = NULL;
641 	driver->cb_fn = NULL;
642 
643 	cleanup_pci_devices();
644 
645 	if (rc != 0) {
646 		return -1;
647 	}
648 
649 	/* explicit attach ignores the allowlist, so if we blocked this
650 	 * device before let's enable it now - just for clarity.
651 	 */
652 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
653 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
654 			break;
655 		}
656 	}
657 	assert(dev != NULL);
658 
659 	rte_dev = dev->dev_handle;
660 	if (rte_dev != NULL) {
661 		da = dpdk_pci_device_get_devargs(rte_dev);
662 		if (da && get_allowed_at(da)) {
663 			set_allowed_at(da, spdk_get_ticks());
664 			da->policy = RTE_DEV_ALLOWED;
665 		}
666 	}
667 
668 	return 0;
669 }
670 
671 /* Note: You can call spdk_pci_enumerate from more than one thread
672  *       simultaneously safely, but you cannot call spdk_pci_enumerate
673  *       and rte_eal_pci_probe simultaneously.
674  */
675 int
676 spdk_pci_enumerate(struct spdk_pci_driver *driver,
677 		   spdk_pci_enum_cb enum_cb,
678 		   void *enum_ctx)
679 {
680 	struct spdk_pci_device *dev;
681 	int rc;
682 
683 	cleanup_pci_devices();
684 
685 	pthread_mutex_lock(&g_pci_mutex);
686 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
687 		if (dev->internal.attached ||
688 		    dev->internal.driver != driver ||
689 		    dev->internal.pending_removal) {
690 			continue;
691 		}
692 
693 		rc = enum_cb(enum_ctx, dev);
694 		if (rc == 0) {
695 			dev->internal.attached = true;
696 		} else if (rc < 0) {
697 			pthread_mutex_unlock(&g_pci_mutex);
698 			return -1;
699 		}
700 	}
701 	pthread_mutex_unlock(&g_pci_mutex);
702 
703 	if (scan_pci_bus(true) != 0) {
704 		return -1;
705 	}
706 
707 	driver->cb_fn = enum_cb;
708 	driver->cb_arg = enum_ctx;
709 
710 	if (dpdk_bus_probe() != 0) {
711 		driver->cb_arg = NULL;
712 		driver->cb_fn = NULL;
713 		return -1;
714 	}
715 
716 	driver->cb_arg = NULL;
717 	driver->cb_fn = NULL;
718 
719 	cleanup_pci_devices();
720 	return 0;
721 }
722 
723 void
724 spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev))
725 {
726 	struct spdk_pci_device *dev, *tmp;
727 
728 	pthread_mutex_lock(&g_pci_mutex);
729 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
730 		fn(ctx, dev);
731 	}
732 	pthread_mutex_unlock(&g_pci_mutex);
733 }
734 
735 int
736 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
737 			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
738 {
739 	int rc;
740 
741 	rc = dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
742 	if (rc) {
743 		return rc;
744 	}
745 
746 #if VFIO_ENABLED
747 	/* Automatically map the BAR to the IOMMU */
748 	if (!spdk_iommu_is_enabled()) {
749 		return 0;
750 	}
751 
752 	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
753 		/* We'll use the virtual address as the iova to match DPDK. */
754 		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), (uint64_t) * mapped_addr, *size);
755 		if (rc) {
756 			dev->unmap_bar(dev, bar, *mapped_addr);
757 			return -EFAULT;
758 		}
759 
760 		*phys_addr = (uint64_t)(*mapped_addr);
761 	} else {
762 		/* We'll use the physical address as the iova to match DPDK. */
763 		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), *phys_addr, *size);
764 		if (rc) {
765 			dev->unmap_bar(dev, bar, *mapped_addr);
766 			return -EFAULT;
767 		}
768 	}
769 #endif
770 	return rc;
771 }
772 
773 int
774 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
775 {
776 #if VFIO_ENABLED
777 	int rc;
778 
779 	if (spdk_iommu_is_enabled()) {
780 		rc = vtophys_iommu_unmap_dma_bar((uint64_t)addr);
781 		if (rc) {
782 			return -EFAULT;
783 		}
784 	}
785 #endif
786 
787 	return dev->unmap_bar(dev, bar, addr);
788 }
789 
790 int
791 spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev)
792 {
793 	return dpdk_pci_device_enable_interrupt(dev->dev_handle);
794 }
795 
796 int
797 spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev)
798 {
799 	return dpdk_pci_device_disable_interrupt(dev->dev_handle);
800 }
801 
802 int
803 spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev)
804 {
805 	return dpdk_pci_device_get_interrupt_efd(dev->dev_handle);
806 }
807 
808 uint32_t
809 spdk_pci_device_get_domain(struct spdk_pci_device *dev)
810 {
811 	return dev->addr.domain;
812 }
813 
814 uint8_t
815 spdk_pci_device_get_bus(struct spdk_pci_device *dev)
816 {
817 	return dev->addr.bus;
818 }
819 
820 uint8_t
821 spdk_pci_device_get_dev(struct spdk_pci_device *dev)
822 {
823 	return dev->addr.dev;
824 }
825 
826 uint8_t
827 spdk_pci_device_get_func(struct spdk_pci_device *dev)
828 {
829 	return dev->addr.func;
830 }
831 
832 uint16_t
833 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
834 {
835 	return dev->id.vendor_id;
836 }
837 
838 uint16_t
839 spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
840 {
841 	return dev->id.device_id;
842 }
843 
844 uint16_t
845 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
846 {
847 	return dev->id.subvendor_id;
848 }
849 
850 uint16_t
851 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
852 {
853 	return dev->id.subdevice_id;
854 }
855 
856 struct spdk_pci_id
857 spdk_pci_device_get_id(struct spdk_pci_device *dev)
858 {
859 	return dev->id;
860 }
861 
862 int
863 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
864 {
865 	return dev->socket_id;
866 }
867 
868 int
869 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
870 {
871 	return dev->cfg_read(dev, value, len, offset);
872 }
873 
874 int
875 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
876 {
877 	return dev->cfg_write(dev, value, len, offset);
878 }
879 
880 int
881 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
882 {
883 	return spdk_pci_device_cfg_read(dev, value, 1, offset);
884 }
885 
886 int
887 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
888 {
889 	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
890 }
891 
892 int
893 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
894 {
895 	return spdk_pci_device_cfg_read(dev, value, 2, offset);
896 }
897 
898 int
899 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
900 {
901 	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
902 }
903 
904 int
905 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
906 {
907 	return spdk_pci_device_cfg_read(dev, value, 4, offset);
908 }
909 
910 int
911 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
912 {
913 	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
914 }
915 
916 int
917 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
918 {
919 	int err;
920 	uint32_t pos, header = 0;
921 	uint32_t i, buf[2];
922 
923 	if (len < 17) {
924 		return -1;
925 	}
926 
927 	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
928 	if (err || !header) {
929 		return -1;
930 	}
931 
932 	pos = PCI_CFG_SIZE;
933 	while (1) {
934 		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
935 			if (pos) {
936 				/* skip the header */
937 				pos += 4;
938 				for (i = 0; i < 2; i++) {
939 					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
940 					if (err) {
941 						return -1;
942 					}
943 				}
944 				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
945 				return 0;
946 			}
947 		}
948 		pos = (header >> 20) & 0xffc;
949 		/* 0 if no other items exist */
950 		if (pos < PCI_CFG_SIZE) {
951 			return -1;
952 		}
953 		err = spdk_pci_device_cfg_read32(dev, &header, pos);
954 		if (err) {
955 			return -1;
956 		}
957 	}
958 	return -1;
959 }
960 
961 struct spdk_pci_addr
962 spdk_pci_device_get_addr(struct spdk_pci_device *dev)
963 {
964 	return dev->addr;
965 }
966 
967 bool
968 spdk_pci_device_is_removed(struct spdk_pci_device *dev)
969 {
970 	return dev->internal.pending_removal;
971 }
972 
973 int
974 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
975 {
976 	if (a1->domain > a2->domain) {
977 		return 1;
978 	} else if (a1->domain < a2->domain) {
979 		return -1;
980 	} else if (a1->bus > a2->bus) {
981 		return 1;
982 	} else if (a1->bus < a2->bus) {
983 		return -1;
984 	} else if (a1->dev > a2->dev) {
985 		return 1;
986 	} else if (a1->dev < a2->dev) {
987 		return -1;
988 	} else if (a1->func > a2->func) {
989 		return 1;
990 	} else if (a1->func < a2->func) {
991 		return -1;
992 	}
993 
994 	return 0;
995 }
996 
997 #ifdef __linux__
998 int
999 spdk_pci_device_claim(struct spdk_pci_device *dev)
1000 {
1001 	int dev_fd;
1002 	char dev_name[64];
1003 	int pid;
1004 	void *dev_map;
1005 	struct flock pcidev_lock = {
1006 		.l_type = F_WRLCK,
1007 		.l_whence = SEEK_SET,
1008 		.l_start = 0,
1009 		.l_len = 0,
1010 	};
1011 
1012 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1013 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1014 
1015 	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
1016 	if (dev_fd == -1) {
1017 		SPDK_ERRLOG("could not open %s\n", dev_name);
1018 		return -errno;
1019 	}
1020 
1021 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
1022 		SPDK_ERRLOG("could not truncate %s\n", dev_name);
1023 		close(dev_fd);
1024 		return -errno;
1025 	}
1026 
1027 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
1028 		       MAP_SHARED, dev_fd, 0);
1029 	if (dev_map == MAP_FAILED) {
1030 		SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno);
1031 		close(dev_fd);
1032 		return -errno;
1033 	}
1034 
1035 	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
1036 		pid = *(int *)dev_map;
1037 		SPDK_ERRLOG("Cannot create lock on device %s, probably"
1038 			    " process %d has claimed it\n", dev_name, pid);
1039 		munmap(dev_map, sizeof(int));
1040 		close(dev_fd);
1041 		/* F_SETLK returns unspecified errnos, normalize them */
1042 		return -EACCES;
1043 	}
1044 
1045 	*(int *)dev_map = (int)getpid();
1046 	munmap(dev_map, sizeof(int));
1047 	dev->internal.claim_fd = dev_fd;
1048 	/* Keep dev_fd open to maintain the lock. */
1049 	return 0;
1050 }
1051 
1052 void
1053 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1054 {
1055 	char dev_name[64];
1056 
1057 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1058 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1059 
1060 	close(dev->internal.claim_fd);
1061 	dev->internal.claim_fd = -1;
1062 	unlink(dev_name);
1063 }
1064 #else /* !__linux__ */
1065 int
1066 spdk_pci_device_claim(struct spdk_pci_device *dev)
1067 {
1068 	/* TODO */
1069 	return 0;
1070 }
1071 
1072 void
1073 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1074 {
1075 	/* TODO */
1076 }
1077 #endif /* __linux__ */
1078 
1079 int
1080 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
1081 {
1082 	unsigned domain, bus, dev, func;
1083 
1084 	if (addr == NULL || bdf == NULL) {
1085 		return -EINVAL;
1086 	}
1087 
1088 	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1089 	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1090 		/* Matched a full address - all variables are initialized */
1091 	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1092 		func = 0;
1093 	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1094 		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1095 		domain = 0;
1096 	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1097 		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1098 		domain = 0;
1099 		func = 0;
1100 	} else {
1101 		return -EINVAL;
1102 	}
1103 
1104 	if (bus > 0xFF || dev > 0x1F || func > 7) {
1105 		return -EINVAL;
1106 	}
1107 
1108 	addr->domain = domain;
1109 	addr->bus = bus;
1110 	addr->dev = dev;
1111 	addr->func = func;
1112 
1113 	return 0;
1114 }
1115 
1116 int
1117 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1118 {
1119 	int rc;
1120 
1121 	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1122 		      addr->domain, addr->bus,
1123 		      addr->dev, addr->func);
1124 
1125 	if (rc > 0 && (size_t)rc < sz) {
1126 		return 0;
1127 	}
1128 
1129 	return -1;
1130 }
1131 
1132 int
1133 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1134 {
1135 	int rc;
1136 
1137 	assert(dev->map_bar != NULL);
1138 	assert(dev->unmap_bar != NULL);
1139 	assert(dev->cfg_read != NULL);
1140 	assert(dev->cfg_write != NULL);
1141 	dev->internal.driver = drv;
1142 
1143 	if (drv->cb_fn != NULL) {
1144 		rc = drv->cb_fn(drv->cb_arg, dev);
1145 		if (rc != 0) {
1146 			return -ECANCELED;
1147 		}
1148 
1149 		dev->internal.attached = true;
1150 	}
1151 
1152 	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1153 
1154 	return 0;
1155 }
1156 
1157 void
1158 spdk_pci_unhook_device(struct spdk_pci_device *dev)
1159 {
1160 	assert(!dev->internal.attached);
1161 	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1162 }
1163 
1164 void
1165 spdk_pci_register_device_provider(struct spdk_pci_device_provider *provider)
1166 {
1167 	TAILQ_INSERT_TAIL(&g_pci_device_providers, provider, tailq);
1168 }
1169 
1170 const char *
1171 spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1172 {
1173 	return dev->type;
1174 }
1175 
1176 int
1177 spdk_pci_device_allow(struct spdk_pci_addr *pci_addr)
1178 {
1179 	struct rte_devargs *da;
1180 	char devargs_str[128];
1181 
1182 	da = calloc(1, sizeof(*da));
1183 	if (da == NULL) {
1184 		SPDK_ERRLOG("could not allocate rte_devargs\n");
1185 		return -ENOMEM;
1186 	}
1187 
1188 	snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x",
1189 		 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func);
1190 	if (rte_devargs_parse(da, devargs_str) != 0) {
1191 		SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str);
1192 		free(da);
1193 		return -EINVAL;
1194 	}
1195 	da->policy = RTE_DEV_ALLOWED;
1196 	/* Note: if a devargs already exists for this device address, it just gets
1197 	 * overridden.  So we do not need to check if the devargs already exists.
1198 	 * DPDK will take care of memory management for the devargs structure after
1199 	 * it has been inserted, so there's nothing SPDK needs to track.
1200 	 */
1201 	if (rte_devargs_insert(&da) != 0) {
1202 		SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str);
1203 		free(da);
1204 		return -EINVAL;
1205 	}
1206 
1207 	return 0;
1208 }
1209