xref: /spdk/lib/env_dpdk/pci.c (revision f8abbede89d30584d2a4f8427b13896f8591b873)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2015 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "env_internal.h"
7 #include "pci_dpdk.h"
8 
9 #include <rte_alarm.h>
10 #include <rte_devargs.h>
11 #include <rte_pci.h>
12 #include "spdk/env.h"
13 #include "spdk/log.h"
14 #include "spdk/string.h"
15 #include "spdk/memory.h"
16 
17 #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
18 
19 #define PCI_CFG_SIZE		256
20 #define PCI_EXT_CAP_ID_SN	0x03
21 
22 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
23  * might cause the internal IPC to misbehave. Just retry in such case.
24  */
25 #define DPDK_HOTPLUG_RETRY_COUNT 4
26 
27 /* DPDK alarm/interrupt thread */
28 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
29 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
30 /* devices hotplugged on a dpdk thread */
31 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
32 	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
33 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
34 static TAILQ_HEAD(, spdk_pci_device_provider) g_pci_device_providers =
35 	TAILQ_HEAD_INITIALIZER(g_pci_device_providers);
36 
37 int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device);
38 int pci_device_fini(struct rte_pci_device *device);
39 
40 struct env_devargs {
41 	struct rte_bus	*bus;
42 	char		name[128];
43 	uint64_t	allowed_at;
44 	TAILQ_ENTRY(env_devargs) link;
45 };
46 static TAILQ_HEAD(, env_devargs) g_env_devargs = TAILQ_HEAD_INITIALIZER(g_env_devargs);
47 
48 static struct env_devargs *
49 find_env_devargs(struct rte_bus *bus, const char *name)
50 {
51 	struct env_devargs *da;
52 
53 	TAILQ_FOREACH(da, &g_env_devargs, link) {
54 		if (bus == da->bus && !strcmp(name, da->name)) {
55 			return da;
56 		}
57 	}
58 
59 	return NULL;
60 }
61 
62 static int
63 map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
64 	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
65 {
66 	struct rte_mem_resource *res;
67 
68 	res = dpdk_pci_device_get_mem_resource(device->dev_handle, bar);
69 	*mapped_addr = res->addr;
70 	*phys_addr = (uint64_t)res->phys_addr;
71 	*size = (uint64_t)res->len;
72 
73 	return 0;
74 }
75 
76 static int
77 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
78 {
79 	return 0;
80 }
81 
82 static int
83 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
84 {
85 	return dpdk_pci_device_read_config(dev->dev_handle, value, len, offset);
86 }
87 
88 static int
89 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
90 {
91 	return dpdk_pci_device_write_config(dev->dev_handle, value, len, offset);
92 }
93 
94 static void
95 remove_rte_dev(struct rte_pci_device *rte_dev)
96 {
97 	char bdf[32];
98 	int i = 0, rc;
99 
100 	snprintf(bdf, sizeof(bdf), "%s", dpdk_pci_device_get_name(rte_dev));
101 	do {
102 		rc = rte_eal_hotplug_remove("pci", bdf);
103 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
104 }
105 
106 static void
107 detach_rte_cb(void *_dev)
108 {
109 	remove_rte_dev(_dev);
110 }
111 
112 /* if it's a physical device we need to deal with DPDK on
113  * a different process and we can't just unset one flag
114  * here. We also want to stop using any device resources
115  * so that the device isn't "in use" by the userspace driver
116  * once we detach it. This would allow attaching the device
117  * to a different process, or to a kernel driver like nvme.
118  */
119 static void
120 detach_rte(struct spdk_pci_device *dev)
121 {
122 	struct rte_pci_device *rte_dev = dev->dev_handle;
123 	int i;
124 	bool removed;
125 
126 	if (!spdk_process_is_primary()) {
127 		return;
128 	}
129 
130 	pthread_mutex_lock(&g_pci_mutex);
131 	dev->internal.attached = false;
132 	/* prevent the hotremove notification from removing this device */
133 	dev->internal.pending_removal = true;
134 	pthread_mutex_unlock(&g_pci_mutex);
135 
136 	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
137 
138 	/* wait up to 2s for the cb to execute */
139 	for (i = 2000; i > 0; i--) {
140 
141 		spdk_delay_us(1000);
142 		pthread_mutex_lock(&g_pci_mutex);
143 		removed = dev->internal.removed;
144 		pthread_mutex_unlock(&g_pci_mutex);
145 
146 		if (removed) {
147 			break;
148 		}
149 	}
150 
151 	/* besides checking the removed flag, we also need to wait
152 	 * for the dpdk detach function to unwind, as it's doing some
153 	 * operations even after calling our detach callback. Simply
154 	 * cancel the alarm - if it started executing already, this
155 	 * call will block and wait for it to finish.
156 	 */
157 	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
158 
159 	/* the device could have been finally removed, so just check
160 	 * it again.
161 	 */
162 	pthread_mutex_lock(&g_pci_mutex);
163 	removed = dev->internal.removed;
164 	pthread_mutex_unlock(&g_pci_mutex);
165 	if (!removed) {
166 		SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n",
167 			    dpdk_pci_device_get_name(rte_dev));
168 		/* If we reach this state, then the device couldn't be removed and most likely
169 		   a subsequent hot add of a device in the same BDF will fail */
170 	}
171 }
172 
173 void
174 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
175 {
176 	struct spdk_pci_driver *driver;
177 
178 	driver = calloc(1, sizeof(*driver));
179 	if (!driver) {
180 		/* we can't do any better than bailing atm */
181 		return;
182 	}
183 
184 	driver->name = name;
185 	driver->id_table = id_table;
186 	driver->drv_flags = flags;
187 	driver->driver = (struct rte_pci_driver *)driver->driver_buf;
188 	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
189 }
190 
191 struct spdk_pci_driver *
192 spdk_pci_nvme_get_driver(void)
193 {
194 	return spdk_pci_get_driver("nvme");
195 }
196 
197 struct spdk_pci_driver *
198 spdk_pci_get_driver(const char *name)
199 {
200 	struct spdk_pci_driver *driver;
201 
202 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
203 		if (strcmp(driver->name, name) == 0) {
204 			return driver;
205 		}
206 	}
207 
208 	return NULL;
209 }
210 
211 static void
212 pci_device_rte_dev_event(const char *device_name,
213 			 enum rte_dev_event_type event,
214 			 void *cb_arg)
215 {
216 	struct spdk_pci_device *dev;
217 	bool can_detach = false;
218 
219 	switch (event) {
220 	default:
221 	case RTE_DEV_EVENT_ADD:
222 		/* Nothing to do here yet. */
223 		break;
224 	case RTE_DEV_EVENT_REMOVE:
225 		pthread_mutex_lock(&g_pci_mutex);
226 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
227 			struct rte_pci_device *rte_dev = dev->dev_handle;
228 
229 			/* Note: these ERRLOGs are useful for triaging issue #2983. */
230 			if (dev->internal.pending_removal || dev->internal.removed) {
231 				SPDK_ERRLOG("Received event for device SPDK already tried to remove\n");
232 				SPDK_ERRLOG("pending_removal=%d removed=%d\n", dev->internal.pending_removal,
233 					    dev->internal.removed);
234 			}
235 
236 			if (strcmp(dpdk_pci_device_get_name(rte_dev), device_name) == 0 &&
237 			    !dev->internal.pending_removal) {
238 				can_detach = !dev->internal.attached;
239 				/* prevent any further attaches */
240 				dev->internal.pending_removal = true;
241 				break;
242 			}
243 		}
244 		pthread_mutex_unlock(&g_pci_mutex);
245 
246 		if (dev != NULL && can_detach) {
247 			/* if device is not attached we can remove it right away.
248 			 * Otherwise it will be removed at detach.
249 			 *
250 			 * Because the user's callback is invoked in eal interrupt
251 			 * callback, the interrupt callback need to be finished before
252 			 * it can be unregistered when detaching device. So finish
253 			 * callback soon and use a deferred removal to detach device
254 			 * is need. It is a workaround, once the device detaching be
255 			 * moved into the eal in the future, the deferred removal could
256 			 * be deleted.
257 			 */
258 			rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle);
259 		}
260 		break;
261 	}
262 }
263 
264 static void
265 cleanup_pci_devices(void)
266 {
267 	struct spdk_pci_device *dev, *tmp;
268 
269 	pthread_mutex_lock(&g_pci_mutex);
270 	/* cleanup removed devices */
271 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
272 		if (!dev->internal.removed) {
273 			continue;
274 		}
275 
276 		vtophys_pci_device_removed(dev->dev_handle);
277 		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
278 		free(dev);
279 	}
280 
281 	/* add newly-attached devices */
282 	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
283 		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
284 		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
285 		vtophys_pci_device_added(dev->dev_handle);
286 	}
287 	pthread_mutex_unlock(&g_pci_mutex);
288 }
289 
290 static int scan_pci_bus(bool delay_init);
291 
292 static inline void
293 _pci_env_init(void)
294 {
295 	/* We assume devices were present on the bus for more than 2 seconds
296 	 * before initializing SPDK and there's no need to wait more. We scan
297 	 * the bus, but we don't block any devices.
298 	 */
299 	scan_pci_bus(false);
300 
301 	/* Register a single hotremove callback for all devices. */
302 	if (spdk_process_is_primary()) {
303 		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
304 	}
305 }
306 
307 int
308 pci_env_init(void)
309 {
310 	struct spdk_pci_driver *driver;
311 	int rc;
312 
313 	rc = dpdk_pci_init();
314 	if (rc) {
315 		return rc;
316 	}
317 
318 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
319 		dpdk_pci_driver_register(driver, pci_device_init, pci_device_fini);
320 	}
321 
322 	_pci_env_init();
323 	return 0;
324 }
325 
326 void
327 pci_env_reinit(void)
328 {
329 	/* There is no need to register pci drivers again, since they were
330 	 * already pre-registered in pci_env_init.
331 	 */
332 
333 	_pci_env_init();
334 }
335 
336 void
337 pci_env_fini(void)
338 {
339 	struct spdk_pci_device *dev;
340 	char bdf[32];
341 
342 	cleanup_pci_devices();
343 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
344 		if (dev->internal.attached) {
345 			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
346 			SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf);
347 		}
348 	}
349 
350 	if (spdk_process_is_primary()) {
351 		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
352 	}
353 }
354 
355 int
356 pci_device_init(struct rte_pci_driver *_drv,
357 		struct rte_pci_device *_dev)
358 {
359 	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
360 	struct spdk_pci_device *dev;
361 	struct rte_pci_addr *addr;
362 	struct rte_pci_id *id;
363 	int rc;
364 
365 	dev = calloc(1, sizeof(*dev));
366 	if (dev == NULL) {
367 		return -1;
368 	}
369 
370 	dev->dev_handle = _dev;
371 
372 	addr = dpdk_pci_device_get_addr(_dev);
373 	dev->addr.domain = addr->domain;
374 	dev->addr.bus = addr->bus;
375 	dev->addr.dev = addr->devid;
376 	dev->addr.func = addr->function;
377 
378 	id = dpdk_pci_device_get_id(_dev);
379 	dev->id.class_id = id->class_id;
380 	dev->id.vendor_id = id->vendor_id;
381 	dev->id.device_id = id->device_id;
382 	dev->id.subvendor_id = id->subsystem_vendor_id;
383 	dev->id.subdevice_id = id->subsystem_device_id;
384 
385 	dev->socket_id = dpdk_pci_device_get_numa_node(_dev);
386 	dev->type = "pci";
387 
388 	dev->map_bar = map_bar_rte;
389 	dev->unmap_bar = unmap_bar_rte;
390 	dev->cfg_read = cfg_read_rte;
391 	dev->cfg_write = cfg_write_rte;
392 
393 	dev->internal.driver = driver;
394 	dev->internal.claim_fd = -1;
395 
396 	if (driver->cb_fn != NULL) {
397 		rc = driver->cb_fn(driver->cb_arg, dev);
398 		if (rc != 0) {
399 			free(dev);
400 			return rc;
401 		}
402 		dev->internal.attached = true;
403 	}
404 
405 	pthread_mutex_lock(&g_pci_mutex);
406 	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
407 	pthread_mutex_unlock(&g_pci_mutex);
408 	return 0;
409 }
410 
411 static void
412 set_allowed_at(struct rte_devargs *rte_da, uint64_t tsc)
413 {
414 	struct env_devargs *env_da;
415 
416 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
417 	if (env_da == NULL) {
418 		env_da = calloc(1, sizeof(*env_da));
419 		if (env_da == NULL) {
420 			SPDK_ERRLOG("could not set_allowed_at for device %s\n", rte_da->name);
421 			return;
422 		}
423 		env_da->bus = rte_da->bus;
424 		spdk_strcpy_pad(env_da->name, rte_da->name, sizeof(env_da->name), 0);
425 		TAILQ_INSERT_TAIL(&g_env_devargs, env_da, link);
426 	}
427 
428 	env_da->allowed_at = tsc;
429 }
430 
431 static uint64_t
432 get_allowed_at(struct rte_devargs *rte_da)
433 {
434 	struct env_devargs *env_da;
435 
436 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
437 	if (env_da) {
438 		return env_da->allowed_at;
439 	} else {
440 		return 0;
441 	}
442 }
443 
444 int
445 pci_device_fini(struct rte_pci_device *_dev)
446 {
447 	struct spdk_pci_device *dev;
448 
449 	pthread_mutex_lock(&g_pci_mutex);
450 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
451 		if (dev->dev_handle == _dev) {
452 			break;
453 		}
454 	}
455 
456 	if (dev == NULL || dev->internal.attached) {
457 		/* The device might be still referenced somewhere in SPDK. */
458 		pthread_mutex_unlock(&g_pci_mutex);
459 		return -EBUSY;
460 	}
461 
462 	/* remove our allowed_at option */
463 	if (dpdk_pci_device_get_devargs(_dev)) {
464 		set_allowed_at(dpdk_pci_device_get_devargs(_dev), 0);
465 	}
466 
467 	/* It is possible that removed flag was already set when there is a race
468 	 * between the remove notification for this process, and another process
469 	 * that is also detaching from this same device (for example, when using
470 	 * nvme driver in multi-process mode.  So do not assert here.  See
471 	 * #2456 for additional details.
472 	 */
473 	dev->internal.removed = true;
474 	pthread_mutex_unlock(&g_pci_mutex);
475 	return 0;
476 
477 }
478 
479 void
480 spdk_pci_device_detach(struct spdk_pci_device *dev)
481 {
482 	struct spdk_pci_device_provider *provider;
483 
484 	assert(dev->internal.attached);
485 
486 	if (dev->internal.claim_fd >= 0) {
487 		spdk_pci_device_unclaim(dev);
488 	}
489 
490 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
491 		if (strcmp(dev->type, provider->name) == 0) {
492 			break;
493 		}
494 	}
495 
496 	assert(provider != NULL);
497 	dev->internal.attached = false;
498 	provider->detach_cb(dev);
499 
500 	cleanup_pci_devices();
501 }
502 
503 static int
504 scan_pci_bus(bool delay_init)
505 {
506 	struct rte_dev_iterator it;
507 	struct rte_device *rte_dev;
508 	uint64_t now;
509 
510 	dpdk_bus_scan();
511 	now = spdk_get_ticks();
512 
513 	if (!TAILQ_FIRST(&g_pci_drivers)) {
514 		return 0;
515 	}
516 
517 	RTE_DEV_FOREACH(rte_dev, "bus=pci", &it) {
518 		struct rte_devargs *da;
519 
520 		da = dpdk_device_get_devargs(rte_dev);
521 		if (!da) {
522 			char devargs_str[128];
523 
524 			/* the device was never blocked or allowed */
525 			da = calloc(1, sizeof(*da));
526 			if (!da) {
527 				return -1;
528 			}
529 
530 			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", dpdk_device_get_name(rte_dev));
531 			if (rte_devargs_parse(da, devargs_str) != 0) {
532 				free(da);
533 				return -1;
534 			}
535 
536 			rte_devargs_insert(&da);
537 			dpdk_device_set_devargs(rte_dev, da);
538 		}
539 
540 		if (get_allowed_at(da)) {
541 			uint64_t allowed_at = get_allowed_at(da);
542 
543 			/* this device was seen by spdk before... */
544 			if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) {
545 				da->policy = RTE_DEV_ALLOWED;
546 			}
547 		} else if ((dpdk_device_scan_allowed(rte_dev) && da->policy == RTE_DEV_ALLOWED) ||
548 			   da->policy != RTE_DEV_BLOCKED) {
549 			/* override the policy only if not permanently blocked */
550 
551 			if (delay_init) {
552 				da->policy = RTE_DEV_BLOCKED;
553 				set_allowed_at(da, now + 2 * spdk_get_ticks_hz());
554 			} else {
555 				da->policy = RTE_DEV_ALLOWED;
556 				set_allowed_at(da, now);
557 			}
558 		}
559 	}
560 
561 	return 0;
562 }
563 
564 static int
565 pci_attach_rte(const struct spdk_pci_addr *addr)
566 {
567 	char bdf[32];
568 	int rc, i = 0;
569 
570 	spdk_pci_addr_fmt(bdf, sizeof(bdf), addr);
571 
572 	do {
573 		rc = rte_eal_hotplug_add("pci", bdf, "");
574 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
575 
576 	if (i > 1 && rc == -EEXIST) {
577 		/* Even though the previous request timed out, the device
578 		 * was attached successfully.
579 		 */
580 		rc = 0;
581 	}
582 
583 	return rc;
584 }
585 
586 static struct spdk_pci_device_provider g_pci_rte_provider = {
587 	.name = "pci",
588 	.attach_cb = pci_attach_rte,
589 	.detach_cb = detach_rte,
590 };
591 
592 SPDK_PCI_REGISTER_DEVICE_PROVIDER(pci, &g_pci_rte_provider);
593 
594 int
595 spdk_pci_device_attach(struct spdk_pci_driver *driver,
596 		       spdk_pci_enum_cb enum_cb,
597 		       void *enum_ctx, struct spdk_pci_addr *pci_address)
598 {
599 	struct spdk_pci_device *dev;
600 	struct spdk_pci_device_provider *provider;
601 	struct rte_pci_device *rte_dev;
602 	struct rte_devargs *da;
603 	int rc;
604 
605 	cleanup_pci_devices();
606 
607 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
608 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
609 			break;
610 		}
611 	}
612 
613 	if (dev != NULL && dev->internal.driver == driver) {
614 		pthread_mutex_lock(&g_pci_mutex);
615 		if (dev->internal.attached || dev->internal.pending_removal) {
616 			pthread_mutex_unlock(&g_pci_mutex);
617 			return -1;
618 		}
619 
620 		rc = enum_cb(enum_ctx, dev);
621 		if (rc == 0) {
622 			dev->internal.attached = true;
623 		}
624 		pthread_mutex_unlock(&g_pci_mutex);
625 		return rc;
626 	}
627 
628 	driver->cb_fn = enum_cb;
629 	driver->cb_arg = enum_ctx;
630 
631 	rc = -ENODEV;
632 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
633 		rc = provider->attach_cb(pci_address);
634 		if (rc == 0) {
635 			break;
636 		}
637 	}
638 
639 	driver->cb_arg = NULL;
640 	driver->cb_fn = NULL;
641 
642 	cleanup_pci_devices();
643 
644 	if (rc != 0) {
645 		return -1;
646 	}
647 
648 	/* explicit attach ignores the allowlist, so if we blocked this
649 	 * device before let's enable it now - just for clarity.
650 	 */
651 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
652 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
653 			break;
654 		}
655 	}
656 	assert(dev != NULL);
657 
658 	rte_dev = dev->dev_handle;
659 	if (rte_dev != NULL) {
660 		da = dpdk_pci_device_get_devargs(rte_dev);
661 		if (da && get_allowed_at(da)) {
662 			set_allowed_at(da, spdk_get_ticks());
663 			da->policy = RTE_DEV_ALLOWED;
664 		}
665 	}
666 
667 	return 0;
668 }
669 
670 /* Note: You can call spdk_pci_enumerate from more than one thread
671  *       simultaneously safely, but you cannot call spdk_pci_enumerate
672  *       and rte_eal_pci_probe simultaneously.
673  */
674 int
675 spdk_pci_enumerate(struct spdk_pci_driver *driver,
676 		   spdk_pci_enum_cb enum_cb,
677 		   void *enum_ctx)
678 {
679 	struct spdk_pci_device *dev;
680 	int rc;
681 
682 	cleanup_pci_devices();
683 
684 	pthread_mutex_lock(&g_pci_mutex);
685 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
686 		if (dev->internal.attached ||
687 		    dev->internal.driver != driver ||
688 		    dev->internal.pending_removal) {
689 			continue;
690 		}
691 
692 		rc = enum_cb(enum_ctx, dev);
693 		if (rc == 0) {
694 			dev->internal.attached = true;
695 		} else if (rc < 0) {
696 			pthread_mutex_unlock(&g_pci_mutex);
697 			return -1;
698 		}
699 	}
700 	pthread_mutex_unlock(&g_pci_mutex);
701 
702 	if (scan_pci_bus(true) != 0) {
703 		return -1;
704 	}
705 
706 	driver->cb_fn = enum_cb;
707 	driver->cb_arg = enum_ctx;
708 
709 	if (dpdk_bus_probe() != 0) {
710 		driver->cb_arg = NULL;
711 		driver->cb_fn = NULL;
712 		return -1;
713 	}
714 
715 	driver->cb_arg = NULL;
716 	driver->cb_fn = NULL;
717 
718 	cleanup_pci_devices();
719 	return 0;
720 }
721 
722 void
723 spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev))
724 {
725 	struct spdk_pci_device *dev, *tmp;
726 
727 	pthread_mutex_lock(&g_pci_mutex);
728 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
729 		fn(ctx, dev);
730 	}
731 	pthread_mutex_unlock(&g_pci_mutex);
732 }
733 
734 int
735 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
736 			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
737 {
738 	int rc;
739 
740 	rc = dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
741 	if (rc) {
742 		return rc;
743 	}
744 
745 #if VFIO_ENABLED
746 	/* Automatically map the BAR to the IOMMU */
747 	if (!spdk_iommu_is_enabled()) {
748 		return 0;
749 	}
750 
751 	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
752 		/* We'll use the virtual address as the iova to match DPDK. */
753 		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), (uint64_t) * mapped_addr, *size);
754 		if (rc) {
755 			dev->unmap_bar(dev, bar, *mapped_addr);
756 			return -EFAULT;
757 		}
758 
759 		*phys_addr = (uint64_t)(*mapped_addr);
760 	} else {
761 		/* We'll use the physical address as the iova to match DPDK. */
762 		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), *phys_addr, *size);
763 		if (rc) {
764 			dev->unmap_bar(dev, bar, *mapped_addr);
765 			return -EFAULT;
766 		}
767 	}
768 #endif
769 	return rc;
770 }
771 
772 int
773 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
774 {
775 #if VFIO_ENABLED
776 	int rc;
777 
778 	if (spdk_iommu_is_enabled()) {
779 		rc = vtophys_iommu_unmap_dma_bar((uint64_t)addr);
780 		if (rc) {
781 			return -EFAULT;
782 		}
783 	}
784 #endif
785 
786 	return dev->unmap_bar(dev, bar, addr);
787 }
788 
789 int
790 spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev)
791 {
792 	return dpdk_pci_device_enable_interrupt(dev->dev_handle);
793 }
794 
795 int
796 spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev)
797 {
798 	return dpdk_pci_device_disable_interrupt(dev->dev_handle);
799 }
800 
801 int
802 spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev)
803 {
804 	return dpdk_pci_device_get_interrupt_efd(dev->dev_handle);
805 }
806 
807 uint32_t
808 spdk_pci_device_get_domain(struct spdk_pci_device *dev)
809 {
810 	return dev->addr.domain;
811 }
812 
813 uint8_t
814 spdk_pci_device_get_bus(struct spdk_pci_device *dev)
815 {
816 	return dev->addr.bus;
817 }
818 
819 uint8_t
820 spdk_pci_device_get_dev(struct spdk_pci_device *dev)
821 {
822 	return dev->addr.dev;
823 }
824 
825 uint8_t
826 spdk_pci_device_get_func(struct spdk_pci_device *dev)
827 {
828 	return dev->addr.func;
829 }
830 
831 uint16_t
832 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
833 {
834 	return dev->id.vendor_id;
835 }
836 
837 uint16_t
838 spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
839 {
840 	return dev->id.device_id;
841 }
842 
843 uint16_t
844 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
845 {
846 	return dev->id.subvendor_id;
847 }
848 
849 uint16_t
850 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
851 {
852 	return dev->id.subdevice_id;
853 }
854 
855 struct spdk_pci_id
856 spdk_pci_device_get_id(struct spdk_pci_device *dev)
857 {
858 	return dev->id;
859 }
860 
861 int
862 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
863 {
864 	return dev->socket_id;
865 }
866 
867 int
868 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
869 {
870 	return dev->cfg_read(dev, value, len, offset);
871 }
872 
873 int
874 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
875 {
876 	return dev->cfg_write(dev, value, len, offset);
877 }
878 
879 int
880 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
881 {
882 	return spdk_pci_device_cfg_read(dev, value, 1, offset);
883 }
884 
885 int
886 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
887 {
888 	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
889 }
890 
891 int
892 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
893 {
894 	return spdk_pci_device_cfg_read(dev, value, 2, offset);
895 }
896 
897 int
898 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
899 {
900 	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
901 }
902 
903 int
904 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
905 {
906 	return spdk_pci_device_cfg_read(dev, value, 4, offset);
907 }
908 
909 int
910 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
911 {
912 	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
913 }
914 
915 int
916 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
917 {
918 	int err;
919 	uint32_t pos, header = 0;
920 	uint32_t i, buf[2];
921 
922 	if (len < 17) {
923 		return -1;
924 	}
925 
926 	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
927 	if (err || !header) {
928 		return -1;
929 	}
930 
931 	pos = PCI_CFG_SIZE;
932 	while (1) {
933 		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
934 			if (pos) {
935 				/* skip the header */
936 				pos += 4;
937 				for (i = 0; i < 2; i++) {
938 					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
939 					if (err) {
940 						return -1;
941 					}
942 				}
943 				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
944 				return 0;
945 			}
946 		}
947 		pos = (header >> 20) & 0xffc;
948 		/* 0 if no other items exist */
949 		if (pos < PCI_CFG_SIZE) {
950 			return -1;
951 		}
952 		err = spdk_pci_device_cfg_read32(dev, &header, pos);
953 		if (err) {
954 			return -1;
955 		}
956 	}
957 	return -1;
958 }
959 
960 struct spdk_pci_addr
961 spdk_pci_device_get_addr(struct spdk_pci_device *dev)
962 {
963 	return dev->addr;
964 }
965 
966 bool
967 spdk_pci_device_is_removed(struct spdk_pci_device *dev)
968 {
969 	return dev->internal.pending_removal;
970 }
971 
972 int
973 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
974 {
975 	if (a1->domain > a2->domain) {
976 		return 1;
977 	} else if (a1->domain < a2->domain) {
978 		return -1;
979 	} else if (a1->bus > a2->bus) {
980 		return 1;
981 	} else if (a1->bus < a2->bus) {
982 		return -1;
983 	} else if (a1->dev > a2->dev) {
984 		return 1;
985 	} else if (a1->dev < a2->dev) {
986 		return -1;
987 	} else if (a1->func > a2->func) {
988 		return 1;
989 	} else if (a1->func < a2->func) {
990 		return -1;
991 	}
992 
993 	return 0;
994 }
995 
996 #ifdef __linux__
997 int
998 spdk_pci_device_claim(struct spdk_pci_device *dev)
999 {
1000 	int dev_fd;
1001 	char dev_name[64];
1002 	int pid;
1003 	void *dev_map;
1004 	struct flock pcidev_lock = {
1005 		.l_type = F_WRLCK,
1006 		.l_whence = SEEK_SET,
1007 		.l_start = 0,
1008 		.l_len = 0,
1009 	};
1010 
1011 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1012 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1013 
1014 	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
1015 	if (dev_fd == -1) {
1016 		SPDK_ERRLOG("could not open %s\n", dev_name);
1017 		return -errno;
1018 	}
1019 
1020 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
1021 		SPDK_ERRLOG("could not truncate %s\n", dev_name);
1022 		close(dev_fd);
1023 		return -errno;
1024 	}
1025 
1026 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
1027 		       MAP_SHARED, dev_fd, 0);
1028 	if (dev_map == MAP_FAILED) {
1029 		SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno);
1030 		close(dev_fd);
1031 		return -errno;
1032 	}
1033 
1034 	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
1035 		pid = *(int *)dev_map;
1036 		SPDK_ERRLOG("Cannot create lock on device %s, probably"
1037 			    " process %d has claimed it\n", dev_name, pid);
1038 		munmap(dev_map, sizeof(int));
1039 		close(dev_fd);
1040 		/* F_SETLK returns unspecified errnos, normalize them */
1041 		return -EACCES;
1042 	}
1043 
1044 	*(int *)dev_map = (int)getpid();
1045 	munmap(dev_map, sizeof(int));
1046 	dev->internal.claim_fd = dev_fd;
1047 	/* Keep dev_fd open to maintain the lock. */
1048 	return 0;
1049 }
1050 
1051 void
1052 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1053 {
1054 	char dev_name[64];
1055 
1056 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1057 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1058 
1059 	close(dev->internal.claim_fd);
1060 	dev->internal.claim_fd = -1;
1061 	unlink(dev_name);
1062 }
1063 #else /* !__linux__ */
1064 int
1065 spdk_pci_device_claim(struct spdk_pci_device *dev)
1066 {
1067 	/* TODO */
1068 	return 0;
1069 }
1070 
1071 void
1072 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1073 {
1074 	/* TODO */
1075 }
1076 #endif /* __linux__ */
1077 
1078 int
1079 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
1080 {
1081 	unsigned domain, bus, dev, func;
1082 
1083 	if (addr == NULL || bdf == NULL) {
1084 		return -EINVAL;
1085 	}
1086 
1087 	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1088 	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1089 		/* Matched a full address - all variables are initialized */
1090 	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1091 		func = 0;
1092 	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1093 		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1094 		domain = 0;
1095 	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1096 		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1097 		domain = 0;
1098 		func = 0;
1099 	} else {
1100 		return -EINVAL;
1101 	}
1102 
1103 	if (bus > 0xFF || dev > 0x1F || func > 7) {
1104 		return -EINVAL;
1105 	}
1106 
1107 	addr->domain = domain;
1108 	addr->bus = bus;
1109 	addr->dev = dev;
1110 	addr->func = func;
1111 
1112 	return 0;
1113 }
1114 
1115 int
1116 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1117 {
1118 	int rc;
1119 
1120 	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1121 		      addr->domain, addr->bus,
1122 		      addr->dev, addr->func);
1123 
1124 	if (rc > 0 && (size_t)rc < sz) {
1125 		return 0;
1126 	}
1127 
1128 	return -1;
1129 }
1130 
1131 int
1132 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1133 {
1134 	int rc;
1135 
1136 	assert(dev->map_bar != NULL);
1137 	assert(dev->unmap_bar != NULL);
1138 	assert(dev->cfg_read != NULL);
1139 	assert(dev->cfg_write != NULL);
1140 	dev->internal.driver = drv;
1141 
1142 	if (drv->cb_fn != NULL) {
1143 		rc = drv->cb_fn(drv->cb_arg, dev);
1144 		if (rc != 0) {
1145 			return -ECANCELED;
1146 		}
1147 
1148 		dev->internal.attached = true;
1149 	}
1150 
1151 	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1152 
1153 	return 0;
1154 }
1155 
1156 void
1157 spdk_pci_unhook_device(struct spdk_pci_device *dev)
1158 {
1159 	assert(!dev->internal.attached);
1160 	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1161 }
1162 
1163 void
1164 spdk_pci_register_device_provider(struct spdk_pci_device_provider *provider)
1165 {
1166 	TAILQ_INSERT_TAIL(&g_pci_device_providers, provider, tailq);
1167 }
1168 
1169 const char *
1170 spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1171 {
1172 	return dev->type;
1173 }
1174 
1175 int
1176 spdk_pci_device_allow(struct spdk_pci_addr *pci_addr)
1177 {
1178 	struct rte_devargs *da;
1179 	char devargs_str[128];
1180 
1181 	da = calloc(1, sizeof(*da));
1182 	if (da == NULL) {
1183 		SPDK_ERRLOG("could not allocate rte_devargs\n");
1184 		return -ENOMEM;
1185 	}
1186 
1187 	snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x",
1188 		 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func);
1189 	if (rte_devargs_parse(da, devargs_str) != 0) {
1190 		SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str);
1191 		free(da);
1192 		return -EINVAL;
1193 	}
1194 	da->policy = RTE_DEV_ALLOWED;
1195 	/* Note: if a devargs already exists for this device address, it just gets
1196 	 * overridden.  So we do not need to check if the devargs already exists.
1197 	 * DPDK will take care of memory management for the devargs structure after
1198 	 * it has been inserted, so there's nothing SPDK needs to track.
1199 	 */
1200 	if (rte_devargs_insert(&da) != 0) {
1201 		SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str);
1202 		free(da);
1203 		return -EINVAL;
1204 	}
1205 
1206 	return 0;
1207 }
1208