xref: /spdk/lib/env_dpdk/pci.c (revision dfc989439662457d39bac524be72e8ea1c20e817)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "env_internal.h"
7 #include "pci_dpdk.h"
8 
9 #include <rte_alarm.h>
10 #include <rte_devargs.h>
11 #include "spdk/env.h"
12 #include "spdk/log.h"
13 #include "spdk/string.h"
14 
15 #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
16 
17 /* Compatibility for versions < 20.11 */
18 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0)
19 #define RTE_DEV_ALLOWED RTE_DEV_WHITELISTED
20 #define RTE_DEV_BLOCKED RTE_DEV_BLACKLISTED
21 #define RTE_BUS_SCAN_ALLOWLIST RTE_BUS_SCAN_WHITELIST
22 #endif
23 
24 #define PCI_CFG_SIZE		256
25 #define PCI_EXT_CAP_ID_SN	0x03
26 
27 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
28  * might cause the internal IPC to misbehave. Just retry in such case.
29  */
30 #define DPDK_HOTPLUG_RETRY_COUNT 4
31 
32 /* DPDK alarm/interrupt thread */
33 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
34 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
35 /* devices hotplugged on a dpdk thread */
36 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
37 	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
38 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
39 static TAILQ_HEAD(, spdk_pci_device_provider) g_pci_device_providers =
40 	TAILQ_HEAD_INITIALIZER(g_pci_device_providers);
41 
42 int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device);
43 int pci_device_fini(struct rte_pci_device *device);
44 
45 struct env_devargs {
46 	struct rte_bus	*bus;
47 	char		name[128];
48 	uint64_t	allowed_at;
49 	TAILQ_ENTRY(env_devargs) link;
50 };
51 static TAILQ_HEAD(, env_devargs) g_env_devargs = TAILQ_HEAD_INITIALIZER(g_env_devargs);
52 
53 static struct env_devargs *
54 find_env_devargs(struct rte_bus *bus, const char *name)
55 {
56 	struct env_devargs *da;
57 
58 	TAILQ_FOREACH(da, &g_env_devargs, link) {
59 		if (bus == da->bus && !strcmp(name, da->name)) {
60 			return da;
61 		}
62 	}
63 
64 	return NULL;
65 }
66 
67 static int
68 map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
69 	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
70 {
71 	return dpdk_pci_device_map_bar(device->dev_handle, bar, mapped_addr, phys_addr, size);
72 }
73 
74 static int
75 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
76 {
77 	return 0;
78 }
79 
80 static int
81 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
82 {
83 	return dpdk_pci_device_read_config(dev->dev_handle, value, len, offset);
84 }
85 
86 static int
87 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
88 {
89 	return dpdk_pci_device_write_config(dev->dev_handle, value, len, offset);
90 }
91 
92 static void
93 remove_rte_dev(struct rte_pci_device *rte_dev)
94 {
95 	char bdf[32];
96 	int i = 0, rc;
97 
98 	snprintf(bdf, sizeof(bdf), "%s", dpdk_pci_device_get_name(rte_dev));
99 	do {
100 		rc = rte_eal_hotplug_remove("pci", bdf);
101 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
102 }
103 
104 static void
105 detach_rte_cb(void *_dev)
106 {
107 	remove_rte_dev(_dev);
108 }
109 
110 /* if it's a physical device we need to deal with DPDK on
111  * a different process and we can't just unset one flag
112  * here. We also want to stop using any device resources
113  * so that the device isn't "in use" by the userspace driver
114  * once we detach it. This would allow attaching the device
115  * to a different process, or to a kernel driver like nvme.
116  */
117 static void
118 detach_rte(struct spdk_pci_device *dev)
119 {
120 	struct rte_pci_device *rte_dev = dev->dev_handle;
121 	int i;
122 	bool removed;
123 
124 	if (!spdk_process_is_primary()) {
125 		remove_rte_dev(rte_dev);
126 		return;
127 	}
128 
129 	pthread_mutex_lock(&g_pci_mutex);
130 	dev->internal.attached = false;
131 	/* prevent the hotremove notification from removing this device */
132 	dev->internal.pending_removal = true;
133 	pthread_mutex_unlock(&g_pci_mutex);
134 
135 	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
136 
137 	/* wait up to 2s for the cb to execute */
138 	for (i = 2000; i > 0; i--) {
139 
140 		spdk_delay_us(1000);
141 		pthread_mutex_lock(&g_pci_mutex);
142 		removed = dev->internal.removed;
143 		pthread_mutex_unlock(&g_pci_mutex);
144 
145 		if (removed) {
146 			break;
147 		}
148 	}
149 
150 	/* besides checking the removed flag, we also need to wait
151 	 * for the dpdk detach function to unwind, as it's doing some
152 	 * operations even after calling our detach callback. Simply
153 	 * cancel the alarm - if it started executing already, this
154 	 * call will block and wait for it to finish.
155 	 */
156 	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
157 
158 	/* the device could have been finally removed, so just check
159 	 * it again.
160 	 */
161 	pthread_mutex_lock(&g_pci_mutex);
162 	removed = dev->internal.removed;
163 	pthread_mutex_unlock(&g_pci_mutex);
164 	if (!removed) {
165 		SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n",
166 			    dpdk_pci_device_get_name(rte_dev));
167 		/* If we reach this state, then the device couldn't be removed and most likely
168 		   a subsequent hot add of a device in the same BDF will fail */
169 	}
170 }
171 
172 void
173 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
174 {
175 	struct spdk_pci_driver *driver;
176 
177 	driver = calloc(1, sizeof(*driver));
178 	if (!driver) {
179 		/* we can't do any better than bailing atm */
180 		return;
181 	}
182 
183 	driver->name = name;
184 	driver->id_table = id_table;
185 	driver->drv_flags = flags;
186 	driver->driver = (struct rte_pci_driver *)driver->driver_buf;
187 	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
188 }
189 
190 struct spdk_pci_driver *
191 spdk_pci_nvme_get_driver(void)
192 {
193 	return spdk_pci_get_driver("nvme");
194 }
195 
196 struct spdk_pci_driver *
197 spdk_pci_get_driver(const char *name)
198 {
199 	struct spdk_pci_driver *driver;
200 
201 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
202 		if (strcmp(driver->name, name) == 0) {
203 			return driver;
204 		}
205 	}
206 
207 	return NULL;
208 }
209 
210 static void
211 pci_device_rte_dev_event(const char *device_name,
212 			 enum rte_dev_event_type event,
213 			 void *cb_arg)
214 {
215 	struct spdk_pci_device *dev;
216 	bool can_detach = false;
217 
218 	switch (event) {
219 	default:
220 	case RTE_DEV_EVENT_ADD:
221 		/* Nothing to do here yet. */
222 		break;
223 	case RTE_DEV_EVENT_REMOVE:
224 		pthread_mutex_lock(&g_pci_mutex);
225 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
226 			struct rte_pci_device *rte_dev = dev->dev_handle;
227 
228 			if (strcmp(dpdk_pci_device_get_name(rte_dev), device_name) == 0 &&
229 			    !dev->internal.pending_removal) {
230 				can_detach = !dev->internal.attached;
231 				/* prevent any further attaches */
232 				dev->internal.pending_removal = true;
233 				break;
234 			}
235 		}
236 		pthread_mutex_unlock(&g_pci_mutex);
237 
238 		if (dev != NULL && can_detach) {
239 			/* if device is not attached we can remove it right away.
240 			 * Otherwise it will be removed at detach.
241 			 *
242 			 * Because the user's callback is invoked in eal interrupt
243 			 * callback, the interrupt callback need to be finished before
244 			 * it can be unregistered when detaching device. So finish
245 			 * callback soon and use a deferred removal to detach device
246 			 * is need. It is a workaround, once the device detaching be
247 			 * moved into the eal in the future, the deferred removal could
248 			 * be deleted.
249 			 */
250 			rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle);
251 		}
252 		break;
253 	}
254 }
255 
256 static void
257 cleanup_pci_devices(void)
258 {
259 	struct spdk_pci_device *dev, *tmp;
260 
261 	pthread_mutex_lock(&g_pci_mutex);
262 	/* cleanup removed devices */
263 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
264 		if (!dev->internal.removed) {
265 			continue;
266 		}
267 
268 		vtophys_pci_device_removed(dev->dev_handle);
269 		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
270 		free(dev);
271 	}
272 
273 	/* add newly-attached devices */
274 	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
275 		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
276 		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
277 		vtophys_pci_device_added(dev->dev_handle);
278 	}
279 	pthread_mutex_unlock(&g_pci_mutex);
280 }
281 
282 static int scan_pci_bus(bool delay_init);
283 
284 static inline void
285 _pci_env_init(void)
286 {
287 	/* We assume devices were present on the bus for more than 2 seconds
288 	 * before initializing SPDK and there's no need to wait more. We scan
289 	 * the bus, but we don't block any devices.
290 	 */
291 	scan_pci_bus(false);
292 
293 	/* Register a single hotremove callback for all devices. */
294 	if (spdk_process_is_primary()) {
295 		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
296 	}
297 }
298 
299 void
300 pci_env_init(void)
301 {
302 	struct spdk_pci_driver *driver;
303 
304 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
305 		dpdk_pci_driver_register(driver, pci_device_init, pci_device_fini);
306 	}
307 
308 	_pci_env_init();
309 }
310 
311 void
312 pci_env_reinit(void)
313 {
314 	/* There is no need to register pci drivers again, since they were
315 	 * already pre-registered in pci_env_init.
316 	 */
317 
318 	_pci_env_init();
319 }
320 
321 void
322 pci_env_fini(void)
323 {
324 	struct spdk_pci_device *dev;
325 	char bdf[32];
326 
327 	cleanup_pci_devices();
328 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
329 		if (dev->internal.attached) {
330 			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
331 			SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf);
332 		}
333 	}
334 
335 	if (spdk_process_is_primary()) {
336 		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
337 	}
338 }
339 
340 int
341 pci_device_init(struct rte_pci_driver *_drv,
342 		struct rte_pci_device *_dev)
343 {
344 	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
345 	struct spdk_pci_device *dev;
346 	int rc;
347 
348 	dev = calloc(1, sizeof(*dev));
349 	if (dev == NULL) {
350 		return -1;
351 	}
352 
353 	dev->dev_handle = _dev;
354 
355 	dpdk_pci_device_copy_identifiers(_dev, dev);
356 	dev->type = "pci";
357 
358 	dev->map_bar = map_bar_rte;
359 	dev->unmap_bar = unmap_bar_rte;
360 	dev->cfg_read = cfg_read_rte;
361 	dev->cfg_write = cfg_write_rte;
362 
363 	dev->internal.driver = driver;
364 	dev->internal.claim_fd = -1;
365 
366 	if (driver->cb_fn != NULL) {
367 		rc = driver->cb_fn(driver->cb_arg, dev);
368 		if (rc != 0) {
369 			free(dev);
370 			return rc;
371 		}
372 		dev->internal.attached = true;
373 	}
374 
375 	pthread_mutex_lock(&g_pci_mutex);
376 	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
377 	pthread_mutex_unlock(&g_pci_mutex);
378 	return 0;
379 }
380 
381 static void
382 set_allowed_at(struct rte_devargs *rte_da, uint64_t tsc)
383 {
384 	struct env_devargs *env_da;
385 
386 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
387 	if (env_da == NULL) {
388 		env_da = calloc(1, sizeof(*env_da));
389 		if (env_da == NULL) {
390 			SPDK_ERRLOG("could not set_allowed_at for device %s\n", rte_da->name);
391 			return;
392 		}
393 		env_da->bus = rte_da->bus;
394 		spdk_strcpy_pad(env_da->name, rte_da->name, sizeof(env_da->name), 0);
395 		TAILQ_INSERT_TAIL(&g_env_devargs, env_da, link);
396 	}
397 
398 	env_da->allowed_at = tsc;
399 }
400 
401 static uint64_t
402 get_allowed_at(struct rte_devargs *rte_da)
403 {
404 	struct env_devargs *env_da;
405 
406 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
407 	if (env_da) {
408 		return env_da->allowed_at;
409 	} else {
410 		return 0;
411 	}
412 }
413 
414 int
415 pci_device_fini(struct rte_pci_device *_dev)
416 {
417 	struct spdk_pci_device *dev;
418 
419 	pthread_mutex_lock(&g_pci_mutex);
420 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
421 		if (dev->dev_handle == _dev) {
422 			break;
423 		}
424 	}
425 
426 	if (dev == NULL || dev->internal.attached) {
427 		/* The device might be still referenced somewhere in SPDK. */
428 		pthread_mutex_unlock(&g_pci_mutex);
429 		return -EBUSY;
430 	}
431 
432 	/* remove our allowed_at option */
433 	if (dpdk_pci_device_get_devargs(_dev)) {
434 		set_allowed_at(dpdk_pci_device_get_devargs(_dev), 0);
435 	}
436 
437 	/* It is possible that removed flag was already set when there is a race
438 	 * between the remove notification for this process, and another process
439 	 * that is also detaching from this same device (for example, when using
440 	 * nvme driver in multi-process mode.  So do not assert here.  See
441 	 * #2456 for additional details.
442 	 */
443 	dev->internal.removed = true;
444 	pthread_mutex_unlock(&g_pci_mutex);
445 	return 0;
446 
447 }
448 
449 void
450 spdk_pci_device_detach(struct spdk_pci_device *dev)
451 {
452 	struct spdk_pci_device_provider *provider;
453 
454 	assert(dev->internal.attached);
455 
456 	if (dev->internal.claim_fd >= 0) {
457 		spdk_pci_device_unclaim(dev);
458 	}
459 
460 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
461 		if (strcmp(dev->type, provider->name) == 0) {
462 			break;
463 		}
464 	}
465 
466 	assert(provider != NULL);
467 	dev->internal.attached = false;
468 	provider->detach_cb(dev);
469 
470 	cleanup_pci_devices();
471 }
472 
473 static int
474 scan_pci_bus(bool delay_init)
475 {
476 	struct rte_dev_iterator it;
477 	struct rte_device *rte_dev;
478 	uint64_t now;
479 
480 	dpdk_bus_scan();
481 	now = spdk_get_ticks();
482 
483 	if (!TAILQ_FIRST(&g_pci_drivers)) {
484 		return 0;
485 	}
486 
487 	RTE_DEV_FOREACH(rte_dev, "bus=pci", &it) {
488 		struct rte_devargs *da;
489 
490 		da = dpdk_device_get_devargs(rte_dev);
491 		if (!da) {
492 			char devargs_str[128];
493 
494 			/* the device was never blocked or allowed */
495 			da = calloc(1, sizeof(*da));
496 			if (!da) {
497 				return -1;
498 			}
499 
500 			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", dpdk_device_get_name(rte_dev));
501 			if (rte_devargs_parse(da, devargs_str) != 0) {
502 				free(da);
503 				return -1;
504 			}
505 
506 			rte_devargs_insert(&da);
507 			dpdk_device_set_devargs(rte_dev, da);
508 		}
509 
510 		if (get_allowed_at(da)) {
511 			uint64_t allowed_at = get_allowed_at(da);
512 
513 			/* this device was seen by spdk before... */
514 			if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) {
515 				da->policy = RTE_DEV_ALLOWED;
516 			}
517 		} else if ((dpdk_device_scan_allowed(rte_dev) && da->policy == RTE_DEV_ALLOWED) ||
518 			   da->policy != RTE_DEV_BLOCKED) {
519 			/* override the policy only if not permanently blocked */
520 
521 			if (delay_init) {
522 				da->policy = RTE_DEV_BLOCKED;
523 				set_allowed_at(da, now + 2 * spdk_get_ticks_hz());
524 			} else {
525 				da->policy = RTE_DEV_ALLOWED;
526 				set_allowed_at(da, now);
527 			}
528 		}
529 	}
530 
531 	return 0;
532 }
533 
534 static int
535 pci_attach_rte(const struct spdk_pci_addr *addr)
536 {
537 	char bdf[32];
538 	int rc, i = 0;
539 
540 	spdk_pci_addr_fmt(bdf, sizeof(bdf), addr);
541 
542 	do {
543 		rc = rte_eal_hotplug_add("pci", bdf, "");
544 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
545 
546 	if (i > 1 && rc == -EEXIST) {
547 		/* Even though the previous request timed out, the device
548 		 * was attached successfully.
549 		 */
550 		rc = 0;
551 	}
552 
553 	return rc;
554 }
555 
556 static struct spdk_pci_device_provider g_pci_rte_provider = {
557 	.name = "pci",
558 	.attach_cb = pci_attach_rte,
559 	.detach_cb = detach_rte,
560 };
561 
562 SPDK_PCI_REGISTER_DEVICE_PROVIDER(pci, &g_pci_rte_provider);
563 
564 int
565 spdk_pci_device_attach(struct spdk_pci_driver *driver,
566 		       spdk_pci_enum_cb enum_cb,
567 		       void *enum_ctx, struct spdk_pci_addr *pci_address)
568 {
569 	struct spdk_pci_device *dev;
570 	struct spdk_pci_device_provider *provider;
571 	struct rte_pci_device *rte_dev;
572 	struct rte_devargs *da;
573 	int rc;
574 
575 	cleanup_pci_devices();
576 
577 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
578 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
579 			break;
580 		}
581 	}
582 
583 	if (dev != NULL && dev->internal.driver == driver) {
584 		pthread_mutex_lock(&g_pci_mutex);
585 		if (dev->internal.attached || dev->internal.pending_removal) {
586 			pthread_mutex_unlock(&g_pci_mutex);
587 			return -1;
588 		}
589 
590 		rc = enum_cb(enum_ctx, dev);
591 		if (rc == 0) {
592 			dev->internal.attached = true;
593 		}
594 		pthread_mutex_unlock(&g_pci_mutex);
595 		return rc;
596 	}
597 
598 	driver->cb_fn = enum_cb;
599 	driver->cb_arg = enum_ctx;
600 
601 	rc = -ENODEV;
602 	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
603 		rc = provider->attach_cb(pci_address);
604 		if (rc == 0) {
605 			break;
606 		}
607 	}
608 
609 	driver->cb_arg = NULL;
610 	driver->cb_fn = NULL;
611 
612 	cleanup_pci_devices();
613 
614 	if (rc != 0) {
615 		return -1;
616 	}
617 
618 	/* explicit attach ignores the allowlist, so if we blocked this
619 	 * device before let's enable it now - just for clarity.
620 	 */
621 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
622 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
623 			break;
624 		}
625 	}
626 	assert(dev != NULL);
627 
628 	rte_dev = dev->dev_handle;
629 	if (rte_dev != NULL) {
630 		da = dpdk_pci_device_get_devargs(rte_dev);
631 		if (da && get_allowed_at(da)) {
632 			set_allowed_at(da, spdk_get_ticks());
633 			da->policy = RTE_DEV_ALLOWED;
634 		}
635 	}
636 
637 	return 0;
638 }
639 
640 /* Note: You can call spdk_pci_enumerate from more than one thread
641  *       simultaneously safely, but you cannot call spdk_pci_enumerate
642  *       and rte_eal_pci_probe simultaneously.
643  */
644 int
645 spdk_pci_enumerate(struct spdk_pci_driver *driver,
646 		   spdk_pci_enum_cb enum_cb,
647 		   void *enum_ctx)
648 {
649 	struct spdk_pci_device *dev;
650 	int rc;
651 
652 	cleanup_pci_devices();
653 
654 	pthread_mutex_lock(&g_pci_mutex);
655 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
656 		if (dev->internal.attached ||
657 		    dev->internal.driver != driver ||
658 		    dev->internal.pending_removal) {
659 			continue;
660 		}
661 
662 		rc = enum_cb(enum_ctx, dev);
663 		if (rc == 0) {
664 			dev->internal.attached = true;
665 		} else if (rc < 0) {
666 			pthread_mutex_unlock(&g_pci_mutex);
667 			return -1;
668 		}
669 	}
670 	pthread_mutex_unlock(&g_pci_mutex);
671 
672 	if (scan_pci_bus(true) != 0) {
673 		return -1;
674 	}
675 
676 	driver->cb_fn = enum_cb;
677 	driver->cb_arg = enum_ctx;
678 
679 	if (dpdk_bus_probe() != 0) {
680 		driver->cb_arg = NULL;
681 		driver->cb_fn = NULL;
682 		return -1;
683 	}
684 
685 	driver->cb_arg = NULL;
686 	driver->cb_fn = NULL;
687 
688 	cleanup_pci_devices();
689 	return 0;
690 }
691 
692 void
693 spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev))
694 {
695 	struct spdk_pci_device *dev, *tmp;
696 
697 	pthread_mutex_lock(&g_pci_mutex);
698 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
699 		fn(ctx, dev);
700 	}
701 	pthread_mutex_unlock(&g_pci_mutex);
702 }
703 
704 int
705 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
706 			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
707 {
708 	return dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
709 }
710 
711 int
712 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
713 {
714 	return dev->unmap_bar(dev, bar, addr);
715 }
716 
717 int
718 spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev)
719 {
720 	return dpdk_pci_device_enable_interrupt(dev->dev_handle);
721 }
722 
723 int
724 spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev)
725 {
726 	return dpdk_pci_device_disable_interrupt(dev->dev_handle);
727 }
728 
729 int
730 spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev)
731 {
732 	return dpdk_pci_device_get_interrupt_efd(dev->dev_handle);
733 }
734 
735 uint32_t
736 spdk_pci_device_get_domain(struct spdk_pci_device *dev)
737 {
738 	return dev->addr.domain;
739 }
740 
741 uint8_t
742 spdk_pci_device_get_bus(struct spdk_pci_device *dev)
743 {
744 	return dev->addr.bus;
745 }
746 
747 uint8_t
748 spdk_pci_device_get_dev(struct spdk_pci_device *dev)
749 {
750 	return dev->addr.dev;
751 }
752 
753 uint8_t
754 spdk_pci_device_get_func(struct spdk_pci_device *dev)
755 {
756 	return dev->addr.func;
757 }
758 
759 uint16_t
760 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
761 {
762 	return dev->id.vendor_id;
763 }
764 
765 uint16_t
766 spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
767 {
768 	return dev->id.device_id;
769 }
770 
771 uint16_t
772 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
773 {
774 	return dev->id.subvendor_id;
775 }
776 
777 uint16_t
778 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
779 {
780 	return dev->id.subdevice_id;
781 }
782 
783 struct spdk_pci_id
784 spdk_pci_device_get_id(struct spdk_pci_device *dev)
785 {
786 	return dev->id;
787 }
788 
789 int
790 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
791 {
792 	return dev->socket_id;
793 }
794 
795 int
796 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
797 {
798 	return dev->cfg_read(dev, value, len, offset);
799 }
800 
801 int
802 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
803 {
804 	return dev->cfg_write(dev, value, len, offset);
805 }
806 
807 int
808 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
809 {
810 	return spdk_pci_device_cfg_read(dev, value, 1, offset);
811 }
812 
813 int
814 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
815 {
816 	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
817 }
818 
819 int
820 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
821 {
822 	return spdk_pci_device_cfg_read(dev, value, 2, offset);
823 }
824 
825 int
826 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
827 {
828 	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
829 }
830 
831 int
832 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
833 {
834 	return spdk_pci_device_cfg_read(dev, value, 4, offset);
835 }
836 
837 int
838 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
839 {
840 	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
841 }
842 
843 int
844 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
845 {
846 	int err;
847 	uint32_t pos, header = 0;
848 	uint32_t i, buf[2];
849 
850 	if (len < 17) {
851 		return -1;
852 	}
853 
854 	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
855 	if (err || !header) {
856 		return -1;
857 	}
858 
859 	pos = PCI_CFG_SIZE;
860 	while (1) {
861 		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
862 			if (pos) {
863 				/* skip the header */
864 				pos += 4;
865 				for (i = 0; i < 2; i++) {
866 					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
867 					if (err) {
868 						return -1;
869 					}
870 				}
871 				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
872 				return 0;
873 			}
874 		}
875 		pos = (header >> 20) & 0xffc;
876 		/* 0 if no other items exist */
877 		if (pos < PCI_CFG_SIZE) {
878 			return -1;
879 		}
880 		err = spdk_pci_device_cfg_read32(dev, &header, pos);
881 		if (err) {
882 			return -1;
883 		}
884 	}
885 	return -1;
886 }
887 
888 struct spdk_pci_addr
889 spdk_pci_device_get_addr(struct spdk_pci_device *dev)
890 {
891 	return dev->addr;
892 }
893 
894 bool
895 spdk_pci_device_is_removed(struct spdk_pci_device *dev)
896 {
897 	return dev->internal.pending_removal;
898 }
899 
900 int
901 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
902 {
903 	if (a1->domain > a2->domain) {
904 		return 1;
905 	} else if (a1->domain < a2->domain) {
906 		return -1;
907 	} else if (a1->bus > a2->bus) {
908 		return 1;
909 	} else if (a1->bus < a2->bus) {
910 		return -1;
911 	} else if (a1->dev > a2->dev) {
912 		return 1;
913 	} else if (a1->dev < a2->dev) {
914 		return -1;
915 	} else if (a1->func > a2->func) {
916 		return 1;
917 	} else if (a1->func < a2->func) {
918 		return -1;
919 	}
920 
921 	return 0;
922 }
923 
924 #ifdef __linux__
925 int
926 spdk_pci_device_claim(struct spdk_pci_device *dev)
927 {
928 	int dev_fd;
929 	char dev_name[64];
930 	int pid;
931 	void *dev_map;
932 	struct flock pcidev_lock = {
933 		.l_type = F_WRLCK,
934 		.l_whence = SEEK_SET,
935 		.l_start = 0,
936 		.l_len = 0,
937 	};
938 
939 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
940 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
941 
942 	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
943 	if (dev_fd == -1) {
944 		SPDK_ERRLOG("could not open %s\n", dev_name);
945 		return -errno;
946 	}
947 
948 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
949 		SPDK_ERRLOG("could not truncate %s\n", dev_name);
950 		close(dev_fd);
951 		return -errno;
952 	}
953 
954 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
955 		       MAP_SHARED, dev_fd, 0);
956 	if (dev_map == MAP_FAILED) {
957 		SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno);
958 		close(dev_fd);
959 		return -errno;
960 	}
961 
962 	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
963 		pid = *(int *)dev_map;
964 		SPDK_ERRLOG("Cannot create lock on device %s, probably"
965 			    " process %d has claimed it\n", dev_name, pid);
966 		munmap(dev_map, sizeof(int));
967 		close(dev_fd);
968 		/* F_SETLK returns unspecified errnos, normalize them */
969 		return -EACCES;
970 	}
971 
972 	*(int *)dev_map = (int)getpid();
973 	munmap(dev_map, sizeof(int));
974 	dev->internal.claim_fd = dev_fd;
975 	/* Keep dev_fd open to maintain the lock. */
976 	return 0;
977 }
978 
979 void
980 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
981 {
982 	char dev_name[64];
983 
984 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
985 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
986 
987 	close(dev->internal.claim_fd);
988 	dev->internal.claim_fd = -1;
989 	unlink(dev_name);
990 }
991 #else /* !__linux__ */
992 int
993 spdk_pci_device_claim(struct spdk_pci_device *dev)
994 {
995 	/* TODO */
996 	return 0;
997 }
998 
999 void
1000 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1001 {
1002 	/* TODO */
1003 }
1004 #endif /* __linux__ */
1005 
1006 int
1007 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
1008 {
1009 	unsigned domain, bus, dev, func;
1010 
1011 	if (addr == NULL || bdf == NULL) {
1012 		return -EINVAL;
1013 	}
1014 
1015 	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1016 	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1017 		/* Matched a full address - all variables are initialized */
1018 	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1019 		func = 0;
1020 	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1021 		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1022 		domain = 0;
1023 	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1024 		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1025 		domain = 0;
1026 		func = 0;
1027 	} else {
1028 		return -EINVAL;
1029 	}
1030 
1031 	if (bus > 0xFF || dev > 0x1F || func > 7) {
1032 		return -EINVAL;
1033 	}
1034 
1035 	addr->domain = domain;
1036 	addr->bus = bus;
1037 	addr->dev = dev;
1038 	addr->func = func;
1039 
1040 	return 0;
1041 }
1042 
1043 int
1044 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1045 {
1046 	int rc;
1047 
1048 	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1049 		      addr->domain, addr->bus,
1050 		      addr->dev, addr->func);
1051 
1052 	if (rc > 0 && (size_t)rc < sz) {
1053 		return 0;
1054 	}
1055 
1056 	return -1;
1057 }
1058 
1059 int
1060 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1061 {
1062 	int rc;
1063 
1064 	assert(dev->map_bar != NULL);
1065 	assert(dev->unmap_bar != NULL);
1066 	assert(dev->cfg_read != NULL);
1067 	assert(dev->cfg_write != NULL);
1068 	dev->internal.driver = drv;
1069 
1070 	if (drv->cb_fn != NULL) {
1071 		rc = drv->cb_fn(drv->cb_arg, dev);
1072 		if (rc != 0) {
1073 			return -ECANCELED;
1074 		}
1075 
1076 		dev->internal.attached = true;
1077 	}
1078 
1079 	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1080 
1081 	return 0;
1082 }
1083 
1084 void
1085 spdk_pci_unhook_device(struct spdk_pci_device *dev)
1086 {
1087 	assert(!dev->internal.attached);
1088 	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1089 }
1090 
1091 void
1092 spdk_pci_register_device_provider(struct spdk_pci_device_provider *provider)
1093 {
1094 	TAILQ_INSERT_TAIL(&g_pci_device_providers, provider, tailq);
1095 }
1096 
1097 const char *
1098 spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1099 {
1100 	return dev->type;
1101 }
1102 
1103 int
1104 spdk_pci_device_allow(struct spdk_pci_addr *pci_addr)
1105 {
1106 	struct rte_devargs *da;
1107 	char devargs_str[128];
1108 
1109 	da = calloc(1, sizeof(*da));
1110 	if (da == NULL) {
1111 		SPDK_ERRLOG("could not allocate rte_devargs\n");
1112 		return -ENOMEM;
1113 	}
1114 
1115 	snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x",
1116 		 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func);
1117 	if (rte_devargs_parse(da, devargs_str) != 0) {
1118 		SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str);
1119 		free(da);
1120 		return -EINVAL;
1121 	}
1122 	da->policy = RTE_DEV_ALLOWED;
1123 	/* Note: if a devargs already exists for this device address, it just gets
1124 	 * overridden.  So we do not need to check if the devargs already exists.
1125 	 * DPDK will take care of memory management for the devargs structure after
1126 	 * it has been inserted, so there's nothing SPDK needs to track.
1127 	 */
1128 	if (rte_devargs_insert(&da) != 0) {
1129 		SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str);
1130 		free(da);
1131 		return -EINVAL;
1132 	}
1133 
1134 	return 0;
1135 }
1136