xref: /spdk/lib/env_dpdk/pci.c (revision 0098e636761237b77c12c30c2408263a5d2260cc)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "env_internal.h"
7 
8 #include <rte_alarm.h>
9 #include <rte_devargs.h>
10 #include "spdk/env.h"
11 #include "spdk/log.h"
12 #include "spdk/string.h"
13 
14 #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
15 
16 /* Compatibility for versions < 20.11 */
17 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0)
18 #define RTE_DEV_ALLOWED RTE_DEV_WHITELISTED
19 #define RTE_DEV_BLOCKED RTE_DEV_BLACKLISTED
20 #define RTE_BUS_SCAN_ALLOWLIST RTE_BUS_SCAN_WHITELIST
21 #endif
22 
23 #define PCI_CFG_SIZE		256
24 #define PCI_EXT_CAP_ID_SN	0x03
25 
26 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
27  * might cause the internal IPC to misbehave. Just retry in such case.
28  */
29 #define DPDK_HOTPLUG_RETRY_COUNT 4
30 
31 /* DPDK alarm/interrupt thread */
32 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
33 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
34 /* devices hotplugged on a dpdk thread */
35 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
36 	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
37 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
38 
39 struct env_devargs {
40 	struct rte_bus	*bus;
41 	char		name[128];
42 	uint64_t	allowed_at;
43 	TAILQ_ENTRY(env_devargs) link;
44 };
45 static TAILQ_HEAD(, env_devargs) g_env_devargs = TAILQ_HEAD_INITIALIZER(g_env_devargs);
46 
47 static struct env_devargs *
48 find_env_devargs(struct rte_bus *bus, const char *name)
49 {
50 	struct env_devargs *da;
51 
52 	TAILQ_FOREACH(da, &g_env_devargs, link) {
53 		if (bus == da->bus && !strcmp(name, da->name)) {
54 			return da;
55 		}
56 	}
57 
58 	return NULL;
59 }
60 
61 static int
62 map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
63 	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
64 {
65 	struct rte_pci_device *dev = device->dev_handle;
66 
67 	*mapped_addr = dev->mem_resource[bar].addr;
68 	*phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr;
69 	*size = (uint64_t)dev->mem_resource[bar].len;
70 
71 	return 0;
72 }
73 
74 static int
75 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
76 {
77 	return 0;
78 }
79 
80 static int
81 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
82 {
83 	int rc;
84 
85 	rc = rte_pci_read_config(dev->dev_handle, value, len, offset);
86 
87 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
88 }
89 
90 static int
91 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
92 {
93 	int rc;
94 
95 	rc = rte_pci_write_config(dev->dev_handle, value, len, offset);
96 
97 #ifdef __FreeBSD__
98 	/* DPDK returns 0 on success and -1 on failure */
99 	return rc;
100 #endif
101 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
102 }
103 
104 static void
105 remove_rte_dev(struct rte_pci_device *rte_dev)
106 {
107 	char bdf[32];
108 	int i = 0, rc;
109 
110 	snprintf(bdf, sizeof(bdf), "%s", rte_dev->device.name);
111 	do {
112 		rc = rte_eal_hotplug_remove("pci", bdf);
113 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
114 }
115 
116 static void
117 detach_rte_cb(void *_dev)
118 {
119 	remove_rte_dev(_dev);
120 }
121 
122 static void
123 detach_rte(struct spdk_pci_device *dev)
124 {
125 	struct rte_pci_device *rte_dev = dev->dev_handle;
126 	int i;
127 	bool removed;
128 
129 	if (!spdk_process_is_primary()) {
130 		remove_rte_dev(rte_dev);
131 		return;
132 	}
133 
134 	pthread_mutex_lock(&g_pci_mutex);
135 	dev->internal.attached = false;
136 	/* prevent the hotremove notification from removing this device */
137 	dev->internal.pending_removal = true;
138 	pthread_mutex_unlock(&g_pci_mutex);
139 
140 	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
141 
142 	/* wait up to 2s for the cb to execute */
143 	for (i = 2000; i > 0; i--) {
144 
145 		spdk_delay_us(1000);
146 		pthread_mutex_lock(&g_pci_mutex);
147 		removed = dev->internal.removed;
148 		pthread_mutex_unlock(&g_pci_mutex);
149 
150 		if (removed) {
151 			break;
152 		}
153 	}
154 
155 	/* besides checking the removed flag, we also need to wait
156 	 * for the dpdk detach function to unwind, as it's doing some
157 	 * operations even after calling our detach callback. Simply
158 	 * cancel the alarm - if it started executing already, this
159 	 * call will block and wait for it to finish.
160 	 */
161 	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
162 
163 	/* the device could have been finally removed, so just check
164 	 * it again.
165 	 */
166 	pthread_mutex_lock(&g_pci_mutex);
167 	removed = dev->internal.removed;
168 	pthread_mutex_unlock(&g_pci_mutex);
169 	if (!removed) {
170 		SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n",
171 			    rte_dev->name);
172 		/* If we reach this state, then the device couldn't be removed and most likely
173 		   a subsequent hot add of a device in the same BDF will fail */
174 	}
175 }
176 
177 void
178 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
179 {
180 	struct spdk_pci_driver *driver;
181 
182 	driver = calloc(1, sizeof(*driver));
183 	if (!driver) {
184 		/* we can't do any better than bailing atm */
185 		return;
186 	}
187 
188 	driver->name = name;
189 	driver->id_table = id_table;
190 	driver->drv_flags = flags;
191 	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
192 }
193 
194 struct spdk_pci_driver *
195 spdk_pci_nvme_get_driver(void)
196 {
197 	return spdk_pci_get_driver("nvme");
198 }
199 
200 struct spdk_pci_driver *
201 spdk_pci_get_driver(const char *name)
202 {
203 	struct spdk_pci_driver *driver;
204 
205 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
206 		if (strcmp(driver->name, name) == 0) {
207 			return driver;
208 		}
209 	}
210 
211 	return NULL;
212 }
213 
214 static void
215 pci_device_rte_dev_event(const char *device_name,
216 			 enum rte_dev_event_type event,
217 			 void *cb_arg)
218 {
219 	struct spdk_pci_device *dev;
220 	bool can_detach = false;
221 
222 	switch (event) {
223 	default:
224 	case RTE_DEV_EVENT_ADD:
225 		/* Nothing to do here yet. */
226 		break;
227 	case RTE_DEV_EVENT_REMOVE:
228 		pthread_mutex_lock(&g_pci_mutex);
229 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
230 			struct rte_pci_device *rte_dev = dev->dev_handle;
231 
232 			if (strcmp(rte_dev->name, device_name) == 0 &&
233 			    !dev->internal.pending_removal) {
234 				can_detach = !dev->internal.attached;
235 				/* prevent any further attaches */
236 				dev->internal.pending_removal = true;
237 				break;
238 			}
239 		}
240 		pthread_mutex_unlock(&g_pci_mutex);
241 
242 		if (dev != NULL && can_detach) {
243 			/* if device is not attached we can remove it right away.
244 			 * Otherwise it will be removed at detach.
245 			 *
246 			 * Because the user's callback is invoked in eal interrupt
247 			 * callback, the interrupt callback need to be finished before
248 			 * it can be unregistered when detaching device. So finish
249 			 * callback soon and use a deferred removal to detach device
250 			 * is need. It is a workaround, once the device detaching be
251 			 * moved into the eal in the future, the deferred removal could
252 			 * be deleted.
253 			 */
254 			rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle);
255 		}
256 		break;
257 	}
258 }
259 
260 static void
261 cleanup_pci_devices(void)
262 {
263 	struct spdk_pci_device *dev, *tmp;
264 
265 	pthread_mutex_lock(&g_pci_mutex);
266 	/* cleanup removed devices */
267 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
268 		if (!dev->internal.removed) {
269 			continue;
270 		}
271 
272 		vtophys_pci_device_removed(dev->dev_handle);
273 		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
274 		free(dev);
275 	}
276 
277 	/* add newly-attached devices */
278 	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
279 		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
280 		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
281 		vtophys_pci_device_added(dev->dev_handle);
282 	}
283 	pthread_mutex_unlock(&g_pci_mutex);
284 }
285 
286 static int scan_pci_bus(bool delay_init);
287 
288 /* translate spdk_pci_driver to an rte_pci_driver and register it to dpdk */
289 static int
290 register_rte_driver(struct spdk_pci_driver *driver)
291 {
292 	unsigned pci_id_count = 0;
293 	struct rte_pci_id *rte_id_table;
294 	char *rte_name;
295 	size_t rte_name_len;
296 	uint32_t rte_flags;
297 
298 	assert(driver->id_table);
299 	while (driver->id_table[pci_id_count].vendor_id) {
300 		pci_id_count++;
301 	}
302 	assert(pci_id_count > 0);
303 
304 	rte_id_table = calloc(pci_id_count + 1, sizeof(*rte_id_table));
305 	if (!rte_id_table) {
306 		return -ENOMEM;
307 	}
308 
309 	while (pci_id_count > 0) {
310 		struct rte_pci_id *rte_id = &rte_id_table[pci_id_count - 1];
311 		const struct spdk_pci_id *spdk_id = &driver->id_table[pci_id_count - 1];
312 
313 		rte_id->class_id = spdk_id->class_id;
314 		rte_id->vendor_id = spdk_id->vendor_id;
315 		rte_id->device_id = spdk_id->device_id;
316 		rte_id->subsystem_vendor_id = spdk_id->subvendor_id;
317 		rte_id->subsystem_device_id = spdk_id->subdevice_id;
318 		pci_id_count--;
319 	}
320 
321 	assert(driver->name);
322 	rte_name_len = strlen(driver->name) + strlen("spdk_") + 1;
323 	rte_name = calloc(rte_name_len, 1);
324 	if (!rte_name) {
325 		free(rte_id_table);
326 		return -ENOMEM;
327 	}
328 
329 	snprintf(rte_name, rte_name_len, "spdk_%s", driver->name);
330 	driver->driver.driver.name = rte_name;
331 	driver->driver.id_table = rte_id_table;
332 
333 	rte_flags = 0;
334 	if (driver->drv_flags & SPDK_PCI_DRIVER_NEED_MAPPING) {
335 		rte_flags |= RTE_PCI_DRV_NEED_MAPPING;
336 	}
337 	if (driver->drv_flags & SPDK_PCI_DRIVER_WC_ACTIVATE) {
338 		rte_flags |= RTE_PCI_DRV_WC_ACTIVATE;
339 	}
340 	driver->driver.drv_flags = rte_flags;
341 
342 	driver->driver.probe = pci_device_init;
343 	driver->driver.remove = pci_device_fini;
344 
345 	rte_pci_register(&driver->driver);
346 	return 0;
347 }
348 
349 static inline void
350 _pci_env_init(void)
351 {
352 	/* We assume devices were present on the bus for more than 2 seconds
353 	 * before initializing SPDK and there's no need to wait more. We scan
354 	 * the bus, but we don't block any devices.
355 	 */
356 	scan_pci_bus(false);
357 
358 	/* Register a single hotremove callback for all devices. */
359 	if (spdk_process_is_primary()) {
360 		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
361 	}
362 }
363 
364 void
365 pci_env_init(void)
366 {
367 	struct spdk_pci_driver *driver;
368 
369 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
370 		register_rte_driver(driver);
371 	}
372 
373 	_pci_env_init();
374 }
375 
376 void
377 pci_env_reinit(void)
378 {
379 	/* There is no need to register pci drivers again, since they were
380 	 * already pre-registered in pci_env_init.
381 	 */
382 
383 	_pci_env_init();
384 }
385 
386 void
387 pci_env_fini(void)
388 {
389 	struct spdk_pci_device *dev;
390 	char bdf[32];
391 
392 	cleanup_pci_devices();
393 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
394 		if (dev->internal.attached) {
395 			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
396 			SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf);
397 		}
398 	}
399 
400 	if (spdk_process_is_primary()) {
401 		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
402 	}
403 }
404 
405 int
406 pci_device_init(struct rte_pci_driver *_drv,
407 		struct rte_pci_device *_dev)
408 {
409 	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
410 	struct spdk_pci_device *dev;
411 	int rc;
412 
413 	dev = calloc(1, sizeof(*dev));
414 	if (dev == NULL) {
415 		return -1;
416 	}
417 
418 	dev->dev_handle = _dev;
419 
420 	dev->addr.domain = _dev->addr.domain;
421 	dev->addr.bus = _dev->addr.bus;
422 	dev->addr.dev = _dev->addr.devid;
423 	dev->addr.func = _dev->addr.function;
424 	dev->id.class_id = _dev->id.class_id;
425 	dev->id.vendor_id = _dev->id.vendor_id;
426 	dev->id.device_id = _dev->id.device_id;
427 	dev->id.subvendor_id = _dev->id.subsystem_vendor_id;
428 	dev->id.subdevice_id = _dev->id.subsystem_device_id;
429 	dev->socket_id = _dev->device.numa_node;
430 	dev->type = "pci";
431 
432 	dev->map_bar = map_bar_rte;
433 	dev->unmap_bar = unmap_bar_rte;
434 	dev->cfg_read = cfg_read_rte;
435 	dev->cfg_write = cfg_write_rte;
436 
437 	dev->internal.driver = driver;
438 	dev->internal.claim_fd = -1;
439 
440 	if (driver->cb_fn != NULL) {
441 		rc = driver->cb_fn(driver->cb_arg, dev);
442 		if (rc != 0) {
443 			free(dev);
444 			return rc;
445 		}
446 		dev->internal.attached = true;
447 	}
448 
449 	pthread_mutex_lock(&g_pci_mutex);
450 	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
451 	pthread_mutex_unlock(&g_pci_mutex);
452 	return 0;
453 }
454 
455 static void
456 set_allowed_at(struct rte_devargs *rte_da, uint64_t tsc)
457 {
458 	struct env_devargs *env_da;
459 
460 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
461 	if (env_da == NULL) {
462 		env_da = calloc(1, sizeof(*env_da));
463 		if (env_da == NULL) {
464 			SPDK_ERRLOG("could not set_allowed_at for device %s\n", rte_da->name);
465 			return;
466 		}
467 		env_da->bus = rte_da->bus;
468 		spdk_strcpy_pad(env_da->name, rte_da->name, sizeof(env_da->name), 0);
469 		TAILQ_INSERT_TAIL(&g_env_devargs, env_da, link);
470 	}
471 
472 	env_da->allowed_at = tsc;
473 }
474 
475 static uint64_t
476 get_allowed_at(struct rte_devargs *rte_da)
477 {
478 	struct env_devargs *env_da;
479 
480 	env_da = find_env_devargs(rte_da->bus, rte_da->name);
481 	if (env_da) {
482 		return env_da->allowed_at;
483 	} else {
484 		return 0;
485 	}
486 }
487 
488 int
489 pci_device_fini(struct rte_pci_device *_dev)
490 {
491 	struct spdk_pci_device *dev;
492 
493 	pthread_mutex_lock(&g_pci_mutex);
494 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
495 		if (dev->dev_handle == _dev) {
496 			break;
497 		}
498 	}
499 
500 	if (dev == NULL || dev->internal.attached) {
501 		/* The device might be still referenced somewhere in SPDK. */
502 		pthread_mutex_unlock(&g_pci_mutex);
503 		return -EBUSY;
504 	}
505 
506 	/* remove our allowed_at option */
507 	if (_dev->device.devargs) {
508 		set_allowed_at(_dev->device.devargs, 0);
509 	}
510 
511 	/* It is possible that removed flag was already set when there is a race
512 	 * between the remove notification for this process, and another process
513 	 * that is also detaching from this same device (for example, when using
514 	 * nvme driver in multi-process mode.  So do not assert here.  See
515 	 * #2456 for additional details.
516 	 */
517 	dev->internal.removed = true;
518 	pthread_mutex_unlock(&g_pci_mutex);
519 	return 0;
520 
521 }
522 
523 void
524 spdk_pci_device_detach(struct spdk_pci_device *dev)
525 {
526 	assert(dev->internal.attached);
527 
528 	if (dev->internal.claim_fd >= 0) {
529 		spdk_pci_device_unclaim(dev);
530 	}
531 
532 	dev->internal.attached = false;
533 	if (strcmp(dev->type, "pci") == 0) {
534 		/* if it's a physical device we need to deal with DPDK on
535 		 * a different process and we can't just unset one flag
536 		 * here. We also want to stop using any device resources
537 		 * so that the device isn't "in use" by the userspace driver
538 		 * once we detach it. This would allow attaching the device
539 		 * to a different process, or to a kernel driver like nvme.
540 		 */
541 		detach_rte(dev);
542 	}
543 
544 	cleanup_pci_devices();
545 }
546 
547 static int
548 scan_pci_bus(bool delay_init)
549 {
550 	struct spdk_pci_driver *driver;
551 	struct rte_pci_device *rte_dev;
552 	uint64_t now;
553 
554 	rte_bus_scan();
555 	now = spdk_get_ticks();
556 
557 	driver = TAILQ_FIRST(&g_pci_drivers);
558 	if (!driver) {
559 		return 0;
560 	}
561 
562 	TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) {
563 		struct rte_devargs *da;
564 
565 		da = rte_dev->device.devargs;
566 		if (!da) {
567 			char devargs_str[128];
568 
569 			/* the device was never blocked or allowed */
570 			da = calloc(1, sizeof(*da));
571 			if (!da) {
572 				return -1;
573 			}
574 
575 			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name);
576 			if (rte_devargs_parse(da, devargs_str) != 0) {
577 				free(da);
578 				return -1;
579 			}
580 
581 			rte_devargs_insert(&da);
582 			rte_dev->device.devargs = da;
583 		}
584 
585 		if (get_allowed_at(da)) {
586 			uint64_t allowed_at = get_allowed_at(da);
587 
588 			/* this device was seen by spdk before... */
589 			if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) {
590 				da->policy = RTE_DEV_ALLOWED;
591 			}
592 		} else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_ALLOWLIST &&
593 			    da->policy == RTE_DEV_ALLOWED) || da->policy != RTE_DEV_BLOCKED) {
594 			/* override the policy only if not permanently blocked */
595 
596 			if (delay_init) {
597 				da->policy = RTE_DEV_BLOCKED;
598 				set_allowed_at(da, now + 2 * spdk_get_ticks_hz());
599 			} else {
600 				da->policy = RTE_DEV_ALLOWED;
601 				set_allowed_at(da, now);
602 			}
603 		}
604 	}
605 
606 	return 0;
607 }
608 
609 int
610 spdk_pci_device_attach(struct spdk_pci_driver *driver,
611 		       spdk_pci_enum_cb enum_cb,
612 		       void *enum_ctx, struct spdk_pci_addr *pci_address)
613 {
614 	struct spdk_pci_device *dev;
615 	struct rte_pci_device *rte_dev;
616 	struct rte_devargs *da;
617 	int rc;
618 	char bdf[32];
619 
620 	spdk_pci_addr_fmt(bdf, sizeof(bdf), pci_address);
621 
622 	cleanup_pci_devices();
623 
624 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
625 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
626 			break;
627 		}
628 	}
629 
630 	if (dev != NULL && dev->internal.driver == driver) {
631 		pthread_mutex_lock(&g_pci_mutex);
632 		if (dev->internal.attached || dev->internal.pending_removal) {
633 			pthread_mutex_unlock(&g_pci_mutex);
634 			return -1;
635 		}
636 
637 		rc = enum_cb(enum_ctx, dev);
638 		if (rc == 0) {
639 			dev->internal.attached = true;
640 		}
641 		pthread_mutex_unlock(&g_pci_mutex);
642 		return rc;
643 	}
644 
645 	driver->cb_fn = enum_cb;
646 	driver->cb_arg = enum_ctx;
647 
648 	int i = 0;
649 
650 	do {
651 		rc = rte_eal_hotplug_add("pci", bdf, "");
652 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
653 
654 	if (i > 1 && rc == -EEXIST) {
655 		/* Even though the previous request timed out, the device
656 		 * was attached successfully.
657 		 */
658 		rc = 0;
659 	}
660 
661 	driver->cb_arg = NULL;
662 	driver->cb_fn = NULL;
663 
664 	cleanup_pci_devices();
665 
666 	if (rc != 0) {
667 		return -1;
668 	}
669 
670 	/* explicit attach ignores the allowlist, so if we blocked this
671 	 * device before let's enable it now - just for clarity.
672 	 */
673 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
674 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
675 			break;
676 		}
677 	}
678 	assert(dev != NULL);
679 
680 	rte_dev = dev->dev_handle;
681 	da = rte_dev->device.devargs;
682 	if (da && get_allowed_at(da)) {
683 		set_allowed_at(da, spdk_get_ticks());
684 		da->policy = RTE_DEV_ALLOWED;
685 	}
686 
687 	return 0;
688 }
689 
690 /* Note: You can call spdk_pci_enumerate from more than one thread
691  *       simultaneously safely, but you cannot call spdk_pci_enumerate
692  *       and rte_eal_pci_probe simultaneously.
693  */
694 int
695 spdk_pci_enumerate(struct spdk_pci_driver *driver,
696 		   spdk_pci_enum_cb enum_cb,
697 		   void *enum_ctx)
698 {
699 	struct spdk_pci_device *dev;
700 	int rc;
701 
702 	cleanup_pci_devices();
703 
704 	pthread_mutex_lock(&g_pci_mutex);
705 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
706 		if (dev->internal.attached ||
707 		    dev->internal.driver != driver ||
708 		    dev->internal.pending_removal) {
709 			continue;
710 		}
711 
712 		rc = enum_cb(enum_ctx, dev);
713 		if (rc == 0) {
714 			dev->internal.attached = true;
715 		} else if (rc < 0) {
716 			pthread_mutex_unlock(&g_pci_mutex);
717 			return -1;
718 		}
719 	}
720 	pthread_mutex_unlock(&g_pci_mutex);
721 
722 	if (scan_pci_bus(true) != 0) {
723 		return -1;
724 	}
725 
726 	driver->cb_fn = enum_cb;
727 	driver->cb_arg = enum_ctx;
728 
729 	if (rte_bus_probe() != 0) {
730 		driver->cb_arg = NULL;
731 		driver->cb_fn = NULL;
732 		return -1;
733 	}
734 
735 	driver->cb_arg = NULL;
736 	driver->cb_fn = NULL;
737 
738 	cleanup_pci_devices();
739 	return 0;
740 }
741 
742 void
743 spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev))
744 {
745 	struct spdk_pci_device *dev;
746 
747 	pthread_mutex_lock(&g_pci_mutex);
748 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
749 		fn(ctx, dev);
750 	}
751 	pthread_mutex_unlock(&g_pci_mutex);
752 }
753 
754 int
755 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
756 			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
757 {
758 	return dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
759 }
760 
761 int
762 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
763 {
764 	return dev->unmap_bar(dev, bar, addr);
765 }
766 
767 int
768 spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev)
769 {
770 	struct rte_pci_device *rte_dev = dev->dev_handle;
771 #if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
772 	return rte_intr_enable(&rte_dev->intr_handle);
773 #else
774 	return rte_intr_enable(rte_dev->intr_handle);
775 #endif
776 }
777 
778 int
779 spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev)
780 {
781 	struct rte_pci_device *rte_dev = dev->dev_handle;
782 #if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
783 	return rte_intr_disable(&rte_dev->intr_handle);
784 #else
785 	return rte_intr_disable(rte_dev->intr_handle);
786 #endif
787 }
788 
789 int
790 spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev)
791 {
792 	struct rte_pci_device *rte_dev = dev->dev_handle;
793 #if RTE_VERSION < RTE_VERSION_NUM(21, 11, 0, 0)
794 	return rte_dev->intr_handle.fd;
795 #else
796 	return rte_intr_fd_get(rte_dev->intr_handle);
797 #endif
798 }
799 
800 uint32_t
801 spdk_pci_device_get_domain(struct spdk_pci_device *dev)
802 {
803 	return dev->addr.domain;
804 }
805 
806 uint8_t
807 spdk_pci_device_get_bus(struct spdk_pci_device *dev)
808 {
809 	return dev->addr.bus;
810 }
811 
812 uint8_t
813 spdk_pci_device_get_dev(struct spdk_pci_device *dev)
814 {
815 	return dev->addr.dev;
816 }
817 
818 uint8_t
819 spdk_pci_device_get_func(struct spdk_pci_device *dev)
820 {
821 	return dev->addr.func;
822 }
823 
824 uint16_t
825 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
826 {
827 	return dev->id.vendor_id;
828 }
829 
830 uint16_t
831 spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
832 {
833 	return dev->id.device_id;
834 }
835 
836 uint16_t
837 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
838 {
839 	return dev->id.subvendor_id;
840 }
841 
842 uint16_t
843 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
844 {
845 	return dev->id.subdevice_id;
846 }
847 
848 struct spdk_pci_id
849 spdk_pci_device_get_id(struct spdk_pci_device *dev)
850 {
851 	return dev->id;
852 }
853 
854 int
855 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
856 {
857 	return dev->socket_id;
858 }
859 
860 int
861 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
862 {
863 	return dev->cfg_read(dev, value, len, offset);
864 }
865 
866 int
867 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
868 {
869 	return dev->cfg_write(dev, value, len, offset);
870 }
871 
872 int
873 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
874 {
875 	return spdk_pci_device_cfg_read(dev, value, 1, offset);
876 }
877 
878 int
879 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
880 {
881 	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
882 }
883 
884 int
885 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
886 {
887 	return spdk_pci_device_cfg_read(dev, value, 2, offset);
888 }
889 
890 int
891 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
892 {
893 	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
894 }
895 
896 int
897 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
898 {
899 	return spdk_pci_device_cfg_read(dev, value, 4, offset);
900 }
901 
902 int
903 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
904 {
905 	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
906 }
907 
908 int
909 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
910 {
911 	int err;
912 	uint32_t pos, header = 0;
913 	uint32_t i, buf[2];
914 
915 	if (len < 17) {
916 		return -1;
917 	}
918 
919 	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
920 	if (err || !header) {
921 		return -1;
922 	}
923 
924 	pos = PCI_CFG_SIZE;
925 	while (1) {
926 		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
927 			if (pos) {
928 				/* skip the header */
929 				pos += 4;
930 				for (i = 0; i < 2; i++) {
931 					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
932 					if (err) {
933 						return -1;
934 					}
935 				}
936 				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
937 				return 0;
938 			}
939 		}
940 		pos = (header >> 20) & 0xffc;
941 		/* 0 if no other items exist */
942 		if (pos < PCI_CFG_SIZE) {
943 			return -1;
944 		}
945 		err = spdk_pci_device_cfg_read32(dev, &header, pos);
946 		if (err) {
947 			return -1;
948 		}
949 	}
950 	return -1;
951 }
952 
953 struct spdk_pci_addr
954 spdk_pci_device_get_addr(struct spdk_pci_device *dev)
955 {
956 	return dev->addr;
957 }
958 
959 bool
960 spdk_pci_device_is_removed(struct spdk_pci_device *dev)
961 {
962 	return dev->internal.pending_removal;
963 }
964 
965 int
966 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
967 {
968 	if (a1->domain > a2->domain) {
969 		return 1;
970 	} else if (a1->domain < a2->domain) {
971 		return -1;
972 	} else if (a1->bus > a2->bus) {
973 		return 1;
974 	} else if (a1->bus < a2->bus) {
975 		return -1;
976 	} else if (a1->dev > a2->dev) {
977 		return 1;
978 	} else if (a1->dev < a2->dev) {
979 		return -1;
980 	} else if (a1->func > a2->func) {
981 		return 1;
982 	} else if (a1->func < a2->func) {
983 		return -1;
984 	}
985 
986 	return 0;
987 }
988 
989 #ifdef __linux__
990 int
991 spdk_pci_device_claim(struct spdk_pci_device *dev)
992 {
993 	int dev_fd;
994 	char dev_name[64];
995 	int pid;
996 	void *dev_map;
997 	struct flock pcidev_lock = {
998 		.l_type = F_WRLCK,
999 		.l_whence = SEEK_SET,
1000 		.l_start = 0,
1001 		.l_len = 0,
1002 	};
1003 
1004 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1005 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1006 
1007 	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
1008 	if (dev_fd == -1) {
1009 		SPDK_ERRLOG("could not open %s\n", dev_name);
1010 		return -errno;
1011 	}
1012 
1013 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
1014 		SPDK_ERRLOG("could not truncate %s\n", dev_name);
1015 		close(dev_fd);
1016 		return -errno;
1017 	}
1018 
1019 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
1020 		       MAP_SHARED, dev_fd, 0);
1021 	if (dev_map == MAP_FAILED) {
1022 		SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno);
1023 		close(dev_fd);
1024 		return -errno;
1025 	}
1026 
1027 	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
1028 		pid = *(int *)dev_map;
1029 		SPDK_ERRLOG("Cannot create lock on device %s, probably"
1030 			    " process %d has claimed it\n", dev_name, pid);
1031 		munmap(dev_map, sizeof(int));
1032 		close(dev_fd);
1033 		/* F_SETLK returns unspecified errnos, normalize them */
1034 		return -EACCES;
1035 	}
1036 
1037 	*(int *)dev_map = (int)getpid();
1038 	munmap(dev_map, sizeof(int));
1039 	dev->internal.claim_fd = dev_fd;
1040 	/* Keep dev_fd open to maintain the lock. */
1041 	return 0;
1042 }
1043 
1044 void
1045 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1046 {
1047 	char dev_name[64];
1048 
1049 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1050 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1051 
1052 	close(dev->internal.claim_fd);
1053 	dev->internal.claim_fd = -1;
1054 	unlink(dev_name);
1055 }
1056 #else /* !__linux__ */
1057 int
1058 spdk_pci_device_claim(struct spdk_pci_device *dev)
1059 {
1060 	/* TODO */
1061 	return 0;
1062 }
1063 
1064 void
1065 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1066 {
1067 	/* TODO */
1068 }
1069 #endif /* __linux__ */
1070 
1071 int
1072 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
1073 {
1074 	unsigned domain, bus, dev, func;
1075 
1076 	if (addr == NULL || bdf == NULL) {
1077 		return -EINVAL;
1078 	}
1079 
1080 	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1081 	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1082 		/* Matched a full address - all variables are initialized */
1083 	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1084 		func = 0;
1085 	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1086 		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1087 		domain = 0;
1088 	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1089 		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1090 		domain = 0;
1091 		func = 0;
1092 	} else {
1093 		return -EINVAL;
1094 	}
1095 
1096 	if (bus > 0xFF || dev > 0x1F || func > 7) {
1097 		return -EINVAL;
1098 	}
1099 
1100 	addr->domain = domain;
1101 	addr->bus = bus;
1102 	addr->dev = dev;
1103 	addr->func = func;
1104 
1105 	return 0;
1106 }
1107 
1108 int
1109 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1110 {
1111 	int rc;
1112 
1113 	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1114 		      addr->domain, addr->bus,
1115 		      addr->dev, addr->func);
1116 
1117 	if (rc > 0 && (size_t)rc < sz) {
1118 		return 0;
1119 	}
1120 
1121 	return -1;
1122 }
1123 
1124 void
1125 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1126 {
1127 	assert(dev->map_bar != NULL);
1128 	assert(dev->unmap_bar != NULL);
1129 	assert(dev->cfg_read != NULL);
1130 	assert(dev->cfg_write != NULL);
1131 	dev->internal.driver = drv;
1132 	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1133 }
1134 
1135 void
1136 spdk_pci_unhook_device(struct spdk_pci_device *dev)
1137 {
1138 	assert(!dev->internal.attached);
1139 	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1140 }
1141 
1142 const char *
1143 spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1144 {
1145 	return dev->type;
1146 }
1147 
1148 int
1149 spdk_pci_device_allow(struct spdk_pci_addr *pci_addr)
1150 {
1151 	struct rte_devargs *da;
1152 	char devargs_str[128];
1153 
1154 	da = calloc(1, sizeof(*da));
1155 	if (da == NULL) {
1156 		SPDK_ERRLOG("could not allocate rte_devargs\n");
1157 		return -ENOMEM;
1158 	}
1159 
1160 	snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x",
1161 		 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func);
1162 	if (rte_devargs_parse(da, devargs_str) != 0) {
1163 		SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str);
1164 		free(da);
1165 		return -EINVAL;
1166 	}
1167 	da->policy = RTE_DEV_ALLOWED;
1168 	/* Note: if a devargs already exists for this device address, it just gets
1169 	 * overridden.  So we do not need to check if the devargs already exists.
1170 	 * DPDK will take care of memory management for the devargs structure after
1171 	 * it has been inserted, so there's nothing SPDK needs to track.
1172 	 */
1173 	if (rte_devargs_insert(&da) != 0) {
1174 		SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str);
1175 		free(da);
1176 		return -EINVAL;
1177 	}
1178 
1179 	return 0;
1180 }
1181