xref: /spdk/lib/env_dpdk/pci.c (revision 0e3de45def885b06193996d849acfa24d1135d92)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "env_internal.h"
35 
36 #include <rte_alarm.h>
37 #include <rte_devargs.h>
38 #include "spdk/env.h"
39 #include "spdk/log.h"
40 
41 #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
42 
43 /* Compatibility for versions < 20.11 */
44 #if RTE_VERSION < RTE_VERSION_NUM(20, 11, 0, 0)
45 #define RTE_DEV_ALLOWED RTE_DEV_WHITELISTED
46 #define RTE_DEV_BLOCKED RTE_DEV_BLACKLISTED
47 #define RTE_BUS_SCAN_ALLOWLIST RTE_BUS_SCAN_WHITELIST
48 #endif
49 
50 #define PCI_CFG_SIZE		256
51 #define PCI_EXT_CAP_ID_SN	0x03
52 
53 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
54  * might cause the internal IPC to misbehave. Just retry in such case.
55  */
56 #define DPDK_HOTPLUG_RETRY_COUNT 4
57 
58 /* DPDK alarm/interrupt thread */
59 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
60 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
61 /* devices hotplugged on a dpdk thread */
62 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
63 	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
64 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
65 
66 static int
67 map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
68 	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
69 {
70 	struct rte_pci_device *dev = device->dev_handle;
71 
72 	*mapped_addr = dev->mem_resource[bar].addr;
73 	*phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr;
74 	*size = (uint64_t)dev->mem_resource[bar].len;
75 
76 	return 0;
77 }
78 
79 static int
80 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
81 {
82 	return 0;
83 }
84 
85 static int
86 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
87 {
88 	int rc;
89 
90 	rc = rte_pci_read_config(dev->dev_handle, value, len, offset);
91 
92 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
93 }
94 
95 static int
96 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
97 {
98 	int rc;
99 
100 	rc = rte_pci_write_config(dev->dev_handle, value, len, offset);
101 
102 #ifdef __FreeBSD__
103 	/* DPDK returns 0 on success and -1 on failure */
104 	return rc;
105 #endif
106 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
107 }
108 
109 static void
110 remove_rte_dev(struct rte_pci_device *rte_dev)
111 {
112 	char bdf[32];
113 	int i = 0, rc;
114 
115 	snprintf(bdf, sizeof(bdf), "%s", rte_dev->device.name);
116 	do {
117 		rc = rte_eal_hotplug_remove("pci", bdf);
118 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
119 }
120 
121 static void
122 detach_rte_cb(void *_dev)
123 {
124 	remove_rte_dev(_dev);
125 }
126 
127 static void
128 detach_rte(struct spdk_pci_device *dev)
129 {
130 	struct rte_pci_device *rte_dev = dev->dev_handle;
131 	int i;
132 	bool removed;
133 
134 	if (!spdk_process_is_primary()) {
135 		remove_rte_dev(rte_dev);
136 		return;
137 	}
138 
139 	pthread_mutex_lock(&g_pci_mutex);
140 	dev->internal.attached = false;
141 	/* prevent the hotremove notification from removing this device */
142 	dev->internal.pending_removal = true;
143 	pthread_mutex_unlock(&g_pci_mutex);
144 
145 	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
146 
147 	/* wait up to 2s for the cb to execute */
148 	for (i = 2000; i > 0; i--) {
149 
150 		spdk_delay_us(1000);
151 		pthread_mutex_lock(&g_pci_mutex);
152 		removed = dev->internal.removed;
153 		pthread_mutex_unlock(&g_pci_mutex);
154 
155 		if (removed) {
156 			break;
157 		}
158 	}
159 
160 	/* besides checking the removed flag, we also need to wait
161 	 * for the dpdk detach function to unwind, as it's doing some
162 	 * operations even after calling our detach callback. Simply
163 	 * cancel the alarm - if it started executing already, this
164 	 * call will block and wait for it to finish.
165 	 */
166 	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
167 
168 	/* the device could have been finally removed, so just check
169 	 * it again.
170 	 */
171 	pthread_mutex_lock(&g_pci_mutex);
172 	removed = dev->internal.removed;
173 	pthread_mutex_unlock(&g_pci_mutex);
174 	if (!removed) {
175 		SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n",
176 			    rte_dev->name);
177 		/* If we reach this state, then the device couldn't be removed and most likely
178 		   a subsequent hot add of a device in the same BDF will fail */
179 	}
180 }
181 
182 void
183 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
184 {
185 	struct spdk_pci_driver *driver;
186 
187 	driver = calloc(1, sizeof(*driver));
188 	if (!driver) {
189 		/* we can't do any better than bailing atm */
190 		return;
191 	}
192 
193 	driver->name = name;
194 	driver->id_table = id_table;
195 	driver->drv_flags = flags;
196 	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
197 }
198 
199 struct spdk_pci_driver *
200 spdk_pci_nvme_get_driver(void)
201 {
202 	return spdk_pci_get_driver("nvme");
203 }
204 
205 struct spdk_pci_driver *
206 spdk_pci_get_driver(const char *name)
207 {
208 	struct spdk_pci_driver *driver;
209 
210 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
211 		if (strcmp(driver->name, name) == 0) {
212 			return driver;
213 		}
214 	}
215 
216 	return NULL;
217 }
218 
219 static void
220 pci_device_rte_dev_event(const char *device_name,
221 			 enum rte_dev_event_type event,
222 			 void *cb_arg)
223 {
224 	struct spdk_pci_device *dev;
225 	bool can_detach = false;
226 
227 	switch (event) {
228 	default:
229 	case RTE_DEV_EVENT_ADD:
230 		/* Nothing to do here yet. */
231 		break;
232 	case RTE_DEV_EVENT_REMOVE:
233 		pthread_mutex_lock(&g_pci_mutex);
234 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
235 			struct rte_pci_device *rte_dev = dev->dev_handle;
236 
237 			if (strcmp(rte_dev->name, device_name) == 0 &&
238 			    !dev->internal.pending_removal) {
239 				can_detach = !dev->internal.attached;
240 				/* prevent any further attaches */
241 				dev->internal.pending_removal = true;
242 				break;
243 			}
244 		}
245 		pthread_mutex_unlock(&g_pci_mutex);
246 
247 		if (dev != NULL && can_detach) {
248 			/* if device is not attached we can remove it right away.
249 			 * Otherwise it will be removed at detach.
250 			 *
251 			 * Because the user's callback is invoked in eal interrupt
252 			 * callback, the interrupt callback need to be finished before
253 			 * it can be unregistered when detaching device. So finish
254 			 * callback soon and use a deferred removal to detach device
255 			 * is need. It is a workaround, once the device detaching be
256 			 * moved into the eal in the future, the deferred removal could
257 			 * be deleted.
258 			 */
259 			rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle);
260 		}
261 		break;
262 	}
263 }
264 
265 static void
266 cleanup_pci_devices(void)
267 {
268 	struct spdk_pci_device *dev, *tmp;
269 
270 	pthread_mutex_lock(&g_pci_mutex);
271 	/* cleanup removed devices */
272 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
273 		if (!dev->internal.removed) {
274 			continue;
275 		}
276 
277 		vtophys_pci_device_removed(dev->dev_handle);
278 		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
279 		free(dev);
280 	}
281 
282 	/* add newly-attached devices */
283 	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
284 		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
285 		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
286 		vtophys_pci_device_added(dev->dev_handle);
287 	}
288 	pthread_mutex_unlock(&g_pci_mutex);
289 }
290 
291 static int scan_pci_bus(bool delay_init);
292 
293 /* translate spdk_pci_driver to an rte_pci_driver and register it to dpdk */
294 static int
295 register_rte_driver(struct spdk_pci_driver *driver)
296 {
297 	unsigned pci_id_count = 0;
298 	struct rte_pci_id *rte_id_table;
299 	char *rte_name;
300 	size_t rte_name_len;
301 	uint32_t rte_flags;
302 
303 	assert(driver->id_table);
304 	while (driver->id_table[pci_id_count].vendor_id) {
305 		pci_id_count++;
306 	}
307 	assert(pci_id_count > 0);
308 
309 	rte_id_table = calloc(pci_id_count + 1, sizeof(*rte_id_table));
310 	if (!rte_id_table) {
311 		return -ENOMEM;
312 	}
313 
314 	while (pci_id_count > 0) {
315 		struct rte_pci_id *rte_id = &rte_id_table[pci_id_count - 1];
316 		const struct spdk_pci_id *spdk_id = &driver->id_table[pci_id_count - 1];
317 
318 		rte_id->class_id = spdk_id->class_id;
319 		rte_id->vendor_id = spdk_id->vendor_id;
320 		rte_id->device_id = spdk_id->device_id;
321 		rte_id->subsystem_vendor_id = spdk_id->subvendor_id;
322 		rte_id->subsystem_device_id = spdk_id->subdevice_id;
323 		pci_id_count--;
324 	}
325 
326 	assert(driver->name);
327 	rte_name_len = strlen(driver->name) + strlen("spdk_") + 1;
328 	rte_name = calloc(rte_name_len, 1);
329 	if (!rte_name) {
330 		free(rte_id_table);
331 		return -ENOMEM;
332 	}
333 
334 	snprintf(rte_name, rte_name_len, "spdk_%s", driver->name);
335 	driver->driver.driver.name = rte_name;
336 	driver->driver.id_table = rte_id_table;
337 
338 	rte_flags = 0;
339 	if (driver->drv_flags & SPDK_PCI_DRIVER_NEED_MAPPING) {
340 		rte_flags |= RTE_PCI_DRV_NEED_MAPPING;
341 	}
342 	if (driver->drv_flags & SPDK_PCI_DRIVER_WC_ACTIVATE) {
343 		rte_flags |= RTE_PCI_DRV_WC_ACTIVATE;
344 	}
345 	driver->driver.drv_flags = rte_flags;
346 
347 	driver->driver.probe = pci_device_init;
348 	driver->driver.remove = pci_device_fini;
349 
350 	rte_pci_register(&driver->driver);
351 	return 0;
352 }
353 
354 static inline void
355 _pci_env_init(void)
356 {
357 	/* We assume devices were present on the bus for more than 2 seconds
358 	 * before initializing SPDK and there's no need to wait more. We scan
359 	 * the bus, but we don't block any devices.
360 	 */
361 	scan_pci_bus(false);
362 
363 	/* Register a single hotremove callback for all devices. */
364 	if (spdk_process_is_primary()) {
365 		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
366 	}
367 }
368 
369 void
370 pci_env_init(void)
371 {
372 	struct spdk_pci_driver *driver;
373 
374 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
375 		register_rte_driver(driver);
376 	}
377 
378 	_pci_env_init();
379 }
380 
381 void
382 pci_env_reinit(void)
383 {
384 	/* There is no need to register pci drivers again, since they were
385 	 * already pre-registered in pci_env_init.
386 	 */
387 
388 	_pci_env_init();
389 }
390 
391 void
392 pci_env_fini(void)
393 {
394 	struct spdk_pci_device *dev;
395 	char bdf[32];
396 
397 	cleanup_pci_devices();
398 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
399 		if (dev->internal.attached) {
400 			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
401 			SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf);
402 		}
403 	}
404 
405 	if (spdk_process_is_primary()) {
406 		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
407 	}
408 }
409 
410 int
411 pci_device_init(struct rte_pci_driver *_drv,
412 		struct rte_pci_device *_dev)
413 {
414 	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
415 	struct spdk_pci_device *dev;
416 	int rc;
417 
418 	dev = calloc(1, sizeof(*dev));
419 	if (dev == NULL) {
420 		return -1;
421 	}
422 
423 	dev->dev_handle = _dev;
424 
425 	dev->addr.domain = _dev->addr.domain;
426 	dev->addr.bus = _dev->addr.bus;
427 	dev->addr.dev = _dev->addr.devid;
428 	dev->addr.func = _dev->addr.function;
429 	dev->id.class_id = _dev->id.class_id;
430 	dev->id.vendor_id = _dev->id.vendor_id;
431 	dev->id.device_id = _dev->id.device_id;
432 	dev->id.subvendor_id = _dev->id.subsystem_vendor_id;
433 	dev->id.subdevice_id = _dev->id.subsystem_device_id;
434 	dev->socket_id = _dev->device.numa_node;
435 	dev->type = "pci";
436 
437 	dev->map_bar = map_bar_rte;
438 	dev->unmap_bar = unmap_bar_rte;
439 	dev->cfg_read = cfg_read_rte;
440 	dev->cfg_write = cfg_write_rte;
441 
442 	dev->internal.driver = driver;
443 	dev->internal.claim_fd = -1;
444 
445 	if (driver->cb_fn != NULL) {
446 		rc = driver->cb_fn(driver->cb_arg, dev);
447 		if (rc != 0) {
448 			free(dev);
449 			return rc;
450 		}
451 		dev->internal.attached = true;
452 	}
453 
454 	pthread_mutex_lock(&g_pci_mutex);
455 	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
456 	pthread_mutex_unlock(&g_pci_mutex);
457 	return 0;
458 }
459 
460 int
461 pci_device_fini(struct rte_pci_device *_dev)
462 {
463 	struct spdk_pci_device *dev;
464 
465 	pthread_mutex_lock(&g_pci_mutex);
466 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
467 		if (dev->dev_handle == _dev) {
468 			break;
469 		}
470 	}
471 
472 	if (dev == NULL || dev->internal.attached) {
473 		/* The device might be still referenced somewhere in SPDK. */
474 		pthread_mutex_unlock(&g_pci_mutex);
475 		return -1;
476 	}
477 
478 	/* remove our allowed_at option */
479 	if (_dev->device.devargs) {
480 		_dev->device.devargs->data = NULL;
481 	}
482 
483 	assert(!dev->internal.removed);
484 	dev->internal.removed = true;
485 	pthread_mutex_unlock(&g_pci_mutex);
486 	return 0;
487 
488 }
489 
490 void
491 spdk_pci_device_detach(struct spdk_pci_device *dev)
492 {
493 	assert(dev->internal.attached);
494 
495 	if (dev->internal.claim_fd >= 0) {
496 		spdk_pci_device_unclaim(dev);
497 	}
498 
499 	if (strcmp(dev->type, "pci") == 0) {
500 		/* if it's a physical device we need to deal with DPDK on
501 		 * a different process and we can't just unset one flag
502 		 * here. We also want to stop using any device resources
503 		 * so that the device isn't "in use" by the userspace driver
504 		 * once we detach it. This would allow attaching the device
505 		 * to a different process, or to a kernel driver like nvme.
506 		 */
507 		detach_rte(dev);
508 	} else {
509 		dev->internal.attached = false;
510 	}
511 
512 	cleanup_pci_devices();
513 }
514 
515 static int
516 scan_pci_bus(bool delay_init)
517 {
518 	struct spdk_pci_driver *driver;
519 	struct rte_pci_device *rte_dev;
520 	uint64_t now;
521 
522 	rte_bus_scan();
523 	now = spdk_get_ticks();
524 
525 	driver = TAILQ_FIRST(&g_pci_drivers);
526 	if (!driver) {
527 		return 0;
528 	}
529 
530 	TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) {
531 		struct rte_devargs *da;
532 
533 		da = rte_dev->device.devargs;
534 		if (!da) {
535 			char devargs_str[128];
536 
537 			/* the device was never blocked or allowed */
538 			da = calloc(1, sizeof(*da));
539 			if (!da) {
540 				return -1;
541 			}
542 
543 			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name);
544 			if (rte_devargs_parse(da, devargs_str) != 0) {
545 				free(da);
546 				return -1;
547 			}
548 
549 			rte_devargs_insert(&da);
550 			rte_dev->device.devargs = da;
551 		}
552 
553 		if (da->data) {
554 			uint64_t allowed_at = (uint64_t)(uintptr_t)da->data;
555 
556 			/* this device was seen by spdk before... */
557 			if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) {
558 				da->policy = RTE_DEV_ALLOWED;
559 			}
560 		} else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_ALLOWLIST &&
561 			    da->policy == RTE_DEV_ALLOWED) || da->policy != RTE_DEV_BLOCKED) {
562 			/* override the policy only if not permanently blocked */
563 
564 			if (delay_init) {
565 				da->policy = RTE_DEV_BLOCKED;
566 				da->data = (void *)(now + 2 * spdk_get_ticks_hz());
567 			} else {
568 				da->policy = RTE_DEV_ALLOWED;
569 				da->data = (void *)(uintptr_t)now;
570 			}
571 		}
572 	}
573 
574 	return 0;
575 }
576 
577 int
578 spdk_pci_device_attach(struct spdk_pci_driver *driver,
579 		       spdk_pci_enum_cb enum_cb,
580 		       void *enum_ctx, struct spdk_pci_addr *pci_address)
581 {
582 	struct spdk_pci_device *dev;
583 	struct rte_pci_device *rte_dev;
584 	struct rte_devargs *da;
585 	int rc;
586 	char bdf[32];
587 
588 	spdk_pci_addr_fmt(bdf, sizeof(bdf), pci_address);
589 
590 	cleanup_pci_devices();
591 
592 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
593 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
594 			break;
595 		}
596 	}
597 
598 	if (dev != NULL && dev->internal.driver == driver) {
599 		pthread_mutex_lock(&g_pci_mutex);
600 		if (dev->internal.attached || dev->internal.pending_removal) {
601 			pthread_mutex_unlock(&g_pci_mutex);
602 			return -1;
603 		}
604 
605 		rc = enum_cb(enum_ctx, dev);
606 		if (rc == 0) {
607 			dev->internal.attached = true;
608 		}
609 		pthread_mutex_unlock(&g_pci_mutex);
610 		return rc;
611 	}
612 
613 	driver->cb_fn = enum_cb;
614 	driver->cb_arg = enum_ctx;
615 
616 	int i = 0;
617 
618 	do {
619 		rc = rte_eal_hotplug_add("pci", bdf, "");
620 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
621 
622 	if (i > 1 && rc == -EEXIST) {
623 		/* Even though the previous request timed out, the device
624 		 * was attached successfully.
625 		 */
626 		rc = 0;
627 	}
628 
629 	driver->cb_arg = NULL;
630 	driver->cb_fn = NULL;
631 
632 	cleanup_pci_devices();
633 
634 	if (rc != 0) {
635 		return -1;
636 	}
637 
638 	/* explicit attach ignores the allowlist, so if we blocked this
639 	 * device before let's enable it now - just for clarity.
640 	 */
641 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
642 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
643 			break;
644 		}
645 	}
646 	assert(dev != NULL);
647 
648 	rte_dev = dev->dev_handle;
649 	da = rte_dev->device.devargs;
650 	if (da && da->data) {
651 		da->data = (void *)(uintptr_t)spdk_get_ticks();
652 		da->policy = RTE_DEV_ALLOWED;
653 	}
654 
655 	return 0;
656 }
657 
658 /* Note: You can call spdk_pci_enumerate from more than one thread
659  *       simultaneously safely, but you cannot call spdk_pci_enumerate
660  *       and rte_eal_pci_probe simultaneously.
661  */
662 int
663 spdk_pci_enumerate(struct spdk_pci_driver *driver,
664 		   spdk_pci_enum_cb enum_cb,
665 		   void *enum_ctx)
666 {
667 	struct spdk_pci_device *dev;
668 	int rc;
669 
670 	cleanup_pci_devices();
671 
672 	pthread_mutex_lock(&g_pci_mutex);
673 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
674 		if (dev->internal.attached ||
675 		    dev->internal.driver != driver ||
676 		    dev->internal.pending_removal) {
677 			continue;
678 		}
679 
680 		rc = enum_cb(enum_ctx, dev);
681 		if (rc == 0) {
682 			dev->internal.attached = true;
683 		} else if (rc < 0) {
684 			pthread_mutex_unlock(&g_pci_mutex);
685 			return -1;
686 		}
687 	}
688 	pthread_mutex_unlock(&g_pci_mutex);
689 
690 	if (scan_pci_bus(true) != 0) {
691 		return -1;
692 	}
693 
694 	driver->cb_fn = enum_cb;
695 	driver->cb_arg = enum_ctx;
696 
697 	if (rte_bus_probe() != 0) {
698 		driver->cb_arg = NULL;
699 		driver->cb_fn = NULL;
700 		return -1;
701 	}
702 
703 	driver->cb_arg = NULL;
704 	driver->cb_fn = NULL;
705 
706 	cleanup_pci_devices();
707 	return 0;
708 }
709 
710 struct spdk_pci_device *
711 spdk_pci_get_first_device(void)
712 {
713 	return TAILQ_FIRST(&g_pci_devices);
714 }
715 
716 struct spdk_pci_device *
717 spdk_pci_get_next_device(struct spdk_pci_device *prev)
718 {
719 	return TAILQ_NEXT(prev, internal.tailq);
720 }
721 
722 int
723 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
724 			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
725 {
726 	return dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
727 }
728 
729 int
730 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
731 {
732 	return dev->unmap_bar(dev, bar, addr);
733 }
734 
735 uint32_t
736 spdk_pci_device_get_domain(struct spdk_pci_device *dev)
737 {
738 	return dev->addr.domain;
739 }
740 
741 uint8_t
742 spdk_pci_device_get_bus(struct spdk_pci_device *dev)
743 {
744 	return dev->addr.bus;
745 }
746 
747 uint8_t
748 spdk_pci_device_get_dev(struct spdk_pci_device *dev)
749 {
750 	return dev->addr.dev;
751 }
752 
753 uint8_t
754 spdk_pci_device_get_func(struct spdk_pci_device *dev)
755 {
756 	return dev->addr.func;
757 }
758 
759 uint16_t
760 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
761 {
762 	return dev->id.vendor_id;
763 }
764 
765 uint16_t
766 spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
767 {
768 	return dev->id.device_id;
769 }
770 
771 uint16_t
772 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
773 {
774 	return dev->id.subvendor_id;
775 }
776 
777 uint16_t
778 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
779 {
780 	return dev->id.subdevice_id;
781 }
782 
783 struct spdk_pci_id
784 spdk_pci_device_get_id(struct spdk_pci_device *dev)
785 {
786 	return dev->id;
787 }
788 
789 int
790 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
791 {
792 	return dev->socket_id;
793 }
794 
795 int
796 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
797 {
798 	return dev->cfg_read(dev, value, len, offset);
799 }
800 
801 int
802 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
803 {
804 	return dev->cfg_write(dev, value, len, offset);
805 }
806 
807 int
808 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
809 {
810 	return spdk_pci_device_cfg_read(dev, value, 1, offset);
811 }
812 
813 int
814 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
815 {
816 	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
817 }
818 
819 int
820 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
821 {
822 	return spdk_pci_device_cfg_read(dev, value, 2, offset);
823 }
824 
825 int
826 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
827 {
828 	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
829 }
830 
831 int
832 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
833 {
834 	return spdk_pci_device_cfg_read(dev, value, 4, offset);
835 }
836 
837 int
838 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
839 {
840 	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
841 }
842 
843 int
844 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
845 {
846 	int err;
847 	uint32_t pos, header = 0;
848 	uint32_t i, buf[2];
849 
850 	if (len < 17) {
851 		return -1;
852 	}
853 
854 	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
855 	if (err || !header) {
856 		return -1;
857 	}
858 
859 	pos = PCI_CFG_SIZE;
860 	while (1) {
861 		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
862 			if (pos) {
863 				/* skip the header */
864 				pos += 4;
865 				for (i = 0; i < 2; i++) {
866 					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
867 					if (err) {
868 						return -1;
869 					}
870 				}
871 				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
872 				return 0;
873 			}
874 		}
875 		pos = (header >> 20) & 0xffc;
876 		/* 0 if no other items exist */
877 		if (pos < PCI_CFG_SIZE) {
878 			return -1;
879 		}
880 		err = spdk_pci_device_cfg_read32(dev, &header, pos);
881 		if (err) {
882 			return -1;
883 		}
884 	}
885 	return -1;
886 }
887 
888 struct spdk_pci_addr
889 spdk_pci_device_get_addr(struct spdk_pci_device *dev)
890 {
891 	return dev->addr;
892 }
893 
894 bool
895 spdk_pci_device_is_removed(struct spdk_pci_device *dev)
896 {
897 	return dev->internal.pending_removal;
898 }
899 
900 int
901 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
902 {
903 	if (a1->domain > a2->domain) {
904 		return 1;
905 	} else if (a1->domain < a2->domain) {
906 		return -1;
907 	} else if (a1->bus > a2->bus) {
908 		return 1;
909 	} else if (a1->bus < a2->bus) {
910 		return -1;
911 	} else if (a1->dev > a2->dev) {
912 		return 1;
913 	} else if (a1->dev < a2->dev) {
914 		return -1;
915 	} else if (a1->func > a2->func) {
916 		return 1;
917 	} else if (a1->func < a2->func) {
918 		return -1;
919 	}
920 
921 	return 0;
922 }
923 
924 #ifdef __linux__
925 int
926 spdk_pci_device_claim(struct spdk_pci_device *dev)
927 {
928 	int dev_fd;
929 	char dev_name[64];
930 	int pid;
931 	void *dev_map;
932 	struct flock pcidev_lock = {
933 		.l_type = F_WRLCK,
934 		.l_whence = SEEK_SET,
935 		.l_start = 0,
936 		.l_len = 0,
937 	};
938 
939 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
940 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
941 
942 	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
943 	if (dev_fd == -1) {
944 		SPDK_ERRLOG("could not open %s\n", dev_name);
945 		return -errno;
946 	}
947 
948 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
949 		SPDK_ERRLOG("could not truncate %s\n", dev_name);
950 		close(dev_fd);
951 		return -errno;
952 	}
953 
954 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
955 		       MAP_SHARED, dev_fd, 0);
956 	if (dev_map == MAP_FAILED) {
957 		SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno);
958 		close(dev_fd);
959 		return -errno;
960 	}
961 
962 	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
963 		pid = *(int *)dev_map;
964 		SPDK_ERRLOG("Cannot create lock on device %s, probably"
965 			    " process %d has claimed it\n", dev_name, pid);
966 		munmap(dev_map, sizeof(int));
967 		close(dev_fd);
968 		/* F_SETLK returns unspecified errnos, normalize them */
969 		return -EACCES;
970 	}
971 
972 	*(int *)dev_map = (int)getpid();
973 	munmap(dev_map, sizeof(int));
974 	dev->internal.claim_fd = dev_fd;
975 	/* Keep dev_fd open to maintain the lock. */
976 	return 0;
977 }
978 
979 void
980 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
981 {
982 	char dev_name[64];
983 
984 	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
985 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
986 
987 	close(dev->internal.claim_fd);
988 	dev->internal.claim_fd = -1;
989 	unlink(dev_name);
990 }
991 #else /* !__linux__ */
992 int
993 spdk_pci_device_claim(struct spdk_pci_device *dev)
994 {
995 	/* TODO */
996 	return 0;
997 }
998 
999 void
1000 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1001 {
1002 	/* TODO */
1003 }
1004 #endif /* __linux__ */
1005 
1006 int
1007 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
1008 {
1009 	unsigned domain, bus, dev, func;
1010 
1011 	if (addr == NULL || bdf == NULL) {
1012 		return -EINVAL;
1013 	}
1014 
1015 	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1016 	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1017 		/* Matched a full address - all variables are initialized */
1018 	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1019 		func = 0;
1020 	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1021 		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1022 		domain = 0;
1023 	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1024 		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1025 		domain = 0;
1026 		func = 0;
1027 	} else {
1028 		return -EINVAL;
1029 	}
1030 
1031 	if (bus > 0xFF || dev > 0x1F || func > 7) {
1032 		return -EINVAL;
1033 	}
1034 
1035 	addr->domain = domain;
1036 	addr->bus = bus;
1037 	addr->dev = dev;
1038 	addr->func = func;
1039 
1040 	return 0;
1041 }
1042 
1043 int
1044 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1045 {
1046 	int rc;
1047 
1048 	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1049 		      addr->domain, addr->bus,
1050 		      addr->dev, addr->func);
1051 
1052 	if (rc > 0 && (size_t)rc < sz) {
1053 		return 0;
1054 	}
1055 
1056 	return -1;
1057 }
1058 
1059 void
1060 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1061 {
1062 	assert(dev->map_bar != NULL);
1063 	assert(dev->unmap_bar != NULL);
1064 	assert(dev->cfg_read != NULL);
1065 	assert(dev->cfg_write != NULL);
1066 	dev->internal.driver = drv;
1067 	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1068 }
1069 
1070 void
1071 spdk_pci_unhook_device(struct spdk_pci_device *dev)
1072 {
1073 	assert(!dev->internal.attached);
1074 	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1075 }
1076 
1077 const char *
1078 spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1079 {
1080 	return dev->type;
1081 }
1082 
1083 int
1084 spdk_pci_device_allow(struct spdk_pci_addr *pci_addr)
1085 {
1086 	struct rte_devargs *da;
1087 	char devargs_str[128];
1088 
1089 	da = calloc(1, sizeof(*da));
1090 	if (da == NULL) {
1091 		SPDK_ERRLOG("could not allocate rte_devargs\n");
1092 		return -ENOMEM;
1093 	}
1094 
1095 	snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x",
1096 		 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func);
1097 	if (rte_devargs_parse(da, devargs_str) != 0) {
1098 		SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str);
1099 		free(da);
1100 		return -EINVAL;
1101 	}
1102 	da->policy = RTE_DEV_ALLOWED;
1103 	/* Note: if a devargs already exists for this device address, it just gets
1104 	 * overridden.  So we do not need to check if the devargs already exists.
1105 	 * DPDK will take care of memory management for the devargs structure after
1106 	 * it has been inserted, so there's nothing SPDK needs to track.
1107 	 */
1108 	if (rte_devargs_insert(&da) != 0) {
1109 		SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str);
1110 		free(da);
1111 		return -EINVAL;
1112 	}
1113 
1114 	return 0;
1115 }
1116