xref: /spdk/lib/env_dpdk/pci.c (revision da60639f86dd88295eb46c2d76f9c327db92d7b3)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "env_internal.h"
35 
36 #include <rte_alarm.h>
37 #include <rte_devargs.h>
38 #include "spdk/env.h"
39 
40 #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
41 
42 #define PCI_CFG_SIZE		256
43 #define PCI_EXT_CAP_ID_SN	0x03
44 
45 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
46  * might cause the internal IPC to misbehave. Just retry in such case.
47  */
48 #define DPDK_HOTPLUG_RETRY_COUNT 4
49 
50 /* DPDK alarm/interrupt thread */
51 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
52 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
53 /* devices hotplugged on a dpdk thread */
54 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
55 	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
56 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
57 
58 static int
59 map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
60 	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
61 {
62 	struct rte_pci_device *dev = device->dev_handle;
63 
64 	*mapped_addr = dev->mem_resource[bar].addr;
65 	*phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr;
66 	*size = (uint64_t)dev->mem_resource[bar].len;
67 
68 	return 0;
69 }
70 
71 static int
72 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
73 {
74 	return 0;
75 }
76 
77 static int
78 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
79 {
80 	int rc;
81 
82 	rc = rte_pci_read_config(dev->dev_handle, value, len, offset);
83 
84 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
85 }
86 
87 static int
88 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
89 {
90 	int rc;
91 
92 	rc = rte_pci_write_config(dev->dev_handle, value, len, offset);
93 
94 #ifdef __FreeBSD__
95 	/* DPDK returns 0 on success and -1 on failure */
96 	return rc;
97 #endif
98 	return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
99 }
100 
101 static void
102 remove_rte_dev(struct rte_pci_device *rte_dev)
103 {
104 	char bdf[32];
105 	int i = 0, rc;
106 
107 	snprintf(bdf, sizeof(bdf), "%s", rte_dev->device.name);
108 	do {
109 		rc = rte_eal_hotplug_remove("pci", bdf);
110 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
111 }
112 
113 static void
114 detach_rte_cb(void *_dev)
115 {
116 	remove_rte_dev(_dev);
117 }
118 
119 static void
120 detach_rte(struct spdk_pci_device *dev)
121 {
122 	struct rte_pci_device *rte_dev = dev->dev_handle;
123 	int i;
124 	bool removed;
125 
126 	if (!spdk_process_is_primary()) {
127 		remove_rte_dev(rte_dev);
128 		return;
129 	}
130 
131 	pthread_mutex_lock(&g_pci_mutex);
132 	dev->internal.attached = false;
133 	/* prevent the hotremove notification from removing this device */
134 	dev->internal.pending_removal = true;
135 	pthread_mutex_unlock(&g_pci_mutex);
136 
137 	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
138 
139 	/* wait up to 2s for the cb to execute */
140 	for (i = 2000; i > 0; i--) {
141 
142 		spdk_delay_us(1000);
143 		pthread_mutex_lock(&g_pci_mutex);
144 		removed = dev->internal.removed;
145 		pthread_mutex_unlock(&g_pci_mutex);
146 
147 		if (removed) {
148 			break;
149 		}
150 	}
151 
152 	/* besides checking the removed flag, we also need to wait
153 	 * for the dpdk detach function to unwind, as it's doing some
154 	 * operations even after calling our detach callback. Simply
155 	 * cancel the alarm - if it started executing already, this
156 	 * call will block and wait for it to finish.
157 	 */
158 	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
159 
160 	/* the device could have been finally removed, so just check
161 	 * it again.
162 	 */
163 	pthread_mutex_lock(&g_pci_mutex);
164 	removed = dev->internal.removed;
165 	pthread_mutex_unlock(&g_pci_mutex);
166 	if (!removed) {
167 		fprintf(stderr, "Timeout waiting for DPDK to remove PCI device %s.\n",
168 			rte_dev->name);
169 		/* If we reach this state, then the device couldn't be removed and most likely
170 		   a subsequent hot add of a device in the same BDF will fail */
171 	}
172 }
173 
174 void
175 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
176 {
177 	struct spdk_pci_driver *driver;
178 
179 	driver = calloc(1, sizeof(*driver));
180 	if (!driver) {
181 		/* we can't do any better than bailing atm */
182 		return;
183 	}
184 
185 	driver->name = name;
186 	driver->id_table = id_table;
187 	driver->drv_flags = flags;
188 	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
189 }
190 
191 struct spdk_pci_driver *
192 spdk_pci_nvme_get_driver(void)
193 {
194 	return spdk_pci_get_driver("nvme");
195 }
196 
197 struct spdk_pci_driver *
198 spdk_pci_get_driver(const char *name)
199 {
200 	struct spdk_pci_driver *driver;
201 
202 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
203 		if (strcmp(driver->name, name) == 0) {
204 			return driver;
205 		}
206 	}
207 
208 	return NULL;
209 }
210 
211 static void
212 pci_device_rte_dev_event(const char *device_name,
213 			 enum rte_dev_event_type event,
214 			 void *cb_arg)
215 {
216 	struct spdk_pci_device *dev;
217 	bool can_detach = false;
218 
219 	switch (event) {
220 	default:
221 	case RTE_DEV_EVENT_ADD:
222 		/* Nothing to do here yet. */
223 		break;
224 	case RTE_DEV_EVENT_REMOVE:
225 		pthread_mutex_lock(&g_pci_mutex);
226 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
227 			struct rte_pci_device *rte_dev = dev->dev_handle;
228 
229 			if (strcmp(rte_dev->name, device_name) == 0 &&
230 			    !dev->internal.pending_removal) {
231 				can_detach = !dev->internal.attached;
232 				/* prevent any further attaches */
233 				dev->internal.pending_removal = true;
234 				break;
235 			}
236 		}
237 		pthread_mutex_unlock(&g_pci_mutex);
238 
239 		if (dev != NULL && can_detach) {
240 			/* if device is not attached we can remove it right away.
241 			* Otherwise it will be removed at detach. */
242 			remove_rte_dev(dev->dev_handle);
243 		}
244 		break;
245 	}
246 }
247 
248 static void
249 cleanup_pci_devices(void)
250 {
251 	struct spdk_pci_device *dev, *tmp;
252 
253 	pthread_mutex_lock(&g_pci_mutex);
254 	/* cleanup removed devices */
255 	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
256 		if (!dev->internal.removed) {
257 			continue;
258 		}
259 
260 		vtophys_pci_device_removed(dev->dev_handle);
261 		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
262 		free(dev);
263 	}
264 
265 	/* add newly-attached devices */
266 	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
267 		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
268 		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
269 		vtophys_pci_device_added(dev->dev_handle);
270 	}
271 	pthread_mutex_unlock(&g_pci_mutex);
272 }
273 
274 static int scan_pci_bus(bool delay_init);
275 
276 /* translate spdk_pci_driver to an rte_pci_driver and register it to dpdk */
277 static int
278 register_rte_driver(struct spdk_pci_driver *driver)
279 {
280 	unsigned pci_id_count = 0;
281 	struct rte_pci_id *rte_id_table;
282 	char *rte_name;
283 	size_t rte_name_len;
284 	uint32_t rte_flags;
285 
286 	assert(driver->id_table);
287 	while (driver->id_table[pci_id_count].vendor_id) {
288 		pci_id_count++;
289 	}
290 	assert(pci_id_count > 0);
291 
292 	rte_id_table = calloc(pci_id_count + 1, sizeof(*rte_id_table));
293 	if (!rte_id_table) {
294 		return -ENOMEM;
295 	}
296 
297 	while (pci_id_count > 0) {
298 		struct rte_pci_id *rte_id = &rte_id_table[pci_id_count - 1];
299 		const struct spdk_pci_id *spdk_id = &driver->id_table[pci_id_count - 1];
300 
301 		rte_id->class_id = spdk_id->class_id;
302 		rte_id->vendor_id = spdk_id->vendor_id;
303 		rte_id->device_id = spdk_id->device_id;
304 		rte_id->subsystem_vendor_id = spdk_id->subvendor_id;
305 		rte_id->subsystem_device_id = spdk_id->subdevice_id;
306 		pci_id_count--;
307 	}
308 
309 	assert(driver->name);
310 	rte_name_len = strlen(driver->name) + strlen("spdk_") + 1;
311 	rte_name = calloc(rte_name_len, 1);
312 	if (!rte_name) {
313 		free(rte_id_table);
314 		return -ENOMEM;
315 	}
316 
317 	snprintf(rte_name, rte_name_len, "spdk_%s", driver->name);
318 	driver->driver.driver.name = rte_name;
319 	driver->driver.id_table = rte_id_table;
320 
321 	rte_flags = 0;
322 	if (driver->drv_flags & SPDK_PCI_DRIVER_NEED_MAPPING) {
323 		rte_flags |= RTE_PCI_DRV_NEED_MAPPING;
324 	}
325 	if (driver->drv_flags & SPDK_PCI_DRIVER_WC_ACTIVATE) {
326 		rte_flags |= RTE_PCI_DRV_WC_ACTIVATE;
327 	}
328 	driver->driver.drv_flags = rte_flags;
329 
330 	driver->driver.probe = pci_device_init;
331 	driver->driver.remove = pci_device_fini;
332 
333 	rte_pci_register(&driver->driver);
334 	return 0;
335 }
336 
337 static inline void
338 _pci_env_init(void)
339 {
340 	/* We assume devices were present on the bus for more than 2 seconds
341 	 * before initializing SPDK and there's no need to wait more. We scan
342 	 * the bus, but we don't blacklist any devices.
343 	 */
344 	scan_pci_bus(false);
345 
346 	/* Register a single hotremove callback for all devices. */
347 	if (spdk_process_is_primary()) {
348 		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
349 	}
350 }
351 
352 void
353 pci_env_init(void)
354 {
355 	struct spdk_pci_driver *driver;
356 
357 	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
358 		register_rte_driver(driver);
359 	}
360 
361 	_pci_env_init();
362 }
363 
364 void
365 pci_env_reinit(void)
366 {
367 	/* There is no need to register pci drivers again, since they were
368 	 * already pre-registered in pci_env_init.
369 	 */
370 
371 	_pci_env_init();
372 }
373 
374 void
375 pci_env_fini(void)
376 {
377 	struct spdk_pci_device *dev;
378 	char bdf[32];
379 
380 	cleanup_pci_devices();
381 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
382 		if (dev->internal.attached) {
383 			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
384 			fprintf(stderr, "Device %s is still attached at shutdown!\n", bdf);
385 		}
386 	}
387 
388 	if (spdk_process_is_primary()) {
389 		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
390 	}
391 }
392 
393 int
394 pci_device_init(struct rte_pci_driver *_drv,
395 		struct rte_pci_device *_dev)
396 {
397 	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
398 	struct spdk_pci_device *dev;
399 	int rc;
400 
401 	dev = calloc(1, sizeof(*dev));
402 	if (dev == NULL) {
403 		return -1;
404 	}
405 
406 	dev->dev_handle = _dev;
407 
408 	dev->addr.domain = _dev->addr.domain;
409 	dev->addr.bus = _dev->addr.bus;
410 	dev->addr.dev = _dev->addr.devid;
411 	dev->addr.func = _dev->addr.function;
412 	dev->id.class_id = _dev->id.class_id;
413 	dev->id.vendor_id = _dev->id.vendor_id;
414 	dev->id.device_id = _dev->id.device_id;
415 	dev->id.subvendor_id = _dev->id.subsystem_vendor_id;
416 	dev->id.subdevice_id = _dev->id.subsystem_device_id;
417 	dev->socket_id = _dev->device.numa_node;
418 	dev->type = "pci";
419 
420 	dev->map_bar = map_bar_rte;
421 	dev->unmap_bar = unmap_bar_rte;
422 	dev->cfg_read = cfg_read_rte;
423 	dev->cfg_write = cfg_write_rte;
424 
425 	dev->internal.driver = driver;
426 	dev->internal.claim_fd = -1;
427 
428 	if (driver->cb_fn != NULL) {
429 		rc = driver->cb_fn(driver->cb_arg, dev);
430 		if (rc != 0) {
431 			free(dev);
432 			return rc;
433 		}
434 		dev->internal.attached = true;
435 	}
436 
437 	pthread_mutex_lock(&g_pci_mutex);
438 	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
439 	pthread_mutex_unlock(&g_pci_mutex);
440 	return 0;
441 }
442 
443 int
444 pci_device_fini(struct rte_pci_device *_dev)
445 {
446 	struct spdk_pci_device *dev;
447 
448 	pthread_mutex_lock(&g_pci_mutex);
449 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
450 		if (dev->dev_handle == _dev) {
451 			break;
452 		}
453 	}
454 
455 	if (dev == NULL || dev->internal.attached) {
456 		/* The device might be still referenced somewhere in SPDK. */
457 		pthread_mutex_unlock(&g_pci_mutex);
458 		return -1;
459 	}
460 
461 	/* remove our whitelist_at option */
462 	if (_dev->device.devargs) {
463 		_dev->device.devargs->data = NULL;
464 	}
465 
466 	assert(!dev->internal.removed);
467 	dev->internal.removed = true;
468 	pthread_mutex_unlock(&g_pci_mutex);
469 	return 0;
470 
471 }
472 
473 void
474 spdk_pci_device_detach(struct spdk_pci_device *dev)
475 {
476 	assert(dev->internal.attached);
477 
478 	if (dev->internal.claim_fd >= 0) {
479 		spdk_pci_device_unclaim(dev);
480 	}
481 
482 	if (strcmp(dev->type, "pci") == 0) {
483 		/* if it's a physical device we need to deal with DPDK on
484 		 * a different process and we can't just unset one flag
485 		 * here. We also want to stop using any device resources
486 		 * so that the device isn't "in use" by the userspace driver
487 		 * once we detach it. This would allow attaching the device
488 		 * to a different process, or to a kernel driver like nvme.
489 		 */
490 		detach_rte(dev);
491 	} else {
492 		dev->internal.attached = false;
493 	}
494 
495 	cleanup_pci_devices();
496 }
497 
498 static int
499 scan_pci_bus(bool delay_init)
500 {
501 	struct spdk_pci_driver *driver;
502 	struct rte_pci_device *rte_dev;
503 	uint64_t now;
504 
505 	rte_bus_scan();
506 	now = spdk_get_ticks();
507 
508 	driver = TAILQ_FIRST(&g_pci_drivers);
509 	if (!driver) {
510 		return 0;
511 	}
512 
513 	TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) {
514 		struct rte_devargs *da;
515 
516 		da = rte_dev->device.devargs;
517 		if (!da) {
518 			char devargs_str[128];
519 
520 			/* the device was never blacklisted or whitelisted */
521 			da = calloc(1, sizeof(*da));
522 			if (!da) {
523 				return -1;
524 			}
525 
526 			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name);
527 			if (rte_devargs_parse(da, devargs_str) != 0) {
528 				free(da);
529 				return -1;
530 			}
531 
532 			rte_devargs_insert(&da);
533 			rte_dev->device.devargs = da;
534 		}
535 
536 		if (da->data) {
537 			uint64_t whitelist_at = (uint64_t)(uintptr_t)da->data;
538 
539 			/* this device was seen by spdk before... */
540 			if (da->policy == RTE_DEV_BLACKLISTED && whitelist_at <= now) {
541 				da->policy = RTE_DEV_WHITELISTED;
542 			}
543 		} else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_WHITELIST &&
544 			    da->policy == RTE_DEV_WHITELISTED) || da->policy != RTE_DEV_BLACKLISTED) {
545 			/* override the policy only if not permanently blacklisted */
546 
547 			if (delay_init) {
548 				da->policy = RTE_DEV_BLACKLISTED;
549 				da->data = (void *)(now + 2 * spdk_get_ticks_hz());
550 			} else {
551 				da->policy = RTE_DEV_WHITELISTED;
552 				da->data = (void *)(uintptr_t)now;
553 			}
554 		}
555 	}
556 
557 	return 0;
558 }
559 
560 int
561 spdk_pci_device_attach(struct spdk_pci_driver *driver,
562 		       spdk_pci_enum_cb enum_cb,
563 		       void *enum_ctx, struct spdk_pci_addr *pci_address)
564 {
565 	struct spdk_pci_device *dev;
566 	struct rte_pci_device *rte_dev;
567 	struct rte_devargs *da;
568 	int rc;
569 	char bdf[32];
570 
571 	spdk_pci_addr_fmt(bdf, sizeof(bdf), pci_address);
572 
573 	cleanup_pci_devices();
574 
575 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
576 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
577 			break;
578 		}
579 	}
580 
581 	if (dev != NULL && dev->internal.driver == driver) {
582 		pthread_mutex_lock(&g_pci_mutex);
583 		if (dev->internal.attached || dev->internal.pending_removal) {
584 			pthread_mutex_unlock(&g_pci_mutex);
585 			return -1;
586 		}
587 
588 		rc = enum_cb(enum_ctx, dev);
589 		if (rc == 0) {
590 			dev->internal.attached = true;
591 		}
592 		pthread_mutex_unlock(&g_pci_mutex);
593 		return rc;
594 	}
595 
596 	driver->cb_fn = enum_cb;
597 	driver->cb_arg = enum_ctx;
598 
599 	int i = 0;
600 
601 	do {
602 		rc = rte_eal_hotplug_add("pci", bdf, "");
603 	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
604 
605 	if (i > 1 && rc == -EEXIST) {
606 		/* Even though the previous request timed out, the device
607 		 * was attached successfully.
608 		 */
609 		rc = 0;
610 	}
611 
612 	driver->cb_arg = NULL;
613 	driver->cb_fn = NULL;
614 
615 	cleanup_pci_devices();
616 
617 	if (rc != 0) {
618 		return -1;
619 	}
620 
621 	/* explicit attach ignores the whitelist, so if we blacklisted this
622 	 * device before let's enable it now - just for clarity.
623 	 */
624 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
625 		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
626 			break;
627 		}
628 	}
629 	assert(dev != NULL);
630 
631 	rte_dev = dev->dev_handle;
632 	da = rte_dev->device.devargs;
633 	if (da && da->data) {
634 		da->data = (void *)(uintptr_t)spdk_get_ticks();
635 		da->policy = RTE_DEV_WHITELISTED;
636 	}
637 
638 	return 0;
639 }
640 
641 /* Note: You can call spdk_pci_enumerate from more than one thread
642  *       simultaneously safely, but you cannot call spdk_pci_enumerate
643  *       and rte_eal_pci_probe simultaneously.
644  */
645 int
646 spdk_pci_enumerate(struct spdk_pci_driver *driver,
647 		   spdk_pci_enum_cb enum_cb,
648 		   void *enum_ctx)
649 {
650 	struct spdk_pci_device *dev;
651 	int rc;
652 
653 	cleanup_pci_devices();
654 
655 	pthread_mutex_lock(&g_pci_mutex);
656 	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
657 		if (dev->internal.attached ||
658 		    dev->internal.driver != driver ||
659 		    dev->internal.pending_removal) {
660 			continue;
661 		}
662 
663 		rc = enum_cb(enum_ctx, dev);
664 		if (rc == 0) {
665 			dev->internal.attached = true;
666 		} else if (rc < 0) {
667 			pthread_mutex_unlock(&g_pci_mutex);
668 			return -1;
669 		}
670 	}
671 	pthread_mutex_unlock(&g_pci_mutex);
672 
673 	if (scan_pci_bus(true) != 0) {
674 		return -1;
675 	}
676 
677 	driver->cb_fn = enum_cb;
678 	driver->cb_arg = enum_ctx;
679 
680 	if (rte_bus_probe() != 0) {
681 		driver->cb_arg = NULL;
682 		driver->cb_fn = NULL;
683 		return -1;
684 	}
685 
686 	driver->cb_arg = NULL;
687 	driver->cb_fn = NULL;
688 
689 	cleanup_pci_devices();
690 	return 0;
691 }
692 
693 struct spdk_pci_device *
694 spdk_pci_get_first_device(void)
695 {
696 	return TAILQ_FIRST(&g_pci_devices);
697 }
698 
699 struct spdk_pci_device *
700 spdk_pci_get_next_device(struct spdk_pci_device *prev)
701 {
702 	return TAILQ_NEXT(prev, internal.tailq);
703 }
704 
705 int
706 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
707 			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
708 {
709 	return dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
710 }
711 
712 int
713 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
714 {
715 	return dev->unmap_bar(dev, bar, addr);
716 }
717 
718 uint32_t
719 spdk_pci_device_get_domain(struct spdk_pci_device *dev)
720 {
721 	return dev->addr.domain;
722 }
723 
724 uint8_t
725 spdk_pci_device_get_bus(struct spdk_pci_device *dev)
726 {
727 	return dev->addr.bus;
728 }
729 
730 uint8_t
731 spdk_pci_device_get_dev(struct spdk_pci_device *dev)
732 {
733 	return dev->addr.dev;
734 }
735 
736 uint8_t
737 spdk_pci_device_get_func(struct spdk_pci_device *dev)
738 {
739 	return dev->addr.func;
740 }
741 
742 uint16_t
743 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
744 {
745 	return dev->id.vendor_id;
746 }
747 
748 uint16_t
749 spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
750 {
751 	return dev->id.device_id;
752 }
753 
754 uint16_t
755 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
756 {
757 	return dev->id.subvendor_id;
758 }
759 
760 uint16_t
761 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
762 {
763 	return dev->id.subdevice_id;
764 }
765 
766 struct spdk_pci_id
767 spdk_pci_device_get_id(struct spdk_pci_device *dev)
768 {
769 	return dev->id;
770 }
771 
772 int
773 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
774 {
775 	return dev->socket_id;
776 }
777 
778 int
779 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
780 {
781 	return dev->cfg_read(dev, value, len, offset);
782 }
783 
784 int
785 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
786 {
787 	return dev->cfg_write(dev, value, len, offset);
788 }
789 
790 int
791 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
792 {
793 	return spdk_pci_device_cfg_read(dev, value, 1, offset);
794 }
795 
796 int
797 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
798 {
799 	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
800 }
801 
802 int
803 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
804 {
805 	return spdk_pci_device_cfg_read(dev, value, 2, offset);
806 }
807 
808 int
809 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
810 {
811 	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
812 }
813 
814 int
815 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
816 {
817 	return spdk_pci_device_cfg_read(dev, value, 4, offset);
818 }
819 
820 int
821 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
822 {
823 	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
824 }
825 
826 int
827 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
828 {
829 	int err;
830 	uint32_t pos, header = 0;
831 	uint32_t i, buf[2];
832 
833 	if (len < 17) {
834 		return -1;
835 	}
836 
837 	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
838 	if (err || !header) {
839 		return -1;
840 	}
841 
842 	pos = PCI_CFG_SIZE;
843 	while (1) {
844 		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
845 			if (pos) {
846 				/* skip the header */
847 				pos += 4;
848 				for (i = 0; i < 2; i++) {
849 					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
850 					if (err) {
851 						return -1;
852 					}
853 				}
854 				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
855 				return 0;
856 			}
857 		}
858 		pos = (header >> 20) & 0xffc;
859 		/* 0 if no other items exist */
860 		if (pos < PCI_CFG_SIZE) {
861 			return -1;
862 		}
863 		err = spdk_pci_device_cfg_read32(dev, &header, pos);
864 		if (err) {
865 			return -1;
866 		}
867 	}
868 	return -1;
869 }
870 
871 struct spdk_pci_addr
872 spdk_pci_device_get_addr(struct spdk_pci_device *dev)
873 {
874 	return dev->addr;
875 }
876 
877 bool
878 spdk_pci_device_is_removed(struct spdk_pci_device *dev)
879 {
880 	return dev->internal.pending_removal;
881 }
882 
883 int
884 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
885 {
886 	if (a1->domain > a2->domain) {
887 		return 1;
888 	} else if (a1->domain < a2->domain) {
889 		return -1;
890 	} else if (a1->bus > a2->bus) {
891 		return 1;
892 	} else if (a1->bus < a2->bus) {
893 		return -1;
894 	} else if (a1->dev > a2->dev) {
895 		return 1;
896 	} else if (a1->dev < a2->dev) {
897 		return -1;
898 	} else if (a1->func > a2->func) {
899 		return 1;
900 	} else if (a1->func < a2->func) {
901 		return -1;
902 	}
903 
904 	return 0;
905 }
906 
907 #ifdef __linux__
908 int
909 spdk_pci_device_claim(struct spdk_pci_device *dev)
910 {
911 	int dev_fd;
912 	char dev_name[64];
913 	int pid;
914 	void *dev_map;
915 	struct flock pcidev_lock = {
916 		.l_type = F_WRLCK,
917 		.l_whence = SEEK_SET,
918 		.l_start = 0,
919 		.l_len = 0,
920 	};
921 
922 	snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
923 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
924 
925 	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
926 	if (dev_fd == -1) {
927 		fprintf(stderr, "could not open %s\n", dev_name);
928 		return -errno;
929 	}
930 
931 	if (ftruncate(dev_fd, sizeof(int)) != 0) {
932 		fprintf(stderr, "could not truncate %s\n", dev_name);
933 		close(dev_fd);
934 		return -errno;
935 	}
936 
937 	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
938 		       MAP_SHARED, dev_fd, 0);
939 	if (dev_map == MAP_FAILED) {
940 		fprintf(stderr, "could not mmap dev %s (%d)\n", dev_name, errno);
941 		close(dev_fd);
942 		return -errno;
943 	}
944 
945 	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
946 		pid = *(int *)dev_map;
947 		fprintf(stderr, "Cannot create lock on device %s, probably"
948 			" process %d has claimed it\n", dev_name, pid);
949 		munmap(dev_map, sizeof(int));
950 		close(dev_fd);
951 		/* F_SETLK returns unspecified errnos, normalize them */
952 		return -EACCES;
953 	}
954 
955 	*(int *)dev_map = (int)getpid();
956 	munmap(dev_map, sizeof(int));
957 	dev->internal.claim_fd = dev_fd;
958 	/* Keep dev_fd open to maintain the lock. */
959 	return 0;
960 }
961 
962 void
963 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
964 {
965 	char dev_name[64];
966 
967 	snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
968 		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
969 
970 	close(dev->internal.claim_fd);
971 	dev->internal.claim_fd = -1;
972 	unlink(dev_name);
973 }
974 #endif /* __linux__ */
975 
976 #ifdef __FreeBSD__
977 int
978 spdk_pci_device_claim(struct spdk_pci_device *dev)
979 {
980 	/* TODO */
981 	return 0;
982 }
983 
984 void
985 spdk_pci_device_unclaim(struct spdk_pci_device *dev)
986 {
987 	/* TODO */
988 }
989 #endif /* __FreeBSD__ */
990 
991 int
992 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
993 {
994 	unsigned domain, bus, dev, func;
995 
996 	if (addr == NULL || bdf == NULL) {
997 		return -EINVAL;
998 	}
999 
1000 	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1001 	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1002 		/* Matched a full address - all variables are initialized */
1003 	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1004 		func = 0;
1005 	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1006 		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1007 		domain = 0;
1008 	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1009 		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1010 		domain = 0;
1011 		func = 0;
1012 	} else {
1013 		return -EINVAL;
1014 	}
1015 
1016 	if (bus > 0xFF || dev > 0x1F || func > 7) {
1017 		return -EINVAL;
1018 	}
1019 
1020 	addr->domain = domain;
1021 	addr->bus = bus;
1022 	addr->dev = dev;
1023 	addr->func = func;
1024 
1025 	return 0;
1026 }
1027 
1028 int
1029 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1030 {
1031 	int rc;
1032 
1033 	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1034 		      addr->domain, addr->bus,
1035 		      addr->dev, addr->func);
1036 
1037 	if (rc > 0 && (size_t)rc < sz) {
1038 		return 0;
1039 	}
1040 
1041 	return -1;
1042 }
1043 
1044 void
1045 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1046 {
1047 	assert(dev->map_bar != NULL);
1048 	assert(dev->unmap_bar != NULL);
1049 	assert(dev->cfg_read != NULL);
1050 	assert(dev->cfg_write != NULL);
1051 	dev->internal.driver = drv;
1052 	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1053 }
1054 
1055 void
1056 spdk_pci_unhook_device(struct spdk_pci_device *dev)
1057 {
1058 	assert(!dev->internal.attached);
1059 	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1060 }
1061 
1062 const char *
1063 spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1064 {
1065 	return dev->type;
1066 }
1067