xref: /spdk/lib/env_dpdk/pci.c (revision b02581a89058ebaebe03bd0e16e3b58adfe406c1)
1  /*   SPDX-License-Identifier: BSD-3-Clause
2   *   Copyright (C) 2015 Intel Corporation.
3   *   All rights reserved.
4   */
5  
6  #include "env_internal.h"
7  #include "pci_dpdk.h"
8  
9  #include <rte_alarm.h>
10  #include <rte_devargs.h>
11  #include <rte_pci.h>
12  #include "spdk/env.h"
13  #include "spdk/log.h"
14  #include "spdk/string.h"
15  #include "spdk/memory.h"
16  
17  #define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"
18  
19  #define PCI_CFG_SIZE		256
20  #define PCI_EXT_CAP_ID_SN	0x03
21  
22  /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
23   * might cause the internal IPC to misbehave. Just retry in such case.
24   */
25  #define DPDK_HOTPLUG_RETRY_COUNT 4
26  
27  /* DPDK alarm/interrupt thread */
28  static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
29  static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
30  /* devices hotplugged on a dpdk thread */
31  static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
32  	TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
33  static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
34  static TAILQ_HEAD(, spdk_pci_device_provider) g_pci_device_providers =
35  	TAILQ_HEAD_INITIALIZER(g_pci_device_providers);
36  
37  int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device);
38  int pci_device_fini(struct rte_pci_device *device);
39  
40  struct env_devargs {
41  	struct rte_bus	*bus;
42  	char		name[128];
43  	uint64_t	allowed_at;
44  	TAILQ_ENTRY(env_devargs) link;
45  };
46  static TAILQ_HEAD(, env_devargs) g_env_devargs = TAILQ_HEAD_INITIALIZER(g_env_devargs);
47  
48  static struct env_devargs *
49  find_env_devargs(struct rte_bus *bus, const char *name)
50  {
51  	struct env_devargs *da;
52  
53  	TAILQ_FOREACH(da, &g_env_devargs, link) {
54  		if (bus == da->bus && !strcmp(name, da->name)) {
55  			return da;
56  		}
57  	}
58  
59  	return NULL;
60  }
61  
62  static int
63  map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
64  	    void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
65  {
66  	struct rte_mem_resource *res;
67  
68  	res = dpdk_pci_device_get_mem_resource(device->dev_handle, bar);
69  	*mapped_addr = res->addr;
70  	*phys_addr = (uint64_t)res->phys_addr;
71  	*size = (uint64_t)res->len;
72  
73  	return 0;
74  }
75  
76  static int
77  unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
78  {
79  	return 0;
80  }
81  
82  static int
83  cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
84  {
85  	return dpdk_pci_device_read_config(dev->dev_handle, value, len, offset);
86  }
87  
88  static int
89  cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
90  {
91  	return dpdk_pci_device_write_config(dev->dev_handle, value, len, offset);
92  }
93  
94  static void
95  remove_rte_dev(struct rte_pci_device *rte_dev)
96  {
97  	char bdf[32];
98  	int i = 0, rc;
99  
100  	snprintf(bdf, sizeof(bdf), "%s", dpdk_pci_device_get_name(rte_dev));
101  	do {
102  		rc = rte_eal_hotplug_remove("pci", bdf);
103  	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
104  }
105  
106  static void
107  detach_rte_cb(void *_dev)
108  {
109  	remove_rte_dev(_dev);
110  }
111  
112  /* if it's a physical device we need to deal with DPDK on
113   * a different process and we can't just unset one flag
114   * here. We also want to stop using any device resources
115   * so that the device isn't "in use" by the userspace driver
116   * once we detach it. This would allow attaching the device
117   * to a different process, or to a kernel driver like nvme.
118   */
119  static void
120  detach_rte(struct spdk_pci_device *dev)
121  {
122  	struct rte_pci_device *rte_dev = dev->dev_handle;
123  	int i;
124  	bool removed;
125  
126  	if (!spdk_process_is_primary()) {
127  		return;
128  	}
129  
130  	pthread_mutex_lock(&g_pci_mutex);
131  	dev->internal.attached = false;
132  	/* prevent the hotremove notification from removing this device */
133  	dev->internal.pending_removal = true;
134  	pthread_mutex_unlock(&g_pci_mutex);
135  
136  	rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
137  
138  	/* wait up to 2s for the cb to execute */
139  	for (i = 2000; i > 0; i--) {
140  
141  		spdk_delay_us(1000);
142  		pthread_mutex_lock(&g_pci_mutex);
143  		removed = dev->internal.removed;
144  		pthread_mutex_unlock(&g_pci_mutex);
145  
146  		if (removed) {
147  			break;
148  		}
149  	}
150  
151  	/* besides checking the removed flag, we also need to wait
152  	 * for the dpdk detach function to unwind, as it's doing some
153  	 * operations even after calling our detach callback. Simply
154  	 * cancel the alarm - if it started executing already, this
155  	 * call will block and wait for it to finish.
156  	 */
157  	rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
158  
159  	/* the device could have been finally removed, so just check
160  	 * it again.
161  	 */
162  	pthread_mutex_lock(&g_pci_mutex);
163  	removed = dev->internal.removed;
164  	pthread_mutex_unlock(&g_pci_mutex);
165  	if (!removed) {
166  		SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n",
167  			    dpdk_pci_device_get_name(rte_dev));
168  		/* If we reach this state, then the device couldn't be removed and most likely
169  		   a subsequent hot add of a device in the same BDF will fail */
170  	}
171  }
172  
173  void
174  spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
175  {
176  	struct spdk_pci_driver *driver;
177  
178  	driver = calloc(1, sizeof(*driver));
179  	if (!driver) {
180  		/* we can't do any better than bailing atm */
181  		return;
182  	}
183  
184  	driver->name = name;
185  	driver->id_table = id_table;
186  	driver->drv_flags = flags;
187  	driver->driver = (struct rte_pci_driver *)driver->driver_buf;
188  	TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
189  }
190  
191  struct spdk_pci_driver *
192  spdk_pci_nvme_get_driver(void)
193  {
194  	return spdk_pci_get_driver("nvme");
195  }
196  
197  struct spdk_pci_driver *
198  spdk_pci_get_driver(const char *name)
199  {
200  	struct spdk_pci_driver *driver;
201  
202  	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
203  		if (strcmp(driver->name, name) == 0) {
204  			return driver;
205  		}
206  	}
207  
208  	return NULL;
209  }
210  
211  static void
212  pci_device_rte_dev_event(const char *device_name,
213  			 enum rte_dev_event_type event,
214  			 void *cb_arg)
215  {
216  	struct spdk_pci_device *dev;
217  	bool can_detach = false;
218  
219  	switch (event) {
220  	default:
221  	case RTE_DEV_EVENT_ADD:
222  		/* Nothing to do here yet. */
223  		break;
224  	case RTE_DEV_EVENT_REMOVE:
225  		pthread_mutex_lock(&g_pci_mutex);
226  		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
227  			struct rte_pci_device *rte_dev = dev->dev_handle;
228  
229  			if (strcmp(dpdk_pci_device_get_name(rte_dev), device_name)) {
230  				continue;
231  			}
232  
233  			/* Note: these ERRLOGs are useful for triaging issue #2983. */
234  			if (dev->internal.pending_removal || dev->internal.removed) {
235  				SPDK_ERRLOG("Received event for device SPDK already tried to remove\n");
236  				SPDK_ERRLOG("pending_removal=%d removed=%d\n", dev->internal.pending_removal,
237  					    dev->internal.removed);
238  			}
239  
240  			if (!dev->internal.pending_removal) {
241  				can_detach = !dev->internal.attached;
242  				/* prevent any further attaches */
243  				dev->internal.pending_removal = true;
244  				break;
245  			}
246  		}
247  		pthread_mutex_unlock(&g_pci_mutex);
248  
249  		if (can_detach) {
250  			/* if device is not attached we can remove it right away.
251  			 * Otherwise it will be removed at detach.
252  			 *
253  			 * Because the user's callback is invoked in eal interrupt
254  			 * callback, the interrupt callback need to be finished before
255  			 * it can be unregistered when detaching device. So finish
256  			 * callback soon and use a deferred removal to detach device
257  			 * is need. It is a workaround, once the device detaching be
258  			 * moved into the eal in the future, the deferred removal could
259  			 * be deleted.
260  			 */
261  			assert(dev != NULL);
262  			rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle);
263  		}
264  		break;
265  	}
266  }
267  
268  static void
269  cleanup_pci_devices(void)
270  {
271  	struct spdk_pci_device *dev, *tmp;
272  
273  	pthread_mutex_lock(&g_pci_mutex);
274  	/* cleanup removed devices */
275  	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
276  		if (!dev->internal.removed) {
277  			continue;
278  		}
279  
280  		vtophys_pci_device_removed(dev->dev_handle);
281  		TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
282  		free(dev);
283  	}
284  
285  	/* add newly-attached devices */
286  	TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
287  		TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
288  		TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
289  		vtophys_pci_device_added(dev->dev_handle);
290  	}
291  	pthread_mutex_unlock(&g_pci_mutex);
292  }
293  
294  static int scan_pci_bus(bool delay_init);
295  
296  static inline void
297  _pci_env_init(void)
298  {
299  	/* We assume devices were present on the bus for more than 2 seconds
300  	 * before initializing SPDK and there's no need to wait more. We scan
301  	 * the bus, but we don't block any devices.
302  	 */
303  	scan_pci_bus(false);
304  
305  	/* Register a single hotremove callback for all devices. */
306  	if (spdk_process_is_primary()) {
307  		rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL);
308  	}
309  }
310  
311  int
312  pci_env_init(void)
313  {
314  	struct spdk_pci_driver *driver;
315  	int rc;
316  
317  	rc = dpdk_pci_init();
318  	if (rc) {
319  		return rc;
320  	}
321  
322  	TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
323  		dpdk_pci_driver_register(driver, pci_device_init, pci_device_fini);
324  	}
325  
326  	_pci_env_init();
327  	return 0;
328  }
329  
330  void
331  pci_env_reinit(void)
332  {
333  	/* There is no need to register pci drivers again, since they were
334  	 * already pre-registered in pci_env_init.
335  	 */
336  
337  	_pci_env_init();
338  }
339  
340  void
341  pci_env_fini(void)
342  {
343  	struct spdk_pci_device *dev;
344  	char bdf[32];
345  
346  	cleanup_pci_devices();
347  	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
348  		if (dev->internal.attached) {
349  			spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
350  			SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf);
351  		}
352  	}
353  
354  	if (spdk_process_is_primary()) {
355  		rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL);
356  	}
357  }
358  
359  int
360  pci_device_init(struct rte_pci_driver *_drv,
361  		struct rte_pci_device *_dev)
362  {
363  	struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
364  	struct spdk_pci_device *dev;
365  	struct rte_pci_addr *addr;
366  	struct rte_pci_id *id;
367  	int rc;
368  
369  	dev = calloc(1, sizeof(*dev));
370  	if (dev == NULL) {
371  		return -1;
372  	}
373  
374  	dev->dev_handle = _dev;
375  
376  	addr = dpdk_pci_device_get_addr(_dev);
377  	dev->addr.domain = addr->domain;
378  	dev->addr.bus = addr->bus;
379  	dev->addr.dev = addr->devid;
380  	dev->addr.func = addr->function;
381  
382  	id = dpdk_pci_device_get_id(_dev);
383  	dev->id.class_id = id->class_id;
384  	dev->id.vendor_id = id->vendor_id;
385  	dev->id.device_id = id->device_id;
386  	dev->id.subvendor_id = id->subsystem_vendor_id;
387  	dev->id.subdevice_id = id->subsystem_device_id;
388  
389  	dev->socket_id = dpdk_pci_device_get_numa_node(_dev);
390  	dev->type = "pci";
391  
392  	dev->map_bar = map_bar_rte;
393  	dev->unmap_bar = unmap_bar_rte;
394  	dev->cfg_read = cfg_read_rte;
395  	dev->cfg_write = cfg_write_rte;
396  
397  	dev->internal.driver = driver;
398  	dev->internal.claim_fd = -1;
399  
400  	if (driver->cb_fn != NULL) {
401  		rc = driver->cb_fn(driver->cb_arg, dev);
402  		if (rc != 0) {
403  			free(dev);
404  			return rc;
405  		}
406  		dev->internal.attached = true;
407  	}
408  
409  	pthread_mutex_lock(&g_pci_mutex);
410  	TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
411  	pthread_mutex_unlock(&g_pci_mutex);
412  	return 0;
413  }
414  
415  static void
416  set_allowed_at(struct rte_devargs *rte_da, uint64_t tsc)
417  {
418  	struct env_devargs *env_da;
419  
420  	env_da = find_env_devargs(rte_da->bus, rte_da->name);
421  	if (env_da == NULL) {
422  		env_da = calloc(1, sizeof(*env_da));
423  		if (env_da == NULL) {
424  			SPDK_ERRLOG("could not set_allowed_at for device %s\n", rte_da->name);
425  			return;
426  		}
427  		env_da->bus = rte_da->bus;
428  		spdk_strcpy_pad(env_da->name, rte_da->name, sizeof(env_da->name), 0);
429  		TAILQ_INSERT_TAIL(&g_env_devargs, env_da, link);
430  	}
431  
432  	env_da->allowed_at = tsc;
433  }
434  
435  static uint64_t
436  get_allowed_at(struct rte_devargs *rte_da)
437  {
438  	struct env_devargs *env_da;
439  
440  	env_da = find_env_devargs(rte_da->bus, rte_da->name);
441  	if (env_da) {
442  		return env_da->allowed_at;
443  	} else {
444  		return 0;
445  	}
446  }
447  
448  int
449  pci_device_fini(struct rte_pci_device *_dev)
450  {
451  	struct spdk_pci_device *dev;
452  
453  	pthread_mutex_lock(&g_pci_mutex);
454  	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
455  		if (dev->dev_handle == _dev) {
456  			break;
457  		}
458  	}
459  
460  	if (dev == NULL || dev->internal.attached) {
461  		/* The device might be still referenced somewhere in SPDK. */
462  		pthread_mutex_unlock(&g_pci_mutex);
463  		return -EBUSY;
464  	}
465  
466  	/* remove our allowed_at option */
467  	if (dpdk_pci_device_get_devargs(_dev)) {
468  		set_allowed_at(dpdk_pci_device_get_devargs(_dev), 0);
469  	}
470  
471  	/* It is possible that removed flag was already set when there is a race
472  	 * between the remove notification for this process, and another process
473  	 * that is also detaching from this same device (for example, when using
474  	 * nvme driver in multi-process mode.  So do not assert here.  See
475  	 * #2456 for additional details.
476  	 */
477  	dev->internal.removed = true;
478  	pthread_mutex_unlock(&g_pci_mutex);
479  	return 0;
480  
481  }
482  
483  void
484  spdk_pci_device_detach(struct spdk_pci_device *dev)
485  {
486  	struct spdk_pci_device_provider *provider;
487  
488  	assert(dev->internal.attached);
489  
490  	if (dev->internal.claim_fd >= 0) {
491  		spdk_pci_device_unclaim(dev);
492  	}
493  
494  	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
495  		if (strcmp(dev->type, provider->name) == 0) {
496  			break;
497  		}
498  	}
499  
500  	assert(provider != NULL);
501  	dev->internal.attached = false;
502  	provider->detach_cb(dev);
503  
504  	cleanup_pci_devices();
505  }
506  
507  static int
508  scan_pci_bus(bool delay_init)
509  {
510  	struct rte_dev_iterator it;
511  	struct rte_device *rte_dev;
512  	uint64_t now;
513  
514  	dpdk_bus_scan();
515  	now = spdk_get_ticks();
516  
517  	if (!TAILQ_FIRST(&g_pci_drivers)) {
518  		return 0;
519  	}
520  
521  	RTE_DEV_FOREACH(rte_dev, "bus=pci", &it) {
522  		struct rte_devargs *da;
523  
524  		da = dpdk_device_get_devargs(rte_dev);
525  		if (!da) {
526  			char devargs_str[128];
527  
528  			/* the device was never blocked or allowed */
529  			da = calloc(1, sizeof(*da));
530  			if (!da) {
531  				return -1;
532  			}
533  
534  			snprintf(devargs_str, sizeof(devargs_str), "pci:%s", dpdk_device_get_name(rte_dev));
535  			if (rte_devargs_parse(da, devargs_str) != 0) {
536  				free(da);
537  				return -1;
538  			}
539  
540  			rte_devargs_insert(&da);
541  			dpdk_device_set_devargs(rte_dev, da);
542  		}
543  
544  		if (get_allowed_at(da)) {
545  			uint64_t allowed_at = get_allowed_at(da);
546  
547  			/* this device was seen by spdk before... */
548  			if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) {
549  				da->policy = RTE_DEV_ALLOWED;
550  			}
551  		} else if ((dpdk_device_scan_allowed(rte_dev) && da->policy == RTE_DEV_ALLOWED) ||
552  			   da->policy != RTE_DEV_BLOCKED) {
553  			/* override the policy only if not permanently blocked */
554  
555  			if (delay_init) {
556  				da->policy = RTE_DEV_BLOCKED;
557  				set_allowed_at(da, now + 2 * spdk_get_ticks_hz());
558  			} else {
559  				da->policy = RTE_DEV_ALLOWED;
560  				set_allowed_at(da, now);
561  			}
562  		}
563  	}
564  
565  	return 0;
566  }
567  
568  static int
569  pci_attach_rte(const struct spdk_pci_addr *addr)
570  {
571  	char bdf[32];
572  	int rc, i = 0;
573  
574  	spdk_pci_addr_fmt(bdf, sizeof(bdf), addr);
575  
576  	do {
577  		rc = rte_eal_hotplug_add("pci", bdf, "");
578  	} while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
579  
580  	if (i > 1 && rc == -EEXIST) {
581  		/* Even though the previous request timed out, the device
582  		 * was attached successfully.
583  		 */
584  		rc = 0;
585  	}
586  
587  	return rc;
588  }
589  
590  static struct spdk_pci_device_provider g_pci_rte_provider = {
591  	.name = "pci",
592  	.attach_cb = pci_attach_rte,
593  	.detach_cb = detach_rte,
594  };
595  
596  SPDK_PCI_REGISTER_DEVICE_PROVIDER(pci, &g_pci_rte_provider);
597  
598  int
599  spdk_pci_device_attach(struct spdk_pci_driver *driver,
600  		       spdk_pci_enum_cb enum_cb,
601  		       void *enum_ctx, struct spdk_pci_addr *pci_address)
602  {
603  	struct spdk_pci_device *dev;
604  	struct spdk_pci_device_provider *provider;
605  	struct rte_pci_device *rte_dev;
606  	struct rte_devargs *da;
607  	int rc;
608  
609  	cleanup_pci_devices();
610  
611  	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
612  		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
613  			break;
614  		}
615  	}
616  
617  	if (dev != NULL && dev->internal.driver == driver) {
618  		pthread_mutex_lock(&g_pci_mutex);
619  		if (dev->internal.attached || dev->internal.pending_removal) {
620  			pthread_mutex_unlock(&g_pci_mutex);
621  			return -1;
622  		}
623  
624  		rc = enum_cb(enum_ctx, dev);
625  		if (rc == 0) {
626  			dev->internal.attached = true;
627  		}
628  		pthread_mutex_unlock(&g_pci_mutex);
629  		return rc;
630  	}
631  
632  	driver->cb_fn = enum_cb;
633  	driver->cb_arg = enum_ctx;
634  
635  	rc = -ENODEV;
636  	TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) {
637  		rc = provider->attach_cb(pci_address);
638  		if (rc == 0) {
639  			break;
640  		}
641  	}
642  
643  	driver->cb_arg = NULL;
644  	driver->cb_fn = NULL;
645  
646  	cleanup_pci_devices();
647  
648  	if (rc != 0) {
649  		return -1;
650  	}
651  
652  	/* explicit attach ignores the allowlist, so if we blocked this
653  	 * device before let's enable it now - just for clarity.
654  	 */
655  	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
656  		if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
657  			break;
658  		}
659  	}
660  	assert(dev != NULL);
661  
662  	rte_dev = dev->dev_handle;
663  	if (rte_dev != NULL) {
664  		da = dpdk_pci_device_get_devargs(rte_dev);
665  		if (da && get_allowed_at(da)) {
666  			set_allowed_at(da, spdk_get_ticks());
667  			da->policy = RTE_DEV_ALLOWED;
668  		}
669  	}
670  
671  	return 0;
672  }
673  
674  /* Note: You can call spdk_pci_enumerate from more than one thread
675   *       simultaneously safely, but you cannot call spdk_pci_enumerate
676   *       and rte_eal_pci_probe simultaneously.
677   */
678  int
679  spdk_pci_enumerate(struct spdk_pci_driver *driver,
680  		   spdk_pci_enum_cb enum_cb,
681  		   void *enum_ctx)
682  {
683  	struct spdk_pci_device *dev;
684  	int rc;
685  
686  	cleanup_pci_devices();
687  
688  	pthread_mutex_lock(&g_pci_mutex);
689  	TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
690  		if (dev->internal.attached ||
691  		    dev->internal.driver != driver ||
692  		    dev->internal.pending_removal) {
693  			continue;
694  		}
695  
696  		rc = enum_cb(enum_ctx, dev);
697  		if (rc == 0) {
698  			dev->internal.attached = true;
699  		} else if (rc < 0) {
700  			pthread_mutex_unlock(&g_pci_mutex);
701  			return -1;
702  		}
703  	}
704  	pthread_mutex_unlock(&g_pci_mutex);
705  
706  	if (scan_pci_bus(true) != 0) {
707  		return -1;
708  	}
709  
710  	driver->cb_fn = enum_cb;
711  	driver->cb_arg = enum_ctx;
712  
713  	if (dpdk_bus_probe() != 0) {
714  		driver->cb_arg = NULL;
715  		driver->cb_fn = NULL;
716  		return -1;
717  	}
718  
719  	driver->cb_arg = NULL;
720  	driver->cb_fn = NULL;
721  
722  	cleanup_pci_devices();
723  	return 0;
724  }
725  
726  void
727  spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev))
728  {
729  	struct spdk_pci_device *dev, *tmp;
730  
731  	pthread_mutex_lock(&g_pci_mutex);
732  	TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
733  		fn(ctx, dev);
734  	}
735  	pthread_mutex_unlock(&g_pci_mutex);
736  }
737  
738  int
739  spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
740  			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
741  {
742  	int rc;
743  
744  	rc = dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
745  	if (rc) {
746  		return rc;
747  	}
748  
749  #if VFIO_ENABLED
750  	/* Automatically map the BAR to the IOMMU */
751  	if (!spdk_iommu_is_enabled()) {
752  		return 0;
753  	}
754  
755  	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
756  		/* We'll use the virtual address as the iova to match DPDK. */
757  		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), (uint64_t) * mapped_addr, *size);
758  		if (rc) {
759  			dev->unmap_bar(dev, bar, *mapped_addr);
760  			return -EFAULT;
761  		}
762  
763  		*phys_addr = (uint64_t)(*mapped_addr);
764  	} else {
765  		/* We'll use the physical address as the iova to match DPDK. */
766  		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), *phys_addr, *size);
767  		if (rc) {
768  			dev->unmap_bar(dev, bar, *mapped_addr);
769  			return -EFAULT;
770  		}
771  	}
772  #endif
773  	return rc;
774  }
775  
776  int
777  spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
778  {
779  #if VFIO_ENABLED
780  	int rc;
781  
782  	if (spdk_iommu_is_enabled()) {
783  		rc = vtophys_iommu_unmap_dma_bar((uint64_t)addr);
784  		if (rc) {
785  			return -EFAULT;
786  		}
787  	}
788  #endif
789  
790  	return dev->unmap_bar(dev, bar, addr);
791  }
792  
793  int
794  spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev)
795  {
796  	return dpdk_pci_device_enable_interrupt(dev->dev_handle);
797  }
798  
799  int
800  spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev)
801  {
802  	return dpdk_pci_device_disable_interrupt(dev->dev_handle);
803  }
804  
805  int
806  spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev)
807  {
808  	return dpdk_pci_device_get_interrupt_efd(dev->dev_handle);
809  }
810  
811  uint32_t
812  spdk_pci_device_get_domain(struct spdk_pci_device *dev)
813  {
814  	return dev->addr.domain;
815  }
816  
817  uint8_t
818  spdk_pci_device_get_bus(struct spdk_pci_device *dev)
819  {
820  	return dev->addr.bus;
821  }
822  
823  uint8_t
824  spdk_pci_device_get_dev(struct spdk_pci_device *dev)
825  {
826  	return dev->addr.dev;
827  }
828  
829  uint8_t
830  spdk_pci_device_get_func(struct spdk_pci_device *dev)
831  {
832  	return dev->addr.func;
833  }
834  
835  uint16_t
836  spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
837  {
838  	return dev->id.vendor_id;
839  }
840  
841  uint16_t
842  spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
843  {
844  	return dev->id.device_id;
845  }
846  
847  uint16_t
848  spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
849  {
850  	return dev->id.subvendor_id;
851  }
852  
853  uint16_t
854  spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
855  {
856  	return dev->id.subdevice_id;
857  }
858  
859  struct spdk_pci_id
860  spdk_pci_device_get_id(struct spdk_pci_device *dev)
861  {
862  	return dev->id;
863  }
864  
865  int
866  spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
867  {
868  	return dev->socket_id;
869  }
870  
871  int
872  spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
873  {
874  	return dev->cfg_read(dev, value, len, offset);
875  }
876  
877  int
878  spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
879  {
880  	return dev->cfg_write(dev, value, len, offset);
881  }
882  
883  int
884  spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
885  {
886  	return spdk_pci_device_cfg_read(dev, value, 1, offset);
887  }
888  
889  int
890  spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
891  {
892  	return spdk_pci_device_cfg_write(dev, &value, 1, offset);
893  }
894  
895  int
896  spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
897  {
898  	return spdk_pci_device_cfg_read(dev, value, 2, offset);
899  }
900  
901  int
902  spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
903  {
904  	return spdk_pci_device_cfg_write(dev, &value, 2, offset);
905  }
906  
907  int
908  spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
909  {
910  	return spdk_pci_device_cfg_read(dev, value, 4, offset);
911  }
912  
913  int
914  spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
915  {
916  	return spdk_pci_device_cfg_write(dev, &value, 4, offset);
917  }
918  
919  int
920  spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
921  {
922  	int err;
923  	uint32_t pos, header = 0;
924  	uint32_t i, buf[2];
925  
926  	if (len < 17) {
927  		return -1;
928  	}
929  
930  	err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
931  	if (err || !header) {
932  		return -1;
933  	}
934  
935  	pos = PCI_CFG_SIZE;
936  	while (1) {
937  		if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
938  			if (pos) {
939  				/* skip the header */
940  				pos += 4;
941  				for (i = 0; i < 2; i++) {
942  					err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
943  					if (err) {
944  						return -1;
945  					}
946  				}
947  				snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
948  				return 0;
949  			}
950  		}
951  		pos = (header >> 20) & 0xffc;
952  		/* 0 if no other items exist */
953  		if (pos < PCI_CFG_SIZE) {
954  			return -1;
955  		}
956  		err = spdk_pci_device_cfg_read32(dev, &header, pos);
957  		if (err) {
958  			return -1;
959  		}
960  	}
961  	return -1;
962  }
963  
964  struct spdk_pci_addr
965  spdk_pci_device_get_addr(struct spdk_pci_device *dev)
966  {
967  	return dev->addr;
968  }
969  
970  bool
971  spdk_pci_device_is_removed(struct spdk_pci_device *dev)
972  {
973  	return dev->internal.pending_removal;
974  }
975  
976  int
977  spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
978  {
979  	if (a1->domain > a2->domain) {
980  		return 1;
981  	} else if (a1->domain < a2->domain) {
982  		return -1;
983  	} else if (a1->bus > a2->bus) {
984  		return 1;
985  	} else if (a1->bus < a2->bus) {
986  		return -1;
987  	} else if (a1->dev > a2->dev) {
988  		return 1;
989  	} else if (a1->dev < a2->dev) {
990  		return -1;
991  	} else if (a1->func > a2->func) {
992  		return 1;
993  	} else if (a1->func < a2->func) {
994  		return -1;
995  	}
996  
997  	return 0;
998  }
999  
1000  #ifdef __linux__
1001  int
1002  spdk_pci_device_claim(struct spdk_pci_device *dev)
1003  {
1004  	int dev_fd;
1005  	char dev_name[64];
1006  	int pid;
1007  	void *dev_map;
1008  	struct flock pcidev_lock = {
1009  		.l_type = F_WRLCK,
1010  		.l_whence = SEEK_SET,
1011  		.l_start = 0,
1012  		.l_len = 0,
1013  	};
1014  
1015  	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1016  		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1017  
1018  	dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
1019  	if (dev_fd == -1) {
1020  		SPDK_ERRLOG("could not open %s\n", dev_name);
1021  		return -errno;
1022  	}
1023  
1024  	if (ftruncate(dev_fd, sizeof(int)) != 0) {
1025  		SPDK_ERRLOG("could not truncate %s\n", dev_name);
1026  		close(dev_fd);
1027  		return -errno;
1028  	}
1029  
1030  	dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
1031  		       MAP_SHARED, dev_fd, 0);
1032  	if (dev_map == MAP_FAILED) {
1033  		SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno);
1034  		close(dev_fd);
1035  		return -errno;
1036  	}
1037  
1038  	if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
1039  		pid = *(int *)dev_map;
1040  		SPDK_ERRLOG("Cannot create lock on device %s, probably"
1041  			    " process %d has claimed it\n", dev_name, pid);
1042  		munmap(dev_map, sizeof(int));
1043  		close(dev_fd);
1044  		/* F_SETLK returns unspecified errnos, normalize them */
1045  		return -EACCES;
1046  	}
1047  
1048  	*(int *)dev_map = (int)getpid();
1049  	munmap(dev_map, sizeof(int));
1050  	dev->internal.claim_fd = dev_fd;
1051  	/* Keep dev_fd open to maintain the lock. */
1052  	return 0;
1053  }
1054  
1055  void
1056  spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1057  {
1058  	char dev_name[64];
1059  
1060  	snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
1061  		 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
1062  
1063  	close(dev->internal.claim_fd);
1064  	dev->internal.claim_fd = -1;
1065  	unlink(dev_name);
1066  }
1067  #else /* !__linux__ */
1068  int
1069  spdk_pci_device_claim(struct spdk_pci_device *dev)
1070  {
1071  	/* TODO */
1072  	return 0;
1073  }
1074  
1075  void
1076  spdk_pci_device_unclaim(struct spdk_pci_device *dev)
1077  {
1078  	/* TODO */
1079  }
1080  #endif /* __linux__ */
1081  
1082  int
1083  spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
1084  {
1085  	unsigned domain, bus, dev, func;
1086  
1087  	if (addr == NULL || bdf == NULL) {
1088  		return -EINVAL;
1089  	}
1090  
1091  	if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
1092  	    (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
1093  		/* Matched a full address - all variables are initialized */
1094  	} else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
1095  		func = 0;
1096  	} else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
1097  		   (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
1098  		domain = 0;
1099  	} else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
1100  		   (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
1101  		domain = 0;
1102  		func = 0;
1103  	} else {
1104  		return -EINVAL;
1105  	}
1106  
1107  	if (bus > 0xFF || dev > 0x1F || func > 7) {
1108  		return -EINVAL;
1109  	}
1110  
1111  	addr->domain = domain;
1112  	addr->bus = bus;
1113  	addr->dev = dev;
1114  	addr->func = func;
1115  
1116  	return 0;
1117  }
1118  
1119  int
1120  spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
1121  {
1122  	int rc;
1123  
1124  	rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
1125  		      addr->domain, addr->bus,
1126  		      addr->dev, addr->func);
1127  
1128  	if (rc > 0 && (size_t)rc < sz) {
1129  		return 0;
1130  	}
1131  
1132  	return -1;
1133  }
1134  
1135  int
1136  spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
1137  {
1138  	int rc;
1139  
1140  	assert(dev->map_bar != NULL);
1141  	assert(dev->unmap_bar != NULL);
1142  	assert(dev->cfg_read != NULL);
1143  	assert(dev->cfg_write != NULL);
1144  	dev->internal.driver = drv;
1145  
1146  	if (drv->cb_fn != NULL) {
1147  		rc = drv->cb_fn(drv->cb_arg, dev);
1148  		if (rc != 0) {
1149  			return -ECANCELED;
1150  		}
1151  
1152  		dev->internal.attached = true;
1153  	}
1154  
1155  	TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
1156  
1157  	return 0;
1158  }
1159  
1160  void
1161  spdk_pci_unhook_device(struct spdk_pci_device *dev)
1162  {
1163  	assert(!dev->internal.attached);
1164  	TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
1165  }
1166  
1167  void
1168  spdk_pci_register_device_provider(struct spdk_pci_device_provider *provider)
1169  {
1170  	TAILQ_INSERT_TAIL(&g_pci_device_providers, provider, tailq);
1171  }
1172  
1173  const char *
1174  spdk_pci_device_get_type(const struct spdk_pci_device *dev)
1175  {
1176  	return dev->type;
1177  }
1178  
1179  int
1180  spdk_pci_device_allow(struct spdk_pci_addr *pci_addr)
1181  {
1182  	struct rte_devargs *da;
1183  	char devargs_str[128];
1184  
1185  	da = calloc(1, sizeof(*da));
1186  	if (da == NULL) {
1187  		SPDK_ERRLOG("could not allocate rte_devargs\n");
1188  		return -ENOMEM;
1189  	}
1190  
1191  	snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x",
1192  		 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func);
1193  	if (rte_devargs_parse(da, devargs_str) != 0) {
1194  		SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str);
1195  		free(da);
1196  		return -EINVAL;
1197  	}
1198  	da->policy = RTE_DEV_ALLOWED;
1199  	/* Note: if a devargs already exists for this device address, it just gets
1200  	 * overridden.  So we do not need to check if the devargs already exists.
1201  	 * DPDK will take care of memory management for the devargs structure after
1202  	 * it has been inserted, so there's nothing SPDK needs to track.
1203  	 */
1204  	if (rte_devargs_insert(&da) != 0) {
1205  		SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str);
1206  		free(da);
1207  		return -EINVAL;
1208  	}
1209  
1210  	return 0;
1211  }
1212