xref: /dpdk/drivers/common/mlx5/mlx5_common.c (revision 204891763c91185e1fe7ba9bfd2c225973c939b6)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2019 Mellanox Technologies, Ltd
3  */
4 
5 #include <unistd.h>
6 #include <string.h>
7 #include <stdio.h>
8 
9 #include <rte_errno.h>
10 #include <rte_mempool.h>
11 #include <rte_class.h>
12 #include <rte_malloc.h>
13 
14 #include "mlx5_common.h"
15 #include "mlx5_common_os.h"
16 #include "mlx5_common_mp.h"
17 #include "mlx5_common_log.h"
18 #include "mlx5_common_defs.h"
19 #include "mlx5_common_private.h"
20 
21 uint8_t haswell_broadwell_cpu;
22 
23 /* In case this is an x86_64 intel processor to check if
24  * we should use relaxed ordering.
25  */
26 #ifdef RTE_ARCH_X86_64
27 /**
28  * This function returns processor identification and feature information
29  * into the registers.
30  *
31  * @param eax, ebx, ecx, edx
32  *		Pointers to the registers that will hold cpu information.
33  * @param level
34  *		The main category of information returned.
35  */
36 static inline void mlx5_cpu_id(unsigned int level,
37 				unsigned int *eax, unsigned int *ebx,
38 				unsigned int *ecx, unsigned int *edx)
39 {
40 	__asm__("cpuid\n\t"
41 		: "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
42 		: "0" (level));
43 }
44 #endif
45 
46 RTE_LOG_REGISTER_DEFAULT(mlx5_common_logtype, NOTICE)
47 
48 /* Head of list of drivers. */
49 static TAILQ_HEAD(mlx5_drivers, mlx5_class_driver) drivers_list =
50 				TAILQ_HEAD_INITIALIZER(drivers_list);
51 
52 /* Head of devices. */
53 static TAILQ_HEAD(mlx5_devices, mlx5_common_device) devices_list =
54 				TAILQ_HEAD_INITIALIZER(devices_list);
55 static pthread_mutex_t devices_list_lock;
56 
57 static const struct {
58 	const char *name;
59 	unsigned int drv_class;
60 } mlx5_classes[] = {
61 	{ .name = "vdpa", .drv_class = MLX5_CLASS_VDPA },
62 	{ .name = "eth", .drv_class = MLX5_CLASS_ETH },
63 	/* Keep class "net" for backward compatibility. */
64 	{ .name = "net", .drv_class = MLX5_CLASS_ETH },
65 	{ .name = "regex", .drv_class = MLX5_CLASS_REGEX },
66 	{ .name = "compress", .drv_class = MLX5_CLASS_COMPRESS },
67 	{ .name = "crypto", .drv_class = MLX5_CLASS_CRYPTO },
68 };
69 
70 static int
71 class_name_to_value(const char *class_name)
72 {
73 	unsigned int i;
74 
75 	for (i = 0; i < RTE_DIM(mlx5_classes); i++) {
76 		if (strcmp(class_name, mlx5_classes[i].name) == 0)
77 			return mlx5_classes[i].drv_class;
78 	}
79 	return -EINVAL;
80 }
81 
82 static struct mlx5_class_driver *
83 driver_get(uint32_t class)
84 {
85 	struct mlx5_class_driver *driver;
86 
87 	TAILQ_FOREACH(driver, &drivers_list, next) {
88 		if ((uint32_t)driver->drv_class == class)
89 			return driver;
90 	}
91 	return NULL;
92 }
93 
94 /**
95  * Verify and store value for devargs.
96  *
97  * @param[in] key
98  *   Key argument to verify.
99  * @param[in] val
100  *   Value associated with key.
101  * @param opaque
102  *   User data.
103  *
104  * @return
105  *   0 on success, a negative errno value otherwise and rte_errno is set.
106  */
107 static int
108 mlx5_common_args_check_handler(const char *key, const char *val, void *opaque)
109 {
110 	struct mlx5_common_dev_config *config = opaque;
111 	signed long tmp;
112 
113 	errno = 0;
114 	tmp = strtol(val, NULL, 0);
115 	if (errno) {
116 		rte_errno = errno;
117 		DRV_LOG(WARNING, "%s: \"%s\" is an invalid integer.", key, val);
118 		return -rte_errno;
119 	}
120 	if (strcmp(key, "tx_db_nc") == 0) {
121 		if (tmp != MLX5_TXDB_CACHED &&
122 		    tmp != MLX5_TXDB_NCACHED &&
123 		    tmp != MLX5_TXDB_HEURISTIC) {
124 			DRV_LOG(ERR, "Invalid Tx doorbell mapping parameter.");
125 			rte_errno = EINVAL;
126 			return -rte_errno;
127 		}
128 		config->dbnc = tmp;
129 	} else if (strcmp(key, "mr_ext_memseg_en") == 0) {
130 		config->mr_ext_memseg_en = !!tmp;
131 	} else if (strcmp(key, "mr_mempool_reg_en") == 0) {
132 		config->mr_mempool_reg_en = !!tmp;
133 	} else if (strcmp(key, "sys_mem_en") == 0) {
134 		config->sys_mem_en = !!tmp;
135 	}
136 	return 0;
137 }
138 
139 /**
140  * Parse common device parameters.
141  *
142  * @param devargs
143  *   Device arguments structure.
144  * @param config
145  *   Pointer to device configuration structure.
146  *
147  * @return
148  *   0 on success, a negative errno value otherwise and rte_errno is set.
149  */
150 static int
151 mlx5_common_config_get(struct rte_devargs *devargs,
152 		       struct mlx5_common_dev_config *config)
153 {
154 	struct rte_kvargs *kvlist;
155 	int ret = 0;
156 
157 	/* Set defaults. */
158 	config->mr_ext_memseg_en = 1;
159 	config->mr_mempool_reg_en = 1;
160 	config->sys_mem_en = 0;
161 	config->dbnc = MLX5_ARG_UNSET;
162 	if (devargs == NULL)
163 		return 0;
164 	kvlist = rte_kvargs_parse(devargs->args, NULL);
165 	if (kvlist == NULL) {
166 		rte_errno = EINVAL;
167 		return -rte_errno;
168 	}
169 	ret = rte_kvargs_process(kvlist, NULL, mlx5_common_args_check_handler,
170 				 config);
171 	if (ret)
172 		ret = -rte_errno;
173 	rte_kvargs_free(kvlist);
174 	DRV_LOG(DEBUG, "mr_ext_memseg_en is %u.", config->mr_ext_memseg_en);
175 	DRV_LOG(DEBUG, "mr_mempool_reg_en is %u.", config->mr_mempool_reg_en);
176 	DRV_LOG(DEBUG, "sys_mem_en is %u.", config->sys_mem_en);
177 	DRV_LOG(DEBUG, "Tx doorbell mapping parameter is %d.", config->dbnc);
178 	return ret;
179 }
180 
181 static int
182 devargs_class_handler(__rte_unused const char *key,
183 		      const char *class_names, void *opaque)
184 {
185 	int *ret = opaque;
186 	int class_val;
187 	char *scratch;
188 	char *found;
189 	char *refstr = NULL;
190 
191 	*ret = 0;
192 	scratch = strdup(class_names);
193 	if (scratch == NULL) {
194 		*ret = -ENOMEM;
195 		return *ret;
196 	}
197 	found = strtok_r(scratch, ":", &refstr);
198 	if (found == NULL)
199 		/* Empty string. */
200 		goto err;
201 	do {
202 		/* Extract each individual class name. Multiple
203 		 * classes can be supplied as class=net:regex:foo:bar.
204 		 */
205 		class_val = class_name_to_value(found);
206 		/* Check if its a valid class. */
207 		if (class_val < 0) {
208 			*ret = -EINVAL;
209 			goto err;
210 		}
211 		*ret |= class_val;
212 		found = strtok_r(NULL, ":", &refstr);
213 	} while (found != NULL);
214 err:
215 	free(scratch);
216 	if (*ret < 0)
217 		DRV_LOG(ERR, "Invalid mlx5 class options: %s.\n", class_names);
218 	return *ret;
219 }
220 
221 static int
222 parse_class_options(const struct rte_devargs *devargs)
223 {
224 	struct rte_kvargs *kvlist;
225 	int ret = 0;
226 
227 	if (devargs == NULL)
228 		return 0;
229 	if (devargs->cls != NULL && devargs->cls->name != NULL)
230 		/* Global syntax, only one class type. */
231 		return class_name_to_value(devargs->cls->name);
232 	/* Legacy devargs support multiple classes. */
233 	kvlist = rte_kvargs_parse(devargs->args, NULL);
234 	if (kvlist == NULL)
235 		return 0;
236 	rte_kvargs_process(kvlist, RTE_DEVARGS_KEY_CLASS,
237 			   devargs_class_handler, &ret);
238 	rte_kvargs_free(kvlist);
239 	return ret;
240 }
241 
242 static const unsigned int mlx5_class_invalid_combinations[] = {
243 	MLX5_CLASS_ETH | MLX5_CLASS_VDPA,
244 	/* New class combination should be added here. */
245 };
246 
247 static int
248 is_valid_class_combination(uint32_t user_classes)
249 {
250 	unsigned int i;
251 
252 	/* Verify if user specified unsupported combination. */
253 	for (i = 0; i < RTE_DIM(mlx5_class_invalid_combinations); i++) {
254 		if ((mlx5_class_invalid_combinations[i] & user_classes) ==
255 		    mlx5_class_invalid_combinations[i])
256 			return -EINVAL;
257 	}
258 	/* Not found any invalid class combination. */
259 	return 0;
260 }
261 
262 static bool
263 mlx5_bus_match(const struct mlx5_class_driver *drv,
264 	       const struct rte_device *dev)
265 {
266 	if (mlx5_dev_is_pci(dev))
267 		return mlx5_dev_pci_match(drv, dev);
268 	return true;
269 }
270 
271 static struct mlx5_common_device *
272 to_mlx5_device(const struct rte_device *rte_dev)
273 {
274 	struct mlx5_common_device *cdev;
275 
276 	TAILQ_FOREACH(cdev, &devices_list, next) {
277 		if (rte_dev == cdev->dev)
278 			return cdev;
279 	}
280 	return NULL;
281 }
282 
283 int
284 mlx5_dev_to_pci_str(const struct rte_device *dev, char *addr, size_t size)
285 {
286 	struct rte_pci_addr pci_addr = { 0 };
287 	int ret;
288 
289 	if (mlx5_dev_is_pci(dev)) {
290 		/* Input might be <BDF>, format PCI address to <DBDF>. */
291 		ret = rte_pci_addr_parse(dev->name, &pci_addr);
292 		if (ret != 0)
293 			return -ENODEV;
294 		rte_pci_device_name(&pci_addr, addr, size);
295 		return 0;
296 	}
297 #ifdef RTE_EXEC_ENV_LINUX
298 	return mlx5_auxiliary_get_pci_str(RTE_DEV_TO_AUXILIARY_CONST(dev),
299 			addr, size);
300 #else
301 	rte_errno = ENODEV;
302 	return -rte_errno;
303 #endif
304 }
305 
306 /**
307  * Register the mempool for the protection domain.
308  *
309  * @param cdev
310  *   Pointer to the mlx5 common device.
311  * @param mp
312  *   Mempool being registered.
313  *
314  * @return
315  *   0 on success, (-1) on failure and rte_errno is set.
316  */
317 static int
318 mlx5_dev_mempool_register(struct mlx5_common_device *cdev,
319 			  struct rte_mempool *mp)
320 {
321 	return mlx5_mr_mempool_register(cdev, mp);
322 }
323 
324 /**
325  * Unregister the mempool from the protection domain.
326  *
327  * @param cdev
328  *   Pointer to the mlx5 common device.
329  * @param mp
330  *   Mempool being unregistered.
331  */
332 void
333 mlx5_dev_mempool_unregister(struct mlx5_common_device *cdev,
334 			    struct rte_mempool *mp)
335 {
336 	if (mlx5_mr_mempool_unregister(cdev, mp) < 0)
337 		DRV_LOG(WARNING, "Failed to unregister mempool %s for PD %p: %s",
338 			mp->name, cdev->pd, rte_strerror(rte_errno));
339 }
340 
341 /**
342  * rte_mempool_walk() callback to register mempools for the protection domain.
343  *
344  * @param mp
345  *   The mempool being walked.
346  * @param arg
347  *   Pointer to the device shared context.
348  */
349 static void
350 mlx5_dev_mempool_register_cb(struct rte_mempool *mp, void *arg)
351 {
352 	struct mlx5_common_device *cdev = arg;
353 	int ret;
354 
355 	ret = mlx5_dev_mempool_register(cdev, mp);
356 	if (ret < 0 && rte_errno != EEXIST)
357 		DRV_LOG(ERR,
358 			"Failed to register existing mempool %s for PD %p: %s",
359 			mp->name, cdev->pd, rte_strerror(rte_errno));
360 }
361 
362 /**
363  * rte_mempool_walk() callback to unregister mempools
364  * from the protection domain.
365  *
366  * @param mp
367  *   The mempool being walked.
368  * @param arg
369  *   Pointer to the device shared context.
370  */
371 static void
372 mlx5_dev_mempool_unregister_cb(struct rte_mempool *mp, void *arg)
373 {
374 	mlx5_dev_mempool_unregister((struct mlx5_common_device *)arg, mp);
375 }
376 
377 /**
378  * Mempool life cycle callback for mlx5 common devices.
379  *
380  * @param event
381  *   Mempool life cycle event.
382  * @param mp
383  *   Associated mempool.
384  * @param arg
385  *   Pointer to a device shared context.
386  */
387 static void
388 mlx5_dev_mempool_event_cb(enum rte_mempool_event event, struct rte_mempool *mp,
389 			  void *arg)
390 {
391 	struct mlx5_common_device *cdev = arg;
392 
393 	switch (event) {
394 	case RTE_MEMPOOL_EVENT_READY:
395 		if (mlx5_dev_mempool_register(cdev, mp) < 0)
396 			DRV_LOG(ERR,
397 				"Failed to register new mempool %s for PD %p: %s",
398 				mp->name, cdev->pd, rte_strerror(rte_errno));
399 		break;
400 	case RTE_MEMPOOL_EVENT_DESTROY:
401 		mlx5_dev_mempool_unregister(cdev, mp);
402 		break;
403 	}
404 }
405 
406 int
407 mlx5_dev_mempool_subscribe(struct mlx5_common_device *cdev)
408 {
409 	int ret = 0;
410 
411 	if (!cdev->config.mr_mempool_reg_en)
412 		return 0;
413 	rte_rwlock_write_lock(&cdev->mr_scache.mprwlock);
414 	if (cdev->mr_scache.mp_cb_registered)
415 		goto exit;
416 	/* Callback for this device may be already registered. */
417 	ret = rte_mempool_event_callback_register(mlx5_dev_mempool_event_cb,
418 						  cdev);
419 	if (ret != 0 && rte_errno != EEXIST)
420 		goto exit;
421 	/* Register mempools only once for this device. */
422 	if (ret == 0)
423 		rte_mempool_walk(mlx5_dev_mempool_register_cb, cdev);
424 	ret = 0;
425 	cdev->mr_scache.mp_cb_registered = 1;
426 exit:
427 	rte_rwlock_write_unlock(&cdev->mr_scache.mprwlock);
428 	return ret;
429 }
430 
431 static void
432 mlx5_dev_mempool_unsubscribe(struct mlx5_common_device *cdev)
433 {
434 	int ret;
435 
436 	if (!cdev->mr_scache.mp_cb_registered ||
437 	    !cdev->config.mr_mempool_reg_en)
438 		return;
439 	/* Stop watching for mempool events and unregister all mempools. */
440 	ret = rte_mempool_event_callback_unregister(mlx5_dev_mempool_event_cb,
441 						    cdev);
442 	if (ret == 0)
443 		rte_mempool_walk(mlx5_dev_mempool_unregister_cb, cdev);
444 }
445 
446 /**
447  * Callback for memory event.
448  *
449  * @param event_type
450  *   Memory event type.
451  * @param addr
452  *   Address of memory.
453  * @param len
454  *   Size of memory.
455  */
456 static void
457 mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
458 		     size_t len, void *arg __rte_unused)
459 {
460 	struct mlx5_common_device *cdev;
461 
462 	/* Must be called from the primary process. */
463 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
464 	switch (event_type) {
465 	case RTE_MEM_EVENT_FREE:
466 		pthread_mutex_lock(&devices_list_lock);
467 		/* Iterate all the existing mlx5 devices. */
468 		TAILQ_FOREACH(cdev, &devices_list, next)
469 			mlx5_free_mr_by_addr(&cdev->mr_scache,
470 					     mlx5_os_get_ctx_device_name
471 								    (cdev->ctx),
472 					     addr, len);
473 		pthread_mutex_unlock(&devices_list_lock);
474 		break;
475 	case RTE_MEM_EVENT_ALLOC:
476 	default:
477 		break;
478 	}
479 }
480 
481 /**
482  * Uninitialize all HW global of device context.
483  *
484  * @param cdev
485  *   Pointer to mlx5 device structure.
486  *
487  * @return
488  *   0 on success, a negative errno value otherwise and rte_errno is set.
489  */
490 static void
491 mlx5_dev_hw_global_release(struct mlx5_common_device *cdev)
492 {
493 	if (cdev->pd != NULL) {
494 		claim_zero(mlx5_os_dealloc_pd(cdev->pd));
495 		cdev->pd = NULL;
496 	}
497 	if (cdev->ctx != NULL) {
498 		claim_zero(mlx5_glue->close_device(cdev->ctx));
499 		cdev->ctx = NULL;
500 	}
501 }
502 
503 /**
504  * Initialize all HW global of device context.
505  *
506  * @param cdev
507  *   Pointer to mlx5 device structure.
508  * @param classes
509  *   Chosen classes come from user device arguments.
510  *
511  * @return
512  *   0 on success, a negative errno value otherwise and rte_errno is set.
513  */
514 static int
515 mlx5_dev_hw_global_prepare(struct mlx5_common_device *cdev, uint32_t classes)
516 {
517 	int ret;
518 
519 	/* Create context device */
520 	ret = mlx5_os_open_device(cdev, classes);
521 	if (ret < 0)
522 		return ret;
523 	/* Allocate Protection Domain object and extract its pdn. */
524 	ret = mlx5_os_pd_create(cdev);
525 	if (ret)
526 		goto error;
527 	/* All actions taken below are relevant only when DevX is supported */
528 	if (cdev->config.devx == 0)
529 		return 0;
530 	/* Query HCA attributes. */
531 	ret = mlx5_devx_cmd_query_hca_attr(cdev->ctx, &cdev->config.hca_attr);
532 	if (ret) {
533 		DRV_LOG(ERR, "Unable to read HCA capabilities.");
534 		rte_errno = ENOTSUP;
535 		goto error;
536 	}
537 	return 0;
538 error:
539 	mlx5_dev_hw_global_release(cdev);
540 	return ret;
541 }
542 
543 static void
544 mlx5_common_dev_release(struct mlx5_common_device *cdev)
545 {
546 	pthread_mutex_lock(&devices_list_lock);
547 	TAILQ_REMOVE(&devices_list, cdev, next);
548 	pthread_mutex_unlock(&devices_list_lock);
549 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
550 		if (TAILQ_EMPTY(&devices_list))
551 			rte_mem_event_callback_unregister("MLX5_MEM_EVENT_CB",
552 							  NULL);
553 		mlx5_dev_mempool_unsubscribe(cdev);
554 		mlx5_mr_release_cache(&cdev->mr_scache);
555 		mlx5_dev_hw_global_release(cdev);
556 	}
557 	rte_free(cdev);
558 }
559 
560 static struct mlx5_common_device *
561 mlx5_common_dev_create(struct rte_device *eal_dev, uint32_t classes)
562 {
563 	struct mlx5_common_device *cdev;
564 	int ret;
565 
566 	cdev = rte_zmalloc("mlx5_common_device", sizeof(*cdev), 0);
567 	if (!cdev) {
568 		DRV_LOG(ERR, "Device allocation failure.");
569 		rte_errno = ENOMEM;
570 		return NULL;
571 	}
572 	cdev->dev = eal_dev;
573 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
574 		goto exit;
575 	/* Parse device parameters. */
576 	ret = mlx5_common_config_get(eal_dev->devargs, &cdev->config);
577 	if (ret < 0) {
578 		DRV_LOG(ERR, "Failed to process device arguments: %s",
579 			strerror(rte_errno));
580 		rte_free(cdev);
581 		return NULL;
582 	}
583 	mlx5_malloc_mem_select(cdev->config.sys_mem_en);
584 	/* Initialize all HW global of device context. */
585 	ret = mlx5_dev_hw_global_prepare(cdev, classes);
586 	if (ret) {
587 		DRV_LOG(ERR, "Failed to initialize device context.");
588 		rte_free(cdev);
589 		return NULL;
590 	}
591 	/* Initialize global MR cache resources and update its functions. */
592 	ret = mlx5_mr_create_cache(&cdev->mr_scache, eal_dev->numa_node);
593 	if (ret) {
594 		DRV_LOG(ERR, "Failed to initialize global MR share cache.");
595 		mlx5_dev_hw_global_release(cdev);
596 		rte_free(cdev);
597 		return NULL;
598 	}
599 	/* Register callback function for global shared MR cache management. */
600 	if (TAILQ_EMPTY(&devices_list))
601 		rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
602 						mlx5_mr_mem_event_cb, NULL);
603 exit:
604 	pthread_mutex_lock(&devices_list_lock);
605 	TAILQ_INSERT_HEAD(&devices_list, cdev, next);
606 	pthread_mutex_unlock(&devices_list_lock);
607 	return cdev;
608 }
609 
610 static int
611 drivers_remove(struct mlx5_common_device *cdev, uint32_t enabled_classes)
612 {
613 	struct mlx5_class_driver *driver;
614 	int local_ret = -ENODEV;
615 	unsigned int i = 0;
616 	int ret = 0;
617 
618 	enabled_classes &= cdev->classes_loaded;
619 	while (enabled_classes) {
620 		driver = driver_get(RTE_BIT64(i));
621 		if (driver != NULL) {
622 			local_ret = driver->remove(cdev);
623 			if (local_ret == 0)
624 				cdev->classes_loaded &= ~RTE_BIT64(i);
625 			else if (ret == 0)
626 				ret = local_ret;
627 		}
628 		enabled_classes &= ~RTE_BIT64(i);
629 		i++;
630 	}
631 	if (local_ret != 0 && ret == 0)
632 		ret = local_ret;
633 	return ret;
634 }
635 
636 static int
637 drivers_probe(struct mlx5_common_device *cdev, uint32_t user_classes)
638 {
639 	struct mlx5_class_driver *driver;
640 	uint32_t enabled_classes = 0;
641 	bool already_loaded;
642 	int ret;
643 
644 	TAILQ_FOREACH(driver, &drivers_list, next) {
645 		if ((driver->drv_class & user_classes) == 0)
646 			continue;
647 		if (!mlx5_bus_match(driver, cdev->dev))
648 			continue;
649 		already_loaded = cdev->classes_loaded & driver->drv_class;
650 		if (already_loaded && driver->probe_again == 0) {
651 			DRV_LOG(ERR, "Device %s is already probed",
652 				cdev->dev->name);
653 			ret = -EEXIST;
654 			goto probe_err;
655 		}
656 		ret = driver->probe(cdev);
657 		if (ret < 0) {
658 			DRV_LOG(ERR, "Failed to load driver %s",
659 				driver->name);
660 			goto probe_err;
661 		}
662 		enabled_classes |= driver->drv_class;
663 	}
664 	cdev->classes_loaded |= enabled_classes;
665 	return 0;
666 probe_err:
667 	/* Only unload drivers which are enabled which were enabled
668 	 * in this probe instance.
669 	 */
670 	drivers_remove(cdev, enabled_classes);
671 	return ret;
672 }
673 
674 int
675 mlx5_common_dev_probe(struct rte_device *eal_dev)
676 {
677 	struct mlx5_common_device *cdev;
678 	uint32_t classes = 0;
679 	bool new_device = false;
680 	int ret;
681 
682 	DRV_LOG(INFO, "probe device \"%s\".", eal_dev->name);
683 	ret = parse_class_options(eal_dev->devargs);
684 	if (ret < 0) {
685 		DRV_LOG(ERR, "Unsupported mlx5 class type: %s",
686 			eal_dev->devargs->args);
687 		return ret;
688 	}
689 	classes = ret;
690 	if (classes == 0)
691 		/* Default to net class. */
692 		classes = MLX5_CLASS_ETH;
693 	cdev = to_mlx5_device(eal_dev);
694 	if (!cdev) {
695 		cdev = mlx5_common_dev_create(eal_dev, classes);
696 		if (!cdev)
697 			return -ENOMEM;
698 		new_device = true;
699 	}
700 	/*
701 	 * Validate combination here.
702 	 * For new device, the classes_loaded field is 0 and it check only
703 	 * the classes given as user device arguments.
704 	 */
705 	ret = is_valid_class_combination(classes | cdev->classes_loaded);
706 	if (ret != 0) {
707 		DRV_LOG(ERR, "Unsupported mlx5 classes combination.");
708 		goto class_err;
709 	}
710 	ret = drivers_probe(cdev, classes);
711 	if (ret)
712 		goto class_err;
713 	return 0;
714 class_err:
715 	if (new_device)
716 		mlx5_common_dev_release(cdev);
717 	return ret;
718 }
719 
720 int
721 mlx5_common_dev_remove(struct rte_device *eal_dev)
722 {
723 	struct mlx5_common_device *cdev;
724 	int ret;
725 
726 	cdev = to_mlx5_device(eal_dev);
727 	if (!cdev)
728 		return -ENODEV;
729 	/* Matching device found, cleanup and unload drivers. */
730 	ret = drivers_remove(cdev, cdev->classes_loaded);
731 	if (ret == 0)
732 		mlx5_common_dev_release(cdev);
733 	return ret;
734 }
735 
736 /**
737  * Callback to DMA map external memory to a device.
738  *
739  * @param rte_dev
740  *   Pointer to the generic device.
741  * @param addr
742  *   Starting virtual address of memory to be mapped.
743  * @param iova
744  *   Starting IOVA address of memory to be mapped.
745  * @param len
746  *   Length of memory segment being mapped.
747  *
748  * @return
749  *   0 on success, negative value on error.
750  */
751 int
752 mlx5_common_dev_dma_map(struct rte_device *rte_dev, void *addr,
753 			uint64_t iova __rte_unused, size_t len)
754 {
755 	struct mlx5_common_device *dev;
756 	struct mlx5_mr *mr;
757 
758 	dev = to_mlx5_device(rte_dev);
759 	if (!dev) {
760 		DRV_LOG(WARNING,
761 			"Unable to find matching mlx5 device to device %s",
762 			rte_dev->name);
763 		rte_errno = ENODEV;
764 		return -1;
765 	}
766 	mr = mlx5_create_mr_ext(dev->pd, (uintptr_t)addr, len,
767 				SOCKET_ID_ANY, dev->mr_scache.reg_mr_cb);
768 	if (!mr) {
769 		DRV_LOG(WARNING, "Device %s unable to DMA map", rte_dev->name);
770 		rte_errno = EINVAL;
771 		return -1;
772 	}
773 	rte_rwlock_write_lock(&dev->mr_scache.rwlock);
774 	LIST_INSERT_HEAD(&dev->mr_scache.mr_list, mr, mr);
775 	/* Insert to the global cache table. */
776 	mlx5_mr_insert_cache(&dev->mr_scache, mr);
777 	rte_rwlock_write_unlock(&dev->mr_scache.rwlock);
778 	return 0;
779 }
780 
781 /**
782  * Callback to DMA unmap external memory to a device.
783  *
784  * @param rte_dev
785  *   Pointer to the generic device.
786  * @param addr
787  *   Starting virtual address of memory to be unmapped.
788  * @param iova
789  *   Starting IOVA address of memory to be unmapped.
790  * @param len
791  *   Length of memory segment being unmapped.
792  *
793  * @return
794  *   0 on success, negative value on error.
795  */
796 int
797 mlx5_common_dev_dma_unmap(struct rte_device *rte_dev, void *addr,
798 			  uint64_t iova __rte_unused, size_t len __rte_unused)
799 {
800 	struct mlx5_common_device *dev;
801 	struct mr_cache_entry entry;
802 	struct mlx5_mr *mr;
803 
804 	dev = to_mlx5_device(rte_dev);
805 	if (!dev) {
806 		DRV_LOG(WARNING,
807 			"Unable to find matching mlx5 device to device %s.",
808 			rte_dev->name);
809 		rte_errno = ENODEV;
810 		return -1;
811 	}
812 	rte_rwlock_read_lock(&dev->mr_scache.rwlock);
813 	mr = mlx5_mr_lookup_list(&dev->mr_scache, &entry, (uintptr_t)addr);
814 	if (!mr) {
815 		rte_rwlock_read_unlock(&dev->mr_scache.rwlock);
816 		DRV_LOG(WARNING,
817 			"Address 0x%" PRIxPTR " wasn't registered to device %s",
818 			(uintptr_t)addr, rte_dev->name);
819 		rte_errno = EINVAL;
820 		return -1;
821 	}
822 	LIST_REMOVE(mr, mr);
823 	DRV_LOG(DEBUG, "MR(%p) is removed from list.", (void *)mr);
824 	mlx5_mr_free(mr, dev->mr_scache.dereg_mr_cb);
825 	mlx5_mr_rebuild_cache(&dev->mr_scache);
826 	/*
827 	 * No explicit wmb is needed after updating dev_gen due to
828 	 * store-release ordering in unlock that provides the
829 	 * implicit barrier at the software visible level.
830 	 */
831 	++dev->mr_scache.dev_gen;
832 	DRV_LOG(DEBUG, "Broadcasting local cache flush, gen=%d.",
833 		dev->mr_scache.dev_gen);
834 	rte_rwlock_read_unlock(&dev->mr_scache.rwlock);
835 	return 0;
836 }
837 
838 void
839 mlx5_class_driver_register(struct mlx5_class_driver *driver)
840 {
841 	mlx5_common_driver_on_register_pci(driver);
842 	TAILQ_INSERT_TAIL(&drivers_list, driver, next);
843 }
844 
845 static void mlx5_common_driver_init(void)
846 {
847 	mlx5_common_pci_init();
848 #ifdef RTE_EXEC_ENV_LINUX
849 	mlx5_common_auxiliary_init();
850 #endif
851 }
852 
853 static bool mlx5_common_initialized;
854 
855 /**
856  * One time innitialization routine for run-time dependency on glue library
857  * for multiple PMDs. Each mlx5 PMD that depends on mlx5_common module,
858  * must invoke in its constructor.
859  */
860 void
861 mlx5_common_init(void)
862 {
863 	if (mlx5_common_initialized)
864 		return;
865 
866 	pthread_mutex_init(&devices_list_lock, NULL);
867 	mlx5_glue_constructor();
868 	mlx5_common_driver_init();
869 	mlx5_common_initialized = true;
870 }
871 
872 /**
873  * This function is responsible of initializing the variable
874  *  haswell_broadwell_cpu by checking if the cpu is intel
875  *  and reading the data returned from mlx5_cpu_id().
876  *  since haswell and broadwell cpus don't have improved performance
877  *  when using relaxed ordering we want to check the cpu type before
878  *  before deciding whether to enable RO or not.
879  *  if the cpu is haswell or broadwell the variable will be set to 1
880  *  otherwise it will be 0.
881  */
882 RTE_INIT_PRIO(mlx5_is_haswell_broadwell_cpu, LOG)
883 {
884 #ifdef RTE_ARCH_X86_64
885 	unsigned int broadwell_models[4] = {0x3d, 0x47, 0x4F, 0x56};
886 	unsigned int haswell_models[4] = {0x3c, 0x3f, 0x45, 0x46};
887 	unsigned int i, model, family, brand_id, vendor;
888 	unsigned int signature_intel_ebx = 0x756e6547;
889 	unsigned int extended_model;
890 	unsigned int eax = 0;
891 	unsigned int ebx = 0;
892 	unsigned int ecx = 0;
893 	unsigned int edx = 0;
894 	int max_level;
895 
896 	mlx5_cpu_id(0, &eax, &ebx, &ecx, &edx);
897 	vendor = ebx;
898 	max_level = eax;
899 	if (max_level < 1) {
900 		haswell_broadwell_cpu = 0;
901 		return;
902 	}
903 	mlx5_cpu_id(1, &eax, &ebx, &ecx, &edx);
904 	model = (eax >> 4) & 0x0f;
905 	family = (eax >> 8) & 0x0f;
906 	brand_id = ebx & 0xff;
907 	extended_model = (eax >> 12) & 0xf0;
908 	/* Check if the processor is Haswell or Broadwell */
909 	if (vendor == signature_intel_ebx) {
910 		if (family == 0x06)
911 			model += extended_model;
912 		if (brand_id == 0 && family == 0x6) {
913 			for (i = 0; i < RTE_DIM(broadwell_models); i++)
914 				if (model == broadwell_models[i]) {
915 					haswell_broadwell_cpu = 1;
916 					return;
917 				}
918 			for (i = 0; i < RTE_DIM(haswell_models); i++)
919 				if (model == haswell_models[i]) {
920 					haswell_broadwell_cpu = 1;
921 					return;
922 				}
923 		}
924 	}
925 #endif
926 	haswell_broadwell_cpu = 0;
927 }
928 
929 /**
930  * Allocate the User Access Region with DevX on specified device.
931  *
932  * @param [in] ctx
933  *   Infiniband device context to perform allocation on.
934  * @param [in] mapping
935  *   MLX5DV_UAR_ALLOC_TYPE_BF - allocate as cached memory with write-combining
936  *				attributes (if supported by the host), the
937  *				writes to the UAR registers must be followed
938  *				by write memory barrier.
939  *   MLX5DV_UAR_ALLOC_TYPE_NC - allocate as non-cached nenory, all writes are
940  *				promoted to the registers immediately, no
941  *				memory barriers needed.
942  *   mapping < 0 - the first attempt is performed with MLX5DV_UAR_ALLOC_TYPE_BF,
943  *		   if this fails the next attempt with MLX5DV_UAR_ALLOC_TYPE_NC
944  *		   is performed. The drivers specifying negative values should
945  *		   always provide the write memory barrier operation after UAR
946  *		   register writings.
947  * If there is no definitions for the MLX5DV_UAR_ALLOC_TYPE_xx (older rdma
948  * library headers), the caller can specify 0.
949  *
950  * @return
951  *   UAR object pointer on success, NULL otherwise and rte_errno is set.
952  */
953 void *
954 mlx5_devx_alloc_uar(void *ctx, int mapping)
955 {
956 	void *uar;
957 	uint32_t retry, uar_mapping;
958 	void *base_addr;
959 
960 	for (retry = 0; retry < MLX5_ALLOC_UAR_RETRY; ++retry) {
961 #ifdef MLX5DV_UAR_ALLOC_TYPE_NC
962 		/* Control the mapping type according to the settings. */
963 		uar_mapping = (mapping < 0) ?
964 			      MLX5DV_UAR_ALLOC_TYPE_NC : mapping;
965 #else
966 		/*
967 		 * It seems we have no way to control the memory mapping type
968 		 * for the UAR, the default "Write-Combining" type is supposed.
969 		 */
970 		uar_mapping = 0;
971 		RTE_SET_USED(mapping);
972 #endif
973 		uar = mlx5_glue->devx_alloc_uar(ctx, uar_mapping);
974 #ifdef MLX5DV_UAR_ALLOC_TYPE_NC
975 		if (!uar &&
976 		    mapping < 0 &&
977 		    uar_mapping == MLX5DV_UAR_ALLOC_TYPE_BF) {
978 			/*
979 			 * In some environments like virtual machine the
980 			 * Write Combining mapped might be not supported and
981 			 * UAR allocation fails. We tried "Non-Cached" mapping
982 			 * for the case.
983 			 */
984 			DRV_LOG(WARNING, "Failed to allocate DevX UAR (BF)");
985 			uar_mapping = MLX5DV_UAR_ALLOC_TYPE_NC;
986 			uar = mlx5_glue->devx_alloc_uar(ctx, uar_mapping);
987 		} else if (!uar &&
988 			   mapping < 0 &&
989 			   uar_mapping == MLX5DV_UAR_ALLOC_TYPE_NC) {
990 			/*
991 			 * If Verbs/kernel does not support "Non-Cached"
992 			 * try the "Write-Combining".
993 			 */
994 			DRV_LOG(WARNING, "Failed to allocate DevX UAR (NC)");
995 			uar_mapping = MLX5DV_UAR_ALLOC_TYPE_BF;
996 			uar = mlx5_glue->devx_alloc_uar(ctx, uar_mapping);
997 		}
998 #endif
999 		if (!uar) {
1000 			DRV_LOG(ERR, "Failed to allocate DevX UAR (BF/NC)");
1001 			rte_errno = ENOMEM;
1002 			goto exit;
1003 		}
1004 		base_addr = mlx5_os_get_devx_uar_base_addr(uar);
1005 		if (base_addr)
1006 			break;
1007 		/*
1008 		 * The UARs are allocated by rdma_core within the
1009 		 * IB device context, on context closure all UARs
1010 		 * will be freed, should be no memory/object leakage.
1011 		 */
1012 		DRV_LOG(WARNING, "Retrying to allocate DevX UAR");
1013 		uar = NULL;
1014 	}
1015 	/* Check whether we finally succeeded with valid UAR allocation. */
1016 	if (!uar) {
1017 		DRV_LOG(ERR, "Failed to allocate DevX UAR (NULL base)");
1018 		rte_errno = ENOMEM;
1019 	}
1020 	/*
1021 	 * Return void * instead of struct mlx5dv_devx_uar *
1022 	 * is for compatibility with older rdma-core library headers.
1023 	 */
1024 exit:
1025 	return uar;
1026 }
1027 
1028 RTE_PMD_EXPORT_NAME(mlx5_common_driver, __COUNTER__);
1029