xref: /dpdk/drivers/gpu/cuda/cuda.c (revision b53d106d34b5c638f5a2cbdfee0da5bd42d4383f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2021 NVIDIA Corporation & Affiliates
3  */
4 
5 #include <dlfcn.h>
6 
7 #include <rte_common.h>
8 #include <rte_log.h>
9 #include <rte_malloc.h>
10 #include <rte_errno.h>
11 #include <rte_pci.h>
12 #include <rte_bus_pci.h>
13 #include <rte_byteorder.h>
14 #include <rte_dev.h>
15 
16 #include <gpudev_driver.h>
17 #include <cuda.h>
18 #include <cudaTypedefs.h>
19 
20 #define CUDA_DRIVER_MIN_VERSION 11040
21 #define CUDA_API_MIN_VERSION 3020
22 
23 /* CUDA Driver functions loaded with dlsym() */
24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
27 		void **pfn, int cudaVersion, uint64_t flags);
28 
29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
30 static PFN_cuGetErrorString pfn_cuGetErrorString;
31 static PFN_cuGetErrorName pfn_cuGetErrorName;
32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
44 static PFN_cuMemAlloc pfn_cuMemAlloc;
45 static PFN_cuMemFree pfn_cuMemFree;
46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
50 
51 static void *cudalib;
52 static unsigned int cuda_api_version;
53 static int cuda_driver_version;
54 
55 /* NVIDIA GPU vendor */
56 #define NVIDIA_GPU_VENDOR_ID (0x10de)
57 
58 /* NVIDIA GPU device IDs */
59 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
60 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
61 
62 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
63 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
64 
65 #define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
66 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
67 
68 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
69 
70 #define CUDA_MAX_ALLOCATION_NUM 512
71 
72 #define GPU_PAGE_SHIFT 16
73 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
74 
75 static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
76 
77 /* Helper macro for logging */
78 #define rte_cuda_log(level, fmt, ...) \
79 	rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
80 
81 #define rte_cuda_debug(fmt, ...) \
82 	rte_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
83 		##__VA_ARGS__)
84 
85 /* NVIDIA GPU address map */
86 static const struct rte_pci_id pci_id_cuda_map[] = {
87 	{
88 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
89 				NVIDIA_GPU_A100_40GB_DEVICE_ID)
90 	},
91 	{
92 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
93 				NVIDIA_GPU_A100_80GB_DEVICE_ID)
94 	},
95 	{
96 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
97 				NVIDIA_GPU_A30_24GB_DEVICE_ID)
98 	},
99 	{
100 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
101 				NVIDIA_GPU_A10_24GB_DEVICE_ID)
102 	},
103 	{
104 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
105 				NVIDIA_GPU_V100_32GB_DEVICE_ID)
106 	},
107 	{
108 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
109 				NVIDIA_GPU_V100_16GB_DEVICE_ID)
110 	},
111 	{
112 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
113 				NVIDIA_GPU_T4_16GB_DEVICE_ID)
114 	},
115 	{
116 		.device_id = 0
117 	}
118 };
119 
120 /* Device private info */
121 struct cuda_info {
122 	char gpu_name[RTE_DEV_NAME_MAX_LEN];
123 	CUdevice cu_dev;
124 	int gdr_supported;
125 	int gdr_write_ordering;
126 	int gdr_flush_type;
127 };
128 
129 /* Type of memory allocated by CUDA driver */
130 enum mem_type {
131 	GPU_MEM = 0,
132 	CPU_REGISTERED,
133 	GPU_REGISTERED /* Not used yet */
134 };
135 
136 /* key associated to a memory address */
137 typedef uintptr_t cuda_ptr_key;
138 
139 /* Single entry of the memory list */
140 struct mem_entry {
141 	CUdeviceptr ptr_d;
142 	void *ptr_h;
143 	size_t size;
144 	struct rte_gpu *dev;
145 	CUcontext ctx;
146 	cuda_ptr_key pkey;
147 	enum mem_type mtype;
148 	struct mem_entry *prev;
149 	struct mem_entry *next;
150 };
151 
152 static struct mem_entry *mem_alloc_list_head;
153 static struct mem_entry *mem_alloc_list_tail;
154 static uint32_t mem_alloc_list_last_elem;
155 
156 /* Load the CUDA symbols */
157 
158 static int
159 cuda_loader(void)
160 {
161 	char cuda_path[1024];
162 
163 	if (getenv("CUDA_PATH_L") == NULL)
164 		snprintf(cuda_path, 1024, "%s", "libcuda.so");
165 	else
166 		snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so");
167 
168 	cudalib = dlopen(cuda_path, RTLD_LAZY);
169 	if (cudalib == NULL) {
170 		rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
171 				cuda_path, getenv("CUDA_PATH_L"));
172 		return -1;
173 	}
174 
175 	return 0;
176 }
177 
178 static int
179 cuda_sym_func_loader(void)
180 {
181 	if (cudalib == NULL)
182 		return -1;
183 
184 	sym_cuInit = dlsym(cudalib, "cuInit");
185 	if (sym_cuInit == NULL) {
186 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
187 		return -1;
188 	}
189 
190 	sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
191 	if (sym_cuDriverGetVersion == NULL) {
192 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
193 		return -1;
194 	}
195 
196 	sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
197 	if (sym_cuGetProcAddress == NULL) {
198 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
199 		return -1;
200 	}
201 
202 	return 0;
203 }
204 
205 static int
206 cuda_pfn_func_loader(void)
207 {
208 	CUresult res;
209 
210 	res = sym_cuGetProcAddress("cuGetErrorString",
211 			(void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
212 	if (res != 0) {
213 		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
214 		return -1;
215 	}
216 
217 	res = sym_cuGetProcAddress("cuGetErrorName",
218 			(void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
219 	if (res != 0) {
220 		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
221 		return -1;
222 	}
223 
224 	res = sym_cuGetProcAddress("cuPointerSetAttribute",
225 			(void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
226 	if (res != 0) {
227 		rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
228 		return -1;
229 	}
230 
231 	res = sym_cuGetProcAddress("cuDeviceGetAttribute",
232 			(void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
233 	if (res != 0) {
234 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
235 		return -1;
236 	}
237 
238 	res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
239 			(void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
240 	if (res != 0) {
241 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
242 		return -1;
243 	}
244 
245 	res = sym_cuGetProcAddress("cuDeviceGetName",
246 			(void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
247 	if (res != 0) {
248 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
249 		return -1;
250 	}
251 
252 	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
253 			(void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
254 	if (res != 0) {
255 		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
256 		return -1;
257 	}
258 
259 	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
260 			(void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
261 	if (res != 0) {
262 		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
263 		return -1;
264 	}
265 
266 	res = sym_cuGetProcAddress("cuDeviceTotalMem",
267 			(void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
268 	if (res != 0) {
269 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
270 		return -1;
271 	}
272 
273 	res = sym_cuGetProcAddress("cuCtxGetApiVersion",
274 			(void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
275 	if (res != 0) {
276 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
277 		return -1;
278 	}
279 
280 	res = sym_cuGetProcAddress("cuCtxGetDevice",
281 			(void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
282 	if (res != 0) {
283 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
284 		return -1;
285 	}
286 
287 	res = sym_cuGetProcAddress("cuCtxSetCurrent",
288 			(void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
289 	if (res != 0) {
290 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
291 		return -1;
292 	}
293 
294 	res = sym_cuGetProcAddress("cuCtxGetCurrent",
295 			(void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
296 	if (res != 0) {
297 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
298 		return -1;
299 	}
300 
301 	res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
302 			(void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
303 	if (res != 0) {
304 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
305 		return -1;
306 	}
307 
308 	res = sym_cuGetProcAddress("cuMemAlloc",
309 			(void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
310 	if (res != 0) {
311 		rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
312 		return -1;
313 	}
314 
315 	res = sym_cuGetProcAddress("cuMemFree",
316 			(void **)(&pfn_cuMemFree), cuda_driver_version, 0);
317 	if (res != 0) {
318 		rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
319 		return -1;
320 	}
321 
322 	res = sym_cuGetProcAddress("cuMemHostRegister",
323 			(void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
324 	if (res != 0) {
325 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
326 		return -1;
327 	}
328 
329 	res = sym_cuGetProcAddress("cuMemHostUnregister",
330 			(void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
331 	if (res != 0) {
332 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
333 		return -1;
334 	}
335 
336 	res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
337 			(void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
338 	if (res != 0) {
339 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
340 		return -1;
341 	}
342 
343 	res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
344 			(void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
345 	if (res != 0) {
346 		rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
347 		return -1;
348 	}
349 
350 	return 0;
351 }
352 
353 /* Generate a key from a memory pointer */
354 static cuda_ptr_key
355 get_hash_from_ptr(void *ptr)
356 {
357 	return (uintptr_t)ptr;
358 }
359 
360 static uint32_t
361 mem_list_count_item(void)
362 {
363 	return mem_alloc_list_last_elem;
364 }
365 
366 /* Initiate list of memory allocations if not done yet */
367 static struct mem_entry *
368 mem_list_add_item(void)
369 {
370 	/* Initiate list of memory allocations if not done yet */
371 	if (mem_alloc_list_head == NULL) {
372 		mem_alloc_list_head = rte_zmalloc(NULL,
373 				sizeof(struct mem_entry),
374 				RTE_CACHE_LINE_SIZE);
375 		if (mem_alloc_list_head == NULL) {
376 			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
377 			return NULL;
378 		}
379 
380 		mem_alloc_list_head->next = NULL;
381 		mem_alloc_list_head->prev = NULL;
382 		mem_alloc_list_tail = mem_alloc_list_head;
383 	} else {
384 		struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
385 				sizeof(struct mem_entry),
386 				RTE_CACHE_LINE_SIZE);
387 
388 		if (mem_alloc_list_cur == NULL) {
389 			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
390 			return NULL;
391 		}
392 
393 		mem_alloc_list_tail->next = mem_alloc_list_cur;
394 		mem_alloc_list_cur->prev = mem_alloc_list_tail;
395 		mem_alloc_list_tail = mem_alloc_list_tail->next;
396 		mem_alloc_list_tail->next = NULL;
397 	}
398 
399 	mem_alloc_list_last_elem++;
400 
401 	return mem_alloc_list_tail;
402 }
403 
404 static struct mem_entry *
405 mem_list_find_item(cuda_ptr_key pk)
406 {
407 	struct mem_entry *mem_alloc_list_cur = NULL;
408 
409 	if (mem_alloc_list_head == NULL) {
410 		rte_cuda_log(ERR, "Memory list doesn't exist");
411 		return NULL;
412 	}
413 
414 	if (mem_list_count_item() == 0) {
415 		rte_cuda_log(ERR, "No items in memory list");
416 		return NULL;
417 	}
418 
419 	mem_alloc_list_cur = mem_alloc_list_head;
420 
421 	while (mem_alloc_list_cur != NULL) {
422 		if (mem_alloc_list_cur->pkey == pk)
423 			return mem_alloc_list_cur;
424 		mem_alloc_list_cur = mem_alloc_list_cur->next;
425 	}
426 
427 	return mem_alloc_list_cur;
428 }
429 
430 static int
431 mem_list_del_item(cuda_ptr_key pk)
432 {
433 	struct mem_entry *mem_alloc_list_cur = NULL;
434 
435 	mem_alloc_list_cur = mem_list_find_item(pk);
436 	if (mem_alloc_list_cur == NULL)
437 		return -EINVAL;
438 
439 	/* if key is in head */
440 	if (mem_alloc_list_cur->prev == NULL)
441 		mem_alloc_list_head = mem_alloc_list_cur->next;
442 	else {
443 		mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
444 		if (mem_alloc_list_cur->next != NULL)
445 			mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
446 	}
447 
448 	rte_free(mem_alloc_list_cur);
449 
450 	mem_alloc_list_last_elem--;
451 
452 	return 0;
453 }
454 
455 static int
456 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
457 {
458 	int ret = 0;
459 	CUresult res;
460 	struct rte_gpu_info parent_info;
461 	CUexecAffinityParam affinityPrm;
462 	const char *err_string;
463 	struct cuda_info *private;
464 	CUcontext current_ctx;
465 	CUcontext input_ctx;
466 
467 	if (dev == NULL) {
468 		rte_errno = ENODEV;
469 		return -rte_errno;
470 	}
471 
472 	/* Child initialization time probably called by rte_gpu_add_child() */
473 	if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
474 			dev->mpshared->dev_private == NULL) {
475 		/* Store current ctx */
476 		res = pfn_cuCtxGetCurrent(&current_ctx);
477 		if (res != 0) {
478 			pfn_cuGetErrorString(res, &(err_string));
479 			rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
480 					err_string);
481 			rte_errno = EPERM;
482 			return -rte_errno;
483 		}
484 
485 		/* Set child ctx as current ctx */
486 		input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
487 		res = pfn_cuCtxSetCurrent(input_ctx);
488 		if (res != 0) {
489 			pfn_cuGetErrorString(res, &(err_string));
490 			rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
491 					err_string);
492 			rte_errno = EPERM;
493 			return -rte_errno;
494 		}
495 
496 		/*
497 		 * Ctx capacity info
498 		 */
499 
500 		/* MPS compatible */
501 		res = pfn_cuCtxGetExecAffinity(&affinityPrm,
502 				CU_EXEC_AFFINITY_TYPE_SM_COUNT);
503 		if (res != 0) {
504 			pfn_cuGetErrorString(res, &(err_string));
505 			rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
506 					err_string);
507 		}
508 		dev->mpshared->info.processor_count =
509 				(uint32_t)affinityPrm.param.smCount.val;
510 
511 		ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
512 		if (ret) {
513 			rte_errno = ENODEV;
514 			return -rte_errno;
515 		}
516 		dev->mpshared->info.total_memory = parent_info.total_memory;
517 
518 		/*
519 		 * GPU Device private info
520 		 */
521 		dev->mpshared->dev_private = rte_zmalloc(NULL,
522 				sizeof(struct cuda_info),
523 				RTE_CACHE_LINE_SIZE);
524 		if (dev->mpshared->dev_private == NULL) {
525 			rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
526 			rte_errno = EPERM;
527 			return -rte_errno;
528 		}
529 
530 		private = (struct cuda_info *)dev->mpshared->dev_private;
531 
532 		res = pfn_cuCtxGetDevice(&(private->cu_dev));
533 		if (res != 0) {
534 			pfn_cuGetErrorString(res, &(err_string));
535 			rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
536 					err_string);
537 			rte_errno = EPERM;
538 			return -rte_errno;
539 		}
540 
541 		res = pfn_cuDeviceGetName(private->gpu_name,
542 				RTE_DEV_NAME_MAX_LEN, private->cu_dev);
543 		if (res != 0) {
544 			pfn_cuGetErrorString(res, &(err_string));
545 			rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
546 					err_string);
547 			rte_errno = EPERM;
548 			return -rte_errno;
549 		}
550 
551 		/* Restore original ctx as current ctx */
552 		res = pfn_cuCtxSetCurrent(current_ctx);
553 		if (res != 0) {
554 			pfn_cuGetErrorString(res, &(err_string));
555 			rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
556 					err_string);
557 			rte_errno = EPERM;
558 			return -rte_errno;
559 		}
560 	}
561 
562 	*info = dev->mpshared->info;
563 
564 	return 0;
565 }
566 
567 /*
568  * GPU Memory
569  */
570 
571 static int
572 cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
573 {
574 	CUresult res;
575 	const char *err_string;
576 	CUcontext current_ctx;
577 	CUcontext input_ctx;
578 	unsigned int flag = 1;
579 
580 	if (dev == NULL)
581 		return -ENODEV;
582 
583 	/* Store current ctx */
584 	res = pfn_cuCtxGetCurrent(&current_ctx);
585 	if (res != 0) {
586 		pfn_cuGetErrorString(res, &(err_string));
587 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
588 				err_string);
589 		rte_errno = EPERM;
590 		return -rte_errno;
591 	}
592 
593 	/* Set child ctx as current ctx */
594 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
595 	res = pfn_cuCtxSetCurrent(input_ctx);
596 	if (res != 0) {
597 		pfn_cuGetErrorString(res, &(err_string));
598 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
599 				err_string);
600 		rte_errno = EPERM;
601 		return -rte_errno;
602 	}
603 
604 	/* Get next memory list item */
605 	mem_alloc_list_tail = mem_list_add_item();
606 	if (mem_alloc_list_tail == NULL) {
607 		rte_errno = EPERM;
608 		return -rte_errno;
609 	}
610 
611 	/* Allocate memory */
612 	mem_alloc_list_tail->size = size;
613 	res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d),
614 			mem_alloc_list_tail->size);
615 	if (res != 0) {
616 		pfn_cuGetErrorString(res, &(err_string));
617 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
618 				err_string);
619 		rte_errno = EPERM;
620 		return -rte_errno;
621 	}
622 
623 	/* GPUDirect RDMA attribute required */
624 	res = pfn_cuPointerSetAttribute(&flag,
625 			CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
626 			mem_alloc_list_tail->ptr_d);
627 	if (res != 0) {
628 		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
629 				"GPU memory at  %"PRIu32", err %d",
630 				(uint32_t)mem_alloc_list_tail->ptr_d, res);
631 		rte_errno = EPERM;
632 		return -rte_errno;
633 	}
634 
635 	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
636 	mem_alloc_list_tail->ptr_h = NULL;
637 	mem_alloc_list_tail->size = size;
638 	mem_alloc_list_tail->dev = dev;
639 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
640 	mem_alloc_list_tail->mtype = GPU_MEM;
641 
642 	/* Restore original ctx as current ctx */
643 	res = pfn_cuCtxSetCurrent(current_ctx);
644 	if (res != 0) {
645 		pfn_cuGetErrorString(res, &(err_string));
646 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
647 				err_string);
648 		rte_errno = EPERM;
649 		return -rte_errno;
650 	}
651 
652 	*ptr = (void *)mem_alloc_list_tail->ptr_d;
653 
654 	return 0;
655 }
656 
657 static int
658 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
659 {
660 	CUresult res;
661 	const char *err_string;
662 	CUcontext current_ctx;
663 	CUcontext input_ctx;
664 	unsigned int flag = 1;
665 	int use_ptr_h = 0;
666 
667 	if (dev == NULL)
668 		return -ENODEV;
669 
670 	/* Store current ctx */
671 	res = pfn_cuCtxGetCurrent(&current_ctx);
672 	if (res != 0) {
673 		pfn_cuGetErrorString(res, &(err_string));
674 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
675 				err_string);
676 		rte_errno = EPERM;
677 		return -rte_errno;
678 	}
679 
680 	/* Set child ctx as current ctx */
681 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
682 	res = pfn_cuCtxSetCurrent(input_ctx);
683 	if (res != 0) {
684 		pfn_cuGetErrorString(res, &(err_string));
685 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
686 				err_string);
687 		rte_errno = EPERM;
688 		return -rte_errno;
689 	}
690 
691 	/* Get next memory list item */
692 	mem_alloc_list_tail = mem_list_add_item();
693 	if (mem_alloc_list_tail == NULL) {
694 		rte_errno = EPERM;
695 		return -rte_errno;
696 	}
697 
698 	/* Allocate memory */
699 	mem_alloc_list_tail->size = size;
700 	mem_alloc_list_tail->ptr_h = ptr;
701 
702 	res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
703 			mem_alloc_list_tail->size,
704 			CU_MEMHOSTREGISTER_PORTABLE |
705 			CU_MEMHOSTREGISTER_DEVICEMAP);
706 	if (res != 0) {
707 		pfn_cuGetErrorString(res, &(err_string));
708 		rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
709 				err_string,
710 				mem_alloc_list_tail->ptr_h,
711 				mem_alloc_list_tail->size);
712 		rte_errno = EPERM;
713 		return -rte_errno;
714 	}
715 
716 	res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
717 			CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
718 			((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
719 	if (res != 0) {
720 		pfn_cuGetErrorString(res, &(err_string));
721 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
722 				err_string);
723 		rte_errno = EPERM;
724 		return -rte_errno;
725 	}
726 
727 	if (use_ptr_h == 0) {
728 		res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
729 				mem_alloc_list_tail->ptr_h, 0);
730 		if (res != 0) {
731 			pfn_cuGetErrorString(res, &(err_string));
732 			rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
733 					err_string);
734 			rte_errno = EPERM;
735 			return -rte_errno;
736 		}
737 
738 		if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
739 				(uintptr_t)mem_alloc_list_tail->ptr_h) {
740 			rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
741 			rte_errno = ENOTSUP;
742 			return -rte_errno;
743 		}
744 	} else {
745 		mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
746 	}
747 
748 	/* GPUDirect RDMA attribute required */
749 	res = pfn_cuPointerSetAttribute(&flag,
750 			CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
751 			mem_alloc_list_tail->ptr_d);
752 	if (res != 0) {
753 		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
754 				", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
755 		rte_errno = EPERM;
756 		return -rte_errno;
757 	}
758 
759 	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
760 	mem_alloc_list_tail->size = size;
761 	mem_alloc_list_tail->dev = dev;
762 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
763 	mem_alloc_list_tail->mtype = CPU_REGISTERED;
764 
765 	/* Restore original ctx as current ctx */
766 	res = pfn_cuCtxSetCurrent(current_ctx);
767 	if (res != 0) {
768 		pfn_cuGetErrorString(res, &(err_string));
769 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
770 				err_string);
771 		rte_errno = EPERM;
772 		return -rte_errno;
773 	}
774 
775 	return 0;
776 }
777 
778 static int
779 cuda_mem_free(struct rte_gpu *dev, void *ptr)
780 {
781 	CUresult res;
782 	struct mem_entry *mem_item;
783 	const char *err_string;
784 	cuda_ptr_key hk;
785 
786 	if (dev == NULL)
787 		return -ENODEV;
788 
789 	hk = get_hash_from_ptr((void *)ptr);
790 
791 	mem_item = mem_list_find_item(hk);
792 	if (mem_item == NULL) {
793 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
794 		rte_errno = EPERM;
795 		return -rte_errno;
796 	}
797 
798 	if (mem_item->mtype == GPU_MEM) {
799 		res = pfn_cuMemFree(mem_item->ptr_d);
800 		if (res != 0) {
801 			pfn_cuGetErrorString(res, &(err_string));
802 			rte_cuda_log(ERR, "cuMemFree current failed with %s",
803 					err_string);
804 			rte_errno = EPERM;
805 			return -rte_errno;
806 		}
807 
808 		return mem_list_del_item(hk);
809 	}
810 
811 	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
812 
813 	return -EPERM;
814 }
815 
816 static int
817 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
818 {
819 	CUresult res;
820 	struct mem_entry *mem_item;
821 	const char *err_string;
822 	cuda_ptr_key hk;
823 
824 	if (dev == NULL)
825 		return -ENODEV;
826 
827 	hk = get_hash_from_ptr((void *)ptr);
828 
829 	mem_item = mem_list_find_item(hk);
830 	if (mem_item == NULL) {
831 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
832 		rte_errno = EPERM;
833 		return -rte_errno;
834 	}
835 
836 	if (mem_item->mtype == CPU_REGISTERED) {
837 		res = pfn_cuMemHostUnregister(ptr);
838 		if (res != 0) {
839 			pfn_cuGetErrorString(res, &(err_string));
840 			rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
841 					err_string);
842 			rte_errno = EPERM;
843 			return -rte_errno;
844 		}
845 
846 		return mem_list_del_item(hk);
847 	}
848 
849 	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
850 
851 	rte_errno = EPERM;
852 	return -rte_errno;
853 }
854 
855 static int
856 cuda_dev_close(struct rte_gpu *dev)
857 {
858 	if (dev == NULL)
859 		return -EINVAL;
860 
861 	rte_free(dev->mpshared->dev_private);
862 
863 	return 0;
864 }
865 
866 static int
867 cuda_wmb(struct rte_gpu *dev)
868 {
869 	CUresult res;
870 	const char *err_string;
871 	CUcontext current_ctx;
872 	CUcontext input_ctx;
873 	struct cuda_info *private;
874 
875 	if (dev == NULL) {
876 		rte_errno = ENODEV;
877 		return -rte_errno;
878 	}
879 
880 	private = (struct cuda_info *)dev->mpshared->dev_private;
881 
882 	if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
883 		/*
884 		 * No need to explicitly force the write ordering because
885 		 * the device natively supports it
886 		 */
887 		return 0;
888 	}
889 
890 	if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
891 		/*
892 		 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
893 		 * Application needs to use alternative methods.
894 		 */
895 		rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
896 				"Application needs to use alternative methods.");
897 
898 		rte_errno = ENOTSUP;
899 		return -rte_errno;
900 	}
901 
902 	/* Store current ctx */
903 	res = pfn_cuCtxGetCurrent(&current_ctx);
904 	if (res != 0) {
905 		pfn_cuGetErrorString(res, &(err_string));
906 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
907 				err_string);
908 		rte_errno = EPERM;
909 		return -rte_errno;
910 	}
911 
912 	/* Set child ctx as current ctx */
913 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
914 	res = pfn_cuCtxSetCurrent(input_ctx);
915 	if (res != 0) {
916 		pfn_cuGetErrorString(res, &(err_string));
917 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
918 				err_string);
919 		rte_errno = EPERM;
920 		return -rte_errno;
921 	}
922 
923 	res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
924 			CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
925 	if (res != 0) {
926 		pfn_cuGetErrorString(res, &(err_string));
927 		rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
928 				err_string);
929 		rte_errno = EPERM;
930 		return -rte_errno;
931 	}
932 
933 	/* Restore original ctx as current ctx */
934 	res = pfn_cuCtxSetCurrent(current_ctx);
935 	if (res != 0) {
936 		pfn_cuGetErrorString(res, &(err_string));
937 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
938 				err_string);
939 		rte_errno = EPERM;
940 		return -rte_errno;
941 	}
942 
943 	return 0;
944 }
945 
946 static int
947 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
948 {
949 	struct rte_gpu *dev = NULL;
950 	CUresult res;
951 	CUdevice cu_dev_id;
952 	CUcontext pctx;
953 	char dev_name[RTE_DEV_NAME_MAX_LEN];
954 	const char *err_string;
955 	int processor_count = 0;
956 	struct cuda_info *private;
957 
958 	if (pci_dev == NULL) {
959 		rte_cuda_log(ERR, "NULL PCI device");
960 		rte_errno = ENODEV;
961 		return -rte_errno;
962 	}
963 
964 	rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
965 
966 	/* Allocate memory to be used privately by drivers */
967 	dev = rte_gpu_allocate(pci_dev->device.name);
968 	if (dev == NULL) {
969 		rte_errno = ENODEV;
970 		return -rte_errno;
971 	}
972 
973 	/* Initialize values only for the first CUDA driver call */
974 	if (dev->mpshared->info.dev_id == 0) {
975 		mem_alloc_list_head = NULL;
976 		mem_alloc_list_tail = NULL;
977 		mem_alloc_list_last_elem = 0;
978 
979 		/* Load libcuda.so library */
980 		if (cuda_loader()) {
981 			rte_cuda_log(ERR, "CUDA Driver library not found");
982 			rte_errno = ENOTSUP;
983 			return -rte_errno;
984 		}
985 
986 		/* Load initial CUDA functions */
987 		if (cuda_sym_func_loader()) {
988 			rte_cuda_log(ERR, "CUDA functions not found in library");
989 			rte_errno = ENOTSUP;
990 			return -rte_errno;
991 		}
992 
993 		/*
994 		 * Required to initialize the CUDA Driver.
995 		 * Multiple calls of cuInit() will return immediately
996 		 * without making any relevant change
997 		 */
998 		sym_cuInit(0);
999 
1000 		res = sym_cuDriverGetVersion(&cuda_driver_version);
1001 		if (res != 0) {
1002 			rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1003 			rte_errno = ENOTSUP;
1004 			return -rte_errno;
1005 		}
1006 
1007 		if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1008 			rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1009 					"Minimum requirement is %d",
1010 					cuda_driver_version,
1011 					CUDA_DRIVER_MIN_VERSION);
1012 			rte_errno = ENOTSUP;
1013 			return -rte_errno;
1014 		}
1015 
1016 		if (cuda_pfn_func_loader()) {
1017 			rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1018 			rte_errno = ENOTSUP;
1019 			return -rte_errno;
1020 		}
1021 	}
1022 
1023 	/* Fill HW specific part of device structure */
1024 	dev->device = &pci_dev->device;
1025 	dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1026 
1027 	/* Get NVIDIA GPU Device descriptor */
1028 	res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1029 	if (res != 0) {
1030 		pfn_cuGetErrorString(res, &(err_string));
1031 		rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1032 				dev->device->name, res, err_string);
1033 		rte_errno = EPERM;
1034 		return -rte_errno;
1035 	}
1036 
1037 	res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1038 	if (res != 0) {
1039 		pfn_cuGetErrorString(res, &(err_string));
1040 		rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1041 				dev->device->name, res, err_string);
1042 		rte_errno = EPERM;
1043 		return -rte_errno;
1044 	}
1045 
1046 	res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1047 	if (res != 0) {
1048 		rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1049 		rte_errno = ENOTSUP;
1050 		return -rte_errno;
1051 	}
1052 
1053 	if (cuda_api_version < CUDA_API_MIN_VERSION) {
1054 		rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1055 				cuda_api_version, CUDA_API_MIN_VERSION);
1056 		rte_errno = ENOTSUP;
1057 		return -rte_errno;
1058 	}
1059 
1060 	dev->mpshared->info.context = (uint64_t)pctx;
1061 
1062 	/*
1063 	 * GPU Device generic info
1064 	 */
1065 
1066 	/* Processor count */
1067 	res = pfn_cuDeviceGetAttribute(&(processor_count),
1068 			CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1069 			cu_dev_id);
1070 	if (res != 0) {
1071 		pfn_cuGetErrorString(res, &(err_string));
1072 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1073 				err_string);
1074 		rte_errno = EPERM;
1075 		return -rte_errno;
1076 	}
1077 	dev->mpshared->info.processor_count = (uint32_t)processor_count;
1078 
1079 	/* Total memory */
1080 	res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1081 	if (res != 0) {
1082 		pfn_cuGetErrorString(res, &(err_string));
1083 		rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1084 				err_string);
1085 		rte_errno = EPERM;
1086 		return -rte_errno;
1087 	}
1088 
1089 	/*
1090 	 * GPU Device private info
1091 	 */
1092 	dev->mpshared->dev_private = rte_zmalloc(NULL,
1093 			sizeof(struct cuda_info),
1094 			RTE_CACHE_LINE_SIZE);
1095 	if (dev->mpshared->dev_private == NULL) {
1096 		rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1097 		rte_errno = EPERM;
1098 		return -rte_errno;
1099 	}
1100 
1101 	private = (struct cuda_info *)dev->mpshared->dev_private;
1102 	private->cu_dev = cu_dev_id;
1103 	res = pfn_cuDeviceGetName(private->gpu_name,
1104 			RTE_DEV_NAME_MAX_LEN,
1105 			cu_dev_id);
1106 	if (res != 0) {
1107 		pfn_cuGetErrorString(res, &(err_string));
1108 		rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1109 				err_string);
1110 		rte_errno = EPERM;
1111 		return -rte_errno;
1112 	}
1113 
1114 	res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1115 			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1116 			cu_dev_id);
1117 	if (res != 0) {
1118 		pfn_cuGetErrorString(res, &(err_string));
1119 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1120 				err_string);
1121 		rte_errno = EPERM;
1122 		return -rte_errno;
1123 	}
1124 
1125 	if (private->gdr_supported == 0)
1126 		rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1127 				pci_dev->device.name);
1128 
1129 	res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1130 			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1131 			cu_dev_id);
1132 	if (res != 0) {
1133 		pfn_cuGetErrorString(res, &(err_string));
1134 		rte_cuda_log(ERR,
1135 				"cuDeviceGetAttribute failed with %s",
1136 				err_string);
1137 		rte_errno = EPERM;
1138 		return -rte_errno;
1139 	}
1140 
1141 	if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1142 		res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1143 				CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1144 				cu_dev_id);
1145 		if (res != 0) {
1146 			pfn_cuGetErrorString(res, &(err_string));
1147 			rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1148 					err_string);
1149 			rte_errno = EPERM;
1150 			return -rte_errno;
1151 		}
1152 
1153 		if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1154 			rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1155 	}
1156 
1157 	dev->ops.dev_info_get = cuda_dev_info_get;
1158 	dev->ops.dev_close = cuda_dev_close;
1159 	dev->ops.mem_alloc = cuda_mem_alloc;
1160 	dev->ops.mem_free = cuda_mem_free;
1161 	dev->ops.mem_register = cuda_mem_register;
1162 	dev->ops.mem_unregister = cuda_mem_unregister;
1163 	dev->ops.wmb = cuda_wmb;
1164 
1165 	rte_gpu_complete_new(dev);
1166 
1167 	rte_cuda_debug("dev id = %u name = %s",
1168 			dev->mpshared->info.dev_id, private->gpu_name);
1169 
1170 	return 0;
1171 }
1172 
1173 static int
1174 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1175 {
1176 	struct rte_gpu *dev;
1177 	int ret;
1178 	uint8_t gpu_id;
1179 
1180 	if (pci_dev == NULL) {
1181 		rte_errno = ENODEV;
1182 		return -rte_errno;
1183 	}
1184 
1185 	dev = rte_gpu_get_by_name(pci_dev->device.name);
1186 	if (dev == NULL) {
1187 		rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1188 				pci_dev->device.name);
1189 		rte_errno = ENODEV;
1190 		return -rte_errno;
1191 	}
1192 	gpu_id = dev->mpshared->info.dev_id;
1193 
1194 	/* release dev from library */
1195 	ret = rte_gpu_release(dev);
1196 	if (ret)
1197 		rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1198 
1199 	rte_cuda_debug("Destroyed dev = %u", gpu_id);
1200 
1201 	return 0;
1202 }
1203 
1204 static struct rte_pci_driver rte_cuda_driver = {
1205 	.id_table = pci_id_cuda_map,
1206 	.drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1207 	.probe = cuda_gpu_probe,
1208 	.remove = cuda_gpu_remove,
1209 };
1210 
1211 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1212 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1213 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
1214