xref: /dpdk/drivers/gpu/cuda/cuda.c (revision 7cc8ef9cf4e9d1f3b1c16daea706f9f433968c61)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2021 NVIDIA Corporation & Affiliates
3  */
4 
5 #include <dlfcn.h>
6 
7 #include <rte_malloc.h>
8 #include <rte_pci.h>
9 #include <rte_bus_pci.h>
10 #include <rte_byteorder.h>
11 #include <rte_dev.h>
12 
13 #include <gpudev_driver.h>
14 
15 #include <cuda.h>
16 #include <cudaTypedefs.h>
17 
18 #include "common.h"
19 
20 #define CUDA_DRIVER_MIN_VERSION 11040
21 #define CUDA_API_MIN_VERSION 3020
22 
23 /* CUDA Driver functions loaded with dlsym() */
24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
27 		void **pfn, int cudaVersion, uint64_t flags);
28 
29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
30 static PFN_cuGetErrorString pfn_cuGetErrorString;
31 static PFN_cuGetErrorName pfn_cuGetErrorName;
32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
44 static PFN_cuMemAlloc pfn_cuMemAlloc;
45 static PFN_cuMemFree pfn_cuMemFree;
46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
50 
51 static void *cudalib;
52 static unsigned int cuda_api_version;
53 static int cuda_driver_version;
54 static gdr_t gdrc_h;
55 
56 /* NVIDIA GPU vendor */
57 #define NVIDIA_GPU_VENDOR_ID (0x10de)
58 
59 /* NVIDIA GPU device IDs */
60 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
61 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
62 #define NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID (0x20b8)
63 
64 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
65 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
66 
67 #define NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID (0x1db5)
68 #define NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID (0x1db6)
69 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
70 
71 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
72 
73 #define CUDA_MAX_ALLOCATION_NUM 512
74 
75 #define GPU_PAGE_SHIFT 16
76 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
77 
78 RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
79 
80 /* NVIDIA GPU address map */
81 static const struct rte_pci_id pci_id_cuda_map[] = {
82 	{
83 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
84 				NVIDIA_GPU_A100_40GB_DEVICE_ID)
85 	},
86 	{
87 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
88 				NVIDIA_GPU_A100_80GB_DEVICE_ID)
89 	},
90 	{
91 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
92 				NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID)
93 	},
94 	{
95 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
96 				NVIDIA_GPU_A30_24GB_DEVICE_ID)
97 	},
98 	{
99 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
100 				NVIDIA_GPU_A10_24GB_DEVICE_ID)
101 	},
102 	{
103 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
104 				NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID)
105 	},
106 	{
107 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
108 				NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID)
109 	},
110 	{
111 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
112 				NVIDIA_GPU_V100_16GB_DEVICE_ID)
113 	},
114 	{
115 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
116 				NVIDIA_GPU_T4_16GB_DEVICE_ID)
117 	},
118 	{
119 		.device_id = 0
120 	}
121 };
122 
123 /* Device private info */
124 struct cuda_info {
125 	char gpu_name[RTE_DEV_NAME_MAX_LEN];
126 	CUdevice cu_dev;
127 	int gdr_supported;
128 	int gdr_write_ordering;
129 	int gdr_flush_type;
130 };
131 
132 /* Type of memory allocated by CUDA driver */
133 enum mem_type {
134 	GPU_MEM = 0,
135 	CPU_REGISTERED,
136 	GPU_REGISTERED /* Not used yet */
137 };
138 
139 /* key associated to a memory address */
140 typedef uintptr_t cuda_ptr_key;
141 
142 /* Single entry of the memory list */
143 struct mem_entry {
144 	CUdeviceptr ptr_d;
145 	CUdeviceptr ptr_orig_d;
146 	void *ptr_h;
147 	size_t size;
148 	size_t size_orig;
149 	struct rte_gpu *dev;
150 	CUcontext ctx;
151 	cuda_ptr_key pkey;
152 	enum mem_type mtype;
153 	gdr_mh_t mh;
154 	struct mem_entry *prev;
155 	struct mem_entry *next;
156 };
157 
158 static struct mem_entry *mem_alloc_list_head;
159 static struct mem_entry *mem_alloc_list_tail;
160 static uint32_t mem_alloc_list_last_elem;
161 
162 /* Load the CUDA symbols */
163 
164 static int
165 cuda_loader(void)
166 {
167 	char cuda_path[1024];
168 
169 	if (getenv("CUDA_PATH_L") == NULL)
170 		snprintf(cuda_path, 1024, "%s", "libcuda.so");
171 	else
172 		snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so");
173 
174 	cudalib = dlopen(cuda_path, RTLD_LAZY);
175 	if (cudalib == NULL) {
176 		rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
177 				cuda_path, getenv("CUDA_PATH_L"));
178 		return -1;
179 	}
180 
181 	return 0;
182 }
183 
184 static int
185 cuda_sym_func_loader(void)
186 {
187 	if (cudalib == NULL)
188 		return -1;
189 
190 	sym_cuInit = dlsym(cudalib, "cuInit");
191 	if (sym_cuInit == NULL) {
192 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
193 		return -1;
194 	}
195 
196 	sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
197 	if (sym_cuDriverGetVersion == NULL) {
198 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
199 		return -1;
200 	}
201 
202 	sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
203 	if (sym_cuGetProcAddress == NULL) {
204 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
205 		return -1;
206 	}
207 
208 	return 0;
209 }
210 
211 static int
212 cuda_pfn_func_loader(void)
213 {
214 	CUresult res;
215 
216 	res = sym_cuGetProcAddress("cuGetErrorString",
217 			(void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
218 	if (res != 0) {
219 		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
220 		return -1;
221 	}
222 
223 	res = sym_cuGetProcAddress("cuGetErrorName",
224 			(void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
225 	if (res != 0) {
226 		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
227 		return -1;
228 	}
229 
230 	res = sym_cuGetProcAddress("cuPointerSetAttribute",
231 			(void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
232 	if (res != 0) {
233 		rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
234 		return -1;
235 	}
236 
237 	res = sym_cuGetProcAddress("cuDeviceGetAttribute",
238 			(void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
239 	if (res != 0) {
240 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
241 		return -1;
242 	}
243 
244 	res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
245 			(void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
246 	if (res != 0) {
247 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
248 		return -1;
249 	}
250 
251 	res = sym_cuGetProcAddress("cuDeviceGetName",
252 			(void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
253 	if (res != 0) {
254 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
255 		return -1;
256 	}
257 
258 	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
259 			(void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
260 	if (res != 0) {
261 		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
262 		return -1;
263 	}
264 
265 	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
266 			(void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
267 	if (res != 0) {
268 		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
269 		return -1;
270 	}
271 
272 	res = sym_cuGetProcAddress("cuDeviceTotalMem",
273 			(void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
274 	if (res != 0) {
275 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
276 		return -1;
277 	}
278 
279 	res = sym_cuGetProcAddress("cuCtxGetApiVersion",
280 			(void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
281 	if (res != 0) {
282 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
283 		return -1;
284 	}
285 
286 	res = sym_cuGetProcAddress("cuCtxGetDevice",
287 			(void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
288 	if (res != 0) {
289 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
290 		return -1;
291 	}
292 
293 	res = sym_cuGetProcAddress("cuCtxSetCurrent",
294 			(void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
295 	if (res != 0) {
296 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
297 		return -1;
298 	}
299 
300 	res = sym_cuGetProcAddress("cuCtxGetCurrent",
301 			(void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
302 	if (res != 0) {
303 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
304 		return -1;
305 	}
306 
307 	res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
308 			(void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
309 	if (res != 0) {
310 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
311 		return -1;
312 	}
313 
314 	res = sym_cuGetProcAddress("cuMemAlloc",
315 			(void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
316 	if (res != 0) {
317 		rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
318 		return -1;
319 	}
320 
321 	res = sym_cuGetProcAddress("cuMemFree",
322 			(void **)(&pfn_cuMemFree), cuda_driver_version, 0);
323 	if (res != 0) {
324 		rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
325 		return -1;
326 	}
327 
328 	res = sym_cuGetProcAddress("cuMemHostRegister",
329 			(void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
330 	if (res != 0) {
331 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
332 		return -1;
333 	}
334 
335 	res = sym_cuGetProcAddress("cuMemHostUnregister",
336 			(void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
337 	if (res != 0) {
338 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
339 		return -1;
340 	}
341 
342 	res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
343 			(void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
344 	if (res != 0) {
345 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
346 		return -1;
347 	}
348 
349 	res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
350 			(void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
351 	if (res != 0) {
352 		rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
353 		return -1;
354 	}
355 
356 	return 0;
357 }
358 
359 /* Generate a key from a memory pointer */
360 static cuda_ptr_key
361 get_hash_from_ptr(void *ptr)
362 {
363 	return (uintptr_t)ptr;
364 }
365 
366 static uint32_t
367 mem_list_count_item(void)
368 {
369 	return mem_alloc_list_last_elem;
370 }
371 
372 /* Initiate list of memory allocations if not done yet */
373 static struct mem_entry *
374 mem_list_add_item(void)
375 {
376 	/* Initiate list of memory allocations if not done yet */
377 	if (mem_alloc_list_head == NULL) {
378 		mem_alloc_list_head = rte_zmalloc(NULL,
379 				sizeof(struct mem_entry),
380 				RTE_CACHE_LINE_SIZE);
381 		if (mem_alloc_list_head == NULL) {
382 			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
383 			return NULL;
384 		}
385 
386 		mem_alloc_list_head->next = NULL;
387 		mem_alloc_list_head->prev = NULL;
388 		mem_alloc_list_tail = mem_alloc_list_head;
389 	} else {
390 		struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
391 				sizeof(struct mem_entry),
392 				RTE_CACHE_LINE_SIZE);
393 
394 		if (mem_alloc_list_cur == NULL) {
395 			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
396 			return NULL;
397 		}
398 
399 		mem_alloc_list_tail->next = mem_alloc_list_cur;
400 		mem_alloc_list_cur->prev = mem_alloc_list_tail;
401 		mem_alloc_list_tail = mem_alloc_list_tail->next;
402 		mem_alloc_list_tail->next = NULL;
403 	}
404 
405 	mem_alloc_list_last_elem++;
406 
407 	return mem_alloc_list_tail;
408 }
409 
410 static struct mem_entry *
411 mem_list_find_item(cuda_ptr_key pk)
412 {
413 	struct mem_entry *mem_alloc_list_cur = NULL;
414 
415 	if (mem_alloc_list_head == NULL) {
416 		rte_cuda_log(ERR, "Memory list doesn't exist");
417 		return NULL;
418 	}
419 
420 	if (mem_list_count_item() == 0) {
421 		rte_cuda_log(ERR, "No items in memory list");
422 		return NULL;
423 	}
424 
425 	mem_alloc_list_cur = mem_alloc_list_head;
426 
427 	while (mem_alloc_list_cur != NULL) {
428 		if (mem_alloc_list_cur->pkey == pk)
429 			return mem_alloc_list_cur;
430 		mem_alloc_list_cur = mem_alloc_list_cur->next;
431 	}
432 
433 	return mem_alloc_list_cur;
434 }
435 
436 static int
437 mem_list_del_item(cuda_ptr_key pk)
438 {
439 	struct mem_entry *mem_alloc_list_cur = NULL;
440 
441 	mem_alloc_list_cur = mem_list_find_item(pk);
442 	if (mem_alloc_list_cur == NULL)
443 		return -EINVAL;
444 
445 	/* if key is in head */
446 	if (mem_alloc_list_cur->prev == NULL) {
447 		mem_alloc_list_head = mem_alloc_list_cur->next;
448 		if (mem_alloc_list_head != NULL)
449 			mem_alloc_list_head->prev = NULL;
450 	} else {
451 		mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
452 		if (mem_alloc_list_cur->next != NULL)
453 			mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
454 	}
455 
456 	rte_free(mem_alloc_list_cur);
457 
458 	mem_alloc_list_last_elem--;
459 
460 	return 0;
461 }
462 
463 static int
464 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
465 {
466 	int ret = 0;
467 	CUresult res;
468 	struct rte_gpu_info parent_info;
469 	CUexecAffinityParam affinityPrm;
470 	const char *err_string;
471 	struct cuda_info *private;
472 	CUcontext current_ctx;
473 	CUcontext input_ctx;
474 
475 	if (dev == NULL) {
476 		rte_errno = ENODEV;
477 		return -rte_errno;
478 	}
479 
480 	/* Child initialization time probably called by rte_gpu_add_child() */
481 	if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
482 			dev->mpshared->dev_private == NULL) {
483 		/* Store current ctx */
484 		res = pfn_cuCtxGetCurrent(&current_ctx);
485 		if (res != 0) {
486 			pfn_cuGetErrorString(res, &(err_string));
487 			rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
488 					err_string);
489 			rte_errno = EPERM;
490 			return -rte_errno;
491 		}
492 
493 		/* Set child ctx as current ctx */
494 		input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
495 		res = pfn_cuCtxSetCurrent(input_ctx);
496 		if (res != 0) {
497 			pfn_cuGetErrorString(res, &(err_string));
498 			rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
499 					err_string);
500 			rte_errno = EPERM;
501 			return -rte_errno;
502 		}
503 
504 		/*
505 		 * Ctx capacity info
506 		 */
507 
508 		/* MPS compatible */
509 		res = pfn_cuCtxGetExecAffinity(&affinityPrm,
510 				CU_EXEC_AFFINITY_TYPE_SM_COUNT);
511 		if (res != 0) {
512 			pfn_cuGetErrorString(res, &(err_string));
513 			rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
514 					err_string);
515 		}
516 		dev->mpshared->info.processor_count =
517 				(uint32_t)affinityPrm.param.smCount.val;
518 
519 		ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
520 		if (ret) {
521 			rte_errno = ENODEV;
522 			return -rte_errno;
523 		}
524 		dev->mpshared->info.total_memory = parent_info.total_memory;
525 
526 		/*
527 		 * GPU Device private info
528 		 */
529 		dev->mpshared->dev_private = rte_zmalloc(NULL,
530 				sizeof(struct cuda_info),
531 				RTE_CACHE_LINE_SIZE);
532 		if (dev->mpshared->dev_private == NULL) {
533 			rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
534 			rte_errno = EPERM;
535 			return -rte_errno;
536 		}
537 
538 		private = (struct cuda_info *)dev->mpshared->dev_private;
539 
540 		res = pfn_cuCtxGetDevice(&(private->cu_dev));
541 		if (res != 0) {
542 			pfn_cuGetErrorString(res, &(err_string));
543 			rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
544 					err_string);
545 			rte_errno = EPERM;
546 			return -rte_errno;
547 		}
548 
549 		res = pfn_cuDeviceGetName(private->gpu_name,
550 				RTE_DEV_NAME_MAX_LEN, private->cu_dev);
551 		if (res != 0) {
552 			pfn_cuGetErrorString(res, &(err_string));
553 			rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
554 					err_string);
555 			rte_errno = EPERM;
556 			return -rte_errno;
557 		}
558 
559 		/* Restore original ctx as current ctx */
560 		res = pfn_cuCtxSetCurrent(current_ctx);
561 		if (res != 0) {
562 			pfn_cuGetErrorString(res, &(err_string));
563 			rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
564 					err_string);
565 			rte_errno = EPERM;
566 			return -rte_errno;
567 		}
568 	}
569 
570 	*info = dev->mpshared->info;
571 
572 	return 0;
573 }
574 
575 /*
576  * GPU Memory
577  */
578 
579 static int
580 cuda_mem_alloc(struct rte_gpu *dev, size_t size, unsigned int align, void **ptr)
581 {
582 	CUresult res;
583 	const char *err_string;
584 	CUcontext current_ctx;
585 	CUcontext input_ctx;
586 	unsigned int flag = 1;
587 
588 	if (dev == NULL)
589 		return -ENODEV;
590 
591 	/* Store current ctx */
592 	res = pfn_cuCtxGetCurrent(&current_ctx);
593 	if (res != 0) {
594 		pfn_cuGetErrorString(res, &(err_string));
595 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
596 				err_string);
597 		rte_errno = EPERM;
598 		return -rte_errno;
599 	}
600 
601 	/* Set child ctx as current ctx */
602 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
603 	res = pfn_cuCtxSetCurrent(input_ctx);
604 	if (res != 0) {
605 		pfn_cuGetErrorString(res, &(err_string));
606 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
607 				err_string);
608 		rte_errno = EPERM;
609 		return -rte_errno;
610 	}
611 
612 	/* Get next memory list item */
613 	mem_alloc_list_tail = mem_list_add_item();
614 	if (mem_alloc_list_tail == NULL) {
615 		rte_errno = EPERM;
616 		return -rte_errno;
617 	}
618 
619 	/* Allocate memory */
620 	mem_alloc_list_tail->size = size;
621 	mem_alloc_list_tail->size_orig = size + align;
622 
623 	res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
624 			mem_alloc_list_tail->size_orig);
625 	if (res != 0) {
626 		pfn_cuGetErrorString(res, &(err_string));
627 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
628 				err_string);
629 		rte_errno = EPERM;
630 		return -rte_errno;
631 	}
632 
633 	/* Align memory address */
634 	mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
635 	if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
636 		mem_alloc_list_tail->ptr_d += (align -
637 				(((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
638 
639 	/* GPUDirect RDMA attribute required */
640 	res = pfn_cuPointerSetAttribute(&flag,
641 			CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
642 			mem_alloc_list_tail->ptr_d);
643 	if (res != 0) {
644 		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
645 				"GPU memory at  %"PRIu32", err %d",
646 				(uint32_t)mem_alloc_list_tail->ptr_d, res);
647 		rte_errno = EPERM;
648 		return -rte_errno;
649 	}
650 
651 	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
652 	mem_alloc_list_tail->ptr_h = NULL;
653 	mem_alloc_list_tail->dev = dev;
654 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
655 	mem_alloc_list_tail->mtype = GPU_MEM;
656 
657 	/* Restore original ctx as current ctx */
658 	res = pfn_cuCtxSetCurrent(current_ctx);
659 	if (res != 0) {
660 		pfn_cuGetErrorString(res, &(err_string));
661 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
662 				err_string);
663 		rte_errno = EPERM;
664 		return -rte_errno;
665 	}
666 
667 	*ptr = (void *)mem_alloc_list_tail->ptr_d;
668 
669 	return 0;
670 }
671 
672 static int
673 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
674 {
675 	CUresult res;
676 	const char *err_string;
677 	CUcontext current_ctx;
678 	CUcontext input_ctx;
679 	unsigned int flag = 1;
680 	int use_ptr_h = 0;
681 
682 	if (dev == NULL)
683 		return -ENODEV;
684 
685 	/* Store current ctx */
686 	res = pfn_cuCtxGetCurrent(&current_ctx);
687 	if (res != 0) {
688 		pfn_cuGetErrorString(res, &(err_string));
689 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
690 				err_string);
691 		rte_errno = EPERM;
692 		return -rte_errno;
693 	}
694 
695 	/* Set child ctx as current ctx */
696 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
697 	res = pfn_cuCtxSetCurrent(input_ctx);
698 	if (res != 0) {
699 		pfn_cuGetErrorString(res, &(err_string));
700 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
701 				err_string);
702 		rte_errno = EPERM;
703 		return -rte_errno;
704 	}
705 
706 	/* Get next memory list item */
707 	mem_alloc_list_tail = mem_list_add_item();
708 	if (mem_alloc_list_tail == NULL) {
709 		rte_errno = EPERM;
710 		return -rte_errno;
711 	}
712 
713 	/* Allocate memory */
714 	mem_alloc_list_tail->size = size;
715 	mem_alloc_list_tail->ptr_h = ptr;
716 
717 	res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
718 			mem_alloc_list_tail->size,
719 			CU_MEMHOSTREGISTER_PORTABLE |
720 			CU_MEMHOSTREGISTER_DEVICEMAP);
721 	if (res != 0) {
722 		pfn_cuGetErrorString(res, &(err_string));
723 		rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
724 				err_string,
725 				mem_alloc_list_tail->ptr_h,
726 				mem_alloc_list_tail->size);
727 		rte_errno = EPERM;
728 		return -rte_errno;
729 	}
730 
731 	res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
732 			CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
733 			((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
734 	if (res != 0) {
735 		pfn_cuGetErrorString(res, &(err_string));
736 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
737 				err_string);
738 		rte_errno = EPERM;
739 		return -rte_errno;
740 	}
741 
742 	if (use_ptr_h == 0) {
743 		res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
744 				mem_alloc_list_tail->ptr_h, 0);
745 		if (res != 0) {
746 			pfn_cuGetErrorString(res, &(err_string));
747 			rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
748 					err_string);
749 			rte_errno = EPERM;
750 			return -rte_errno;
751 		}
752 
753 		if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
754 				(uintptr_t)mem_alloc_list_tail->ptr_h) {
755 			rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
756 			rte_errno = ENOTSUP;
757 			return -rte_errno;
758 		}
759 	} else {
760 		mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
761 	}
762 
763 	/* GPUDirect RDMA attribute required */
764 	res = pfn_cuPointerSetAttribute(&flag,
765 			CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
766 			mem_alloc_list_tail->ptr_d);
767 	if (res != 0) {
768 		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
769 				", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
770 		rte_errno = EPERM;
771 		return -rte_errno;
772 	}
773 
774 	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
775 	mem_alloc_list_tail->size = size;
776 	mem_alloc_list_tail->dev = dev;
777 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
778 	mem_alloc_list_tail->mtype = CPU_REGISTERED;
779 	mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d;
780 
781 	/* Restore original ctx as current ctx */
782 	res = pfn_cuCtxSetCurrent(current_ctx);
783 	if (res != 0) {
784 		pfn_cuGetErrorString(res, &(err_string));
785 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
786 				err_string);
787 		rte_errno = EPERM;
788 		return -rte_errno;
789 	}
790 
791 	return 0;
792 }
793 
794 static int
795 cuda_mem_cpu_map(struct rte_gpu *dev, __rte_unused size_t size, void *ptr_in, void **ptr_out)
796 {
797 	struct mem_entry *mem_item;
798 	cuda_ptr_key hk;
799 
800 	if (dev == NULL)
801 		return -ENODEV;
802 
803 	hk = get_hash_from_ptr((void *)ptr_in);
804 
805 	mem_item = mem_list_find_item(hk);
806 	if (mem_item == NULL) {
807 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
808 		rte_errno = EPERM;
809 		return -rte_errno;
810 	}
811 
812 	if (mem_item->mtype != GPU_MEM) {
813 		rte_cuda_log(ERR, "Memory address 0x%p is not GPU memory type.", ptr_in);
814 		rte_errno = EPERM;
815 		return -rte_errno;
816 	}
817 
818 	if (mem_item->size != size)
819 		rte_cuda_log(WARNING,
820 				"Can't expose memory area with size (%zd) different from original size (%zd).",
821 				size, mem_item->size);
822 
823 	if (gdrcopy_pin(&gdrc_h, &(mem_item->mh), (uint64_t)mem_item->ptr_d,
824 					mem_item->size, &(mem_item->ptr_h))) {
825 		rte_cuda_log(ERR, "Error exposing GPU memory address 0x%p.", ptr_in);
826 		rte_errno = EPERM;
827 		return -rte_errno;
828 	}
829 
830 	*ptr_out = mem_item->ptr_h;
831 
832 	return 0;
833 }
834 
835 static int
836 cuda_mem_free(struct rte_gpu *dev, void *ptr)
837 {
838 	CUresult res;
839 	struct mem_entry *mem_item;
840 	const char *err_string;
841 	cuda_ptr_key hk;
842 
843 	if (dev == NULL)
844 		return -ENODEV;
845 
846 	hk = get_hash_from_ptr((void *)ptr);
847 
848 	mem_item = mem_list_find_item(hk);
849 	if (mem_item == NULL) {
850 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
851 		rte_errno = EPERM;
852 		return -rte_errno;
853 	}
854 
855 	if (mem_item->mtype == GPU_MEM) {
856 		res = pfn_cuMemFree(mem_item->ptr_orig_d);
857 		if (res != 0) {
858 			pfn_cuGetErrorString(res, &(err_string));
859 			rte_cuda_log(ERR, "cuMemFree current failed with %s",
860 					err_string);
861 			rte_errno = EPERM;
862 			return -rte_errno;
863 		}
864 
865 		return mem_list_del_item(hk);
866 	}
867 
868 	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
869 
870 	return -EPERM;
871 }
872 
873 static int
874 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
875 {
876 	CUresult res;
877 	struct mem_entry *mem_item;
878 	const char *err_string;
879 	cuda_ptr_key hk;
880 
881 	if (dev == NULL)
882 		return -ENODEV;
883 
884 	hk = get_hash_from_ptr((void *)ptr);
885 
886 	mem_item = mem_list_find_item(hk);
887 	if (mem_item == NULL) {
888 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
889 		rte_errno = EPERM;
890 		return -rte_errno;
891 	}
892 
893 	if (mem_item->mtype == CPU_REGISTERED) {
894 		res = pfn_cuMemHostUnregister(ptr);
895 		if (res != 0) {
896 			pfn_cuGetErrorString(res, &(err_string));
897 			rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
898 					err_string);
899 			rte_errno = EPERM;
900 			return -rte_errno;
901 		}
902 
903 		return mem_list_del_item(hk);
904 	}
905 
906 	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
907 
908 	rte_errno = EPERM;
909 	return -rte_errno;
910 }
911 
912 static int
913 cuda_mem_cpu_unmap(struct rte_gpu *dev, void *ptr_in)
914 {
915 	struct mem_entry *mem_item;
916 	cuda_ptr_key hk;
917 
918 	if (dev == NULL)
919 		return -ENODEV;
920 
921 	hk = get_hash_from_ptr((void *)ptr_in);
922 
923 	mem_item = mem_list_find_item(hk);
924 	if (mem_item == NULL) {
925 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
926 		rte_errno = EPERM;
927 		return -rte_errno;
928 	}
929 
930 	if (gdrcopy_unpin(gdrc_h, mem_item->mh, (void *)mem_item->ptr_d,
931 			mem_item->size)) {
932 		rte_cuda_log(ERR, "Error unexposing GPU memory address 0x%p.", ptr_in);
933 		rte_errno = EPERM;
934 		return -rte_errno;
935 	}
936 
937 	return 0;
938 }
939 
940 static int
941 cuda_dev_close(struct rte_gpu *dev)
942 {
943 	if (dev == NULL)
944 		return -EINVAL;
945 
946 	rte_free(dev->mpshared->dev_private);
947 
948 	return 0;
949 }
950 
951 static int
952 cuda_wmb(struct rte_gpu *dev)
953 {
954 	CUresult res;
955 	const char *err_string;
956 	CUcontext current_ctx;
957 	CUcontext input_ctx;
958 	struct cuda_info *private;
959 
960 	if (dev == NULL) {
961 		rte_errno = ENODEV;
962 		return -rte_errno;
963 	}
964 
965 	private = (struct cuda_info *)dev->mpshared->dev_private;
966 
967 	if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
968 		/*
969 		 * No need to explicitly force the write ordering because
970 		 * the device natively supports it
971 		 */
972 		return 0;
973 	}
974 
975 	if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
976 		/*
977 		 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
978 		 * Application needs to use alternative methods.
979 		 */
980 		rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
981 				"Application needs to use alternative methods.");
982 
983 		rte_errno = ENOTSUP;
984 		return -rte_errno;
985 	}
986 
987 	/* Store current ctx */
988 	res = pfn_cuCtxGetCurrent(&current_ctx);
989 	if (res != 0) {
990 		pfn_cuGetErrorString(res, &(err_string));
991 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
992 				err_string);
993 		rte_errno = EPERM;
994 		return -rte_errno;
995 	}
996 
997 	/* Set child ctx as current ctx */
998 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
999 	res = pfn_cuCtxSetCurrent(input_ctx);
1000 	if (res != 0) {
1001 		pfn_cuGetErrorString(res, &(err_string));
1002 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
1003 				err_string);
1004 		rte_errno = EPERM;
1005 		return -rte_errno;
1006 	}
1007 
1008 	res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
1009 			CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
1010 	if (res != 0) {
1011 		pfn_cuGetErrorString(res, &(err_string));
1012 		rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
1013 				err_string);
1014 		rte_errno = EPERM;
1015 		return -rte_errno;
1016 	}
1017 
1018 	/* Restore original ctx as current ctx */
1019 	res = pfn_cuCtxSetCurrent(current_ctx);
1020 	if (res != 0) {
1021 		pfn_cuGetErrorString(res, &(err_string));
1022 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
1023 				err_string);
1024 		rte_errno = EPERM;
1025 		return -rte_errno;
1026 	}
1027 
1028 	return 0;
1029 }
1030 
1031 static int
1032 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
1033 {
1034 	struct rte_gpu *dev = NULL;
1035 	CUresult res;
1036 	CUdevice cu_dev_id;
1037 	CUcontext pctx;
1038 	char dev_name[RTE_DEV_NAME_MAX_LEN];
1039 	const char *err_string;
1040 	int processor_count = 0;
1041 	struct cuda_info *private;
1042 
1043 	if (pci_dev == NULL) {
1044 		rte_cuda_log(ERR, "NULL PCI device");
1045 		rte_errno = ENODEV;
1046 		return -rte_errno;
1047 	}
1048 
1049 	rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
1050 
1051 	/* Allocate memory to be used privately by drivers */
1052 	dev = rte_gpu_allocate(pci_dev->device.name);
1053 	if (dev == NULL) {
1054 		rte_errno = ENODEV;
1055 		return -rte_errno;
1056 	}
1057 
1058 	/* Initialize values only for the first CUDA driver call */
1059 	if (dev->mpshared->info.dev_id == 0) {
1060 		mem_alloc_list_head = NULL;
1061 		mem_alloc_list_tail = NULL;
1062 		mem_alloc_list_last_elem = 0;
1063 
1064 		/* Load libcuda.so library */
1065 		if (cuda_loader()) {
1066 			rte_cuda_log(ERR, "CUDA Driver library not found");
1067 			rte_errno = ENOTSUP;
1068 			return -rte_errno;
1069 		}
1070 
1071 		/* Load initial CUDA functions */
1072 		if (cuda_sym_func_loader()) {
1073 			rte_cuda_log(ERR, "CUDA functions not found in library");
1074 			rte_errno = ENOTSUP;
1075 			return -rte_errno;
1076 		}
1077 
1078 		/*
1079 		 * Required to initialize the CUDA Driver.
1080 		 * Multiple calls of cuInit() will return immediately
1081 		 * without making any relevant change
1082 		 */
1083 		sym_cuInit(0);
1084 
1085 		res = sym_cuDriverGetVersion(&cuda_driver_version);
1086 		if (res != 0) {
1087 			rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1088 			rte_errno = ENOTSUP;
1089 			return -rte_errno;
1090 		}
1091 
1092 		if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1093 			rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1094 					"Minimum requirement is %d",
1095 					cuda_driver_version,
1096 					CUDA_DRIVER_MIN_VERSION);
1097 			rte_errno = ENOTSUP;
1098 			return -rte_errno;
1099 		}
1100 
1101 		if (cuda_pfn_func_loader()) {
1102 			rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1103 			rte_errno = ENOTSUP;
1104 			return -rte_errno;
1105 		}
1106 
1107 		gdrc_h = NULL;
1108 	}
1109 
1110 	/* Fill HW specific part of device structure */
1111 	dev->device = &pci_dev->device;
1112 	dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1113 
1114 	/* Get NVIDIA GPU Device descriptor */
1115 	res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1116 	if (res != 0) {
1117 		pfn_cuGetErrorString(res, &(err_string));
1118 		rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1119 				dev->device->name, res, err_string);
1120 		rte_errno = EPERM;
1121 		return -rte_errno;
1122 	}
1123 
1124 	res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1125 	if (res != 0) {
1126 		pfn_cuGetErrorString(res, &(err_string));
1127 		rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1128 				dev->device->name, res, err_string);
1129 		rte_errno = EPERM;
1130 		return -rte_errno;
1131 	}
1132 
1133 	res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1134 	if (res != 0) {
1135 		rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1136 		rte_errno = ENOTSUP;
1137 		return -rte_errno;
1138 	}
1139 
1140 	if (cuda_api_version < CUDA_API_MIN_VERSION) {
1141 		rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1142 				cuda_api_version, CUDA_API_MIN_VERSION);
1143 		rte_errno = ENOTSUP;
1144 		return -rte_errno;
1145 	}
1146 
1147 	dev->mpshared->info.context = (uint64_t)pctx;
1148 
1149 	/*
1150 	 * GPU Device generic info
1151 	 */
1152 
1153 	/* Processor count */
1154 	res = pfn_cuDeviceGetAttribute(&(processor_count),
1155 			CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1156 			cu_dev_id);
1157 	if (res != 0) {
1158 		pfn_cuGetErrorString(res, &(err_string));
1159 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1160 				err_string);
1161 		rte_errno = EPERM;
1162 		return -rte_errno;
1163 	}
1164 	dev->mpshared->info.processor_count = (uint32_t)processor_count;
1165 
1166 	/* Total memory */
1167 	res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1168 	if (res != 0) {
1169 		pfn_cuGetErrorString(res, &(err_string));
1170 		rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1171 				err_string);
1172 		rte_errno = EPERM;
1173 		return -rte_errno;
1174 	}
1175 
1176 	/*
1177 	 * GPU Device private info
1178 	 */
1179 	dev->mpshared->dev_private = rte_zmalloc(NULL,
1180 			sizeof(struct cuda_info),
1181 			RTE_CACHE_LINE_SIZE);
1182 	if (dev->mpshared->dev_private == NULL) {
1183 		rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1184 		rte_errno = EPERM;
1185 		return -rte_errno;
1186 	}
1187 
1188 	private = (struct cuda_info *)dev->mpshared->dev_private;
1189 	private->cu_dev = cu_dev_id;
1190 	res = pfn_cuDeviceGetName(private->gpu_name,
1191 			RTE_DEV_NAME_MAX_LEN,
1192 			cu_dev_id);
1193 	if (res != 0) {
1194 		pfn_cuGetErrorString(res, &(err_string));
1195 		rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1196 				err_string);
1197 		rte_errno = EPERM;
1198 		return -rte_errno;
1199 	}
1200 
1201 	res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1202 			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1203 			cu_dev_id);
1204 	if (res != 0) {
1205 		pfn_cuGetErrorString(res, &(err_string));
1206 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1207 				err_string);
1208 		rte_errno = EPERM;
1209 		return -rte_errno;
1210 	}
1211 
1212 	if (private->gdr_supported == 0)
1213 		rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1214 				pci_dev->device.name);
1215 
1216 	res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1217 			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1218 			cu_dev_id);
1219 	if (res != 0) {
1220 		pfn_cuGetErrorString(res, &(err_string));
1221 		rte_cuda_log(ERR,
1222 				"cuDeviceGetAttribute failed with %s",
1223 				err_string);
1224 		rte_errno = EPERM;
1225 		return -rte_errno;
1226 	}
1227 
1228 	if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1229 		res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1230 				CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1231 				cu_dev_id);
1232 		if (res != 0) {
1233 			pfn_cuGetErrorString(res, &(err_string));
1234 			rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1235 					err_string);
1236 			rte_errno = EPERM;
1237 			return -rte_errno;
1238 		}
1239 
1240 		if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1241 			rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1242 	}
1243 
1244 	dev->ops.dev_info_get = cuda_dev_info_get;
1245 	dev->ops.dev_close = cuda_dev_close;
1246 	dev->ops.mem_alloc = cuda_mem_alloc;
1247 	dev->ops.mem_free = cuda_mem_free;
1248 	dev->ops.mem_register = cuda_mem_register;
1249 	dev->ops.mem_unregister = cuda_mem_unregister;
1250 	dev->ops.mem_cpu_map = cuda_mem_cpu_map;
1251 	dev->ops.mem_cpu_unmap = cuda_mem_cpu_unmap;
1252 	dev->ops.wmb = cuda_wmb;
1253 
1254 	rte_gpu_complete_new(dev);
1255 
1256 	rte_cuda_debug("dev id = %u name = %s",
1257 			dev->mpshared->info.dev_id, private->gpu_name);
1258 
1259 	return 0;
1260 }
1261 
1262 static int
1263 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1264 {
1265 	struct rte_gpu *dev;
1266 	int ret;
1267 	uint8_t gpu_id;
1268 
1269 	if (pci_dev == NULL) {
1270 		rte_errno = ENODEV;
1271 		return -rte_errno;
1272 	}
1273 
1274 	dev = rte_gpu_get_by_name(pci_dev->device.name);
1275 	if (dev == NULL) {
1276 		rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1277 				pci_dev->device.name);
1278 		rte_errno = ENODEV;
1279 		return -rte_errno;
1280 	}
1281 	gpu_id = dev->mpshared->info.dev_id;
1282 
1283 	/* release dev from library */
1284 	ret = rte_gpu_release(dev);
1285 	if (ret)
1286 		rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1287 
1288 	rte_cuda_debug("Destroyed dev = %u", gpu_id);
1289 
1290 	return 0;
1291 }
1292 
1293 static struct rte_pci_driver rte_cuda_driver = {
1294 	.id_table = pci_id_cuda_map,
1295 	.drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1296 	.probe = cuda_gpu_probe,
1297 	.remove = cuda_gpu_remove,
1298 };
1299 
1300 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1301 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1302 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
1303