xref: /dpdk/drivers/gpu/cuda/cuda.c (revision b98d1dc0fb0e49b94858501d88512e6354b0e9de)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2021 NVIDIA Corporation & Affiliates
3  */
4 
5 #include <dlfcn.h>
6 
7 #include <rte_malloc.h>
8 #include <rte_pci.h>
9 #include <rte_bus_pci.h>
10 #include <rte_byteorder.h>
11 #include <rte_dev.h>
12 
13 #include <gpudev_driver.h>
14 
15 #include <cuda.h>
16 #include <cudaTypedefs.h>
17 
18 #include "common.h"
19 
20 #define CUDA_DRIVER_MIN_VERSION 11040
21 #define CUDA_API_MIN_VERSION 3020
22 
23 /* CUDA Driver functions loaded with dlsym() */
24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
27 		void **pfn, int cudaVersion, uint64_t flags);
28 
29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
30 static PFN_cuGetErrorString pfn_cuGetErrorString;
31 static PFN_cuGetErrorName pfn_cuGetErrorName;
32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
44 static PFN_cuMemAlloc pfn_cuMemAlloc;
45 static PFN_cuMemFree pfn_cuMemFree;
46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
50 
51 static void *cudalib;
52 static unsigned int cuda_api_version;
53 static int cuda_driver_version;
54 static gdr_t gdrc_h;
55 
56 /* NVIDIA GPU vendor */
57 #define NVIDIA_GPU_VENDOR_ID (0x10de)
58 
59 /* NVIDIA GPU device IDs */
60 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
61 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
62 #define NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID (0x20b8)
63 
64 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
65 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
66 
67 #define NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID (0x1db5)
68 #define NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID (0x1db6)
69 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
70 
71 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
72 
73 #define CUDA_MAX_ALLOCATION_NUM 512
74 
75 #define GPU_PAGE_SHIFT 16
76 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
77 
78 RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
79 
80 /* NVIDIA GPU address map */
81 static const struct rte_pci_id pci_id_cuda_map[] = {
82 	{
83 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
84 				NVIDIA_GPU_A100_40GB_DEVICE_ID)
85 	},
86 	{
87 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
88 				NVIDIA_GPU_A100_80GB_DEVICE_ID)
89 	},
90 	{
91 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
92 				NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID)
93 	},
94 	{
95 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
96 				NVIDIA_GPU_A30_24GB_DEVICE_ID)
97 	},
98 	{
99 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
100 				NVIDIA_GPU_A10_24GB_DEVICE_ID)
101 	},
102 	{
103 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
104 				NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID)
105 	},
106 	{
107 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
108 				NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID)
109 	},
110 	{
111 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
112 				NVIDIA_GPU_V100_16GB_DEVICE_ID)
113 	},
114 	{
115 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
116 				NVIDIA_GPU_T4_16GB_DEVICE_ID)
117 	},
118 	{
119 		.device_id = 0
120 	}
121 };
122 
123 /* Device private info */
124 struct cuda_info {
125 	char gpu_name[RTE_DEV_NAME_MAX_LEN];
126 	CUdevice cu_dev;
127 	int gdr_supported;
128 	int gdr_write_ordering;
129 	int gdr_flush_type;
130 };
131 
132 /* Type of memory allocated by CUDA driver */
133 enum mem_type {
134 	GPU_MEM = 0,
135 	CPU_REGISTERED,
136 	GPU_REGISTERED /* Not used yet */
137 };
138 
139 /* key associated to a memory address */
140 typedef uintptr_t cuda_ptr_key;
141 
142 /* Single entry of the memory list */
143 struct mem_entry {
144 	CUdeviceptr ptr_d;
145 	CUdeviceptr ptr_orig_d;
146 	void *ptr_h;
147 	size_t size;
148 	size_t size_orig;
149 	struct rte_gpu *dev;
150 	CUcontext ctx;
151 	cuda_ptr_key pkey;
152 	enum mem_type mtype;
153 	gdr_mh_t mh;
154 	struct mem_entry *prev;
155 	struct mem_entry *next;
156 };
157 
158 static struct mem_entry *mem_alloc_list_head;
159 static struct mem_entry *mem_alloc_list_tail;
160 static uint32_t mem_alloc_list_last_elem;
161 
162 /* Load the CUDA symbols */
163 
164 static int
165 cuda_loader(void)
166 {
167 	char cuda_path[1024];
168 
169 	if (getenv("CUDA_PATH_L") == NULL)
170 		snprintf(cuda_path, 1024, "%s", "libcuda.so");
171 	else
172 		snprintf(cuda_path, 1024, "%s/%s", getenv("CUDA_PATH_L"), "libcuda.so");
173 
174 	cudalib = dlopen(cuda_path, RTLD_LAZY);
175 	if (cudalib == NULL) {
176 		rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
177 				cuda_path, getenv("CUDA_PATH_L"));
178 		return -1;
179 	}
180 
181 	return 0;
182 }
183 
184 static int
185 cuda_sym_func_loader(void)
186 {
187 	if (cudalib == NULL)
188 		return -1;
189 
190 	sym_cuInit = dlsym(cudalib, "cuInit");
191 	if (sym_cuInit == NULL) {
192 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
193 		return -1;
194 	}
195 
196 	sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
197 	if (sym_cuDriverGetVersion == NULL) {
198 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
199 		return -1;
200 	}
201 
202 	sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
203 	if (sym_cuGetProcAddress == NULL) {
204 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
205 		return -1;
206 	}
207 
208 	return 0;
209 }
210 
211 static int
212 cuda_pfn_func_loader(void)
213 {
214 	CUresult res;
215 
216 	res = sym_cuGetProcAddress("cuGetErrorString",
217 			(void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
218 	if (res != 0) {
219 		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
220 		return -1;
221 	}
222 
223 	res = sym_cuGetProcAddress("cuGetErrorName",
224 			(void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
225 	if (res != 0) {
226 		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
227 		return -1;
228 	}
229 
230 	res = sym_cuGetProcAddress("cuPointerSetAttribute",
231 			(void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
232 	if (res != 0) {
233 		rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
234 		return -1;
235 	}
236 
237 	res = sym_cuGetProcAddress("cuDeviceGetAttribute",
238 			(void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
239 	if (res != 0) {
240 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
241 		return -1;
242 	}
243 
244 	res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
245 			(void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
246 	if (res != 0) {
247 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
248 		return -1;
249 	}
250 
251 	res = sym_cuGetProcAddress("cuDeviceGetName",
252 			(void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
253 	if (res != 0) {
254 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
255 		return -1;
256 	}
257 
258 	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
259 			(void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
260 	if (res != 0) {
261 		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
262 		return -1;
263 	}
264 
265 	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
266 			(void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
267 	if (res != 0) {
268 		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
269 		return -1;
270 	}
271 
272 	res = sym_cuGetProcAddress("cuDeviceTotalMem",
273 			(void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
274 	if (res != 0) {
275 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
276 		return -1;
277 	}
278 
279 	res = sym_cuGetProcAddress("cuCtxGetApiVersion",
280 			(void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
281 	if (res != 0) {
282 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
283 		return -1;
284 	}
285 
286 	res = sym_cuGetProcAddress("cuCtxGetDevice",
287 			(void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
288 	if (res != 0) {
289 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
290 		return -1;
291 	}
292 
293 	res = sym_cuGetProcAddress("cuCtxSetCurrent",
294 			(void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
295 	if (res != 0) {
296 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
297 		return -1;
298 	}
299 
300 	res = sym_cuGetProcAddress("cuCtxGetCurrent",
301 			(void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
302 	if (res != 0) {
303 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
304 		return -1;
305 	}
306 
307 	res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
308 			(void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
309 	if (res != 0) {
310 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
311 		return -1;
312 	}
313 
314 	res = sym_cuGetProcAddress("cuMemAlloc",
315 			(void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
316 	if (res != 0) {
317 		rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
318 		return -1;
319 	}
320 
321 	res = sym_cuGetProcAddress("cuMemFree",
322 			(void **)(&pfn_cuMemFree), cuda_driver_version, 0);
323 	if (res != 0) {
324 		rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
325 		return -1;
326 	}
327 
328 	res = sym_cuGetProcAddress("cuMemHostRegister",
329 			(void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
330 	if (res != 0) {
331 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
332 		return -1;
333 	}
334 
335 	res = sym_cuGetProcAddress("cuMemHostUnregister",
336 			(void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
337 	if (res != 0) {
338 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
339 		return -1;
340 	}
341 
342 	res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
343 			(void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
344 	if (res != 0) {
345 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
346 		return -1;
347 	}
348 
349 	res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
350 			(void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
351 	if (res != 0) {
352 		rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
353 		return -1;
354 	}
355 
356 	return 0;
357 }
358 
359 /* Generate a key from a memory pointer */
360 static cuda_ptr_key
361 get_hash_from_ptr(void *ptr)
362 {
363 	return (uintptr_t)ptr;
364 }
365 
366 static uint32_t
367 mem_list_count_item(void)
368 {
369 	return mem_alloc_list_last_elem;
370 }
371 
372 /* Initiate list of memory allocations if not done yet */
373 static struct mem_entry *
374 mem_list_add_item(void)
375 {
376 	/* Initiate list of memory allocations if not done yet */
377 	if (mem_alloc_list_head == NULL) {
378 		mem_alloc_list_head = rte_zmalloc(NULL,
379 				sizeof(struct mem_entry),
380 				RTE_CACHE_LINE_SIZE);
381 		if (mem_alloc_list_head == NULL) {
382 			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
383 			return NULL;
384 		}
385 
386 		mem_alloc_list_head->next = NULL;
387 		mem_alloc_list_head->prev = NULL;
388 		mem_alloc_list_tail = mem_alloc_list_head;
389 	} else {
390 		struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
391 				sizeof(struct mem_entry),
392 				RTE_CACHE_LINE_SIZE);
393 
394 		if (mem_alloc_list_cur == NULL) {
395 			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
396 			return NULL;
397 		}
398 
399 		mem_alloc_list_tail->next = mem_alloc_list_cur;
400 		mem_alloc_list_cur->prev = mem_alloc_list_tail;
401 		mem_alloc_list_tail = mem_alloc_list_tail->next;
402 		mem_alloc_list_tail->next = NULL;
403 	}
404 
405 	mem_alloc_list_last_elem++;
406 
407 	return mem_alloc_list_tail;
408 }
409 
410 static struct mem_entry *
411 mem_list_find_item(cuda_ptr_key pk)
412 {
413 	struct mem_entry *mem_alloc_list_cur = NULL;
414 
415 	if (mem_alloc_list_head == NULL) {
416 		rte_cuda_log(ERR, "Memory list doesn't exist");
417 		return NULL;
418 	}
419 
420 	if (mem_list_count_item() == 0) {
421 		rte_cuda_log(ERR, "No items in memory list");
422 		return NULL;
423 	}
424 
425 	mem_alloc_list_cur = mem_alloc_list_head;
426 
427 	while (mem_alloc_list_cur != NULL) {
428 		if (mem_alloc_list_cur->pkey == pk)
429 			return mem_alloc_list_cur;
430 		mem_alloc_list_cur = mem_alloc_list_cur->next;
431 	}
432 
433 	return mem_alloc_list_cur;
434 }
435 
436 static int
437 mem_list_del_item(cuda_ptr_key pk)
438 {
439 	struct mem_entry *mem_alloc_list_cur = NULL;
440 
441 	mem_alloc_list_cur = mem_list_find_item(pk);
442 	if (mem_alloc_list_cur == NULL)
443 		return -EINVAL;
444 
445 	/* if key is in head */
446 	if (mem_alloc_list_cur->prev == NULL) {
447 		mem_alloc_list_head = mem_alloc_list_cur->next;
448 		if (mem_alloc_list_head != NULL)
449 			mem_alloc_list_head->prev = NULL;
450 	} else {
451 		mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
452 		if (mem_alloc_list_cur->next != NULL)
453 			mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
454 	}
455 
456 	rte_free(mem_alloc_list_cur);
457 
458 	mem_alloc_list_last_elem--;
459 
460 	return 0;
461 }
462 
463 static int
464 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
465 {
466 	int ret = 0;
467 	CUresult res;
468 	struct rte_gpu_info parent_info;
469 	CUexecAffinityParam affinityPrm;
470 	const char *err_string;
471 	struct cuda_info *private;
472 	CUcontext current_ctx;
473 	CUcontext input_ctx;
474 
475 	if (dev == NULL) {
476 		rte_errno = ENODEV;
477 		return -rte_errno;
478 	}
479 
480 	/* Child initialization time probably called by rte_gpu_add_child() */
481 	if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
482 			dev->mpshared->dev_private == NULL) {
483 		/* Store current ctx */
484 		res = pfn_cuCtxGetCurrent(&current_ctx);
485 		if (res != 0) {
486 			pfn_cuGetErrorString(res, &(err_string));
487 			rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
488 					err_string);
489 			rte_errno = EPERM;
490 			return -rte_errno;
491 		}
492 
493 		/* Set child ctx as current ctx */
494 		input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
495 		res = pfn_cuCtxSetCurrent(input_ctx);
496 		if (res != 0) {
497 			pfn_cuGetErrorString(res, &(err_string));
498 			rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
499 					err_string);
500 			rte_errno = EPERM;
501 			return -rte_errno;
502 		}
503 
504 		/*
505 		 * Ctx capacity info
506 		 */
507 
508 		/* MPS compatible */
509 		res = pfn_cuCtxGetExecAffinity(&affinityPrm,
510 				CU_EXEC_AFFINITY_TYPE_SM_COUNT);
511 		if (res != 0) {
512 			pfn_cuGetErrorString(res, &(err_string));
513 			rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
514 					err_string);
515 		}
516 		dev->mpshared->info.processor_count =
517 				(uint32_t)affinityPrm.param.smCount.val;
518 
519 		ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
520 		if (ret) {
521 			rte_errno = ENODEV;
522 			return -rte_errno;
523 		}
524 		dev->mpshared->info.total_memory = parent_info.total_memory;
525 
526 		dev->mpshared->info.page_size = parent_info.page_size;
527 
528 		/*
529 		 * GPU Device private info
530 		 */
531 		dev->mpshared->dev_private = rte_zmalloc(NULL,
532 				sizeof(struct cuda_info),
533 				RTE_CACHE_LINE_SIZE);
534 		if (dev->mpshared->dev_private == NULL) {
535 			rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
536 			rte_errno = EPERM;
537 			return -rte_errno;
538 		}
539 
540 		private = (struct cuda_info *)dev->mpshared->dev_private;
541 
542 		res = pfn_cuCtxGetDevice(&(private->cu_dev));
543 		if (res != 0) {
544 			pfn_cuGetErrorString(res, &(err_string));
545 			rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
546 					err_string);
547 			rte_errno = EPERM;
548 			return -rte_errno;
549 		}
550 
551 		res = pfn_cuDeviceGetName(private->gpu_name,
552 				RTE_DEV_NAME_MAX_LEN, private->cu_dev);
553 		if (res != 0) {
554 			pfn_cuGetErrorString(res, &(err_string));
555 			rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
556 					err_string);
557 			rte_errno = EPERM;
558 			return -rte_errno;
559 		}
560 
561 		/* Restore original ctx as current ctx */
562 		res = pfn_cuCtxSetCurrent(current_ctx);
563 		if (res != 0) {
564 			pfn_cuGetErrorString(res, &(err_string));
565 			rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
566 					err_string);
567 			rte_errno = EPERM;
568 			return -rte_errno;
569 		}
570 	}
571 
572 	*info = dev->mpshared->info;
573 
574 	return 0;
575 }
576 
577 /*
578  * GPU Memory
579  */
580 
581 static int
582 cuda_mem_alloc(struct rte_gpu *dev, size_t size, unsigned int align, void **ptr)
583 {
584 	CUresult res;
585 	const char *err_string;
586 	CUcontext current_ctx;
587 	CUcontext input_ctx;
588 	unsigned int flag = 1;
589 
590 	if (dev == NULL)
591 		return -ENODEV;
592 
593 	/* Store current ctx */
594 	res = pfn_cuCtxGetCurrent(&current_ctx);
595 	if (res != 0) {
596 		pfn_cuGetErrorString(res, &(err_string));
597 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
598 				err_string);
599 		rte_errno = EPERM;
600 		return -rte_errno;
601 	}
602 
603 	/* Set child ctx as current ctx */
604 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
605 	res = pfn_cuCtxSetCurrent(input_ctx);
606 	if (res != 0) {
607 		pfn_cuGetErrorString(res, &(err_string));
608 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
609 				err_string);
610 		rte_errno = EPERM;
611 		return -rte_errno;
612 	}
613 
614 	/* Get next memory list item */
615 	mem_alloc_list_tail = mem_list_add_item();
616 	if (mem_alloc_list_tail == NULL) {
617 		rte_errno = EPERM;
618 		return -rte_errno;
619 	}
620 
621 	/* Allocate memory */
622 	mem_alloc_list_tail->size = size;
623 	mem_alloc_list_tail->size_orig = size + align;
624 
625 	res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
626 			mem_alloc_list_tail->size_orig);
627 	if (res != 0) {
628 		pfn_cuGetErrorString(res, &(err_string));
629 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
630 				err_string);
631 		rte_errno = EPERM;
632 		return -rte_errno;
633 	}
634 
635 	/* Align memory address */
636 	mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
637 	if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
638 		mem_alloc_list_tail->ptr_d += (align -
639 				(((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
640 
641 	/* GPUDirect RDMA attribute required */
642 	res = pfn_cuPointerSetAttribute(&flag,
643 			CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
644 			mem_alloc_list_tail->ptr_d);
645 	if (res != 0) {
646 		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
647 				"GPU memory at  %"PRIu32", err %d",
648 				(uint32_t)mem_alloc_list_tail->ptr_d, res);
649 		rte_errno = EPERM;
650 		return -rte_errno;
651 	}
652 
653 	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
654 	mem_alloc_list_tail->ptr_h = NULL;
655 	mem_alloc_list_tail->dev = dev;
656 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
657 	mem_alloc_list_tail->mtype = GPU_MEM;
658 
659 	/* Restore original ctx as current ctx */
660 	res = pfn_cuCtxSetCurrent(current_ctx);
661 	if (res != 0) {
662 		pfn_cuGetErrorString(res, &(err_string));
663 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
664 				err_string);
665 		rte_errno = EPERM;
666 		return -rte_errno;
667 	}
668 
669 	*ptr = (void *)mem_alloc_list_tail->ptr_d;
670 
671 	return 0;
672 }
673 
674 static int
675 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
676 {
677 	CUresult res;
678 	const char *err_string;
679 	CUcontext current_ctx;
680 	CUcontext input_ctx;
681 	unsigned int flag = 1;
682 	int use_ptr_h = 0;
683 
684 	if (dev == NULL)
685 		return -ENODEV;
686 
687 	/* Store current ctx */
688 	res = pfn_cuCtxGetCurrent(&current_ctx);
689 	if (res != 0) {
690 		pfn_cuGetErrorString(res, &(err_string));
691 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
692 				err_string);
693 		rte_errno = EPERM;
694 		return -rte_errno;
695 	}
696 
697 	/* Set child ctx as current ctx */
698 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
699 	res = pfn_cuCtxSetCurrent(input_ctx);
700 	if (res != 0) {
701 		pfn_cuGetErrorString(res, &(err_string));
702 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
703 				err_string);
704 		rte_errno = EPERM;
705 		return -rte_errno;
706 	}
707 
708 	/* Get next memory list item */
709 	mem_alloc_list_tail = mem_list_add_item();
710 	if (mem_alloc_list_tail == NULL) {
711 		rte_errno = EPERM;
712 		return -rte_errno;
713 	}
714 
715 	/* Allocate memory */
716 	mem_alloc_list_tail->size = size;
717 	mem_alloc_list_tail->ptr_h = ptr;
718 
719 	res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
720 			mem_alloc_list_tail->size,
721 			CU_MEMHOSTREGISTER_PORTABLE |
722 			CU_MEMHOSTREGISTER_DEVICEMAP);
723 	if (res != 0) {
724 		pfn_cuGetErrorString(res, &(err_string));
725 		rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
726 				err_string,
727 				mem_alloc_list_tail->ptr_h,
728 				mem_alloc_list_tail->size);
729 		rte_errno = EPERM;
730 		return -rte_errno;
731 	}
732 
733 	res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
734 			CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
735 			((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
736 	if (res != 0) {
737 		pfn_cuGetErrorString(res, &(err_string));
738 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
739 				err_string);
740 		rte_errno = EPERM;
741 		return -rte_errno;
742 	}
743 
744 	if (use_ptr_h == 0) {
745 		res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
746 				mem_alloc_list_tail->ptr_h, 0);
747 		if (res != 0) {
748 			pfn_cuGetErrorString(res, &(err_string));
749 			rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
750 					err_string);
751 			rte_errno = EPERM;
752 			return -rte_errno;
753 		}
754 
755 		if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
756 				(uintptr_t)mem_alloc_list_tail->ptr_h) {
757 			rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
758 			rte_errno = ENOTSUP;
759 			return -rte_errno;
760 		}
761 	} else {
762 		mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
763 	}
764 
765 	/* GPUDirect RDMA attribute required */
766 	res = pfn_cuPointerSetAttribute(&flag,
767 			CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
768 			mem_alloc_list_tail->ptr_d);
769 	if (res != 0) {
770 		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
771 				", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
772 		rte_errno = EPERM;
773 		return -rte_errno;
774 	}
775 
776 	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
777 	mem_alloc_list_tail->size = size;
778 	mem_alloc_list_tail->dev = dev;
779 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
780 	mem_alloc_list_tail->mtype = CPU_REGISTERED;
781 	mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d;
782 
783 	/* Restore original ctx as current ctx */
784 	res = pfn_cuCtxSetCurrent(current_ctx);
785 	if (res != 0) {
786 		pfn_cuGetErrorString(res, &(err_string));
787 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
788 				err_string);
789 		rte_errno = EPERM;
790 		return -rte_errno;
791 	}
792 
793 	return 0;
794 }
795 
796 static int
797 cuda_mem_cpu_map(struct rte_gpu *dev, __rte_unused size_t size, void *ptr_in, void **ptr_out)
798 {
799 	struct mem_entry *mem_item;
800 	cuda_ptr_key hk;
801 
802 	if (dev == NULL)
803 		return -ENODEV;
804 
805 	hk = get_hash_from_ptr((void *)ptr_in);
806 
807 	mem_item = mem_list_find_item(hk);
808 	if (mem_item == NULL) {
809 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
810 		rte_errno = EPERM;
811 		return -rte_errno;
812 	}
813 
814 	if (mem_item->mtype != GPU_MEM) {
815 		rte_cuda_log(ERR, "Memory address 0x%p is not GPU memory type.", ptr_in);
816 		rte_errno = EPERM;
817 		return -rte_errno;
818 	}
819 
820 	if (mem_item->size != size)
821 		rte_cuda_log(WARNING,
822 				"Can't expose memory area with size (%zd) different from original size (%zd).",
823 				size, mem_item->size);
824 
825 	if (gdrcopy_pin(&gdrc_h, &(mem_item->mh), (uint64_t)mem_item->ptr_d,
826 					mem_item->size, &(mem_item->ptr_h))) {
827 		rte_cuda_log(ERR, "Error exposing GPU memory address 0x%p.", ptr_in);
828 		rte_errno = EPERM;
829 		return -rte_errno;
830 	}
831 
832 	*ptr_out = mem_item->ptr_h;
833 
834 	return 0;
835 }
836 
837 static int
838 cuda_mem_free(struct rte_gpu *dev, void *ptr)
839 {
840 	CUresult res;
841 	struct mem_entry *mem_item;
842 	const char *err_string;
843 	cuda_ptr_key hk;
844 
845 	if (dev == NULL)
846 		return -ENODEV;
847 
848 	hk = get_hash_from_ptr((void *)ptr);
849 
850 	mem_item = mem_list_find_item(hk);
851 	if (mem_item == NULL) {
852 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
853 		rte_errno = EPERM;
854 		return -rte_errno;
855 	}
856 
857 	if (mem_item->mtype == GPU_MEM) {
858 		res = pfn_cuMemFree(mem_item->ptr_orig_d);
859 		if (res != 0) {
860 			pfn_cuGetErrorString(res, &(err_string));
861 			rte_cuda_log(ERR, "cuMemFree current failed with %s",
862 					err_string);
863 			rte_errno = EPERM;
864 			return -rte_errno;
865 		}
866 
867 		return mem_list_del_item(hk);
868 	}
869 
870 	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
871 
872 	return -EPERM;
873 }
874 
875 static int
876 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
877 {
878 	CUresult res;
879 	struct mem_entry *mem_item;
880 	const char *err_string;
881 	cuda_ptr_key hk;
882 
883 	if (dev == NULL)
884 		return -ENODEV;
885 
886 	hk = get_hash_from_ptr((void *)ptr);
887 
888 	mem_item = mem_list_find_item(hk);
889 	if (mem_item == NULL) {
890 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
891 		rte_errno = EPERM;
892 		return -rte_errno;
893 	}
894 
895 	if (mem_item->mtype == CPU_REGISTERED) {
896 		res = pfn_cuMemHostUnregister(ptr);
897 		if (res != 0) {
898 			pfn_cuGetErrorString(res, &(err_string));
899 			rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
900 					err_string);
901 			rte_errno = EPERM;
902 			return -rte_errno;
903 		}
904 
905 		return mem_list_del_item(hk);
906 	}
907 
908 	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
909 
910 	rte_errno = EPERM;
911 	return -rte_errno;
912 }
913 
914 static int
915 cuda_mem_cpu_unmap(struct rte_gpu *dev, void *ptr_in)
916 {
917 	struct mem_entry *mem_item;
918 	cuda_ptr_key hk;
919 
920 	if (dev == NULL)
921 		return -ENODEV;
922 
923 	hk = get_hash_from_ptr((void *)ptr_in);
924 
925 	mem_item = mem_list_find_item(hk);
926 	if (mem_item == NULL) {
927 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
928 		rte_errno = EPERM;
929 		return -rte_errno;
930 	}
931 
932 	if (gdrcopy_unpin(gdrc_h, mem_item->mh, (void *)mem_item->ptr_d,
933 			mem_item->size)) {
934 		rte_cuda_log(ERR, "Error unexposing GPU memory address 0x%p.", ptr_in);
935 		rte_errno = EPERM;
936 		return -rte_errno;
937 	}
938 
939 	return 0;
940 }
941 
942 static int
943 cuda_dev_close(struct rte_gpu *dev)
944 {
945 	if (dev == NULL)
946 		return -EINVAL;
947 
948 	rte_free(dev->mpshared->dev_private);
949 
950 	return 0;
951 }
952 
953 static int
954 cuda_wmb(struct rte_gpu *dev)
955 {
956 	CUresult res;
957 	const char *err_string;
958 	CUcontext current_ctx;
959 	CUcontext input_ctx;
960 	struct cuda_info *private;
961 
962 	if (dev == NULL) {
963 		rte_errno = ENODEV;
964 		return -rte_errno;
965 	}
966 
967 	private = (struct cuda_info *)dev->mpshared->dev_private;
968 
969 	if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
970 		/*
971 		 * No need to explicitly force the write ordering because
972 		 * the device natively supports it
973 		 */
974 		return 0;
975 	}
976 
977 	if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
978 		/*
979 		 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
980 		 * Application needs to use alternative methods.
981 		 */
982 		rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
983 				"Application needs to use alternative methods.");
984 
985 		rte_errno = ENOTSUP;
986 		return -rte_errno;
987 	}
988 
989 	/* Store current ctx */
990 	res = pfn_cuCtxGetCurrent(&current_ctx);
991 	if (res != 0) {
992 		pfn_cuGetErrorString(res, &(err_string));
993 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
994 				err_string);
995 		rte_errno = EPERM;
996 		return -rte_errno;
997 	}
998 
999 	/* Set child ctx as current ctx */
1000 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
1001 	res = pfn_cuCtxSetCurrent(input_ctx);
1002 	if (res != 0) {
1003 		pfn_cuGetErrorString(res, &(err_string));
1004 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
1005 				err_string);
1006 		rte_errno = EPERM;
1007 		return -rte_errno;
1008 	}
1009 
1010 	res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
1011 			CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
1012 	if (res != 0) {
1013 		pfn_cuGetErrorString(res, &(err_string));
1014 		rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
1015 				err_string);
1016 		rte_errno = EPERM;
1017 		return -rte_errno;
1018 	}
1019 
1020 	/* Restore original ctx as current ctx */
1021 	res = pfn_cuCtxSetCurrent(current_ctx);
1022 	if (res != 0) {
1023 		pfn_cuGetErrorString(res, &(err_string));
1024 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
1025 				err_string);
1026 		rte_errno = EPERM;
1027 		return -rte_errno;
1028 	}
1029 
1030 	return 0;
1031 }
1032 
1033 static int
1034 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
1035 {
1036 	struct rte_gpu *dev = NULL;
1037 	CUresult res;
1038 	CUdevice cu_dev_id;
1039 	CUcontext pctx;
1040 	char dev_name[RTE_DEV_NAME_MAX_LEN];
1041 	const char *err_string;
1042 	int processor_count = 0;
1043 	struct cuda_info *private;
1044 
1045 	if (pci_dev == NULL) {
1046 		rte_cuda_log(ERR, "NULL PCI device");
1047 		rte_errno = ENODEV;
1048 		return -rte_errno;
1049 	}
1050 
1051 	rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
1052 
1053 	/* Allocate memory to be used privately by drivers */
1054 	dev = rte_gpu_allocate(pci_dev->device.name);
1055 	if (dev == NULL) {
1056 		rte_errno = ENODEV;
1057 		return -rte_errno;
1058 	}
1059 
1060 	/* Initialize values only for the first CUDA driver call */
1061 	if (dev->mpshared->info.dev_id == 0) {
1062 		mem_alloc_list_head = NULL;
1063 		mem_alloc_list_tail = NULL;
1064 		mem_alloc_list_last_elem = 0;
1065 
1066 		/* Load libcuda.so library */
1067 		if (cuda_loader()) {
1068 			rte_cuda_log(ERR, "CUDA Driver library not found");
1069 			rte_errno = ENOTSUP;
1070 			return -rte_errno;
1071 		}
1072 
1073 		/* Load initial CUDA functions */
1074 		if (cuda_sym_func_loader()) {
1075 			rte_cuda_log(ERR, "CUDA functions not found in library");
1076 			rte_errno = ENOTSUP;
1077 			return -rte_errno;
1078 		}
1079 
1080 		/*
1081 		 * Required to initialize the CUDA Driver.
1082 		 * Multiple calls of cuInit() will return immediately
1083 		 * without making any relevant change
1084 		 */
1085 		sym_cuInit(0);
1086 
1087 		res = sym_cuDriverGetVersion(&cuda_driver_version);
1088 		if (res != 0) {
1089 			rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1090 			rte_errno = ENOTSUP;
1091 			return -rte_errno;
1092 		}
1093 
1094 		if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1095 			rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1096 					"Minimum requirement is %d",
1097 					cuda_driver_version,
1098 					CUDA_DRIVER_MIN_VERSION);
1099 			rte_errno = ENOTSUP;
1100 			return -rte_errno;
1101 		}
1102 
1103 		if (cuda_pfn_func_loader()) {
1104 			rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1105 			rte_errno = ENOTSUP;
1106 			return -rte_errno;
1107 		}
1108 
1109 		gdrc_h = NULL;
1110 	}
1111 
1112 	/* Fill HW specific part of device structure */
1113 	dev->device = &pci_dev->device;
1114 	dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1115 
1116 	/* Get NVIDIA GPU Device descriptor */
1117 	res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1118 	if (res != 0) {
1119 		pfn_cuGetErrorString(res, &(err_string));
1120 		rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1121 				dev->device->name, res, err_string);
1122 		rte_errno = EPERM;
1123 		return -rte_errno;
1124 	}
1125 
1126 	res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1127 	if (res != 0) {
1128 		pfn_cuGetErrorString(res, &(err_string));
1129 		rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1130 				dev->device->name, res, err_string);
1131 		rte_errno = EPERM;
1132 		return -rte_errno;
1133 	}
1134 
1135 	res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1136 	if (res != 0) {
1137 		rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1138 		rte_errno = ENOTSUP;
1139 		return -rte_errno;
1140 	}
1141 
1142 	if (cuda_api_version < CUDA_API_MIN_VERSION) {
1143 		rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1144 				cuda_api_version, CUDA_API_MIN_VERSION);
1145 		rte_errno = ENOTSUP;
1146 		return -rte_errno;
1147 	}
1148 
1149 	dev->mpshared->info.context = (uint64_t)pctx;
1150 
1151 	/*
1152 	 * GPU Device generic info
1153 	 */
1154 
1155 	/* Processor count */
1156 	res = pfn_cuDeviceGetAttribute(&(processor_count),
1157 			CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1158 			cu_dev_id);
1159 	if (res != 0) {
1160 		pfn_cuGetErrorString(res, &(err_string));
1161 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1162 				err_string);
1163 		rte_errno = EPERM;
1164 		return -rte_errno;
1165 	}
1166 	dev->mpshared->info.processor_count = (uint32_t)processor_count;
1167 
1168 	/* Total memory */
1169 	res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1170 	if (res != 0) {
1171 		pfn_cuGetErrorString(res, &(err_string));
1172 		rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1173 				err_string);
1174 		rte_errno = EPERM;
1175 		return -rte_errno;
1176 	}
1177 
1178 	dev->mpshared->info.page_size = (size_t)GPU_PAGE_SIZE;
1179 
1180 	/*
1181 	 * GPU Device private info
1182 	 */
1183 	dev->mpshared->dev_private = rte_zmalloc(NULL,
1184 			sizeof(struct cuda_info),
1185 			RTE_CACHE_LINE_SIZE);
1186 	if (dev->mpshared->dev_private == NULL) {
1187 		rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1188 		rte_errno = EPERM;
1189 		return -rte_errno;
1190 	}
1191 
1192 	private = (struct cuda_info *)dev->mpshared->dev_private;
1193 	private->cu_dev = cu_dev_id;
1194 	res = pfn_cuDeviceGetName(private->gpu_name,
1195 			RTE_DEV_NAME_MAX_LEN,
1196 			cu_dev_id);
1197 	if (res != 0) {
1198 		pfn_cuGetErrorString(res, &(err_string));
1199 		rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1200 				err_string);
1201 		rte_errno = EPERM;
1202 		return -rte_errno;
1203 	}
1204 
1205 	res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1206 			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1207 			cu_dev_id);
1208 	if (res != 0) {
1209 		pfn_cuGetErrorString(res, &(err_string));
1210 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1211 				err_string);
1212 		rte_errno = EPERM;
1213 		return -rte_errno;
1214 	}
1215 
1216 	if (private->gdr_supported == 0)
1217 		rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1218 				pci_dev->device.name);
1219 
1220 	res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1221 			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1222 			cu_dev_id);
1223 	if (res != 0) {
1224 		pfn_cuGetErrorString(res, &(err_string));
1225 		rte_cuda_log(ERR,
1226 				"cuDeviceGetAttribute failed with %s",
1227 				err_string);
1228 		rte_errno = EPERM;
1229 		return -rte_errno;
1230 	}
1231 
1232 	if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1233 		res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1234 				CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1235 				cu_dev_id);
1236 		if (res != 0) {
1237 			pfn_cuGetErrorString(res, &(err_string));
1238 			rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1239 					err_string);
1240 			rte_errno = EPERM;
1241 			return -rte_errno;
1242 		}
1243 
1244 		if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1245 			rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1246 	}
1247 
1248 	dev->ops.dev_info_get = cuda_dev_info_get;
1249 	dev->ops.dev_close = cuda_dev_close;
1250 	dev->ops.mem_alloc = cuda_mem_alloc;
1251 	dev->ops.mem_free = cuda_mem_free;
1252 	dev->ops.mem_register = cuda_mem_register;
1253 	dev->ops.mem_unregister = cuda_mem_unregister;
1254 	dev->ops.mem_cpu_map = cuda_mem_cpu_map;
1255 	dev->ops.mem_cpu_unmap = cuda_mem_cpu_unmap;
1256 	dev->ops.wmb = cuda_wmb;
1257 
1258 	rte_gpu_complete_new(dev);
1259 
1260 	rte_cuda_debug("dev id = %u name = %s",
1261 			dev->mpshared->info.dev_id, private->gpu_name);
1262 
1263 	return 0;
1264 }
1265 
1266 static int
1267 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1268 {
1269 	struct rte_gpu *dev;
1270 	int ret;
1271 	uint8_t gpu_id;
1272 
1273 	if (pci_dev == NULL) {
1274 		rte_errno = ENODEV;
1275 		return -rte_errno;
1276 	}
1277 
1278 	dev = rte_gpu_get_by_name(pci_dev->device.name);
1279 	if (dev == NULL) {
1280 		rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1281 				pci_dev->device.name);
1282 		rte_errno = ENODEV;
1283 		return -rte_errno;
1284 	}
1285 	gpu_id = dev->mpshared->info.dev_id;
1286 
1287 	/* release dev from library */
1288 	ret = rte_gpu_release(dev);
1289 	if (ret)
1290 		rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1291 
1292 	rte_cuda_debug("Destroyed dev = %u", gpu_id);
1293 
1294 	return 0;
1295 }
1296 
1297 static struct rte_pci_driver rte_cuda_driver = {
1298 	.id_table = pci_id_cuda_map,
1299 	.drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1300 	.probe = cuda_gpu_probe,
1301 	.remove = cuda_gpu_remove,
1302 };
1303 
1304 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1305 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1306 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
1307