xref: /dpdk/drivers/gpu/cuda/cuda.c (revision b4409f2b3f3efaacc58c71ea0ddcb798731460a3)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2021 NVIDIA Corporation & Affiliates
3  */
4 
5 #include <dlfcn.h>
6 
7 #include <rte_malloc.h>
8 #include <rte_pci.h>
9 #include <rte_bus_pci.h>
10 #include <rte_byteorder.h>
11 #include <rte_dev.h>
12 
13 #include <gpudev_driver.h>
14 
15 #include <cuda.h>
16 #include <cudaTypedefs.h>
17 
18 #include "common.h"
19 #include "devices.h"
20 
21 #define CUDA_DRIVER_MIN_VERSION 11040
22 #define CUDA_API_MIN_VERSION 3020
23 
24 /* CUDA Driver functions loaded with dlsym() */
25 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
26 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
27 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
28 		void **pfn, int cudaVersion, uint64_t flags);
29 
30 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
31 static PFN_cuGetErrorString pfn_cuGetErrorString;
32 static PFN_cuGetErrorName pfn_cuGetErrorName;
33 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
34 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
35 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
36 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
37 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
38 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
39 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
40 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
41 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
42 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
43 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
44 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
45 static PFN_cuMemAlloc pfn_cuMemAlloc;
46 static PFN_cuMemFree pfn_cuMemFree;
47 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
48 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
49 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
50 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
51 
52 static void *cudalib;
53 static unsigned int cuda_api_version;
54 static int cuda_driver_version;
55 static gdr_t gdrc_h;
56 
57 #define CUDA_MAX_ALLOCATION_NUM 512
58 
59 #define GPU_PAGE_SHIFT 16
60 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
61 
62 RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
63 
64 /* NVIDIA GPU address map */
65 static const struct rte_pci_id pci_id_cuda_map[] = {
66 	{
67 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
68 				NVIDIA_GPU_A40_DEVICE_ID)
69 	},
70 	{
71 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
72 				NVIDIA_GPU_A30_24GB_DEVICE_ID)
73 	},
74 	{
75 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
76 				NVIDIA_GPU_A30X_24GB_DPU_DEVICE_ID)
77 	},
78 	{
79 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
80 				NVIDIA_GPU_A10_24GB_DEVICE_ID)
81 	},
82 	{
83 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
84 				NVIDIA_GPU_A10G_DEVICE_ID)
85 	},
86 	{
87 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
88 				NVIDIA_GPU_A10M_DEVICE_ID)
89 	},
90 	{
91 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
92 				NVIDIA_GPU_A100_40GB_SXM4_DEVICE_ID)
93 	},
94 	{
95 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
96 				NVIDIA_GPU_A100_40GB_PCIE_DEVICE_ID)
97 	},
98 	{
99 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
100 				NVIDIA_GPU_A100_80GB_SXM4_DEVICE_ID)
101 	},
102 	{
103 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
104 				NVIDIA_GPU_A100_80GB_PCIE_DEVICE_ID)
105 	},
106 	{
107 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
108 				NVIDIA_GPU_A100X_80GB_DPU_DEVICE_ID)
109 	},
110 	{
111 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
112 				NVIDIA_GPU_GA100_PG506_207)
113 	},
114 	{
115 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
116 				NVIDIA_GPU_GA100_PCIE)
117 	},
118 	{
119 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
120 				NVIDIA_GPU_GA100_PG506_217)
121 	},
122 	{
123 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
124 				NVIDIA_GPU_V100_16GB_SXM2_DEVICE_ID)
125 	},
126 	{
127 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
128 				NVIDIA_GPU_V100_16GB_DGXS_DEVICE_ID)
129 	},
130 	{
131 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
132 				NVIDIA_GPU_V100_16GB_FHHL_DEVICE_ID)
133 	},
134 	{
135 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
136 				NVIDIA_GPU_V100_16GB_PCIE_DEVICE_ID)
137 	},
138 	{
139 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
140 				NVIDIA_GPU_V100_32GB_SXM2_DEVICE_ID)
141 	},
142 	{
143 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
144 				NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID)
145 	},
146 	{
147 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
148 				NVIDIA_GPU_V100_32GB_DGXS_DEVICE_ID)
149 	},
150 	{
151 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
152 				NVIDIA_GPU_V100_32GB_SXM3_DEVICE_ID)
153 	},
154 	{
155 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
156 				NVIDIA_GPU_V100_32GB_SXM3_H_DEVICE_ID)
157 	},
158 	{
159 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
160 				NVIDIA_GPU_V100_SXM2)
161 	},
162 	{
163 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
164 				NVIDIA_GPU_V100S_PCIE)
165 	},
166 	{
167 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
168 				NVIDIA_GPU_TITAN_V_CEO_ED)
169 	},
170 	{
171 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
172 				NVIDIA_GPU_GV100GL_PG500_216)
173 	},
174 	{
175 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
176 				NVIDIA_GPU_GV100GL_PG503_216)
177 	},
178 	{
179 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
180 				NVIDIA_GPU_TU102_TITAN_RTX)
181 	},
182 	{
183 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
184 				NVIDIA_GPU_TU102GL_QUADRO_RTX)
185 	},
186 	{
187 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
188 				NVIDIA_GPU_GV100_QUADRO_DEVICE_ID)
189 	},
190 	{
191 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
192 				NVIDIA_GPU_QUADRO_RTX_4000)
193 	},
194 	{
195 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
196 				NVIDIA_GPU_QUADRO_RTX_5000)
197 	},
198 	{
199 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
200 				NVIDIA_GPU_QUADRO_RTX_6000)
201 	},
202 	{
203 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
204 				NVIDIA_GPU_QUADRO_RTX_8000)
205 	},
206 	{
207 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
208 				NVIDIA_GPU_QUADRO_RTX_A4000)
209 	},
210 	{
211 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
212 				NVIDIA_GPU_QUADRO_RTX_A6000)
213 	},
214 	{
215 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
216 				NVIDIA_GPU_QUADRO_RTX_A5000)
217 	},
218 	{
219 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
220 				NVIDIA_GPU_QUADRO_RTX_A4500)
221 	},
222 	{
223 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
224 				NVIDIA_GPU_QUADRO_RTX_A5500)
225 	},
226 	{
227 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
228 				NVIDIA_GPU_QUADRO_RTX_A2000)
229 	},
230 	{
231 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
232 				NVIDIA_GPU_QUADRO_RTX_A2000_12GB)
233 	},
234 	{
235 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
236 				NVIDIA_GPU_T4G)
237 	},
238 	{
239 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
240 				NVIDIA_GPU_T4)
241 	},
242 	{
243 		.device_id = 0
244 	}
245 };
246 
247 /* Device private info */
248 struct cuda_info {
249 	char gpu_name[RTE_DEV_NAME_MAX_LEN];
250 	CUdevice cu_dev;
251 	int gdr_supported;
252 	int gdr_write_ordering;
253 	int gdr_flush_type;
254 };
255 
256 /* Type of memory allocated by CUDA driver */
257 enum mem_type {
258 	GPU_MEM = 0,
259 	CPU_REGISTERED,
260 	GPU_REGISTERED /* Not used yet */
261 };
262 
263 /* key associated to a memory address */
264 typedef uintptr_t cuda_ptr_key;
265 
266 /* Single entry of the memory list */
267 struct mem_entry {
268 	CUdeviceptr ptr_d;
269 	CUdeviceptr ptr_orig_d;
270 	void *ptr_h;
271 	size_t size;
272 	size_t size_orig;
273 	struct rte_gpu *dev;
274 	CUcontext ctx;
275 	cuda_ptr_key pkey;
276 	enum mem_type mtype;
277 	gdr_mh_t mh;
278 	struct mem_entry *prev;
279 	struct mem_entry *next;
280 };
281 
282 static struct mem_entry *mem_alloc_list_head;
283 static struct mem_entry *mem_alloc_list_tail;
284 static uint32_t mem_alloc_list_last_elem;
285 
286 /* Load the CUDA symbols */
287 
288 static int
289 cuda_loader(void)
290 {
291 	char cuda_path[1024];
292 
293 	if (getenv("CUDA_PATH_L") == NULL)
294 		snprintf(cuda_path, 1024, "%s", "libcuda.so");
295 	else
296 		snprintf(cuda_path, 1024, "%s/%s", getenv("CUDA_PATH_L"), "libcuda.so");
297 
298 	cudalib = dlopen(cuda_path, RTLD_LAZY);
299 	if (cudalib == NULL) {
300 		rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
301 				cuda_path, getenv("CUDA_PATH_L"));
302 		return -1;
303 	}
304 
305 	return 0;
306 }
307 
308 static int
309 cuda_sym_func_loader(void)
310 {
311 	if (cudalib == NULL)
312 		return -1;
313 
314 	sym_cuInit = dlsym(cudalib, "cuInit");
315 	if (sym_cuInit == NULL) {
316 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
317 		return -1;
318 	}
319 
320 	sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
321 	if (sym_cuDriverGetVersion == NULL) {
322 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
323 		return -1;
324 	}
325 
326 	sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
327 	if (sym_cuGetProcAddress == NULL) {
328 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
329 		return -1;
330 	}
331 
332 	return 0;
333 }
334 
335 static int
336 cuda_pfn_func_loader(void)
337 {
338 	CUresult res;
339 
340 	res = sym_cuGetProcAddress("cuGetErrorString",
341 			(void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
342 	if (res != 0) {
343 		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
344 		return -1;
345 	}
346 
347 	res = sym_cuGetProcAddress("cuGetErrorName",
348 			(void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
349 	if (res != 0) {
350 		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
351 		return -1;
352 	}
353 
354 	res = sym_cuGetProcAddress("cuPointerSetAttribute",
355 			(void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
356 	if (res != 0) {
357 		rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
358 		return -1;
359 	}
360 
361 	res = sym_cuGetProcAddress("cuDeviceGetAttribute",
362 			(void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
363 	if (res != 0) {
364 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
365 		return -1;
366 	}
367 
368 	res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
369 			(void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
370 	if (res != 0) {
371 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
372 		return -1;
373 	}
374 
375 	res = sym_cuGetProcAddress("cuDeviceGetName",
376 			(void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
377 	if (res != 0) {
378 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
379 		return -1;
380 	}
381 
382 	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
383 			(void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
384 	if (res != 0) {
385 		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
386 		return -1;
387 	}
388 
389 	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
390 			(void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
391 	if (res != 0) {
392 		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
393 		return -1;
394 	}
395 
396 	res = sym_cuGetProcAddress("cuDeviceTotalMem",
397 			(void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
398 	if (res != 0) {
399 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
400 		return -1;
401 	}
402 
403 	res = sym_cuGetProcAddress("cuCtxGetApiVersion",
404 			(void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
405 	if (res != 0) {
406 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
407 		return -1;
408 	}
409 
410 	res = sym_cuGetProcAddress("cuCtxGetDevice",
411 			(void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
412 	if (res != 0) {
413 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
414 		return -1;
415 	}
416 
417 	res = sym_cuGetProcAddress("cuCtxSetCurrent",
418 			(void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
419 	if (res != 0) {
420 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
421 		return -1;
422 	}
423 
424 	res = sym_cuGetProcAddress("cuCtxGetCurrent",
425 			(void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
426 	if (res != 0) {
427 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
428 		return -1;
429 	}
430 
431 	res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
432 			(void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
433 	if (res != 0) {
434 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
435 		return -1;
436 	}
437 
438 	res = sym_cuGetProcAddress("cuMemAlloc",
439 			(void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
440 	if (res != 0) {
441 		rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
442 		return -1;
443 	}
444 
445 	res = sym_cuGetProcAddress("cuMemFree",
446 			(void **)(&pfn_cuMemFree), cuda_driver_version, 0);
447 	if (res != 0) {
448 		rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
449 		return -1;
450 	}
451 
452 	res = sym_cuGetProcAddress("cuMemHostRegister",
453 			(void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
454 	if (res != 0) {
455 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
456 		return -1;
457 	}
458 
459 	res = sym_cuGetProcAddress("cuMemHostUnregister",
460 			(void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
461 	if (res != 0) {
462 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
463 		return -1;
464 	}
465 
466 	res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
467 			(void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
468 	if (res != 0) {
469 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
470 		return -1;
471 	}
472 
473 	res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
474 			(void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
475 	if (res != 0) {
476 		rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
477 		return -1;
478 	}
479 
480 	return 0;
481 }
482 
483 /* Generate a key from a memory pointer */
484 static cuda_ptr_key
485 get_hash_from_ptr(void *ptr)
486 {
487 	return (uintptr_t)ptr;
488 }
489 
490 static uint32_t
491 mem_list_count_item(void)
492 {
493 	return mem_alloc_list_last_elem;
494 }
495 
496 /* Initiate list of memory allocations if not done yet */
497 static struct mem_entry *
498 mem_list_add_item(void)
499 {
500 	/* Initiate list of memory allocations if not done yet */
501 	if (mem_alloc_list_head == NULL) {
502 		mem_alloc_list_head = rte_zmalloc(NULL,
503 				sizeof(struct mem_entry),
504 				RTE_CACHE_LINE_SIZE);
505 		if (mem_alloc_list_head == NULL) {
506 			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
507 			return NULL;
508 		}
509 
510 		mem_alloc_list_head->next = NULL;
511 		mem_alloc_list_head->prev = NULL;
512 		mem_alloc_list_tail = mem_alloc_list_head;
513 	} else {
514 		struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
515 				sizeof(struct mem_entry),
516 				RTE_CACHE_LINE_SIZE);
517 
518 		if (mem_alloc_list_cur == NULL) {
519 			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
520 			return NULL;
521 		}
522 
523 		mem_alloc_list_tail->next = mem_alloc_list_cur;
524 		mem_alloc_list_cur->prev = mem_alloc_list_tail;
525 		mem_alloc_list_tail = mem_alloc_list_tail->next;
526 		mem_alloc_list_tail->next = NULL;
527 	}
528 
529 	mem_alloc_list_last_elem++;
530 
531 	return mem_alloc_list_tail;
532 }
533 
534 static struct mem_entry *
535 mem_list_find_item(cuda_ptr_key pk)
536 {
537 	struct mem_entry *mem_alloc_list_cur = NULL;
538 
539 	if (mem_alloc_list_head == NULL) {
540 		rte_cuda_log(ERR, "Memory list doesn't exist");
541 		return NULL;
542 	}
543 
544 	if (mem_list_count_item() == 0) {
545 		rte_cuda_log(ERR, "No items in memory list");
546 		return NULL;
547 	}
548 
549 	mem_alloc_list_cur = mem_alloc_list_head;
550 
551 	while (mem_alloc_list_cur != NULL) {
552 		if (mem_alloc_list_cur->pkey == pk)
553 			return mem_alloc_list_cur;
554 		mem_alloc_list_cur = mem_alloc_list_cur->next;
555 	}
556 
557 	return mem_alloc_list_cur;
558 }
559 
560 static int
561 mem_list_del_item(cuda_ptr_key pk)
562 {
563 	struct mem_entry *mem_alloc_list_cur = NULL;
564 
565 	mem_alloc_list_cur = mem_list_find_item(pk);
566 	if (mem_alloc_list_cur == NULL)
567 		return -EINVAL;
568 
569 	/* if key is in head */
570 	if (mem_alloc_list_cur->prev == NULL) {
571 		mem_alloc_list_head = mem_alloc_list_cur->next;
572 		if (mem_alloc_list_head != NULL)
573 			mem_alloc_list_head->prev = NULL;
574 	} else {
575 		mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
576 		if (mem_alloc_list_cur->next != NULL)
577 			mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
578 	}
579 
580 	rte_free(mem_alloc_list_cur);
581 
582 	mem_alloc_list_last_elem--;
583 
584 	return 0;
585 }
586 
587 static int
588 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
589 {
590 	int ret = 0;
591 	CUresult res;
592 	struct rte_gpu_info parent_info;
593 	CUexecAffinityParam affinityPrm;
594 	const char *err_string;
595 	struct cuda_info *private;
596 	CUcontext current_ctx;
597 	CUcontext input_ctx;
598 
599 	if (dev == NULL) {
600 		rte_errno = ENODEV;
601 		return -rte_errno;
602 	}
603 
604 	/* Child initialization time probably called by rte_gpu_add_child() */
605 	if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
606 			dev->mpshared->dev_private == NULL) {
607 		/* Store current ctx */
608 		res = pfn_cuCtxGetCurrent(&current_ctx);
609 		if (res != 0) {
610 			pfn_cuGetErrorString(res, &(err_string));
611 			rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
612 					err_string);
613 			rte_errno = EPERM;
614 			return -rte_errno;
615 		}
616 
617 		/* Set child ctx as current ctx */
618 		input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
619 		res = pfn_cuCtxSetCurrent(input_ctx);
620 		if (res != 0) {
621 			pfn_cuGetErrorString(res, &(err_string));
622 			rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
623 					err_string);
624 			rte_errno = EPERM;
625 			return -rte_errno;
626 		}
627 
628 		/*
629 		 * Ctx capacity info
630 		 */
631 
632 		/* MPS compatible */
633 		res = pfn_cuCtxGetExecAffinity(&affinityPrm,
634 				CU_EXEC_AFFINITY_TYPE_SM_COUNT);
635 		if (res != 0) {
636 			pfn_cuGetErrorString(res, &(err_string));
637 			rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
638 					err_string);
639 		}
640 		dev->mpshared->info.processor_count =
641 				(uint32_t)affinityPrm.param.smCount.val;
642 
643 		ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
644 		if (ret) {
645 			rte_errno = ENODEV;
646 			return -rte_errno;
647 		}
648 		dev->mpshared->info.total_memory = parent_info.total_memory;
649 
650 		dev->mpshared->info.page_size = parent_info.page_size;
651 
652 		/*
653 		 * GPU Device private info
654 		 */
655 		dev->mpshared->dev_private = rte_zmalloc(NULL,
656 				sizeof(struct cuda_info),
657 				RTE_CACHE_LINE_SIZE);
658 		if (dev->mpshared->dev_private == NULL) {
659 			rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
660 			rte_errno = EPERM;
661 			return -rte_errno;
662 		}
663 
664 		private = (struct cuda_info *)dev->mpshared->dev_private;
665 
666 		res = pfn_cuCtxGetDevice(&(private->cu_dev));
667 		if (res != 0) {
668 			pfn_cuGetErrorString(res, &(err_string));
669 			rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
670 					err_string);
671 			rte_errno = EPERM;
672 			return -rte_errno;
673 		}
674 
675 		res = pfn_cuDeviceGetName(private->gpu_name,
676 				RTE_DEV_NAME_MAX_LEN, private->cu_dev);
677 		if (res != 0) {
678 			pfn_cuGetErrorString(res, &(err_string));
679 			rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
680 					err_string);
681 			rte_errno = EPERM;
682 			return -rte_errno;
683 		}
684 
685 		/* Restore original ctx as current ctx */
686 		res = pfn_cuCtxSetCurrent(current_ctx);
687 		if (res != 0) {
688 			pfn_cuGetErrorString(res, &(err_string));
689 			rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
690 					err_string);
691 			rte_errno = EPERM;
692 			return -rte_errno;
693 		}
694 	}
695 
696 	*info = dev->mpshared->info;
697 
698 	return 0;
699 }
700 
701 /*
702  * GPU Memory
703  */
704 
705 static int
706 cuda_mem_alloc(struct rte_gpu *dev, size_t size, unsigned int align, void **ptr)
707 {
708 	CUresult res;
709 	const char *err_string;
710 	CUcontext current_ctx;
711 	CUcontext input_ctx;
712 	unsigned int flag = 1;
713 
714 	if (dev == NULL)
715 		return -ENODEV;
716 
717 	/* Store current ctx */
718 	res = pfn_cuCtxGetCurrent(&current_ctx);
719 	if (res != 0) {
720 		pfn_cuGetErrorString(res, &(err_string));
721 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
722 				err_string);
723 		rte_errno = EPERM;
724 		return -rte_errno;
725 	}
726 
727 	/* Set child ctx as current ctx */
728 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
729 	res = pfn_cuCtxSetCurrent(input_ctx);
730 	if (res != 0) {
731 		pfn_cuGetErrorString(res, &(err_string));
732 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
733 				err_string);
734 		rte_errno = EPERM;
735 		return -rte_errno;
736 	}
737 
738 	/* Get next memory list item */
739 	mem_alloc_list_tail = mem_list_add_item();
740 	if (mem_alloc_list_tail == NULL) {
741 		rte_errno = EPERM;
742 		return -rte_errno;
743 	}
744 
745 	/* Allocate memory */
746 	mem_alloc_list_tail->size = size;
747 	mem_alloc_list_tail->size_orig = size + align;
748 
749 	res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
750 			mem_alloc_list_tail->size_orig);
751 	if (res != 0) {
752 		pfn_cuGetErrorString(res, &(err_string));
753 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
754 				err_string);
755 		rte_errno = EPERM;
756 		return -rte_errno;
757 	}
758 
759 	/* Align memory address */
760 	mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
761 	if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
762 		mem_alloc_list_tail->ptr_d += (align -
763 				(((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
764 
765 	/* GPUDirect RDMA attribute required */
766 	res = pfn_cuPointerSetAttribute(&flag,
767 			CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
768 			mem_alloc_list_tail->ptr_d);
769 	if (res != 0) {
770 		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
771 				"GPU memory at  %"PRIu32", err %d",
772 				(uint32_t)mem_alloc_list_tail->ptr_d, res);
773 		rte_errno = EPERM;
774 		return -rte_errno;
775 	}
776 
777 	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
778 	mem_alloc_list_tail->ptr_h = NULL;
779 	mem_alloc_list_tail->dev = dev;
780 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
781 	mem_alloc_list_tail->mtype = GPU_MEM;
782 
783 	/* Restore original ctx as current ctx */
784 	res = pfn_cuCtxSetCurrent(current_ctx);
785 	if (res != 0) {
786 		pfn_cuGetErrorString(res, &(err_string));
787 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
788 				err_string);
789 		rte_errno = EPERM;
790 		return -rte_errno;
791 	}
792 
793 	*ptr = (void *)mem_alloc_list_tail->ptr_d;
794 
795 	return 0;
796 }
797 
798 static int
799 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
800 {
801 	CUresult res;
802 	const char *err_string;
803 	CUcontext current_ctx;
804 	CUcontext input_ctx;
805 	unsigned int flag = 1;
806 	int use_ptr_h = 0;
807 
808 	if (dev == NULL)
809 		return -ENODEV;
810 
811 	/* Store current ctx */
812 	res = pfn_cuCtxGetCurrent(&current_ctx);
813 	if (res != 0) {
814 		pfn_cuGetErrorString(res, &(err_string));
815 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
816 				err_string);
817 		rte_errno = EPERM;
818 		return -rte_errno;
819 	}
820 
821 	/* Set child ctx as current ctx */
822 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
823 	res = pfn_cuCtxSetCurrent(input_ctx);
824 	if (res != 0) {
825 		pfn_cuGetErrorString(res, &(err_string));
826 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
827 				err_string);
828 		rte_errno = EPERM;
829 		return -rte_errno;
830 	}
831 
832 	/* Get next memory list item */
833 	mem_alloc_list_tail = mem_list_add_item();
834 	if (mem_alloc_list_tail == NULL) {
835 		rte_errno = EPERM;
836 		return -rte_errno;
837 	}
838 
839 	/* Allocate memory */
840 	mem_alloc_list_tail->size = size;
841 	mem_alloc_list_tail->ptr_h = ptr;
842 
843 	res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
844 			mem_alloc_list_tail->size,
845 			CU_MEMHOSTREGISTER_PORTABLE |
846 			CU_MEMHOSTREGISTER_DEVICEMAP);
847 	if (res != 0) {
848 		pfn_cuGetErrorString(res, &(err_string));
849 		rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
850 				err_string,
851 				mem_alloc_list_tail->ptr_h,
852 				mem_alloc_list_tail->size);
853 		rte_errno = EPERM;
854 		return -rte_errno;
855 	}
856 
857 	res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
858 			CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
859 			((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
860 	if (res != 0) {
861 		pfn_cuGetErrorString(res, &(err_string));
862 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
863 				err_string);
864 		rte_errno = EPERM;
865 		return -rte_errno;
866 	}
867 
868 	if (use_ptr_h == 0) {
869 		res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
870 				mem_alloc_list_tail->ptr_h, 0);
871 		if (res != 0) {
872 			pfn_cuGetErrorString(res, &(err_string));
873 			rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
874 					err_string);
875 			rte_errno = EPERM;
876 			return -rte_errno;
877 		}
878 
879 		if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
880 				(uintptr_t)mem_alloc_list_tail->ptr_h) {
881 			rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
882 			rte_errno = ENOTSUP;
883 			return -rte_errno;
884 		}
885 	} else {
886 		mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
887 	}
888 
889 	/* GPUDirect RDMA attribute required */
890 	res = pfn_cuPointerSetAttribute(&flag,
891 			CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
892 			mem_alloc_list_tail->ptr_d);
893 	if (res != 0) {
894 		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
895 				", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
896 		rte_errno = EPERM;
897 		return -rte_errno;
898 	}
899 
900 	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
901 	mem_alloc_list_tail->size = size;
902 	mem_alloc_list_tail->dev = dev;
903 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
904 	mem_alloc_list_tail->mtype = CPU_REGISTERED;
905 	mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d;
906 
907 	/* Restore original ctx as current ctx */
908 	res = pfn_cuCtxSetCurrent(current_ctx);
909 	if (res != 0) {
910 		pfn_cuGetErrorString(res, &(err_string));
911 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
912 				err_string);
913 		rte_errno = EPERM;
914 		return -rte_errno;
915 	}
916 
917 	return 0;
918 }
919 
920 static int
921 cuda_mem_cpu_map(struct rte_gpu *dev, __rte_unused size_t size, void *ptr_in, void **ptr_out)
922 {
923 	struct mem_entry *mem_item;
924 	cuda_ptr_key hk;
925 
926 	if (dev == NULL)
927 		return -ENODEV;
928 
929 	hk = get_hash_from_ptr((void *)ptr_in);
930 
931 	mem_item = mem_list_find_item(hk);
932 	if (mem_item == NULL) {
933 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
934 		rte_errno = EPERM;
935 		return -rte_errno;
936 	}
937 
938 	if (mem_item->mtype != GPU_MEM) {
939 		rte_cuda_log(ERR, "Memory address 0x%p is not GPU memory type.", ptr_in);
940 		rte_errno = EPERM;
941 		return -rte_errno;
942 	}
943 
944 	if (mem_item->size != size)
945 		rte_cuda_log(WARNING,
946 				"Can't expose memory area with size (%zd) different from original size (%zd).",
947 				size, mem_item->size);
948 
949 	if (gdrcopy_pin(&gdrc_h, &(mem_item->mh), (uint64_t)mem_item->ptr_d,
950 					mem_item->size, &(mem_item->ptr_h))) {
951 		rte_cuda_log(ERR, "Error exposing GPU memory address 0x%p.", ptr_in);
952 		rte_errno = EPERM;
953 		return -rte_errno;
954 	}
955 
956 	*ptr_out = mem_item->ptr_h;
957 
958 	return 0;
959 }
960 
961 static int
962 cuda_mem_free(struct rte_gpu *dev, void *ptr)
963 {
964 	CUresult res;
965 	struct mem_entry *mem_item;
966 	const char *err_string;
967 	cuda_ptr_key hk;
968 
969 	if (dev == NULL)
970 		return -ENODEV;
971 
972 	hk = get_hash_from_ptr((void *)ptr);
973 
974 	mem_item = mem_list_find_item(hk);
975 	if (mem_item == NULL) {
976 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
977 		rte_errno = EPERM;
978 		return -rte_errno;
979 	}
980 
981 	if (mem_item->mtype == GPU_MEM) {
982 		res = pfn_cuMemFree(mem_item->ptr_orig_d);
983 		if (res != 0) {
984 			pfn_cuGetErrorString(res, &(err_string));
985 			rte_cuda_log(ERR, "cuMemFree current failed with %s",
986 					err_string);
987 			rte_errno = EPERM;
988 			return -rte_errno;
989 		}
990 
991 		return mem_list_del_item(hk);
992 	}
993 
994 	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
995 
996 	return -EPERM;
997 }
998 
999 static int
1000 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
1001 {
1002 	CUresult res;
1003 	struct mem_entry *mem_item;
1004 	const char *err_string;
1005 	cuda_ptr_key hk;
1006 
1007 	if (dev == NULL)
1008 		return -ENODEV;
1009 
1010 	hk = get_hash_from_ptr((void *)ptr);
1011 
1012 	mem_item = mem_list_find_item(hk);
1013 	if (mem_item == NULL) {
1014 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
1015 		rte_errno = EPERM;
1016 		return -rte_errno;
1017 	}
1018 
1019 	if (mem_item->mtype == CPU_REGISTERED) {
1020 		res = pfn_cuMemHostUnregister(ptr);
1021 		if (res != 0) {
1022 			pfn_cuGetErrorString(res, &(err_string));
1023 			rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
1024 					err_string);
1025 			rte_errno = EPERM;
1026 			return -rte_errno;
1027 		}
1028 
1029 		return mem_list_del_item(hk);
1030 	}
1031 
1032 	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
1033 
1034 	rte_errno = EPERM;
1035 	return -rte_errno;
1036 }
1037 
1038 static int
1039 cuda_mem_cpu_unmap(struct rte_gpu *dev, void *ptr_in)
1040 {
1041 	struct mem_entry *mem_item;
1042 	cuda_ptr_key hk;
1043 
1044 	if (dev == NULL)
1045 		return -ENODEV;
1046 
1047 	hk = get_hash_from_ptr((void *)ptr_in);
1048 
1049 	mem_item = mem_list_find_item(hk);
1050 	if (mem_item == NULL) {
1051 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
1052 		rte_errno = EPERM;
1053 		return -rte_errno;
1054 	}
1055 
1056 	if (gdrcopy_unpin(gdrc_h, mem_item->mh, (void *)mem_item->ptr_d,
1057 			mem_item->size)) {
1058 		rte_cuda_log(ERR, "Error unexposing GPU memory address 0x%p.", ptr_in);
1059 		rte_errno = EPERM;
1060 		return -rte_errno;
1061 	}
1062 
1063 	return 0;
1064 }
1065 
1066 static int
1067 cuda_dev_close(struct rte_gpu *dev)
1068 {
1069 	if (dev == NULL)
1070 		return -EINVAL;
1071 
1072 	rte_free(dev->mpshared->dev_private);
1073 
1074 	return 0;
1075 }
1076 
1077 static int
1078 cuda_wmb(struct rte_gpu *dev)
1079 {
1080 	CUresult res;
1081 	const char *err_string;
1082 	CUcontext current_ctx;
1083 	CUcontext input_ctx;
1084 	struct cuda_info *private;
1085 
1086 	if (dev == NULL) {
1087 		rte_errno = ENODEV;
1088 		return -rte_errno;
1089 	}
1090 
1091 	private = (struct cuda_info *)dev->mpshared->dev_private;
1092 
1093 	if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1094 		/*
1095 		 * No need to explicitly force the write ordering because
1096 		 * the device natively supports it
1097 		 */
1098 		return 0;
1099 	}
1100 
1101 	if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
1102 		/*
1103 		 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
1104 		 * Application needs to use alternative methods.
1105 		 */
1106 		rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
1107 				"Application needs to use alternative methods.");
1108 
1109 		rte_errno = ENOTSUP;
1110 		return -rte_errno;
1111 	}
1112 
1113 	/* Store current ctx */
1114 	res = pfn_cuCtxGetCurrent(&current_ctx);
1115 	if (res != 0) {
1116 		pfn_cuGetErrorString(res, &(err_string));
1117 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
1118 				err_string);
1119 		rte_errno = EPERM;
1120 		return -rte_errno;
1121 	}
1122 
1123 	/* Set child ctx as current ctx */
1124 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
1125 	res = pfn_cuCtxSetCurrent(input_ctx);
1126 	if (res != 0) {
1127 		pfn_cuGetErrorString(res, &(err_string));
1128 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
1129 				err_string);
1130 		rte_errno = EPERM;
1131 		return -rte_errno;
1132 	}
1133 
1134 	res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
1135 			CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
1136 	if (res != 0) {
1137 		pfn_cuGetErrorString(res, &(err_string));
1138 		rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
1139 				err_string);
1140 		rte_errno = EPERM;
1141 		return -rte_errno;
1142 	}
1143 
1144 	/* Restore original ctx as current ctx */
1145 	res = pfn_cuCtxSetCurrent(current_ctx);
1146 	if (res != 0) {
1147 		pfn_cuGetErrorString(res, &(err_string));
1148 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
1149 				err_string);
1150 		rte_errno = EPERM;
1151 		return -rte_errno;
1152 	}
1153 
1154 	return 0;
1155 }
1156 
1157 static int
1158 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
1159 {
1160 	struct rte_gpu *dev = NULL;
1161 	CUresult res;
1162 	CUdevice cu_dev_id;
1163 	CUcontext pctx;
1164 	char dev_name[RTE_DEV_NAME_MAX_LEN];
1165 	const char *err_string;
1166 	int processor_count = 0;
1167 	struct cuda_info *private;
1168 
1169 	if (pci_dev == NULL) {
1170 		rte_cuda_log(ERR, "NULL PCI device");
1171 		rte_errno = ENODEV;
1172 		return -rte_errno;
1173 	}
1174 
1175 	rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
1176 
1177 	/* Allocate memory to be used privately by drivers */
1178 	dev = rte_gpu_allocate(pci_dev->device.name);
1179 	if (dev == NULL) {
1180 		rte_errno = ENODEV;
1181 		return -rte_errno;
1182 	}
1183 
1184 	/* Initialize values only for the first CUDA driver call */
1185 	if (dev->mpshared->info.dev_id == 0) {
1186 		mem_alloc_list_head = NULL;
1187 		mem_alloc_list_tail = NULL;
1188 		mem_alloc_list_last_elem = 0;
1189 
1190 		/* Load libcuda.so library */
1191 		if (cuda_loader()) {
1192 			rte_cuda_log(ERR, "CUDA Driver library not found");
1193 			rte_errno = ENOTSUP;
1194 			return -rte_errno;
1195 		}
1196 
1197 		/* Load initial CUDA functions */
1198 		if (cuda_sym_func_loader()) {
1199 			rte_cuda_log(ERR, "CUDA functions not found in library");
1200 			rte_errno = ENOTSUP;
1201 			return -rte_errno;
1202 		}
1203 
1204 		/*
1205 		 * Required to initialize the CUDA Driver.
1206 		 * Multiple calls of cuInit() will return immediately
1207 		 * without making any relevant change
1208 		 */
1209 		sym_cuInit(0);
1210 
1211 		res = sym_cuDriverGetVersion(&cuda_driver_version);
1212 		if (res != 0) {
1213 			rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1214 			rte_errno = ENOTSUP;
1215 			return -rte_errno;
1216 		}
1217 
1218 		if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1219 			rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1220 					"Minimum requirement is %d",
1221 					cuda_driver_version,
1222 					CUDA_DRIVER_MIN_VERSION);
1223 			rte_errno = ENOTSUP;
1224 			return -rte_errno;
1225 		}
1226 
1227 		if (cuda_pfn_func_loader()) {
1228 			rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1229 			rte_errno = ENOTSUP;
1230 			return -rte_errno;
1231 		}
1232 
1233 		gdrc_h = NULL;
1234 	}
1235 
1236 	/* Fill HW specific part of device structure */
1237 	dev->device = &pci_dev->device;
1238 	dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1239 
1240 	/* Get NVIDIA GPU Device descriptor */
1241 	res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1242 	if (res != 0) {
1243 		pfn_cuGetErrorString(res, &(err_string));
1244 		rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1245 				dev->device->name, res, err_string);
1246 		rte_errno = EPERM;
1247 		return -rte_errno;
1248 	}
1249 
1250 	res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1251 	if (res != 0) {
1252 		pfn_cuGetErrorString(res, &(err_string));
1253 		rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1254 				dev->device->name, res, err_string);
1255 		rte_errno = EPERM;
1256 		return -rte_errno;
1257 	}
1258 
1259 	res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1260 	if (res != 0) {
1261 		rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1262 		rte_errno = ENOTSUP;
1263 		return -rte_errno;
1264 	}
1265 
1266 	if (cuda_api_version < CUDA_API_MIN_VERSION) {
1267 		rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1268 				cuda_api_version, CUDA_API_MIN_VERSION);
1269 		rte_errno = ENOTSUP;
1270 		return -rte_errno;
1271 	}
1272 
1273 	dev->mpshared->info.context = (uint64_t)pctx;
1274 
1275 	/*
1276 	 * GPU Device generic info
1277 	 */
1278 
1279 	/* Processor count */
1280 	res = pfn_cuDeviceGetAttribute(&(processor_count),
1281 			CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1282 			cu_dev_id);
1283 	if (res != 0) {
1284 		pfn_cuGetErrorString(res, &(err_string));
1285 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1286 				err_string);
1287 		rte_errno = EPERM;
1288 		return -rte_errno;
1289 	}
1290 	dev->mpshared->info.processor_count = (uint32_t)processor_count;
1291 
1292 	/* Total memory */
1293 	res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1294 	if (res != 0) {
1295 		pfn_cuGetErrorString(res, &(err_string));
1296 		rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1297 				err_string);
1298 		rte_errno = EPERM;
1299 		return -rte_errno;
1300 	}
1301 
1302 	dev->mpshared->info.page_size = (size_t)GPU_PAGE_SIZE;
1303 
1304 	/*
1305 	 * GPU Device private info
1306 	 */
1307 	dev->mpshared->dev_private = rte_zmalloc(NULL,
1308 			sizeof(struct cuda_info),
1309 			RTE_CACHE_LINE_SIZE);
1310 	if (dev->mpshared->dev_private == NULL) {
1311 		rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1312 		rte_errno = EPERM;
1313 		return -rte_errno;
1314 	}
1315 
1316 	private = (struct cuda_info *)dev->mpshared->dev_private;
1317 	private->cu_dev = cu_dev_id;
1318 	res = pfn_cuDeviceGetName(private->gpu_name,
1319 			RTE_DEV_NAME_MAX_LEN,
1320 			cu_dev_id);
1321 	if (res != 0) {
1322 		pfn_cuGetErrorString(res, &(err_string));
1323 		rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1324 				err_string);
1325 		rte_errno = EPERM;
1326 		return -rte_errno;
1327 	}
1328 
1329 	res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1330 			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1331 			cu_dev_id);
1332 	if (res != 0) {
1333 		pfn_cuGetErrorString(res, &(err_string));
1334 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1335 				err_string);
1336 		rte_errno = EPERM;
1337 		return -rte_errno;
1338 	}
1339 
1340 	if (private->gdr_supported == 0)
1341 		rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1342 				pci_dev->device.name);
1343 
1344 	res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1345 			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1346 			cu_dev_id);
1347 	if (res != 0) {
1348 		pfn_cuGetErrorString(res, &(err_string));
1349 		rte_cuda_log(ERR,
1350 				"cuDeviceGetAttribute failed with %s",
1351 				err_string);
1352 		rte_errno = EPERM;
1353 		return -rte_errno;
1354 	}
1355 
1356 	if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1357 		res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1358 				CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1359 				cu_dev_id);
1360 		if (res != 0) {
1361 			pfn_cuGetErrorString(res, &(err_string));
1362 			rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1363 					err_string);
1364 			rte_errno = EPERM;
1365 			return -rte_errno;
1366 		}
1367 
1368 		if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1369 			rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1370 	}
1371 
1372 	dev->ops.dev_info_get = cuda_dev_info_get;
1373 	dev->ops.dev_close = cuda_dev_close;
1374 	dev->ops.mem_alloc = cuda_mem_alloc;
1375 	dev->ops.mem_free = cuda_mem_free;
1376 	dev->ops.mem_register = cuda_mem_register;
1377 	dev->ops.mem_unregister = cuda_mem_unregister;
1378 	dev->ops.mem_cpu_map = cuda_mem_cpu_map;
1379 	dev->ops.mem_cpu_unmap = cuda_mem_cpu_unmap;
1380 	dev->ops.wmb = cuda_wmb;
1381 
1382 	rte_gpu_complete_new(dev);
1383 
1384 	rte_cuda_debug("dev id = %u name = %s",
1385 			dev->mpshared->info.dev_id, private->gpu_name);
1386 
1387 	return 0;
1388 }
1389 
1390 static int
1391 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1392 {
1393 	struct rte_gpu *dev;
1394 	int ret;
1395 	uint8_t gpu_id;
1396 
1397 	if (pci_dev == NULL) {
1398 		rte_errno = ENODEV;
1399 		return -rte_errno;
1400 	}
1401 
1402 	dev = rte_gpu_get_by_name(pci_dev->device.name);
1403 	if (dev == NULL) {
1404 		rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1405 				pci_dev->device.name);
1406 		rte_errno = ENODEV;
1407 		return -rte_errno;
1408 	}
1409 	gpu_id = dev->mpshared->info.dev_id;
1410 
1411 	/* release dev from library */
1412 	ret = rte_gpu_release(dev);
1413 	if (ret)
1414 		rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1415 
1416 	rte_cuda_debug("Destroyed dev = %u", gpu_id);
1417 
1418 	return 0;
1419 }
1420 
1421 static struct rte_pci_driver rte_cuda_driver = {
1422 	.id_table = pci_id_cuda_map,
1423 	.drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1424 	.probe = cuda_gpu_probe,
1425 	.remove = cuda_gpu_remove,
1426 };
1427 
1428 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1429 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1430 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
1431