xref: /dpdk/drivers/gpu/cuda/cuda.c (revision 2490bb897182f57de80fd924dd3ae48dda819b8c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2021 NVIDIA Corporation & Affiliates
3  */
4 
5 #include <dlfcn.h>
6 
7 #include <rte_common.h>
8 #include <rte_log.h>
9 #include <rte_malloc.h>
10 #include <rte_errno.h>
11 #include <rte_pci.h>
12 #include <rte_bus_pci.h>
13 #include <rte_byteorder.h>
14 #include <rte_dev.h>
15 
16 #include <gpudev_driver.h>
17 #include <cuda.h>
18 #include <cudaTypedefs.h>
19 
20 #define CUDA_DRIVER_MIN_VERSION 11040
21 #define CUDA_API_MIN_VERSION 3020
22 
23 /* CUDA Driver functions loaded with dlsym() */
24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
27 		void **pfn, int cudaVersion, uint64_t flags);
28 
29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
30 static PFN_cuGetErrorString pfn_cuGetErrorString;
31 static PFN_cuGetErrorName pfn_cuGetErrorName;
32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
44 static PFN_cuMemAlloc pfn_cuMemAlloc;
45 static PFN_cuMemFree pfn_cuMemFree;
46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
50 
51 static void *cudalib;
52 static unsigned int cuda_api_version;
53 static int cuda_driver_version;
54 
55 /* NVIDIA GPU vendor */
56 #define NVIDIA_GPU_VENDOR_ID (0x10de)
57 
58 /* NVIDIA GPU device IDs */
59 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
60 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
61 
62 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
63 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
64 
65 #define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
66 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
67 
68 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
69 
70 #define CUDA_MAX_ALLOCATION_NUM 512
71 
72 #define GPU_PAGE_SHIFT 16
73 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
74 
75 static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
76 
77 /* Helper macro for logging */
78 #define rte_cuda_log(level, fmt, ...) \
79 	rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
80 
81 #define rte_cuda_debug(fmt, ...) \
82 	rte_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
83 		##__VA_ARGS__)
84 
85 /* NVIDIA GPU address map */
86 static const struct rte_pci_id pci_id_cuda_map[] = {
87 	{
88 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
89 				NVIDIA_GPU_A100_40GB_DEVICE_ID)
90 	},
91 	{
92 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
93 				NVIDIA_GPU_A100_80GB_DEVICE_ID)
94 	},
95 	{
96 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
97 				NVIDIA_GPU_A30_24GB_DEVICE_ID)
98 	},
99 	{
100 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
101 				NVIDIA_GPU_A10_24GB_DEVICE_ID)
102 	},
103 	{
104 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
105 				NVIDIA_GPU_V100_32GB_DEVICE_ID)
106 	},
107 	{
108 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
109 				NVIDIA_GPU_V100_16GB_DEVICE_ID)
110 	},
111 	{
112 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
113 				NVIDIA_GPU_T4_16GB_DEVICE_ID)
114 	},
115 	{
116 		.device_id = 0
117 	}
118 };
119 
120 /* Device private info */
121 struct cuda_info {
122 	char gpu_name[RTE_DEV_NAME_MAX_LEN];
123 	CUdevice cu_dev;
124 	int gdr_supported;
125 	int gdr_write_ordering;
126 	int gdr_flush_type;
127 };
128 
129 /* Type of memory allocated by CUDA driver */
130 enum mem_type {
131 	GPU_MEM = 0,
132 	CPU_REGISTERED,
133 	GPU_REGISTERED /* Not used yet */
134 };
135 
136 /* key associated to a memory address */
137 typedef uintptr_t cuda_ptr_key;
138 
139 /* Single entry of the memory list */
140 struct mem_entry {
141 	CUdeviceptr ptr_d;
142 	void *ptr_h;
143 	size_t size;
144 	struct rte_gpu *dev;
145 	CUcontext ctx;
146 	cuda_ptr_key pkey;
147 	enum mem_type mtype;
148 	struct mem_entry *prev;
149 	struct mem_entry *next;
150 };
151 
152 static struct mem_entry *mem_alloc_list_head;
153 static struct mem_entry *mem_alloc_list_tail;
154 static uint32_t mem_alloc_list_last_elem;
155 
156 /* Load the CUDA symbols */
157 
158 static int
159 cuda_loader(void)
160 {
161 	char cuda_path[1024];
162 
163 	if (getenv("CUDA_PATH_L") == NULL)
164 		snprintf(cuda_path, 1024, "%s", "libcuda.so");
165 	else
166 		snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so");
167 
168 	cudalib = dlopen(cuda_path, RTLD_LAZY);
169 	if (cudalib == NULL) {
170 		rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
171 				cuda_path, getenv("CUDA_PATH_L"));
172 		return -1;
173 	}
174 
175 	return 0;
176 }
177 
178 static int
179 cuda_sym_func_loader(void)
180 {
181 	if (cudalib == NULL)
182 		return -1;
183 
184 	sym_cuInit = dlsym(cudalib, "cuInit");
185 	if (sym_cuInit == NULL) {
186 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
187 		return -1;
188 	}
189 
190 	sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
191 	if (sym_cuDriverGetVersion == NULL) {
192 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
193 		return -1;
194 	}
195 
196 	sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
197 	if (sym_cuGetProcAddress == NULL) {
198 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
199 		return -1;
200 	}
201 
202 	return 0;
203 }
204 
205 static int
206 cuda_pfn_func_loader(void)
207 {
208 	CUresult res;
209 
210 	res = sym_cuGetProcAddress("cuGetErrorString",
211 			(void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
212 	if (res != 0) {
213 		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
214 		return -1;
215 	}
216 
217 	res = sym_cuGetProcAddress("cuGetErrorName",
218 			(void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
219 	if (res != 0) {
220 		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
221 		return -1;
222 	}
223 
224 	res = sym_cuGetProcAddress("cuPointerSetAttribute",
225 			(void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
226 	if (res != 0) {
227 		rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
228 		return -1;
229 	}
230 
231 	res = sym_cuGetProcAddress("cuDeviceGetAttribute",
232 			(void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
233 	if (res != 0) {
234 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
235 		return -1;
236 	}
237 
238 	res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
239 			(void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
240 	if (res != 0) {
241 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
242 		return -1;
243 	}
244 
245 	res = sym_cuGetProcAddress("cuDeviceGetName",
246 			(void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
247 	if (res != 0) {
248 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
249 		return -1;
250 	}
251 
252 	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
253 			(void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
254 	if (res != 0) {
255 		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
256 		return -1;
257 	}
258 
259 	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
260 			(void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
261 	if (res != 0) {
262 		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
263 		return -1;
264 	}
265 
266 	res = sym_cuGetProcAddress("cuDeviceTotalMem",
267 			(void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
268 	if (res != 0) {
269 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
270 		return -1;
271 	}
272 
273 	res = sym_cuGetProcAddress("cuCtxGetApiVersion",
274 			(void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
275 	if (res != 0) {
276 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
277 		return -1;
278 	}
279 
280 	res = sym_cuGetProcAddress("cuCtxGetDevice",
281 			(void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
282 	if (res != 0) {
283 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
284 		return -1;
285 	}
286 
287 	res = sym_cuGetProcAddress("cuCtxSetCurrent",
288 			(void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
289 	if (res != 0) {
290 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
291 		return -1;
292 	}
293 
294 	res = sym_cuGetProcAddress("cuCtxGetCurrent",
295 			(void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
296 	if (res != 0) {
297 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
298 		return -1;
299 	}
300 
301 	res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
302 			(void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
303 	if (res != 0) {
304 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
305 		return -1;
306 	}
307 
308 	res = sym_cuGetProcAddress("cuMemAlloc",
309 			(void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
310 	if (res != 0) {
311 		rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
312 		return -1;
313 	}
314 
315 	res = sym_cuGetProcAddress("cuMemFree",
316 			(void **)(&pfn_cuMemFree), cuda_driver_version, 0);
317 	if (res != 0) {
318 		rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
319 		return -1;
320 	}
321 
322 	res = sym_cuGetProcAddress("cuMemHostRegister",
323 			(void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
324 	if (res != 0) {
325 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
326 		return -1;
327 	}
328 
329 	res = sym_cuGetProcAddress("cuMemHostUnregister",
330 			(void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
331 	if (res != 0) {
332 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
333 		return -1;
334 	}
335 
336 	res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
337 			(void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
338 	if (res != 0) {
339 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
340 		return -1;
341 	}
342 
343 	res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
344 			(void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
345 	if (res != 0) {
346 		rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
347 		return -1;
348 	}
349 
350 	return 0;
351 }
352 
353 /* Generate a key from a memory pointer */
354 static cuda_ptr_key
355 get_hash_from_ptr(void *ptr)
356 {
357 	return (uintptr_t)ptr;
358 }
359 
360 static uint32_t
361 mem_list_count_item(void)
362 {
363 	return mem_alloc_list_last_elem;
364 }
365 
366 /* Initiate list of memory allocations if not done yet */
367 static struct mem_entry *
368 mem_list_add_item(void)
369 {
370 	/* Initiate list of memory allocations if not done yet */
371 	if (mem_alloc_list_head == NULL) {
372 		mem_alloc_list_head = rte_zmalloc(NULL,
373 				sizeof(struct mem_entry),
374 				RTE_CACHE_LINE_SIZE);
375 		if (mem_alloc_list_head == NULL) {
376 			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
377 			return NULL;
378 		}
379 
380 		mem_alloc_list_head->next = NULL;
381 		mem_alloc_list_head->prev = NULL;
382 		mem_alloc_list_tail = mem_alloc_list_head;
383 	} else {
384 		struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
385 				sizeof(struct mem_entry),
386 				RTE_CACHE_LINE_SIZE);
387 
388 		if (mem_alloc_list_cur == NULL) {
389 			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
390 			return NULL;
391 		}
392 
393 		mem_alloc_list_tail->next = mem_alloc_list_cur;
394 		mem_alloc_list_cur->prev = mem_alloc_list_tail;
395 		mem_alloc_list_tail = mem_alloc_list_tail->next;
396 		mem_alloc_list_tail->next = NULL;
397 	}
398 
399 	mem_alloc_list_last_elem++;
400 
401 	return mem_alloc_list_tail;
402 }
403 
404 static struct mem_entry *
405 mem_list_find_item(cuda_ptr_key pk)
406 {
407 	struct mem_entry *mem_alloc_list_cur = NULL;
408 
409 	if (mem_alloc_list_head == NULL) {
410 		rte_cuda_log(ERR, "Memory list doesn't exist");
411 		return NULL;
412 	}
413 
414 	if (mem_list_count_item() == 0) {
415 		rte_cuda_log(ERR, "No items in memory list");
416 		return NULL;
417 	}
418 
419 	mem_alloc_list_cur = mem_alloc_list_head;
420 
421 	while (mem_alloc_list_cur != NULL) {
422 		if (mem_alloc_list_cur->pkey == pk)
423 			return mem_alloc_list_cur;
424 		mem_alloc_list_cur = mem_alloc_list_cur->next;
425 	}
426 
427 	return mem_alloc_list_cur;
428 }
429 
430 static int
431 mem_list_del_item(cuda_ptr_key pk)
432 {
433 	struct mem_entry *mem_alloc_list_cur = NULL;
434 
435 	mem_alloc_list_cur = mem_list_find_item(pk);
436 	if (mem_alloc_list_cur == NULL)
437 		return -EINVAL;
438 
439 	/* if key is in head */
440 	if (mem_alloc_list_cur->prev == NULL)
441 		mem_alloc_list_head = mem_alloc_list_cur->next;
442 	else {
443 		mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
444 		if (mem_alloc_list_cur->next != NULL)
445 			mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
446 	}
447 
448 	rte_free(mem_alloc_list_cur);
449 
450 	mem_alloc_list_last_elem--;
451 
452 	return 0;
453 }
454 
455 static int
456 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
457 {
458 	int ret = 0;
459 	CUresult res;
460 	struct rte_gpu_info parent_info;
461 	CUexecAffinityParam affinityPrm;
462 	const char *err_string;
463 	struct cuda_info *private;
464 	CUcontext current_ctx;
465 	CUcontext input_ctx;
466 
467 	if (dev == NULL)
468 		return -ENODEV;
469 
470 	/* Child initialization time probably called by rte_gpu_add_child() */
471 	if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
472 			dev->mpshared->dev_private == NULL) {
473 		/* Store current ctx */
474 		res = pfn_cuCtxGetCurrent(&current_ctx);
475 		if (res != 0) {
476 			pfn_cuGetErrorString(res, &(err_string));
477 			rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
478 					err_string);
479 			return -EPERM;
480 		}
481 
482 		/* Set child ctx as current ctx */
483 		input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
484 		res = pfn_cuCtxSetCurrent(input_ctx);
485 		if (res != 0) {
486 			pfn_cuGetErrorString(res, &(err_string));
487 			rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
488 					err_string);
489 			return -EPERM;
490 		}
491 
492 		/*
493 		 * Ctx capacity info
494 		 */
495 
496 		/* MPS compatible */
497 		res = pfn_cuCtxGetExecAffinity(&affinityPrm,
498 				CU_EXEC_AFFINITY_TYPE_SM_COUNT);
499 		if (res != 0) {
500 			pfn_cuGetErrorString(res, &(err_string));
501 			rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
502 					err_string);
503 		}
504 		dev->mpshared->info.processor_count =
505 				(uint32_t)affinityPrm.param.smCount.val;
506 
507 		ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
508 		if (ret)
509 			return -ENODEV;
510 		dev->mpshared->info.total_memory = parent_info.total_memory;
511 
512 		/*
513 		 * GPU Device private info
514 		 */
515 		dev->mpshared->dev_private = rte_zmalloc(NULL,
516 				sizeof(struct cuda_info),
517 				RTE_CACHE_LINE_SIZE);
518 		if (dev->mpshared->dev_private == NULL) {
519 			rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
520 			return -EPERM;
521 		}
522 
523 		private = (struct cuda_info *)dev->mpshared->dev_private;
524 
525 		res = pfn_cuCtxGetDevice(&(private->cu_dev));
526 		if (res != 0) {
527 			pfn_cuGetErrorString(res, &(err_string));
528 			rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
529 					err_string);
530 			return -EPERM;
531 		}
532 
533 		res = pfn_cuDeviceGetName(private->gpu_name,
534 				RTE_DEV_NAME_MAX_LEN, private->cu_dev);
535 		if (res != 0) {
536 			pfn_cuGetErrorString(res, &(err_string));
537 			rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
538 					err_string);
539 			return -EPERM;
540 		}
541 
542 		/* Restore original ctx as current ctx */
543 		res = pfn_cuCtxSetCurrent(current_ctx);
544 		if (res != 0) {
545 			pfn_cuGetErrorString(res, &(err_string));
546 			rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
547 					err_string);
548 			return -EPERM;
549 		}
550 	}
551 
552 	*info = dev->mpshared->info;
553 
554 	return 0;
555 }
556 
557 /*
558  * GPU Memory
559  */
560 
561 static int
562 cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
563 {
564 	CUresult res;
565 	const char *err_string;
566 	CUcontext current_ctx;
567 	CUcontext input_ctx;
568 	unsigned int flag = 1;
569 
570 	if (dev == NULL)
571 		return -ENODEV;
572 	if (size == 0)
573 		return -EINVAL;
574 
575 	/* Store current ctx */
576 	res = pfn_cuCtxGetCurrent(&current_ctx);
577 	if (res != 0) {
578 		pfn_cuGetErrorString(res, &(err_string));
579 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
580 				err_string);
581 		return -EPERM;
582 	}
583 
584 	/* Set child ctx as current ctx */
585 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
586 	res = pfn_cuCtxSetCurrent(input_ctx);
587 	if (res != 0) {
588 		pfn_cuGetErrorString(res, &(err_string));
589 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
590 				err_string);
591 		return -EPERM;
592 	}
593 
594 	/* Get next memory list item */
595 	mem_alloc_list_tail = mem_list_add_item();
596 	if (mem_alloc_list_tail == NULL)
597 		return -ENOMEM;
598 
599 	/* Allocate memory */
600 	mem_alloc_list_tail->size = size;
601 	res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d),
602 			mem_alloc_list_tail->size);
603 	if (res != 0) {
604 		pfn_cuGetErrorString(res, &(err_string));
605 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
606 				err_string);
607 		return -EPERM;
608 	}
609 
610 	/* GPUDirect RDMA attribute required */
611 	res = pfn_cuPointerSetAttribute(&flag,
612 			CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
613 			mem_alloc_list_tail->ptr_d);
614 	if (res != 0) {
615 		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
616 				"GPU memory at  %"PRIu32", err %d",
617 				(uint32_t)mem_alloc_list_tail->ptr_d, res);
618 		return -EPERM;
619 	}
620 
621 	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
622 	mem_alloc_list_tail->ptr_h = NULL;
623 	mem_alloc_list_tail->size = size;
624 	mem_alloc_list_tail->dev = dev;
625 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
626 	mem_alloc_list_tail->mtype = GPU_MEM;
627 
628 	/* Restore original ctx as current ctx */
629 	res = pfn_cuCtxSetCurrent(current_ctx);
630 	if (res != 0) {
631 		pfn_cuGetErrorString(res, &(err_string));
632 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
633 				err_string);
634 		return -EPERM;
635 	}
636 
637 	*ptr = (void *)mem_alloc_list_tail->ptr_d;
638 
639 	return 0;
640 }
641 
642 static int
643 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
644 {
645 	CUresult res;
646 	const char *err_string;
647 	CUcontext current_ctx;
648 	CUcontext input_ctx;
649 	unsigned int flag = 1;
650 	int use_ptr_h = 0;
651 
652 	if (dev == NULL)
653 		return -ENODEV;
654 
655 	if (size == 0 || ptr == NULL)
656 		return -EINVAL;
657 
658 	/* Store current ctx */
659 	res = pfn_cuCtxGetCurrent(&current_ctx);
660 	if (res != 0) {
661 		pfn_cuGetErrorString(res, &(err_string));
662 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
663 				err_string);
664 		return -EPERM;
665 	}
666 
667 	/* Set child ctx as current ctx */
668 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
669 	res = pfn_cuCtxSetCurrent(input_ctx);
670 	if (res != 0) {
671 		pfn_cuGetErrorString(res, &(err_string));
672 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
673 				err_string);
674 		return -EPERM;
675 	}
676 
677 	/* Get next memory list item */
678 	mem_alloc_list_tail = mem_list_add_item();
679 	if (mem_alloc_list_tail == NULL)
680 		return -ENOMEM;
681 
682 	/* Allocate memory */
683 	mem_alloc_list_tail->size = size;
684 	mem_alloc_list_tail->ptr_h = ptr;
685 
686 	res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
687 			mem_alloc_list_tail->size,
688 			CU_MEMHOSTREGISTER_PORTABLE |
689 			CU_MEMHOSTREGISTER_DEVICEMAP);
690 	if (res != 0) {
691 		pfn_cuGetErrorString(res, &(err_string));
692 		rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
693 				err_string,
694 				mem_alloc_list_tail->ptr_h,
695 				mem_alloc_list_tail->size);
696 		return -EPERM;
697 	}
698 
699 	res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
700 			CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
701 			((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
702 	if (res != 0) {
703 		pfn_cuGetErrorString(res, &(err_string));
704 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
705 				err_string);
706 		return -EPERM;
707 	}
708 
709 	if (use_ptr_h == 0) {
710 		res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
711 				mem_alloc_list_tail->ptr_h, 0);
712 		if (res != 0) {
713 			pfn_cuGetErrorString(res, &(err_string));
714 			rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
715 					err_string);
716 			return -EPERM;
717 		}
718 
719 		if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
720 				(uintptr_t)mem_alloc_list_tail->ptr_h) {
721 			rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
722 			return -ENOTSUP;
723 		}
724 	} else {
725 		mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
726 	}
727 
728 	/* GPUDirect RDMA attribute required */
729 	res = pfn_cuPointerSetAttribute(&flag,
730 			CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
731 			mem_alloc_list_tail->ptr_d);
732 	if (res != 0) {
733 		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
734 				", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
735 		return -EPERM;
736 	}
737 
738 	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
739 	mem_alloc_list_tail->size = size;
740 	mem_alloc_list_tail->dev = dev;
741 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
742 	mem_alloc_list_tail->mtype = CPU_REGISTERED;
743 
744 	/* Restore original ctx as current ctx */
745 	res = pfn_cuCtxSetCurrent(current_ctx);
746 	if (res != 0) {
747 		pfn_cuGetErrorString(res, &(err_string));
748 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
749 				err_string);
750 		return -EPERM;
751 	}
752 
753 	return 0;
754 }
755 
756 static int
757 cuda_mem_free(struct rte_gpu *dev, void *ptr)
758 {
759 	CUresult res;
760 	struct mem_entry *mem_item;
761 	const char *err_string;
762 	cuda_ptr_key hk;
763 
764 	if (dev == NULL)
765 		return -ENODEV;
766 
767 	if (ptr == NULL)
768 		return -EINVAL;
769 
770 	hk = get_hash_from_ptr((void *)ptr);
771 
772 	mem_item = mem_list_find_item(hk);
773 	if (mem_item == NULL) {
774 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
775 		return -EPERM;
776 	}
777 
778 	if (mem_item->mtype == GPU_MEM) {
779 		res = pfn_cuMemFree(mem_item->ptr_d);
780 		if (res != 0) {
781 			pfn_cuGetErrorString(res, &(err_string));
782 			rte_cuda_log(ERR, "cuMemFree current failed with %s",
783 					err_string);
784 			return -EPERM;
785 		}
786 
787 		return mem_list_del_item(hk);
788 	}
789 
790 	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
791 
792 	return -EPERM;
793 }
794 
795 static int
796 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
797 {
798 	CUresult res;
799 	struct mem_entry *mem_item;
800 	const char *err_string;
801 	cuda_ptr_key hk;
802 
803 	if (dev == NULL)
804 		return -ENODEV;
805 
806 	if (ptr == NULL)
807 		return -EINVAL;
808 
809 	hk = get_hash_from_ptr((void *)ptr);
810 
811 	mem_item = mem_list_find_item(hk);
812 	if (mem_item == NULL) {
813 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
814 		return -EPERM;
815 	}
816 
817 	if (mem_item->mtype == CPU_REGISTERED) {
818 		res = pfn_cuMemHostUnregister(ptr);
819 		if (res != 0) {
820 			pfn_cuGetErrorString(res, &(err_string));
821 			rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
822 					err_string);
823 			return -EPERM;
824 		}
825 
826 		return mem_list_del_item(hk);
827 	}
828 
829 	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
830 
831 	return -EPERM;
832 }
833 
834 static int
835 cuda_dev_close(struct rte_gpu *dev)
836 {
837 	if (dev == NULL)
838 		return -EINVAL;
839 
840 	rte_free(dev->mpshared->dev_private);
841 
842 	return 0;
843 }
844 
845 static int
846 cuda_wmb(struct rte_gpu *dev)
847 {
848 	CUresult res;
849 	const char *err_string;
850 	CUcontext current_ctx;
851 	CUcontext input_ctx;
852 	struct cuda_info *private;
853 
854 	if (dev == NULL)
855 		return -ENODEV;
856 
857 	private = (struct cuda_info *)dev->mpshared->dev_private;
858 
859 	if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
860 		/*
861 		 * No need to explicitly force the write ordering because
862 		 * the device natively supports it
863 		 */
864 		return 0;
865 	}
866 
867 	if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
868 		/*
869 		 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
870 		 * Application needs to use alternative methods.
871 		 */
872 		rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
873 				"Application needs to use alternative methods.");
874 		return -ENOTSUP;
875 	}
876 
877 	/* Store current ctx */
878 	res = pfn_cuCtxGetCurrent(&current_ctx);
879 	if (res != 0) {
880 		pfn_cuGetErrorString(res, &(err_string));
881 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
882 				err_string);
883 		return -EPERM;
884 	}
885 
886 	/* Set child ctx as current ctx */
887 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
888 	res = pfn_cuCtxSetCurrent(input_ctx);
889 	if (res != 0) {
890 		pfn_cuGetErrorString(res, &(err_string));
891 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
892 				err_string);
893 		return -EPERM;
894 	}
895 
896 	res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
897 			CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
898 	if (res != 0) {
899 		pfn_cuGetErrorString(res, &(err_string));
900 		rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
901 				err_string);
902 		return -EPERM;
903 	}
904 
905 	/* Restore original ctx as current ctx */
906 	res = pfn_cuCtxSetCurrent(current_ctx);
907 	if (res != 0) {
908 		pfn_cuGetErrorString(res, &(err_string));
909 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
910 				err_string);
911 		return -EPERM;
912 	}
913 
914 	return 0;
915 }
916 
917 static int
918 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
919 {
920 	struct rte_gpu *dev = NULL;
921 	CUresult res;
922 	CUdevice cu_dev_id;
923 	CUcontext pctx;
924 	char dev_name[RTE_DEV_NAME_MAX_LEN];
925 	const char *err_string;
926 	int processor_count = 0;
927 	struct cuda_info *private;
928 
929 	if (pci_dev == NULL) {
930 		rte_cuda_log(ERR, "NULL PCI device");
931 		return -EINVAL;
932 	}
933 
934 	rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
935 
936 	/* Allocate memory to be used privately by drivers */
937 	dev = rte_gpu_allocate(pci_dev->device.name);
938 	if (dev == NULL)
939 		return -ENODEV;
940 
941 	/* Initialize values only for the first CUDA driver call */
942 	if (dev->mpshared->info.dev_id == 0) {
943 		mem_alloc_list_head = NULL;
944 		mem_alloc_list_tail = NULL;
945 		mem_alloc_list_last_elem = 0;
946 
947 		/* Load libcuda.so library */
948 		if (cuda_loader()) {
949 			rte_cuda_log(ERR, "CUDA Driver library not found");
950 			return -ENOTSUP;
951 		}
952 
953 		/* Load initial CUDA functions */
954 		if (cuda_sym_func_loader()) {
955 			rte_cuda_log(ERR, "CUDA functions not found in library");
956 			return -ENOTSUP;
957 		}
958 
959 		/*
960 		 * Required to initialize the CUDA Driver.
961 		 * Multiple calls of cuInit() will return immediately
962 		 * without making any relevant change
963 		 */
964 		sym_cuInit(0);
965 
966 		res = sym_cuDriverGetVersion(&cuda_driver_version);
967 		if (res != 0) {
968 			rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
969 			return -ENOTSUP;
970 		}
971 
972 		if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
973 			rte_cuda_log(ERR, "CUDA Driver version found is %d. "
974 					"Minimum requirement is %d",
975 					cuda_driver_version,
976 					CUDA_DRIVER_MIN_VERSION);
977 			return -ENOTSUP;
978 		}
979 
980 		if (cuda_pfn_func_loader()) {
981 			rte_cuda_log(ERR, "CUDA PFN functions not found in library");
982 			return -ENOTSUP;
983 		}
984 	}
985 
986 	/* Fill HW specific part of device structure */
987 	dev->device = &pci_dev->device;
988 	dev->mpshared->info.numa_node = pci_dev->device.numa_node;
989 
990 	/* Get NVIDIA GPU Device descriptor */
991 	res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
992 	if (res != 0) {
993 		pfn_cuGetErrorString(res, &(err_string));
994 		rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
995 				dev->device->name, res, err_string);
996 		return -EPERM;
997 	}
998 
999 	res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1000 	if (res != 0) {
1001 		pfn_cuGetErrorString(res, &(err_string));
1002 		rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1003 				dev->device->name, res, err_string);
1004 		return -EPERM;
1005 	}
1006 
1007 	res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1008 	if (res != 0) {
1009 		rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1010 		return -ENOTSUP;
1011 	}
1012 
1013 	if (cuda_api_version < CUDA_API_MIN_VERSION) {
1014 		rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1015 				cuda_api_version, CUDA_API_MIN_VERSION);
1016 		return -ENOTSUP;
1017 	}
1018 
1019 	dev->mpshared->info.context = (uint64_t)pctx;
1020 
1021 	/*
1022 	 * GPU Device generic info
1023 	 */
1024 
1025 	/* Processor count */
1026 	res = pfn_cuDeviceGetAttribute(&(processor_count),
1027 			CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1028 			cu_dev_id);
1029 	if (res != 0) {
1030 		pfn_cuGetErrorString(res, &(err_string));
1031 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1032 				err_string);
1033 		return -EPERM;
1034 	}
1035 	dev->mpshared->info.processor_count = (uint32_t)processor_count;
1036 
1037 	/* Total memory */
1038 	res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1039 	if (res != 0) {
1040 		pfn_cuGetErrorString(res, &(err_string));
1041 		rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1042 				err_string);
1043 		return -EPERM;
1044 	}
1045 
1046 	/*
1047 	 * GPU Device private info
1048 	 */
1049 	dev->mpshared->dev_private = rte_zmalloc(NULL,
1050 			sizeof(struct cuda_info),
1051 			RTE_CACHE_LINE_SIZE);
1052 	if (dev->mpshared->dev_private == NULL) {
1053 		rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1054 		return -ENOMEM;
1055 	}
1056 
1057 	private = (struct cuda_info *)dev->mpshared->dev_private;
1058 	private->cu_dev = cu_dev_id;
1059 	res = pfn_cuDeviceGetName(private->gpu_name,
1060 			RTE_DEV_NAME_MAX_LEN,
1061 			cu_dev_id);
1062 	if (res != 0) {
1063 		pfn_cuGetErrorString(res, &(err_string));
1064 		rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1065 				err_string);
1066 		return -EPERM;
1067 	}
1068 
1069 	res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1070 			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1071 			cu_dev_id);
1072 	if (res != 0) {
1073 		pfn_cuGetErrorString(res, &(err_string));
1074 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1075 				err_string);
1076 		return -EPERM;
1077 	}
1078 
1079 	if (private->gdr_supported == 0)
1080 		rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1081 				pci_dev->device.name);
1082 
1083 	res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1084 			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1085 			cu_dev_id);
1086 	if (res != 0) {
1087 		pfn_cuGetErrorString(res, &(err_string));
1088 		rte_cuda_log(ERR,
1089 				"cuDeviceGetAttribute failed with %s",
1090 				err_string);
1091 		return -EPERM;
1092 	}
1093 
1094 	if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1095 		res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1096 				CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1097 				cu_dev_id);
1098 		if (res != 0) {
1099 			pfn_cuGetErrorString(res, &(err_string));
1100 			rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1101 					err_string);
1102 			return -EPERM;
1103 		}
1104 
1105 		if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1106 			rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1107 	}
1108 
1109 	dev->ops.dev_info_get = cuda_dev_info_get;
1110 	dev->ops.dev_close = cuda_dev_close;
1111 	dev->ops.mem_alloc = cuda_mem_alloc;
1112 	dev->ops.mem_free = cuda_mem_free;
1113 	dev->ops.mem_register = cuda_mem_register;
1114 	dev->ops.mem_unregister = cuda_mem_unregister;
1115 	dev->ops.wmb = cuda_wmb;
1116 
1117 	rte_gpu_complete_new(dev);
1118 
1119 	rte_cuda_debug("dev id = %u name = %s",
1120 			dev->mpshared->info.dev_id, private->gpu_name);
1121 
1122 	return 0;
1123 }
1124 
1125 static int
1126 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1127 {
1128 	struct rte_gpu *dev;
1129 	int ret;
1130 	uint8_t gpu_id;
1131 
1132 	if (pci_dev == NULL)
1133 		return -EINVAL;
1134 
1135 	dev = rte_gpu_get_by_name(pci_dev->device.name);
1136 	if (dev == NULL) {
1137 		rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1138 				pci_dev->device.name);
1139 		return -ENODEV;
1140 	}
1141 	gpu_id = dev->mpshared->info.dev_id;
1142 
1143 	/* release dev from library */
1144 	ret = rte_gpu_release(dev);
1145 	if (ret)
1146 		rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1147 
1148 	rte_cuda_debug("Destroyed dev = %u", gpu_id);
1149 
1150 	return 0;
1151 }
1152 
1153 static struct rte_pci_driver rte_cuda_driver = {
1154 	.id_table = pci_id_cuda_map,
1155 	.drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1156 	.probe = cuda_gpu_probe,
1157 	.remove = cuda_gpu_remove,
1158 };
1159 
1160 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1161 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1162 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
1163