xref: /dpdk/drivers/gpu/cuda/cuda.c (revision f4eac3a09c51a1a2dab1f2fd3a10fe0619286a0d)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2021 NVIDIA Corporation & Affiliates
3  */
4 
5 #include <dlfcn.h>
6 
7 #include <rte_common.h>
8 #include <rte_log.h>
9 #include <rte_malloc.h>
10 #include <rte_errno.h>
11 #include <rte_pci.h>
12 #include <rte_bus_pci.h>
13 #include <rte_byteorder.h>
14 #include <rte_dev.h>
15 
16 #include <gpudev_driver.h>
17 #include <cuda.h>
18 #include <cudaTypedefs.h>
19 
20 #define CUDA_DRIVER_MIN_VERSION 11040
21 #define CUDA_API_MIN_VERSION 3020
22 
23 /* CUDA Driver functions loaded with dlsym() */
24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
27 		void **pfn, int cudaVersion, uint64_t flags);
28 
29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
30 static PFN_cuGetErrorString pfn_cuGetErrorString;
31 static PFN_cuGetErrorName pfn_cuGetErrorName;
32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
44 static PFN_cuMemAlloc pfn_cuMemAlloc;
45 static PFN_cuMemFree pfn_cuMemFree;
46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
50 
51 static void *cudalib;
52 static unsigned int cuda_api_version;
53 static int cuda_driver_version;
54 
55 /* NVIDIA GPU vendor */
56 #define NVIDIA_GPU_VENDOR_ID (0x10de)
57 
58 /* NVIDIA GPU device IDs */
59 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
60 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
61 #define NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID (0x20b8)
62 
63 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
64 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
65 
66 #define NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID (0x1db5)
67 #define NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID (0x1db6)
68 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
69 
70 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
71 
72 #define CUDA_MAX_ALLOCATION_NUM 512
73 
74 #define GPU_PAGE_SHIFT 16
75 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
76 
77 static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
78 
79 /* Helper macro for logging */
80 #define rte_cuda_log(level, fmt, ...) \
81 	rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
82 
83 #define rte_cuda_debug(fmt, ...) \
84 	rte_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
85 		##__VA_ARGS__)
86 
87 /* NVIDIA GPU address map */
88 static const struct rte_pci_id pci_id_cuda_map[] = {
89 	{
90 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
91 				NVIDIA_GPU_A100_40GB_DEVICE_ID)
92 	},
93 	{
94 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
95 				NVIDIA_GPU_A100_80GB_DEVICE_ID)
96 	},
97 	{
98 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
99 				NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID)
100 	},
101 	{
102 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
103 				NVIDIA_GPU_A30_24GB_DEVICE_ID)
104 	},
105 	{
106 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
107 				NVIDIA_GPU_A10_24GB_DEVICE_ID)
108 	},
109 	{
110 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
111 				NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID)
112 	},
113 	{
114 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
115 				NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID)
116 	},
117 	{
118 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
119 				NVIDIA_GPU_V100_16GB_DEVICE_ID)
120 	},
121 	{
122 		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
123 				NVIDIA_GPU_T4_16GB_DEVICE_ID)
124 	},
125 	{
126 		.device_id = 0
127 	}
128 };
129 
130 /* Device private info */
131 struct cuda_info {
132 	char gpu_name[RTE_DEV_NAME_MAX_LEN];
133 	CUdevice cu_dev;
134 	int gdr_supported;
135 	int gdr_write_ordering;
136 	int gdr_flush_type;
137 };
138 
139 /* Type of memory allocated by CUDA driver */
140 enum mem_type {
141 	GPU_MEM = 0,
142 	CPU_REGISTERED,
143 	GPU_REGISTERED /* Not used yet */
144 };
145 
146 /* key associated to a memory address */
147 typedef uintptr_t cuda_ptr_key;
148 
149 /* Single entry of the memory list */
150 struct mem_entry {
151 	CUdeviceptr ptr_d;
152 	CUdeviceptr ptr_orig_d;
153 	void *ptr_h;
154 	size_t size;
155 	size_t size_orig;
156 	struct rte_gpu *dev;
157 	CUcontext ctx;
158 	cuda_ptr_key pkey;
159 	enum mem_type mtype;
160 	struct mem_entry *prev;
161 	struct mem_entry *next;
162 };
163 
164 static struct mem_entry *mem_alloc_list_head;
165 static struct mem_entry *mem_alloc_list_tail;
166 static uint32_t mem_alloc_list_last_elem;
167 
168 /* Load the CUDA symbols */
169 
170 static int
171 cuda_loader(void)
172 {
173 	char cuda_path[1024];
174 
175 	if (getenv("CUDA_PATH_L") == NULL)
176 		snprintf(cuda_path, 1024, "%s", "libcuda.so");
177 	else
178 		snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so");
179 
180 	cudalib = dlopen(cuda_path, RTLD_LAZY);
181 	if (cudalib == NULL) {
182 		rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
183 				cuda_path, getenv("CUDA_PATH_L"));
184 		return -1;
185 	}
186 
187 	return 0;
188 }
189 
190 static int
191 cuda_sym_func_loader(void)
192 {
193 	if (cudalib == NULL)
194 		return -1;
195 
196 	sym_cuInit = dlsym(cudalib, "cuInit");
197 	if (sym_cuInit == NULL) {
198 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
199 		return -1;
200 	}
201 
202 	sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
203 	if (sym_cuDriverGetVersion == NULL) {
204 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
205 		return -1;
206 	}
207 
208 	sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
209 	if (sym_cuGetProcAddress == NULL) {
210 		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
211 		return -1;
212 	}
213 
214 	return 0;
215 }
216 
217 static int
218 cuda_pfn_func_loader(void)
219 {
220 	CUresult res;
221 
222 	res = sym_cuGetProcAddress("cuGetErrorString",
223 			(void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
224 	if (res != 0) {
225 		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
226 		return -1;
227 	}
228 
229 	res = sym_cuGetProcAddress("cuGetErrorName",
230 			(void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
231 	if (res != 0) {
232 		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
233 		return -1;
234 	}
235 
236 	res = sym_cuGetProcAddress("cuPointerSetAttribute",
237 			(void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
238 	if (res != 0) {
239 		rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
240 		return -1;
241 	}
242 
243 	res = sym_cuGetProcAddress("cuDeviceGetAttribute",
244 			(void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
245 	if (res != 0) {
246 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
247 		return -1;
248 	}
249 
250 	res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
251 			(void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
252 	if (res != 0) {
253 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
254 		return -1;
255 	}
256 
257 	res = sym_cuGetProcAddress("cuDeviceGetName",
258 			(void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
259 	if (res != 0) {
260 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
261 		return -1;
262 	}
263 
264 	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
265 			(void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
266 	if (res != 0) {
267 		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
268 		return -1;
269 	}
270 
271 	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
272 			(void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
273 	if (res != 0) {
274 		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
275 		return -1;
276 	}
277 
278 	res = sym_cuGetProcAddress("cuDeviceTotalMem",
279 			(void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
280 	if (res != 0) {
281 		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
282 		return -1;
283 	}
284 
285 	res = sym_cuGetProcAddress("cuCtxGetApiVersion",
286 			(void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
287 	if (res != 0) {
288 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
289 		return -1;
290 	}
291 
292 	res = sym_cuGetProcAddress("cuCtxGetDevice",
293 			(void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
294 	if (res != 0) {
295 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
296 		return -1;
297 	}
298 
299 	res = sym_cuGetProcAddress("cuCtxSetCurrent",
300 			(void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
301 	if (res != 0) {
302 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
303 		return -1;
304 	}
305 
306 	res = sym_cuGetProcAddress("cuCtxGetCurrent",
307 			(void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
308 	if (res != 0) {
309 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
310 		return -1;
311 	}
312 
313 	res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
314 			(void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
315 	if (res != 0) {
316 		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
317 		return -1;
318 	}
319 
320 	res = sym_cuGetProcAddress("cuMemAlloc",
321 			(void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
322 	if (res != 0) {
323 		rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
324 		return -1;
325 	}
326 
327 	res = sym_cuGetProcAddress("cuMemFree",
328 			(void **)(&pfn_cuMemFree), cuda_driver_version, 0);
329 	if (res != 0) {
330 		rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
331 		return -1;
332 	}
333 
334 	res = sym_cuGetProcAddress("cuMemHostRegister",
335 			(void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
336 	if (res != 0) {
337 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
338 		return -1;
339 	}
340 
341 	res = sym_cuGetProcAddress("cuMemHostUnregister",
342 			(void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
343 	if (res != 0) {
344 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
345 		return -1;
346 	}
347 
348 	res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
349 			(void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
350 	if (res != 0) {
351 		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
352 		return -1;
353 	}
354 
355 	res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
356 			(void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
357 	if (res != 0) {
358 		rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
359 		return -1;
360 	}
361 
362 	return 0;
363 }
364 
365 /* Generate a key from a memory pointer */
366 static cuda_ptr_key
367 get_hash_from_ptr(void *ptr)
368 {
369 	return (uintptr_t)ptr;
370 }
371 
372 static uint32_t
373 mem_list_count_item(void)
374 {
375 	return mem_alloc_list_last_elem;
376 }
377 
378 /* Initiate list of memory allocations if not done yet */
379 static struct mem_entry *
380 mem_list_add_item(void)
381 {
382 	/* Initiate list of memory allocations if not done yet */
383 	if (mem_alloc_list_head == NULL) {
384 		mem_alloc_list_head = rte_zmalloc(NULL,
385 				sizeof(struct mem_entry),
386 				RTE_CACHE_LINE_SIZE);
387 		if (mem_alloc_list_head == NULL) {
388 			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
389 			return NULL;
390 		}
391 
392 		mem_alloc_list_head->next = NULL;
393 		mem_alloc_list_head->prev = NULL;
394 		mem_alloc_list_tail = mem_alloc_list_head;
395 	} else {
396 		struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
397 				sizeof(struct mem_entry),
398 				RTE_CACHE_LINE_SIZE);
399 
400 		if (mem_alloc_list_cur == NULL) {
401 			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
402 			return NULL;
403 		}
404 
405 		mem_alloc_list_tail->next = mem_alloc_list_cur;
406 		mem_alloc_list_cur->prev = mem_alloc_list_tail;
407 		mem_alloc_list_tail = mem_alloc_list_tail->next;
408 		mem_alloc_list_tail->next = NULL;
409 	}
410 
411 	mem_alloc_list_last_elem++;
412 
413 	return mem_alloc_list_tail;
414 }
415 
416 static struct mem_entry *
417 mem_list_find_item(cuda_ptr_key pk)
418 {
419 	struct mem_entry *mem_alloc_list_cur = NULL;
420 
421 	if (mem_alloc_list_head == NULL) {
422 		rte_cuda_log(ERR, "Memory list doesn't exist");
423 		return NULL;
424 	}
425 
426 	if (mem_list_count_item() == 0) {
427 		rte_cuda_log(ERR, "No items in memory list");
428 		return NULL;
429 	}
430 
431 	mem_alloc_list_cur = mem_alloc_list_head;
432 
433 	while (mem_alloc_list_cur != NULL) {
434 		if (mem_alloc_list_cur->pkey == pk)
435 			return mem_alloc_list_cur;
436 		mem_alloc_list_cur = mem_alloc_list_cur->next;
437 	}
438 
439 	return mem_alloc_list_cur;
440 }
441 
442 static int
443 mem_list_del_item(cuda_ptr_key pk)
444 {
445 	struct mem_entry *mem_alloc_list_cur = NULL;
446 
447 	mem_alloc_list_cur = mem_list_find_item(pk);
448 	if (mem_alloc_list_cur == NULL)
449 		return -EINVAL;
450 
451 	/* if key is in head */
452 	if (mem_alloc_list_cur->prev == NULL) {
453 		mem_alloc_list_head = mem_alloc_list_cur->next;
454 		if (mem_alloc_list_head != NULL)
455 			mem_alloc_list_head->prev = NULL;
456 	} else {
457 		mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
458 		if (mem_alloc_list_cur->next != NULL)
459 			mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
460 	}
461 
462 	rte_free(mem_alloc_list_cur);
463 
464 	mem_alloc_list_last_elem--;
465 
466 	return 0;
467 }
468 
469 static int
470 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
471 {
472 	int ret = 0;
473 	CUresult res;
474 	struct rte_gpu_info parent_info;
475 	CUexecAffinityParam affinityPrm;
476 	const char *err_string;
477 	struct cuda_info *private;
478 	CUcontext current_ctx;
479 	CUcontext input_ctx;
480 
481 	if (dev == NULL) {
482 		rte_errno = ENODEV;
483 		return -rte_errno;
484 	}
485 
486 	/* Child initialization time probably called by rte_gpu_add_child() */
487 	if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
488 			dev->mpshared->dev_private == NULL) {
489 		/* Store current ctx */
490 		res = pfn_cuCtxGetCurrent(&current_ctx);
491 		if (res != 0) {
492 			pfn_cuGetErrorString(res, &(err_string));
493 			rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
494 					err_string);
495 			rte_errno = EPERM;
496 			return -rte_errno;
497 		}
498 
499 		/* Set child ctx as current ctx */
500 		input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
501 		res = pfn_cuCtxSetCurrent(input_ctx);
502 		if (res != 0) {
503 			pfn_cuGetErrorString(res, &(err_string));
504 			rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
505 					err_string);
506 			rte_errno = EPERM;
507 			return -rte_errno;
508 		}
509 
510 		/*
511 		 * Ctx capacity info
512 		 */
513 
514 		/* MPS compatible */
515 		res = pfn_cuCtxGetExecAffinity(&affinityPrm,
516 				CU_EXEC_AFFINITY_TYPE_SM_COUNT);
517 		if (res != 0) {
518 			pfn_cuGetErrorString(res, &(err_string));
519 			rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
520 					err_string);
521 		}
522 		dev->mpshared->info.processor_count =
523 				(uint32_t)affinityPrm.param.smCount.val;
524 
525 		ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
526 		if (ret) {
527 			rte_errno = ENODEV;
528 			return -rte_errno;
529 		}
530 		dev->mpshared->info.total_memory = parent_info.total_memory;
531 
532 		/*
533 		 * GPU Device private info
534 		 */
535 		dev->mpshared->dev_private = rte_zmalloc(NULL,
536 				sizeof(struct cuda_info),
537 				RTE_CACHE_LINE_SIZE);
538 		if (dev->mpshared->dev_private == NULL) {
539 			rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
540 			rte_errno = EPERM;
541 			return -rte_errno;
542 		}
543 
544 		private = (struct cuda_info *)dev->mpshared->dev_private;
545 
546 		res = pfn_cuCtxGetDevice(&(private->cu_dev));
547 		if (res != 0) {
548 			pfn_cuGetErrorString(res, &(err_string));
549 			rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
550 					err_string);
551 			rte_errno = EPERM;
552 			return -rte_errno;
553 		}
554 
555 		res = pfn_cuDeviceGetName(private->gpu_name,
556 				RTE_DEV_NAME_MAX_LEN, private->cu_dev);
557 		if (res != 0) {
558 			pfn_cuGetErrorString(res, &(err_string));
559 			rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
560 					err_string);
561 			rte_errno = EPERM;
562 			return -rte_errno;
563 		}
564 
565 		/* Restore original ctx as current ctx */
566 		res = pfn_cuCtxSetCurrent(current_ctx);
567 		if (res != 0) {
568 			pfn_cuGetErrorString(res, &(err_string));
569 			rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
570 					err_string);
571 			rte_errno = EPERM;
572 			return -rte_errno;
573 		}
574 	}
575 
576 	*info = dev->mpshared->info;
577 
578 	return 0;
579 }
580 
581 /*
582  * GPU Memory
583  */
584 
585 static int
586 cuda_mem_alloc(struct rte_gpu *dev, size_t size, unsigned int align, void **ptr)
587 {
588 	CUresult res;
589 	const char *err_string;
590 	CUcontext current_ctx;
591 	CUcontext input_ctx;
592 	unsigned int flag = 1;
593 
594 	if (dev == NULL)
595 		return -ENODEV;
596 
597 	/* Store current ctx */
598 	res = pfn_cuCtxGetCurrent(&current_ctx);
599 	if (res != 0) {
600 		pfn_cuGetErrorString(res, &(err_string));
601 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
602 				err_string);
603 		rte_errno = EPERM;
604 		return -rte_errno;
605 	}
606 
607 	/* Set child ctx as current ctx */
608 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
609 	res = pfn_cuCtxSetCurrent(input_ctx);
610 	if (res != 0) {
611 		pfn_cuGetErrorString(res, &(err_string));
612 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
613 				err_string);
614 		rte_errno = EPERM;
615 		return -rte_errno;
616 	}
617 
618 	/* Get next memory list item */
619 	mem_alloc_list_tail = mem_list_add_item();
620 	if (mem_alloc_list_tail == NULL) {
621 		rte_errno = EPERM;
622 		return -rte_errno;
623 	}
624 
625 	/* Allocate memory */
626 	mem_alloc_list_tail->size = size;
627 	mem_alloc_list_tail->size_orig = size + align;
628 
629 	res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
630 			mem_alloc_list_tail->size_orig);
631 	if (res != 0) {
632 		pfn_cuGetErrorString(res, &(err_string));
633 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
634 				err_string);
635 		rte_errno = EPERM;
636 		return -rte_errno;
637 	}
638 
639 	/* Align memory address */
640 	mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
641 	if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
642 		mem_alloc_list_tail->ptr_d += (align -
643 				(((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
644 
645 	/* GPUDirect RDMA attribute required */
646 	res = pfn_cuPointerSetAttribute(&flag,
647 			CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
648 			mem_alloc_list_tail->ptr_d);
649 	if (res != 0) {
650 		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
651 				"GPU memory at  %"PRIu32", err %d",
652 				(uint32_t)mem_alloc_list_tail->ptr_d, res);
653 		rte_errno = EPERM;
654 		return -rte_errno;
655 	}
656 
657 	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
658 	mem_alloc_list_tail->ptr_h = NULL;
659 	mem_alloc_list_tail->dev = dev;
660 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
661 	mem_alloc_list_tail->mtype = GPU_MEM;
662 
663 	/* Restore original ctx as current ctx */
664 	res = pfn_cuCtxSetCurrent(current_ctx);
665 	if (res != 0) {
666 		pfn_cuGetErrorString(res, &(err_string));
667 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
668 				err_string);
669 		rte_errno = EPERM;
670 		return -rte_errno;
671 	}
672 
673 	*ptr = (void *)mem_alloc_list_tail->ptr_d;
674 
675 	return 0;
676 }
677 
678 static int
679 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
680 {
681 	CUresult res;
682 	const char *err_string;
683 	CUcontext current_ctx;
684 	CUcontext input_ctx;
685 	unsigned int flag = 1;
686 	int use_ptr_h = 0;
687 
688 	if (dev == NULL)
689 		return -ENODEV;
690 
691 	/* Store current ctx */
692 	res = pfn_cuCtxGetCurrent(&current_ctx);
693 	if (res != 0) {
694 		pfn_cuGetErrorString(res, &(err_string));
695 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
696 				err_string);
697 		rte_errno = EPERM;
698 		return -rte_errno;
699 	}
700 
701 	/* Set child ctx as current ctx */
702 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
703 	res = pfn_cuCtxSetCurrent(input_ctx);
704 	if (res != 0) {
705 		pfn_cuGetErrorString(res, &(err_string));
706 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
707 				err_string);
708 		rte_errno = EPERM;
709 		return -rte_errno;
710 	}
711 
712 	/* Get next memory list item */
713 	mem_alloc_list_tail = mem_list_add_item();
714 	if (mem_alloc_list_tail == NULL) {
715 		rte_errno = EPERM;
716 		return -rte_errno;
717 	}
718 
719 	/* Allocate memory */
720 	mem_alloc_list_tail->size = size;
721 	mem_alloc_list_tail->ptr_h = ptr;
722 
723 	res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
724 			mem_alloc_list_tail->size,
725 			CU_MEMHOSTREGISTER_PORTABLE |
726 			CU_MEMHOSTREGISTER_DEVICEMAP);
727 	if (res != 0) {
728 		pfn_cuGetErrorString(res, &(err_string));
729 		rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
730 				err_string,
731 				mem_alloc_list_tail->ptr_h,
732 				mem_alloc_list_tail->size);
733 		rte_errno = EPERM;
734 		return -rte_errno;
735 	}
736 
737 	res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
738 			CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
739 			((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
740 	if (res != 0) {
741 		pfn_cuGetErrorString(res, &(err_string));
742 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
743 				err_string);
744 		rte_errno = EPERM;
745 		return -rte_errno;
746 	}
747 
748 	if (use_ptr_h == 0) {
749 		res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
750 				mem_alloc_list_tail->ptr_h, 0);
751 		if (res != 0) {
752 			pfn_cuGetErrorString(res, &(err_string));
753 			rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
754 					err_string);
755 			rte_errno = EPERM;
756 			return -rte_errno;
757 		}
758 
759 		if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
760 				(uintptr_t)mem_alloc_list_tail->ptr_h) {
761 			rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
762 			rte_errno = ENOTSUP;
763 			return -rte_errno;
764 		}
765 	} else {
766 		mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
767 	}
768 
769 	/* GPUDirect RDMA attribute required */
770 	res = pfn_cuPointerSetAttribute(&flag,
771 			CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
772 			mem_alloc_list_tail->ptr_d);
773 	if (res != 0) {
774 		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
775 				", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
776 		rte_errno = EPERM;
777 		return -rte_errno;
778 	}
779 
780 	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
781 	mem_alloc_list_tail->size = size;
782 	mem_alloc_list_tail->dev = dev;
783 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
784 	mem_alloc_list_tail->mtype = CPU_REGISTERED;
785 	mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d;
786 
787 	/* Restore original ctx as current ctx */
788 	res = pfn_cuCtxSetCurrent(current_ctx);
789 	if (res != 0) {
790 		pfn_cuGetErrorString(res, &(err_string));
791 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
792 				err_string);
793 		rte_errno = EPERM;
794 		return -rte_errno;
795 	}
796 
797 	return 0;
798 }
799 
800 static int
801 cuda_mem_free(struct rte_gpu *dev, void *ptr)
802 {
803 	CUresult res;
804 	struct mem_entry *mem_item;
805 	const char *err_string;
806 	cuda_ptr_key hk;
807 
808 	if (dev == NULL)
809 		return -ENODEV;
810 
811 	hk = get_hash_from_ptr((void *)ptr);
812 
813 	mem_item = mem_list_find_item(hk);
814 	if (mem_item == NULL) {
815 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
816 		rte_errno = EPERM;
817 		return -rte_errno;
818 	}
819 
820 	if (mem_item->mtype == GPU_MEM) {
821 		res = pfn_cuMemFree(mem_item->ptr_orig_d);
822 		if (res != 0) {
823 			pfn_cuGetErrorString(res, &(err_string));
824 			rte_cuda_log(ERR, "cuMemFree current failed with %s",
825 					err_string);
826 			rte_errno = EPERM;
827 			return -rte_errno;
828 		}
829 
830 		return mem_list_del_item(hk);
831 	}
832 
833 	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
834 
835 	return -EPERM;
836 }
837 
838 static int
839 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
840 {
841 	CUresult res;
842 	struct mem_entry *mem_item;
843 	const char *err_string;
844 	cuda_ptr_key hk;
845 
846 	if (dev == NULL)
847 		return -ENODEV;
848 
849 	hk = get_hash_from_ptr((void *)ptr);
850 
851 	mem_item = mem_list_find_item(hk);
852 	if (mem_item == NULL) {
853 		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
854 		rte_errno = EPERM;
855 		return -rte_errno;
856 	}
857 
858 	if (mem_item->mtype == CPU_REGISTERED) {
859 		res = pfn_cuMemHostUnregister(ptr);
860 		if (res != 0) {
861 			pfn_cuGetErrorString(res, &(err_string));
862 			rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
863 					err_string);
864 			rte_errno = EPERM;
865 			return -rte_errno;
866 		}
867 
868 		return mem_list_del_item(hk);
869 	}
870 
871 	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
872 
873 	rte_errno = EPERM;
874 	return -rte_errno;
875 }
876 
877 static int
878 cuda_dev_close(struct rte_gpu *dev)
879 {
880 	if (dev == NULL)
881 		return -EINVAL;
882 
883 	rte_free(dev->mpshared->dev_private);
884 
885 	return 0;
886 }
887 
888 static int
889 cuda_wmb(struct rte_gpu *dev)
890 {
891 	CUresult res;
892 	const char *err_string;
893 	CUcontext current_ctx;
894 	CUcontext input_ctx;
895 	struct cuda_info *private;
896 
897 	if (dev == NULL) {
898 		rte_errno = ENODEV;
899 		return -rte_errno;
900 	}
901 
902 	private = (struct cuda_info *)dev->mpshared->dev_private;
903 
904 	if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
905 		/*
906 		 * No need to explicitly force the write ordering because
907 		 * the device natively supports it
908 		 */
909 		return 0;
910 	}
911 
912 	if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
913 		/*
914 		 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
915 		 * Application needs to use alternative methods.
916 		 */
917 		rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
918 				"Application needs to use alternative methods.");
919 
920 		rte_errno = ENOTSUP;
921 		return -rte_errno;
922 	}
923 
924 	/* Store current ctx */
925 	res = pfn_cuCtxGetCurrent(&current_ctx);
926 	if (res != 0) {
927 		pfn_cuGetErrorString(res, &(err_string));
928 		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
929 				err_string);
930 		rte_errno = EPERM;
931 		return -rte_errno;
932 	}
933 
934 	/* Set child ctx as current ctx */
935 	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
936 	res = pfn_cuCtxSetCurrent(input_ctx);
937 	if (res != 0) {
938 		pfn_cuGetErrorString(res, &(err_string));
939 		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
940 				err_string);
941 		rte_errno = EPERM;
942 		return -rte_errno;
943 	}
944 
945 	res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
946 			CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
947 	if (res != 0) {
948 		pfn_cuGetErrorString(res, &(err_string));
949 		rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
950 				err_string);
951 		rte_errno = EPERM;
952 		return -rte_errno;
953 	}
954 
955 	/* Restore original ctx as current ctx */
956 	res = pfn_cuCtxSetCurrent(current_ctx);
957 	if (res != 0) {
958 		pfn_cuGetErrorString(res, &(err_string));
959 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
960 				err_string);
961 		rte_errno = EPERM;
962 		return -rte_errno;
963 	}
964 
965 	return 0;
966 }
967 
968 static int
969 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
970 {
971 	struct rte_gpu *dev = NULL;
972 	CUresult res;
973 	CUdevice cu_dev_id;
974 	CUcontext pctx;
975 	char dev_name[RTE_DEV_NAME_MAX_LEN];
976 	const char *err_string;
977 	int processor_count = 0;
978 	struct cuda_info *private;
979 
980 	if (pci_dev == NULL) {
981 		rte_cuda_log(ERR, "NULL PCI device");
982 		rte_errno = ENODEV;
983 		return -rte_errno;
984 	}
985 
986 	rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
987 
988 	/* Allocate memory to be used privately by drivers */
989 	dev = rte_gpu_allocate(pci_dev->device.name);
990 	if (dev == NULL) {
991 		rte_errno = ENODEV;
992 		return -rte_errno;
993 	}
994 
995 	/* Initialize values only for the first CUDA driver call */
996 	if (dev->mpshared->info.dev_id == 0) {
997 		mem_alloc_list_head = NULL;
998 		mem_alloc_list_tail = NULL;
999 		mem_alloc_list_last_elem = 0;
1000 
1001 		/* Load libcuda.so library */
1002 		if (cuda_loader()) {
1003 			rte_cuda_log(ERR, "CUDA Driver library not found");
1004 			rte_errno = ENOTSUP;
1005 			return -rte_errno;
1006 		}
1007 
1008 		/* Load initial CUDA functions */
1009 		if (cuda_sym_func_loader()) {
1010 			rte_cuda_log(ERR, "CUDA functions not found in library");
1011 			rte_errno = ENOTSUP;
1012 			return -rte_errno;
1013 		}
1014 
1015 		/*
1016 		 * Required to initialize the CUDA Driver.
1017 		 * Multiple calls of cuInit() will return immediately
1018 		 * without making any relevant change
1019 		 */
1020 		sym_cuInit(0);
1021 
1022 		res = sym_cuDriverGetVersion(&cuda_driver_version);
1023 		if (res != 0) {
1024 			rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1025 			rte_errno = ENOTSUP;
1026 			return -rte_errno;
1027 		}
1028 
1029 		if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1030 			rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1031 					"Minimum requirement is %d",
1032 					cuda_driver_version,
1033 					CUDA_DRIVER_MIN_VERSION);
1034 			rte_errno = ENOTSUP;
1035 			return -rte_errno;
1036 		}
1037 
1038 		if (cuda_pfn_func_loader()) {
1039 			rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1040 			rte_errno = ENOTSUP;
1041 			return -rte_errno;
1042 		}
1043 	}
1044 
1045 	/* Fill HW specific part of device structure */
1046 	dev->device = &pci_dev->device;
1047 	dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1048 
1049 	/* Get NVIDIA GPU Device descriptor */
1050 	res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1051 	if (res != 0) {
1052 		pfn_cuGetErrorString(res, &(err_string));
1053 		rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1054 				dev->device->name, res, err_string);
1055 		rte_errno = EPERM;
1056 		return -rte_errno;
1057 	}
1058 
1059 	res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1060 	if (res != 0) {
1061 		pfn_cuGetErrorString(res, &(err_string));
1062 		rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1063 				dev->device->name, res, err_string);
1064 		rte_errno = EPERM;
1065 		return -rte_errno;
1066 	}
1067 
1068 	res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1069 	if (res != 0) {
1070 		rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1071 		rte_errno = ENOTSUP;
1072 		return -rte_errno;
1073 	}
1074 
1075 	if (cuda_api_version < CUDA_API_MIN_VERSION) {
1076 		rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1077 				cuda_api_version, CUDA_API_MIN_VERSION);
1078 		rte_errno = ENOTSUP;
1079 		return -rte_errno;
1080 	}
1081 
1082 	dev->mpshared->info.context = (uint64_t)pctx;
1083 
1084 	/*
1085 	 * GPU Device generic info
1086 	 */
1087 
1088 	/* Processor count */
1089 	res = pfn_cuDeviceGetAttribute(&(processor_count),
1090 			CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1091 			cu_dev_id);
1092 	if (res != 0) {
1093 		pfn_cuGetErrorString(res, &(err_string));
1094 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1095 				err_string);
1096 		rte_errno = EPERM;
1097 		return -rte_errno;
1098 	}
1099 	dev->mpshared->info.processor_count = (uint32_t)processor_count;
1100 
1101 	/* Total memory */
1102 	res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1103 	if (res != 0) {
1104 		pfn_cuGetErrorString(res, &(err_string));
1105 		rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1106 				err_string);
1107 		rte_errno = EPERM;
1108 		return -rte_errno;
1109 	}
1110 
1111 	/*
1112 	 * GPU Device private info
1113 	 */
1114 	dev->mpshared->dev_private = rte_zmalloc(NULL,
1115 			sizeof(struct cuda_info),
1116 			RTE_CACHE_LINE_SIZE);
1117 	if (dev->mpshared->dev_private == NULL) {
1118 		rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1119 		rte_errno = EPERM;
1120 		return -rte_errno;
1121 	}
1122 
1123 	private = (struct cuda_info *)dev->mpshared->dev_private;
1124 	private->cu_dev = cu_dev_id;
1125 	res = pfn_cuDeviceGetName(private->gpu_name,
1126 			RTE_DEV_NAME_MAX_LEN,
1127 			cu_dev_id);
1128 	if (res != 0) {
1129 		pfn_cuGetErrorString(res, &(err_string));
1130 		rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1131 				err_string);
1132 		rte_errno = EPERM;
1133 		return -rte_errno;
1134 	}
1135 
1136 	res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1137 			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1138 			cu_dev_id);
1139 	if (res != 0) {
1140 		pfn_cuGetErrorString(res, &(err_string));
1141 		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1142 				err_string);
1143 		rte_errno = EPERM;
1144 		return -rte_errno;
1145 	}
1146 
1147 	if (private->gdr_supported == 0)
1148 		rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1149 				pci_dev->device.name);
1150 
1151 	res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1152 			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1153 			cu_dev_id);
1154 	if (res != 0) {
1155 		pfn_cuGetErrorString(res, &(err_string));
1156 		rte_cuda_log(ERR,
1157 				"cuDeviceGetAttribute failed with %s",
1158 				err_string);
1159 		rte_errno = EPERM;
1160 		return -rte_errno;
1161 	}
1162 
1163 	if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1164 		res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1165 				CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1166 				cu_dev_id);
1167 		if (res != 0) {
1168 			pfn_cuGetErrorString(res, &(err_string));
1169 			rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1170 					err_string);
1171 			rte_errno = EPERM;
1172 			return -rte_errno;
1173 		}
1174 
1175 		if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1176 			rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1177 	}
1178 
1179 	dev->ops.dev_info_get = cuda_dev_info_get;
1180 	dev->ops.dev_close = cuda_dev_close;
1181 	dev->ops.mem_alloc = cuda_mem_alloc;
1182 	dev->ops.mem_free = cuda_mem_free;
1183 	dev->ops.mem_register = cuda_mem_register;
1184 	dev->ops.mem_unregister = cuda_mem_unregister;
1185 	dev->ops.mem_cpu_map = NULL;
1186 	dev->ops.mem_cpu_unmap = NULL;
1187 	dev->ops.wmb = cuda_wmb;
1188 
1189 	rte_gpu_complete_new(dev);
1190 
1191 	rte_cuda_debug("dev id = %u name = %s",
1192 			dev->mpshared->info.dev_id, private->gpu_name);
1193 
1194 	return 0;
1195 }
1196 
1197 static int
1198 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1199 {
1200 	struct rte_gpu *dev;
1201 	int ret;
1202 	uint8_t gpu_id;
1203 
1204 	if (pci_dev == NULL) {
1205 		rte_errno = ENODEV;
1206 		return -rte_errno;
1207 	}
1208 
1209 	dev = rte_gpu_get_by_name(pci_dev->device.name);
1210 	if (dev == NULL) {
1211 		rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1212 				pci_dev->device.name);
1213 		rte_errno = ENODEV;
1214 		return -rte_errno;
1215 	}
1216 	gpu_id = dev->mpshared->info.dev_id;
1217 
1218 	/* release dev from library */
1219 	ret = rte_gpu_release(dev);
1220 	if (ret)
1221 		rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1222 
1223 	rte_cuda_debug("Destroyed dev = %u", gpu_id);
1224 
1225 	return 0;
1226 }
1227 
1228 static struct rte_pci_driver rte_cuda_driver = {
1229 	.id_table = pci_id_cuda_map,
1230 	.drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1231 	.probe = cuda_gpu_probe,
1232 	.remove = cuda_gpu_remove,
1233 };
1234 
1235 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1236 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1237 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
1238