1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright (c) 2021 NVIDIA Corporation & Affiliates
3 */
4
5 #include <dlfcn.h>
6
7 #include <rte_malloc.h>
8 #include <rte_pci.h>
9 #include <bus_pci_driver.h>
10 #include <rte_byteorder.h>
11 #include <dev_driver.h>
12
13 #include <gpudev_driver.h>
14
15 #include <cuda.h>
16 #include <cudaTypedefs.h>
17
18 #include "common.h"
19 #include "devices.h"
20
21 #define CUDA_DRIVER_MIN_VERSION 11040
22 #define CUDA_API_MIN_VERSION 3020
23
24 /* CUDA Driver functions loaded with dlsym() */
25 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
26 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
27 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
28 void **pfn, int cudaVersion, uint64_t flags);
29
30 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
31 static PFN_cuGetErrorString pfn_cuGetErrorString;
32 static PFN_cuGetErrorName pfn_cuGetErrorName;
33 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
34 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
35 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
36 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
37 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
38 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
39 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
40 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
41 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
42 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
43 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
44 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
45 static PFN_cuMemAlloc pfn_cuMemAlloc;
46 static PFN_cuMemFree pfn_cuMemFree;
47 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
48 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
49 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
50 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
51
52 static void *cudalib;
53 static unsigned int cuda_api_version;
54 static int cuda_driver_version;
55 static gdr_t gdrc_h;
56
57 #define CUDA_MAX_ALLOCATION_NUM 512
58
59 #define GPU_PAGE_SHIFT 16
60 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
61
62 RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
63
64 /* NVIDIA GPU address map */
65 static const struct rte_pci_id pci_id_cuda_map[] = {
66 {
67 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
68 NVIDIA_GPU_A40_DEVICE_ID)
69 },
70 {
71 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
72 NVIDIA_GPU_A30_24GB_DEVICE_ID)
73 },
74 {
75 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
76 NVIDIA_GPU_A30X_24GB_DPU_DEVICE_ID)
77 },
78 {
79 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
80 NVIDIA_GPU_A10_24GB_DEVICE_ID)
81 },
82 {
83 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
84 NVIDIA_GPU_A10G_DEVICE_ID)
85 },
86 {
87 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
88 NVIDIA_GPU_A10M_DEVICE_ID)
89 },
90 {
91 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
92 NVIDIA_GPU_A100_40GB_SXM4_DEVICE_ID)
93 },
94 {
95 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
96 NVIDIA_GPU_A100_40GB_PCIE_DEVICE_ID)
97 },
98 {
99 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
100 NVIDIA_GPU_A100_80GB_SXM4_DEVICE_ID)
101 },
102 {
103 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
104 NVIDIA_GPU_A100_80GB_PCIE_DEVICE_ID)
105 },
106 {
107 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
108 NVIDIA_GPU_A100X_80GB_DPU_DEVICE_ID)
109 },
110 {
111 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
112 NVIDIA_GPU_GA100_PG506_207)
113 },
114 {
115 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
116 NVIDIA_GPU_GA100_PCIE)
117 },
118 {
119 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
120 NVIDIA_GPU_GA100_PG506_217)
121 },
122 {
123 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
124 NVIDIA_GPU_V100_16GB_SXM2_DEVICE_ID)
125 },
126 {
127 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
128 NVIDIA_GPU_V100_16GB_DGXS_DEVICE_ID)
129 },
130 {
131 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
132 NVIDIA_GPU_V100_16GB_FHHL_DEVICE_ID)
133 },
134 {
135 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
136 NVIDIA_GPU_V100_16GB_PCIE_DEVICE_ID)
137 },
138 {
139 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
140 NVIDIA_GPU_V100_32GB_SXM2_DEVICE_ID)
141 },
142 {
143 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
144 NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID)
145 },
146 {
147 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
148 NVIDIA_GPU_V100_32GB_DGXS_DEVICE_ID)
149 },
150 {
151 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
152 NVIDIA_GPU_V100_32GB_SXM3_DEVICE_ID)
153 },
154 {
155 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
156 NVIDIA_GPU_V100_32GB_SXM3_H_DEVICE_ID)
157 },
158 {
159 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
160 NVIDIA_GPU_V100_SXM2)
161 },
162 {
163 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
164 NVIDIA_GPU_V100S_PCIE)
165 },
166 {
167 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
168 NVIDIA_GPU_TITAN_V_CEO_ED)
169 },
170 {
171 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
172 NVIDIA_GPU_GV100GL_PG500_216)
173 },
174 {
175 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
176 NVIDIA_GPU_GV100GL_PG503_216)
177 },
178 {
179 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
180 NVIDIA_GPU_TU102_TITAN_RTX)
181 },
182 {
183 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
184 NVIDIA_GPU_TU102GL_QUADRO_RTX)
185 },
186 {
187 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
188 NVIDIA_GPU_GV100_QUADRO_DEVICE_ID)
189 },
190 {
191 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
192 NVIDIA_GPU_QUADRO_RTX_4000)
193 },
194 {
195 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
196 NVIDIA_GPU_QUADRO_RTX_5000)
197 },
198 {
199 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
200 NVIDIA_GPU_QUADRO_RTX_6000)
201 },
202 {
203 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
204 NVIDIA_GPU_QUADRO_RTX_8000)
205 },
206 {
207 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
208 NVIDIA_GPU_QUADRO_RTX_A4000)
209 },
210 {
211 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
212 NVIDIA_GPU_QUADRO_RTX_A6000)
213 },
214 {
215 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
216 NVIDIA_GPU_QUADRO_RTX_A5000)
217 },
218 {
219 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
220 NVIDIA_GPU_QUADRO_RTX_A4500)
221 },
222 {
223 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
224 NVIDIA_GPU_QUADRO_RTX_A5500)
225 },
226 {
227 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
228 NVIDIA_GPU_QUADRO_RTX_A2000)
229 },
230 {
231 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
232 NVIDIA_GPU_QUADRO_RTX_A2000_12GB)
233 },
234 {
235 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
236 NVIDIA_GPU_T4G)
237 },
238 {
239 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
240 NVIDIA_GPU_T4)
241 },
242 {
243 .device_id = 0
244 }
245 };
246
247 /* Device private info */
248 struct cuda_info {
249 char gpu_name[RTE_DEV_NAME_MAX_LEN];
250 CUdevice cu_dev;
251 int gdr_supported;
252 int gdr_write_ordering;
253 int gdr_flush_type;
254 };
255
256 /* Type of memory allocated by CUDA driver */
257 enum mem_type {
258 GPU_MEM = 0,
259 CPU_REGISTERED,
260 GPU_REGISTERED
261 };
262
263 /* key associated to a memory address */
264 typedef uintptr_t cuda_ptr_key;
265
266 /* Single entry of the memory list */
267 struct mem_entry {
268 CUdeviceptr ptr_d;
269 CUdeviceptr ptr_orig_d;
270 void *ptr_h;
271 size_t size;
272 size_t size_orig;
273 struct rte_gpu *dev;
274 CUcontext ctx;
275 cuda_ptr_key pkey;
276 enum mem_type mtype;
277 gdr_mh_t mh;
278 struct mem_entry *prev;
279 struct mem_entry *next;
280 };
281
282 static struct mem_entry *mem_alloc_list_head;
283 static struct mem_entry *mem_alloc_list_tail;
284 static uint32_t mem_alloc_list_last_elem;
285
286 /* Load the CUDA symbols */
287
288 static int
cuda_loader(void)289 cuda_loader(void)
290 {
291 char cuda_path[1024];
292
293 if (getenv("CUDA_PATH_L") == NULL)
294 snprintf(cuda_path, 1024, "%s", "libcuda.so");
295 else
296 snprintf(cuda_path, 1024, "%s/%s", getenv("CUDA_PATH_L"), "libcuda.so");
297
298 cudalib = dlopen(cuda_path, RTLD_LAZY);
299 if (cudalib == NULL) {
300 rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
301 cuda_path, getenv("CUDA_PATH_L"));
302 return -1;
303 }
304
305 return 0;
306 }
307
308 static int
cuda_sym_func_loader(void)309 cuda_sym_func_loader(void)
310 {
311 if (cudalib == NULL)
312 return -1;
313
314 sym_cuInit = dlsym(cudalib, "cuInit");
315 if (sym_cuInit == NULL) {
316 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
317 return -1;
318 }
319
320 sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
321 if (sym_cuDriverGetVersion == NULL) {
322 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
323 return -1;
324 }
325
326 sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
327 if (sym_cuGetProcAddress == NULL) {
328 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
329 return -1;
330 }
331
332 return 0;
333 }
334
335 static int
cuda_pfn_func_loader(void)336 cuda_pfn_func_loader(void)
337 {
338 CUresult res;
339
340 res = sym_cuGetProcAddress("cuGetErrorString",
341 (void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
342 if (res != 0) {
343 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
344 return -1;
345 }
346
347 res = sym_cuGetProcAddress("cuGetErrorName",
348 (void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
349 if (res != 0) {
350 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
351 return -1;
352 }
353
354 res = sym_cuGetProcAddress("cuPointerSetAttribute",
355 (void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
356 if (res != 0) {
357 rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
358 return -1;
359 }
360
361 res = sym_cuGetProcAddress("cuDeviceGetAttribute",
362 (void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
363 if (res != 0) {
364 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
365 return -1;
366 }
367
368 res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
369 (void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
370 if (res != 0) {
371 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
372 return -1;
373 }
374
375 res = sym_cuGetProcAddress("cuDeviceGetName",
376 (void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
377 if (res != 0) {
378 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
379 return -1;
380 }
381
382 res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
383 (void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
384 if (res != 0) {
385 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
386 return -1;
387 }
388
389 res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
390 (void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
391 if (res != 0) {
392 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
393 return -1;
394 }
395
396 res = sym_cuGetProcAddress("cuDeviceTotalMem",
397 (void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
398 if (res != 0) {
399 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
400 return -1;
401 }
402
403 res = sym_cuGetProcAddress("cuCtxGetApiVersion",
404 (void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
405 if (res != 0) {
406 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
407 return -1;
408 }
409
410 res = sym_cuGetProcAddress("cuCtxGetDevice",
411 (void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
412 if (res != 0) {
413 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
414 return -1;
415 }
416
417 res = sym_cuGetProcAddress("cuCtxSetCurrent",
418 (void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
419 if (res != 0) {
420 rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
421 return -1;
422 }
423
424 res = sym_cuGetProcAddress("cuCtxGetCurrent",
425 (void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
426 if (res != 0) {
427 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
428 return -1;
429 }
430
431 res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
432 (void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
433 if (res != 0) {
434 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
435 return -1;
436 }
437
438 res = sym_cuGetProcAddress("cuMemAlloc",
439 (void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
440 if (res != 0) {
441 rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
442 return -1;
443 }
444
445 res = sym_cuGetProcAddress("cuMemFree",
446 (void **)(&pfn_cuMemFree), cuda_driver_version, 0);
447 if (res != 0) {
448 rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
449 return -1;
450 }
451
452 res = sym_cuGetProcAddress("cuMemHostRegister",
453 (void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
454 if (res != 0) {
455 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
456 return -1;
457 }
458
459 res = sym_cuGetProcAddress("cuMemHostUnregister",
460 (void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
461 if (res != 0) {
462 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
463 return -1;
464 }
465
466 res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
467 (void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
468 if (res != 0) {
469 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
470 return -1;
471 }
472
473 res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
474 (void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
475 if (res != 0) {
476 rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
477 return -1;
478 }
479
480 return 0;
481 }
482
483 /* Generate a key from a memory pointer */
484 static cuda_ptr_key
get_hash_from_ptr(void * ptr)485 get_hash_from_ptr(void *ptr)
486 {
487 return (uintptr_t)ptr;
488 }
489
490 static uint32_t
mem_list_count_item(void)491 mem_list_count_item(void)
492 {
493 return mem_alloc_list_last_elem;
494 }
495
496 /* Initiate list of memory allocations if not done yet */
497 static struct mem_entry *
mem_list_add_item(void)498 mem_list_add_item(void)
499 {
500 /* Initiate list of memory allocations if not done yet */
501 if (mem_alloc_list_head == NULL) {
502 mem_alloc_list_head = rte_zmalloc(NULL,
503 sizeof(struct mem_entry),
504 RTE_CACHE_LINE_SIZE);
505 if (mem_alloc_list_head == NULL) {
506 rte_cuda_log(ERR, "Failed to allocate memory for memory list");
507 return NULL;
508 }
509
510 mem_alloc_list_head->next = NULL;
511 mem_alloc_list_head->prev = NULL;
512 mem_alloc_list_tail = mem_alloc_list_head;
513 } else {
514 struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
515 sizeof(struct mem_entry),
516 RTE_CACHE_LINE_SIZE);
517
518 if (mem_alloc_list_cur == NULL) {
519 rte_cuda_log(ERR, "Failed to allocate memory for memory list");
520 return NULL;
521 }
522
523 mem_alloc_list_tail->next = mem_alloc_list_cur;
524 mem_alloc_list_cur->prev = mem_alloc_list_tail;
525 mem_alloc_list_tail = mem_alloc_list_tail->next;
526 mem_alloc_list_tail->next = NULL;
527 }
528
529 mem_alloc_list_last_elem++;
530
531 return mem_alloc_list_tail;
532 }
533
534 static struct mem_entry *
mem_list_find_item(cuda_ptr_key pk)535 mem_list_find_item(cuda_ptr_key pk)
536 {
537 struct mem_entry *mem_alloc_list_cur = NULL;
538
539 if (mem_alloc_list_head == NULL) {
540 rte_cuda_log(ERR, "Memory list doesn't exist");
541 return NULL;
542 }
543
544 if (mem_list_count_item() == 0) {
545 rte_cuda_log(ERR, "No items in memory list");
546 return NULL;
547 }
548
549 mem_alloc_list_cur = mem_alloc_list_head;
550
551 while (mem_alloc_list_cur != NULL) {
552 if (mem_alloc_list_cur->pkey == pk)
553 return mem_alloc_list_cur;
554 mem_alloc_list_cur = mem_alloc_list_cur->next;
555 }
556
557 return mem_alloc_list_cur;
558 }
559
560 static int
mem_list_del_item(cuda_ptr_key pk)561 mem_list_del_item(cuda_ptr_key pk)
562 {
563 struct mem_entry *mem_alloc_list_cur = NULL;
564
565 mem_alloc_list_cur = mem_list_find_item(pk);
566 if (mem_alloc_list_cur == NULL)
567 return -EINVAL;
568
569 /* if key is in head */
570 if (mem_alloc_list_cur->prev == NULL) {
571 mem_alloc_list_head = mem_alloc_list_cur->next;
572 if (mem_alloc_list_head != NULL)
573 mem_alloc_list_head->prev = NULL;
574 } else {
575 mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
576 if (mem_alloc_list_cur->next != NULL)
577 mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
578 }
579
580 rte_free(mem_alloc_list_cur);
581
582 mem_alloc_list_last_elem--;
583
584 return 0;
585 }
586
587 static int
cuda_dev_info_get(struct rte_gpu * dev,struct rte_gpu_info * info)588 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
589 {
590 int ret = 0;
591 CUresult res;
592 struct rte_gpu_info parent_info;
593 CUexecAffinityParam affinityPrm;
594 const char *err_string;
595 struct cuda_info *private;
596 CUcontext current_ctx;
597 CUcontext input_ctx;
598
599 if (dev == NULL) {
600 rte_errno = ENODEV;
601 return -rte_errno;
602 }
603
604 /* Child initialization time probably called by rte_gpu_add_child() */
605 if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
606 dev->mpshared->dev_private == NULL) {
607 /* Store current ctx */
608 res = pfn_cuCtxGetCurrent(¤t_ctx);
609 if (res != 0) {
610 pfn_cuGetErrorString(res, &(err_string));
611 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
612 err_string);
613 rte_errno = EPERM;
614 return -rte_errno;
615 }
616
617 /* Set child ctx as current ctx */
618 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
619 res = pfn_cuCtxSetCurrent(input_ctx);
620 if (res != 0) {
621 pfn_cuGetErrorString(res, &(err_string));
622 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
623 err_string);
624 rte_errno = EPERM;
625 return -rte_errno;
626 }
627
628 /*
629 * Ctx capacity info
630 */
631
632 /* MPS compatible */
633 res = pfn_cuCtxGetExecAffinity(&affinityPrm,
634 CU_EXEC_AFFINITY_TYPE_SM_COUNT);
635 if (res != 0) {
636 pfn_cuGetErrorString(res, &(err_string));
637 rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
638 err_string);
639 }
640 dev->mpshared->info.processor_count =
641 (uint32_t)affinityPrm.param.smCount.val;
642
643 ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
644 if (ret) {
645 rte_errno = ENODEV;
646 return -rte_errno;
647 }
648 dev->mpshared->info.total_memory = parent_info.total_memory;
649
650 dev->mpshared->info.page_size = parent_info.page_size;
651
652 /*
653 * GPU Device private info
654 */
655 dev->mpshared->dev_private = rte_zmalloc(NULL,
656 sizeof(struct cuda_info),
657 RTE_CACHE_LINE_SIZE);
658 if (dev->mpshared->dev_private == NULL) {
659 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
660 rte_errno = EPERM;
661 return -rte_errno;
662 }
663
664 private = (struct cuda_info *)dev->mpshared->dev_private;
665
666 res = pfn_cuCtxGetDevice(&(private->cu_dev));
667 if (res != 0) {
668 pfn_cuGetErrorString(res, &(err_string));
669 rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
670 err_string);
671 rte_errno = EPERM;
672 return -rte_errno;
673 }
674
675 res = pfn_cuDeviceGetName(private->gpu_name,
676 RTE_DEV_NAME_MAX_LEN, private->cu_dev);
677 if (res != 0) {
678 pfn_cuGetErrorString(res, &(err_string));
679 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
680 err_string);
681 rte_errno = EPERM;
682 return -rte_errno;
683 }
684
685 /* Restore original ctx as current ctx */
686 res = pfn_cuCtxSetCurrent(current_ctx);
687 if (res != 0) {
688 pfn_cuGetErrorString(res, &(err_string));
689 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
690 err_string);
691 rte_errno = EPERM;
692 return -rte_errno;
693 }
694 }
695
696 *info = dev->mpshared->info;
697
698 return 0;
699 }
700
701 /*
702 * GPU Memory
703 */
704
705 static int
cuda_mem_alloc(struct rte_gpu * dev,size_t size,unsigned int align,void ** ptr)706 cuda_mem_alloc(struct rte_gpu *dev, size_t size, unsigned int align, void **ptr)
707 {
708 CUresult res;
709 const char *err_string;
710 CUcontext current_ctx;
711 CUcontext input_ctx;
712 unsigned int flag = 1;
713
714 if (dev == NULL)
715 return -ENODEV;
716
717 /* Store current ctx */
718 res = pfn_cuCtxGetCurrent(¤t_ctx);
719 if (res != 0) {
720 pfn_cuGetErrorString(res, &(err_string));
721 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
722 err_string);
723 rte_errno = EPERM;
724 return -rte_errno;
725 }
726
727 /* Set child ctx as current ctx */
728 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
729 res = pfn_cuCtxSetCurrent(input_ctx);
730 if (res != 0) {
731 pfn_cuGetErrorString(res, &(err_string));
732 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
733 err_string);
734 rte_errno = EPERM;
735 return -rte_errno;
736 }
737
738 /* Get next memory list item */
739 mem_alloc_list_tail = mem_list_add_item();
740 if (mem_alloc_list_tail == NULL) {
741 rte_errno = EPERM;
742 return -rte_errno;
743 }
744
745 /* Allocate memory */
746 mem_alloc_list_tail->size = size;
747 mem_alloc_list_tail->size_orig = size + align;
748
749 res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
750 mem_alloc_list_tail->size_orig);
751 if (res != 0) {
752 pfn_cuGetErrorString(res, &(err_string));
753 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
754 err_string);
755 rte_errno = EPERM;
756 return -rte_errno;
757 }
758
759 /* Align memory address */
760 mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
761 if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
762 mem_alloc_list_tail->ptr_d += (align -
763 (((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
764
765 /* GPUDirect RDMA attribute required */
766 res = pfn_cuPointerSetAttribute(&flag,
767 CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
768 mem_alloc_list_tail->ptr_d);
769 if (res != 0) {
770 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
771 "GPU memory at %"PRIu32", err %d",
772 (uint32_t)mem_alloc_list_tail->ptr_d, res);
773 rte_errno = EPERM;
774 return -rte_errno;
775 }
776
777 mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
778 mem_alloc_list_tail->ptr_h = NULL;
779 mem_alloc_list_tail->dev = dev;
780 mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
781 mem_alloc_list_tail->mtype = GPU_MEM;
782
783 /* Restore original ctx as current ctx */
784 res = pfn_cuCtxSetCurrent(current_ctx);
785 if (res != 0) {
786 pfn_cuGetErrorString(res, &(err_string));
787 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
788 err_string);
789 rte_errno = EPERM;
790 return -rte_errno;
791 }
792
793 *ptr = (void *)mem_alloc_list_tail->ptr_d;
794
795 return 0;
796 }
797
798 static int
cuda_mem_register(struct rte_gpu * dev,size_t size,void * ptr)799 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
800 {
801 CUresult res;
802 const char *err_string;
803 CUcontext current_ctx;
804 CUcontext input_ctx;
805 unsigned int flag = 1;
806 int use_ptr_h = 0;
807
808 if (dev == NULL)
809 return -ENODEV;
810
811 /* Store current ctx */
812 res = pfn_cuCtxGetCurrent(¤t_ctx);
813 if (res != 0) {
814 pfn_cuGetErrorString(res, &(err_string));
815 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
816 err_string);
817 rte_errno = EPERM;
818 return -rte_errno;
819 }
820
821 /* Set child ctx as current ctx */
822 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
823 res = pfn_cuCtxSetCurrent(input_ctx);
824 if (res != 0) {
825 pfn_cuGetErrorString(res, &(err_string));
826 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
827 err_string);
828 rte_errno = EPERM;
829 return -rte_errno;
830 }
831
832 /* Get next memory list item */
833 mem_alloc_list_tail = mem_list_add_item();
834 if (mem_alloc_list_tail == NULL) {
835 rte_errno = EPERM;
836 return -rte_errno;
837 }
838
839 /* Allocate memory */
840 mem_alloc_list_tail->size = size;
841 mem_alloc_list_tail->ptr_h = ptr;
842
843 res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
844 mem_alloc_list_tail->size,
845 CU_MEMHOSTREGISTER_PORTABLE |
846 CU_MEMHOSTREGISTER_DEVICEMAP);
847 if (res != 0) {
848 pfn_cuGetErrorString(res, &(err_string));
849 rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
850 err_string,
851 mem_alloc_list_tail->ptr_h,
852 mem_alloc_list_tail->size);
853 rte_errno = EPERM;
854 return -rte_errno;
855 }
856
857 res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
858 CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
859 ((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
860 if (res != 0) {
861 pfn_cuGetErrorString(res, &(err_string));
862 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
863 err_string);
864 rte_errno = EPERM;
865 return -rte_errno;
866 }
867
868 if (use_ptr_h == 0) {
869 res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
870 mem_alloc_list_tail->ptr_h, 0);
871 if (res != 0) {
872 pfn_cuGetErrorString(res, &(err_string));
873 rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
874 err_string);
875 rte_errno = EPERM;
876 return -rte_errno;
877 }
878
879 if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
880 (uintptr_t)mem_alloc_list_tail->ptr_h) {
881 rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
882 rte_errno = ENOTSUP;
883 return -rte_errno;
884 }
885 } else {
886 mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
887 }
888
889 /* GPUDirect RDMA attribute required */
890 res = pfn_cuPointerSetAttribute(&flag,
891 CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
892 mem_alloc_list_tail->ptr_d);
893 if (res != 0) {
894 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
895 ", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
896 rte_errno = EPERM;
897 return -rte_errno;
898 }
899
900 mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
901 mem_alloc_list_tail->size = size;
902 mem_alloc_list_tail->dev = dev;
903 mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
904 mem_alloc_list_tail->mtype = CPU_REGISTERED;
905 mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d;
906
907 /* Restore original ctx as current ctx */
908 res = pfn_cuCtxSetCurrent(current_ctx);
909 if (res != 0) {
910 pfn_cuGetErrorString(res, &(err_string));
911 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
912 err_string);
913 rte_errno = EPERM;
914 return -rte_errno;
915 }
916
917 return 0;
918 }
919
920 static int
cuda_mem_cpu_map(struct rte_gpu * dev,__rte_unused size_t size,void * ptr_in,void ** ptr_out)921 cuda_mem_cpu_map(struct rte_gpu *dev, __rte_unused size_t size, void *ptr_in, void **ptr_out)
922 {
923 struct mem_entry *mem_item;
924 cuda_ptr_key hk;
925
926 if (dev == NULL)
927 return -ENODEV;
928
929 hk = get_hash_from_ptr((void *)ptr_in);
930
931 mem_item = mem_list_find_item(hk);
932 if (mem_item == NULL) {
933 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
934 rte_errno = EPERM;
935 return -rte_errno;
936 }
937
938 if (mem_item->mtype != GPU_MEM) {
939 rte_cuda_log(ERR, "Memory address 0x%p is not GPU memory type.", ptr_in);
940 rte_errno = EPERM;
941 return -rte_errno;
942 }
943
944 if (mem_item->size != size)
945 rte_cuda_log(WARNING,
946 "Can't expose memory area with size (%zd) different from original size (%zd).",
947 size, mem_item->size);
948
949 if (gdrcopy_pin(&gdrc_h, &(mem_item->mh), (uint64_t)mem_item->ptr_d,
950 mem_item->size, &(mem_item->ptr_h))) {
951 rte_cuda_log(ERR, "Error exposing GPU memory address 0x%p.", ptr_in);
952 rte_errno = EPERM;
953 return -rte_errno;
954 }
955
956 mem_item->mtype = GPU_REGISTERED;
957 *ptr_out = mem_item->ptr_h;
958
959 return 0;
960 }
961
962 static int
cuda_mem_unregister(struct rte_gpu * dev,void * ptr)963 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
964 {
965 CUresult res;
966 struct mem_entry *mem_item;
967 const char *err_string;
968 cuda_ptr_key hk;
969
970 if (dev == NULL)
971 return -ENODEV;
972
973 hk = get_hash_from_ptr((void *)ptr);
974
975 mem_item = mem_list_find_item(hk);
976 if (mem_item == NULL) {
977 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
978 rte_errno = EPERM;
979 return -rte_errno;
980 }
981
982 if (mem_item->mtype == CPU_REGISTERED) {
983 res = pfn_cuMemHostUnregister(ptr);
984 if (res != 0) {
985 pfn_cuGetErrorString(res, &(err_string));
986 rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
987 err_string);
988 rte_errno = EPERM;
989 return -rte_errno;
990 }
991
992 return mem_list_del_item(hk);
993 }
994
995 rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
996
997 rte_errno = EPERM;
998 return -rte_errno;
999 }
1000
1001 static int
cuda_mem_cpu_unmap(struct rte_gpu * dev,void * ptr_in)1002 cuda_mem_cpu_unmap(struct rte_gpu *dev, void *ptr_in)
1003 {
1004 struct mem_entry *mem_item;
1005 cuda_ptr_key hk;
1006
1007 if (dev == NULL)
1008 return -ENODEV;
1009
1010 hk = get_hash_from_ptr((void *)ptr_in);
1011
1012 mem_item = mem_list_find_item(hk);
1013 if (mem_item == NULL) {
1014 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
1015 rte_errno = EPERM;
1016 return -rte_errno;
1017 }
1018
1019 if (mem_item->mtype == GPU_REGISTERED) {
1020 if (gdrcopy_unpin(gdrc_h, mem_item->mh, (void *)mem_item->ptr_d,
1021 mem_item->size)) {
1022 rte_cuda_log(ERR, "Error unexposing GPU memory address 0x%p.", ptr_in);
1023 rte_errno = EPERM;
1024 return -rte_errno;
1025 }
1026
1027 mem_item->mtype = GPU_MEM;
1028 } else {
1029 rte_errno = EPERM;
1030 return -rte_errno;
1031 }
1032
1033 return 0;
1034 }
1035
1036 static int
cuda_mem_free(struct rte_gpu * dev,void * ptr)1037 cuda_mem_free(struct rte_gpu *dev, void *ptr)
1038 {
1039 CUresult res;
1040 struct mem_entry *mem_item;
1041 const char *err_string;
1042 cuda_ptr_key hk;
1043
1044 if (dev == NULL)
1045 return -ENODEV;
1046
1047 hk = get_hash_from_ptr((void *)ptr);
1048
1049 mem_item = mem_list_find_item(hk);
1050 if (mem_item == NULL) {
1051 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
1052 rte_errno = EPERM;
1053 return -rte_errno;
1054 }
1055
1056 /*
1057 * If a GPU memory area that's CPU mapped is being freed
1058 * without calling cpu_unmap, force the unmapping.
1059 */
1060 if (mem_item->mtype == GPU_REGISTERED)
1061 cuda_mem_cpu_unmap(dev, ptr);
1062
1063 if (mem_item->mtype == GPU_MEM) {
1064 res = pfn_cuMemFree(mem_item->ptr_orig_d);
1065 if (res != 0) {
1066 pfn_cuGetErrorString(res, &(err_string));
1067 rte_cuda_log(ERR, "cuMemFree current failed with %s",
1068 err_string);
1069 rte_errno = EPERM;
1070 return -rte_errno;
1071 }
1072
1073 return mem_list_del_item(hk);
1074 }
1075
1076 rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
1077
1078 return -EPERM;
1079 }
1080
1081 static int
cuda_dev_close(struct rte_gpu * dev)1082 cuda_dev_close(struct rte_gpu *dev)
1083 {
1084 if (dev == NULL)
1085 return -EINVAL;
1086
1087 rte_free(dev->mpshared->dev_private);
1088
1089 return 0;
1090 }
1091
1092 static int
cuda_wmb(struct rte_gpu * dev)1093 cuda_wmb(struct rte_gpu *dev)
1094 {
1095 CUresult res;
1096 const char *err_string;
1097 CUcontext current_ctx;
1098 CUcontext input_ctx;
1099 struct cuda_info *private;
1100
1101 if (dev == NULL) {
1102 rte_errno = ENODEV;
1103 return -rte_errno;
1104 }
1105
1106 private = (struct cuda_info *)dev->mpshared->dev_private;
1107
1108 if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1109 /*
1110 * No need to explicitly force the write ordering because
1111 * the device natively supports it
1112 */
1113 return 0;
1114 }
1115
1116 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
1117 /*
1118 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
1119 * Application needs to use alternative methods.
1120 */
1121 rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
1122 "Application needs to use alternative methods.");
1123
1124 rte_errno = ENOTSUP;
1125 return -rte_errno;
1126 }
1127
1128 /* Store current ctx */
1129 res = pfn_cuCtxGetCurrent(¤t_ctx);
1130 if (res != 0) {
1131 pfn_cuGetErrorString(res, &(err_string));
1132 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
1133 err_string);
1134 rte_errno = EPERM;
1135 return -rte_errno;
1136 }
1137
1138 /* Set child ctx as current ctx */
1139 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
1140 res = pfn_cuCtxSetCurrent(input_ctx);
1141 if (res != 0) {
1142 pfn_cuGetErrorString(res, &(err_string));
1143 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
1144 err_string);
1145 rte_errno = EPERM;
1146 return -rte_errno;
1147 }
1148
1149 res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
1150 CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
1151 if (res != 0) {
1152 pfn_cuGetErrorString(res, &(err_string));
1153 rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
1154 err_string);
1155 rte_errno = EPERM;
1156 return -rte_errno;
1157 }
1158
1159 /* Restore original ctx as current ctx */
1160 res = pfn_cuCtxSetCurrent(current_ctx);
1161 if (res != 0) {
1162 pfn_cuGetErrorString(res, &(err_string));
1163 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
1164 err_string);
1165 rte_errno = EPERM;
1166 return -rte_errno;
1167 }
1168
1169 return 0;
1170 }
1171
1172 static int
cuda_gpu_probe(__rte_unused struct rte_pci_driver * pci_drv,struct rte_pci_device * pci_dev)1173 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
1174 {
1175 struct rte_gpu *dev = NULL;
1176 CUresult res;
1177 CUdevice cu_dev_id;
1178 CUcontext pctx;
1179 char dev_name[RTE_DEV_NAME_MAX_LEN];
1180 const char *err_string;
1181 int processor_count = 0;
1182 struct cuda_info *private;
1183
1184 if (pci_dev == NULL) {
1185 rte_cuda_log(ERR, "NULL PCI device");
1186 rte_errno = ENODEV;
1187 return -rte_errno;
1188 }
1189
1190 rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
1191
1192 /* Allocate memory to be used privately by drivers */
1193 dev = rte_gpu_allocate(pci_dev->device.name);
1194 if (dev == NULL) {
1195 rte_errno = ENODEV;
1196 return -rte_errno;
1197 }
1198
1199 /* Initialize values only for the first CUDA driver call */
1200 if (dev->mpshared->info.dev_id == 0) {
1201 mem_alloc_list_head = NULL;
1202 mem_alloc_list_tail = NULL;
1203 mem_alloc_list_last_elem = 0;
1204
1205 /* Load libcuda.so library */
1206 if (cuda_loader()) {
1207 rte_cuda_log(ERR, "CUDA Driver library not found");
1208 rte_errno = ENOTSUP;
1209 return -rte_errno;
1210 }
1211
1212 /* Load initial CUDA functions */
1213 if (cuda_sym_func_loader()) {
1214 rte_cuda_log(ERR, "CUDA functions not found in library");
1215 rte_errno = ENOTSUP;
1216 return -rte_errno;
1217 }
1218
1219 /*
1220 * Required to initialize the CUDA Driver.
1221 * Multiple calls of cuInit() will return immediately
1222 * without making any relevant change
1223 */
1224 sym_cuInit(0);
1225
1226 res = sym_cuDriverGetVersion(&cuda_driver_version);
1227 if (res != 0) {
1228 rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1229 rte_errno = ENOTSUP;
1230 return -rte_errno;
1231 }
1232
1233 if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1234 rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1235 "Minimum requirement is %d",
1236 cuda_driver_version,
1237 CUDA_DRIVER_MIN_VERSION);
1238 rte_errno = ENOTSUP;
1239 return -rte_errno;
1240 }
1241
1242 if (cuda_pfn_func_loader()) {
1243 rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1244 rte_errno = ENOTSUP;
1245 return -rte_errno;
1246 }
1247
1248 gdrc_h = NULL;
1249 }
1250
1251 /* Fill HW specific part of device structure */
1252 dev->device = &pci_dev->device;
1253 dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1254
1255 /* Get NVIDIA GPU Device descriptor */
1256 res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1257 if (res != 0) {
1258 pfn_cuGetErrorString(res, &(err_string));
1259 rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1260 dev->device->name, res, err_string);
1261 rte_errno = EPERM;
1262 return -rte_errno;
1263 }
1264
1265 res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1266 if (res != 0) {
1267 pfn_cuGetErrorString(res, &(err_string));
1268 rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1269 dev->device->name, res, err_string);
1270 rte_errno = EPERM;
1271 return -rte_errno;
1272 }
1273
1274 res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1275 if (res != 0) {
1276 rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1277 rte_errno = ENOTSUP;
1278 return -rte_errno;
1279 }
1280
1281 if (cuda_api_version < CUDA_API_MIN_VERSION) {
1282 rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1283 cuda_api_version, CUDA_API_MIN_VERSION);
1284 rte_errno = ENOTSUP;
1285 return -rte_errno;
1286 }
1287
1288 dev->mpshared->info.context = (uint64_t)pctx;
1289
1290 /*
1291 * GPU Device generic info
1292 */
1293
1294 /* Processor count */
1295 res = pfn_cuDeviceGetAttribute(&(processor_count),
1296 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1297 cu_dev_id);
1298 if (res != 0) {
1299 pfn_cuGetErrorString(res, &(err_string));
1300 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1301 err_string);
1302 rte_errno = EPERM;
1303 return -rte_errno;
1304 }
1305 dev->mpshared->info.processor_count = (uint32_t)processor_count;
1306
1307 /* Total memory */
1308 res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1309 if (res != 0) {
1310 pfn_cuGetErrorString(res, &(err_string));
1311 rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1312 err_string);
1313 rte_errno = EPERM;
1314 return -rte_errno;
1315 }
1316
1317 dev->mpshared->info.page_size = (size_t)GPU_PAGE_SIZE;
1318
1319 /*
1320 * GPU Device private info
1321 */
1322 dev->mpshared->dev_private = rte_zmalloc(NULL,
1323 sizeof(struct cuda_info),
1324 RTE_CACHE_LINE_SIZE);
1325 if (dev->mpshared->dev_private == NULL) {
1326 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1327 rte_errno = EPERM;
1328 return -rte_errno;
1329 }
1330
1331 private = (struct cuda_info *)dev->mpshared->dev_private;
1332 private->cu_dev = cu_dev_id;
1333 res = pfn_cuDeviceGetName(private->gpu_name,
1334 RTE_DEV_NAME_MAX_LEN,
1335 cu_dev_id);
1336 if (res != 0) {
1337 pfn_cuGetErrorString(res, &(err_string));
1338 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1339 err_string);
1340 rte_errno = EPERM;
1341 return -rte_errno;
1342 }
1343
1344 res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1345 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1346 cu_dev_id);
1347 if (res != 0) {
1348 pfn_cuGetErrorString(res, &(err_string));
1349 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1350 err_string);
1351 rte_errno = EPERM;
1352 return -rte_errno;
1353 }
1354
1355 if (private->gdr_supported == 0)
1356 rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1357 pci_dev->device.name);
1358
1359 res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1360 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1361 cu_dev_id);
1362 if (res != 0) {
1363 pfn_cuGetErrorString(res, &(err_string));
1364 rte_cuda_log(ERR,
1365 "cuDeviceGetAttribute failed with %s",
1366 err_string);
1367 rte_errno = EPERM;
1368 return -rte_errno;
1369 }
1370
1371 if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1372 res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1373 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1374 cu_dev_id);
1375 if (res != 0) {
1376 pfn_cuGetErrorString(res, &(err_string));
1377 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1378 err_string);
1379 rte_errno = EPERM;
1380 return -rte_errno;
1381 }
1382
1383 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1384 rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1385 }
1386
1387 dev->ops.dev_info_get = cuda_dev_info_get;
1388 dev->ops.dev_close = cuda_dev_close;
1389 dev->ops.mem_alloc = cuda_mem_alloc;
1390 dev->ops.mem_free = cuda_mem_free;
1391 dev->ops.mem_register = cuda_mem_register;
1392 dev->ops.mem_unregister = cuda_mem_unregister;
1393 dev->ops.mem_cpu_map = cuda_mem_cpu_map;
1394 dev->ops.mem_cpu_unmap = cuda_mem_cpu_unmap;
1395 dev->ops.wmb = cuda_wmb;
1396
1397 rte_gpu_complete_new(dev);
1398
1399 rte_cuda_debug("dev id = %u name = %s",
1400 dev->mpshared->info.dev_id, private->gpu_name);
1401
1402 return 0;
1403 }
1404
1405 static int
cuda_gpu_remove(struct rte_pci_device * pci_dev)1406 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1407 {
1408 struct rte_gpu *dev;
1409 int ret;
1410 uint8_t gpu_id;
1411
1412 if (pci_dev == NULL) {
1413 rte_errno = ENODEV;
1414 return -rte_errno;
1415 }
1416
1417 dev = rte_gpu_get_by_name(pci_dev->device.name);
1418 if (dev == NULL) {
1419 rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1420 pci_dev->device.name);
1421 rte_errno = ENODEV;
1422 return -rte_errno;
1423 }
1424 gpu_id = dev->mpshared->info.dev_id;
1425
1426 /* release dev from library */
1427 ret = rte_gpu_release(dev);
1428 if (ret)
1429 rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1430
1431 rte_cuda_debug("Destroyed dev = %u", gpu_id);
1432
1433 return 0;
1434 }
1435
1436 static struct rte_pci_driver rte_cuda_driver = {
1437 .id_table = pci_id_cuda_map,
1438 .drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1439 .probe = cuda_gpu_probe,
1440 .remove = cuda_gpu_remove,
1441 };
1442
1443 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1444 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1445 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
1446