1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2021 NVIDIA Corporation & Affiliates 3 */ 4 5 #include <dlfcn.h> 6 7 #include <rte_common.h> 8 #include <rte_log.h> 9 #include <rte_malloc.h> 10 #include <rte_errno.h> 11 #include <rte_pci.h> 12 #include <rte_bus_pci.h> 13 #include <rte_byteorder.h> 14 #include <rte_dev.h> 15 16 #include <gpudev_driver.h> 17 #include <cuda.h> 18 #include <cudaTypedefs.h> 19 20 #define CUDA_DRIVER_MIN_VERSION 11040 21 #define CUDA_API_MIN_VERSION 3020 22 23 /* CUDA Driver functions loaded with dlsym() */ 24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags); 25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion); 26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol, 27 void **pfn, int cudaVersion, uint64_t flags); 28 29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */ 30 static PFN_cuGetErrorString pfn_cuGetErrorString; 31 static PFN_cuGetErrorName pfn_cuGetErrorName; 32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute; 33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute; 34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId; 35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain; 36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease; 37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem; 38 static PFN_cuDeviceGetName pfn_cuDeviceGetName; 39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion; 40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent; 41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent; 42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice; 43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity; 44 static PFN_cuMemAlloc pfn_cuMemAlloc; 45 static PFN_cuMemFree pfn_cuMemFree; 46 static PFN_cuMemHostRegister pfn_cuMemHostRegister; 47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister; 48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer; 49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites; 50 51 static void *cudalib; 52 static unsigned int cuda_api_version; 53 static int cuda_driver_version; 54 55 /* NVIDIA GPU vendor */ 56 #define NVIDIA_GPU_VENDOR_ID (0x10de) 57 58 /* NVIDIA GPU device IDs */ 59 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1) 60 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5) 61 #define NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID (0x20b8) 62 63 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7) 64 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236) 65 66 #define NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID (0x1db5) 67 #define NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID (0x1db6) 68 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4) 69 70 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8) 71 72 #define CUDA_MAX_ALLOCATION_NUM 512 73 74 #define GPU_PAGE_SHIFT 16 75 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT) 76 77 static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE); 78 79 /* Helper macro for logging */ 80 #define rte_cuda_log(level, fmt, ...) \ 81 rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__) 82 83 #define rte_cuda_debug(fmt, ...) \ 84 rte_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \ 85 ##__VA_ARGS__) 86 87 /* NVIDIA GPU address map */ 88 static const struct rte_pci_id pci_id_cuda_map[] = { 89 { 90 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID, 91 NVIDIA_GPU_A100_40GB_DEVICE_ID) 92 }, 93 { 94 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID, 95 NVIDIA_GPU_A100_80GB_DEVICE_ID) 96 }, 97 { 98 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID, 99 NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID) 100 }, 101 { 102 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID, 103 NVIDIA_GPU_A30_24GB_DEVICE_ID) 104 }, 105 { 106 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID, 107 NVIDIA_GPU_A10_24GB_DEVICE_ID) 108 }, 109 { 110 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID, 111 NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID) 112 }, 113 { 114 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID, 115 NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID) 116 }, 117 { 118 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID, 119 NVIDIA_GPU_V100_16GB_DEVICE_ID) 120 }, 121 { 122 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID, 123 NVIDIA_GPU_T4_16GB_DEVICE_ID) 124 }, 125 { 126 .device_id = 0 127 } 128 }; 129 130 /* Device private info */ 131 struct cuda_info { 132 char gpu_name[RTE_DEV_NAME_MAX_LEN]; 133 CUdevice cu_dev; 134 int gdr_supported; 135 int gdr_write_ordering; 136 int gdr_flush_type; 137 }; 138 139 /* Type of memory allocated by CUDA driver */ 140 enum mem_type { 141 GPU_MEM = 0, 142 CPU_REGISTERED, 143 GPU_REGISTERED /* Not used yet */ 144 }; 145 146 /* key associated to a memory address */ 147 typedef uintptr_t cuda_ptr_key; 148 149 /* Single entry of the memory list */ 150 struct mem_entry { 151 CUdeviceptr ptr_d; 152 CUdeviceptr ptr_orig_d; 153 void *ptr_h; 154 size_t size; 155 size_t size_orig; 156 struct rte_gpu *dev; 157 CUcontext ctx; 158 cuda_ptr_key pkey; 159 enum mem_type mtype; 160 struct mem_entry *prev; 161 struct mem_entry *next; 162 }; 163 164 static struct mem_entry *mem_alloc_list_head; 165 static struct mem_entry *mem_alloc_list_tail; 166 static uint32_t mem_alloc_list_last_elem; 167 168 /* Load the CUDA symbols */ 169 170 static int 171 cuda_loader(void) 172 { 173 char cuda_path[1024]; 174 175 if (getenv("CUDA_PATH_L") == NULL) 176 snprintf(cuda_path, 1024, "%s", "libcuda.so"); 177 else 178 snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so"); 179 180 cudalib = dlopen(cuda_path, RTLD_LAZY); 181 if (cudalib == NULL) { 182 rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)", 183 cuda_path, getenv("CUDA_PATH_L")); 184 return -1; 185 } 186 187 return 0; 188 } 189 190 static int 191 cuda_sym_func_loader(void) 192 { 193 if (cudalib == NULL) 194 return -1; 195 196 sym_cuInit = dlsym(cudalib, "cuInit"); 197 if (sym_cuInit == NULL) { 198 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit"); 199 return -1; 200 } 201 202 sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion"); 203 if (sym_cuDriverGetVersion == NULL) { 204 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion"); 205 return -1; 206 } 207 208 sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress"); 209 if (sym_cuGetProcAddress == NULL) { 210 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress"); 211 return -1; 212 } 213 214 return 0; 215 } 216 217 static int 218 cuda_pfn_func_loader(void) 219 { 220 CUresult res; 221 222 res = sym_cuGetProcAddress("cuGetErrorString", 223 (void **) (&pfn_cuGetErrorString), cuda_driver_version, 0); 224 if (res != 0) { 225 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res); 226 return -1; 227 } 228 229 res = sym_cuGetProcAddress("cuGetErrorName", 230 (void **)(&pfn_cuGetErrorName), cuda_driver_version, 0); 231 if (res != 0) { 232 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res); 233 return -1; 234 } 235 236 res = sym_cuGetProcAddress("cuPointerSetAttribute", 237 (void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0); 238 if (res != 0) { 239 rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res); 240 return -1; 241 } 242 243 res = sym_cuGetProcAddress("cuDeviceGetAttribute", 244 (void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0); 245 if (res != 0) { 246 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res); 247 return -1; 248 } 249 250 res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId", 251 (void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0); 252 if (res != 0) { 253 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res); 254 return -1; 255 } 256 257 res = sym_cuGetProcAddress("cuDeviceGetName", 258 (void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0); 259 if (res != 0) { 260 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res); 261 return -1; 262 } 263 264 res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain", 265 (void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0); 266 if (res != 0) { 267 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res); 268 return -1; 269 } 270 271 res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease", 272 (void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0); 273 if (res != 0) { 274 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res); 275 return -1; 276 } 277 278 res = sym_cuGetProcAddress("cuDeviceTotalMem", 279 (void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0); 280 if (res != 0) { 281 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res); 282 return -1; 283 } 284 285 res = sym_cuGetProcAddress("cuCtxGetApiVersion", 286 (void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0); 287 if (res != 0) { 288 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res); 289 return -1; 290 } 291 292 res = sym_cuGetProcAddress("cuCtxGetDevice", 293 (void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0); 294 if (res != 0) { 295 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res); 296 return -1; 297 } 298 299 res = sym_cuGetProcAddress("cuCtxSetCurrent", 300 (void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0); 301 if (res != 0) { 302 rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res); 303 return -1; 304 } 305 306 res = sym_cuGetProcAddress("cuCtxGetCurrent", 307 (void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0); 308 if (res != 0) { 309 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res); 310 return -1; 311 } 312 313 res = sym_cuGetProcAddress("cuCtxGetExecAffinity", 314 (void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0); 315 if (res != 0) { 316 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res); 317 return -1; 318 } 319 320 res = sym_cuGetProcAddress("cuMemAlloc", 321 (void **)(&pfn_cuMemAlloc), cuda_driver_version, 0); 322 if (res != 0) { 323 rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res); 324 return -1; 325 } 326 327 res = sym_cuGetProcAddress("cuMemFree", 328 (void **)(&pfn_cuMemFree), cuda_driver_version, 0); 329 if (res != 0) { 330 rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res); 331 return -1; 332 } 333 334 res = sym_cuGetProcAddress("cuMemHostRegister", 335 (void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0); 336 if (res != 0) { 337 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res); 338 return -1; 339 } 340 341 res = sym_cuGetProcAddress("cuMemHostUnregister", 342 (void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0); 343 if (res != 0) { 344 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res); 345 return -1; 346 } 347 348 res = sym_cuGetProcAddress("cuMemHostGetDevicePointer", 349 (void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0); 350 if (res != 0) { 351 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res); 352 return -1; 353 } 354 355 res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites", 356 (void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0); 357 if (res != 0) { 358 rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res); 359 return -1; 360 } 361 362 return 0; 363 } 364 365 /* Generate a key from a memory pointer */ 366 static cuda_ptr_key 367 get_hash_from_ptr(void *ptr) 368 { 369 return (uintptr_t)ptr; 370 } 371 372 static uint32_t 373 mem_list_count_item(void) 374 { 375 return mem_alloc_list_last_elem; 376 } 377 378 /* Initiate list of memory allocations if not done yet */ 379 static struct mem_entry * 380 mem_list_add_item(void) 381 { 382 /* Initiate list of memory allocations if not done yet */ 383 if (mem_alloc_list_head == NULL) { 384 mem_alloc_list_head = rte_zmalloc(NULL, 385 sizeof(struct mem_entry), 386 RTE_CACHE_LINE_SIZE); 387 if (mem_alloc_list_head == NULL) { 388 rte_cuda_log(ERR, "Failed to allocate memory for memory list"); 389 return NULL; 390 } 391 392 mem_alloc_list_head->next = NULL; 393 mem_alloc_list_head->prev = NULL; 394 mem_alloc_list_tail = mem_alloc_list_head; 395 } else { 396 struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL, 397 sizeof(struct mem_entry), 398 RTE_CACHE_LINE_SIZE); 399 400 if (mem_alloc_list_cur == NULL) { 401 rte_cuda_log(ERR, "Failed to allocate memory for memory list"); 402 return NULL; 403 } 404 405 mem_alloc_list_tail->next = mem_alloc_list_cur; 406 mem_alloc_list_cur->prev = mem_alloc_list_tail; 407 mem_alloc_list_tail = mem_alloc_list_tail->next; 408 mem_alloc_list_tail->next = NULL; 409 } 410 411 mem_alloc_list_last_elem++; 412 413 return mem_alloc_list_tail; 414 } 415 416 static struct mem_entry * 417 mem_list_find_item(cuda_ptr_key pk) 418 { 419 struct mem_entry *mem_alloc_list_cur = NULL; 420 421 if (mem_alloc_list_head == NULL) { 422 rte_cuda_log(ERR, "Memory list doesn't exist"); 423 return NULL; 424 } 425 426 if (mem_list_count_item() == 0) { 427 rte_cuda_log(ERR, "No items in memory list"); 428 return NULL; 429 } 430 431 mem_alloc_list_cur = mem_alloc_list_head; 432 433 while (mem_alloc_list_cur != NULL) { 434 if (mem_alloc_list_cur->pkey == pk) 435 return mem_alloc_list_cur; 436 mem_alloc_list_cur = mem_alloc_list_cur->next; 437 } 438 439 return mem_alloc_list_cur; 440 } 441 442 static int 443 mem_list_del_item(cuda_ptr_key pk) 444 { 445 struct mem_entry *mem_alloc_list_cur = NULL; 446 447 mem_alloc_list_cur = mem_list_find_item(pk); 448 if (mem_alloc_list_cur == NULL) 449 return -EINVAL; 450 451 /* if key is in head */ 452 if (mem_alloc_list_cur->prev == NULL) { 453 mem_alloc_list_head = mem_alloc_list_cur->next; 454 if (mem_alloc_list_head != NULL) 455 mem_alloc_list_head->prev = NULL; 456 } else { 457 mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next; 458 if (mem_alloc_list_cur->next != NULL) 459 mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev; 460 } 461 462 rte_free(mem_alloc_list_cur); 463 464 mem_alloc_list_last_elem--; 465 466 return 0; 467 } 468 469 static int 470 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info) 471 { 472 int ret = 0; 473 CUresult res; 474 struct rte_gpu_info parent_info; 475 CUexecAffinityParam affinityPrm; 476 const char *err_string; 477 struct cuda_info *private; 478 CUcontext current_ctx; 479 CUcontext input_ctx; 480 481 if (dev == NULL) { 482 rte_errno = ENODEV; 483 return -rte_errno; 484 } 485 486 /* Child initialization time probably called by rte_gpu_add_child() */ 487 if (dev->mpshared->info.parent != RTE_GPU_ID_NONE && 488 dev->mpshared->dev_private == NULL) { 489 /* Store current ctx */ 490 res = pfn_cuCtxGetCurrent(¤t_ctx); 491 if (res != 0) { 492 pfn_cuGetErrorString(res, &(err_string)); 493 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s", 494 err_string); 495 rte_errno = EPERM; 496 return -rte_errno; 497 } 498 499 /* Set child ctx as current ctx */ 500 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context); 501 res = pfn_cuCtxSetCurrent(input_ctx); 502 if (res != 0) { 503 pfn_cuGetErrorString(res, &(err_string)); 504 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s", 505 err_string); 506 rte_errno = EPERM; 507 return -rte_errno; 508 } 509 510 /* 511 * Ctx capacity info 512 */ 513 514 /* MPS compatible */ 515 res = pfn_cuCtxGetExecAffinity(&affinityPrm, 516 CU_EXEC_AFFINITY_TYPE_SM_COUNT); 517 if (res != 0) { 518 pfn_cuGetErrorString(res, &(err_string)); 519 rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s", 520 err_string); 521 } 522 dev->mpshared->info.processor_count = 523 (uint32_t)affinityPrm.param.smCount.val; 524 525 ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info); 526 if (ret) { 527 rte_errno = ENODEV; 528 return -rte_errno; 529 } 530 dev->mpshared->info.total_memory = parent_info.total_memory; 531 532 /* 533 * GPU Device private info 534 */ 535 dev->mpshared->dev_private = rte_zmalloc(NULL, 536 sizeof(struct cuda_info), 537 RTE_CACHE_LINE_SIZE); 538 if (dev->mpshared->dev_private == NULL) { 539 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private"); 540 rte_errno = EPERM; 541 return -rte_errno; 542 } 543 544 private = (struct cuda_info *)dev->mpshared->dev_private; 545 546 res = pfn_cuCtxGetDevice(&(private->cu_dev)); 547 if (res != 0) { 548 pfn_cuGetErrorString(res, &(err_string)); 549 rte_cuda_log(ERR, "cuCtxGetDevice failed with %s", 550 err_string); 551 rte_errno = EPERM; 552 return -rte_errno; 553 } 554 555 res = pfn_cuDeviceGetName(private->gpu_name, 556 RTE_DEV_NAME_MAX_LEN, private->cu_dev); 557 if (res != 0) { 558 pfn_cuGetErrorString(res, &(err_string)); 559 rte_cuda_log(ERR, "cuDeviceGetName failed with %s", 560 err_string); 561 rte_errno = EPERM; 562 return -rte_errno; 563 } 564 565 /* Restore original ctx as current ctx */ 566 res = pfn_cuCtxSetCurrent(current_ctx); 567 if (res != 0) { 568 pfn_cuGetErrorString(res, &(err_string)); 569 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s", 570 err_string); 571 rte_errno = EPERM; 572 return -rte_errno; 573 } 574 } 575 576 *info = dev->mpshared->info; 577 578 return 0; 579 } 580 581 /* 582 * GPU Memory 583 */ 584 585 static int 586 cuda_mem_alloc(struct rte_gpu *dev, size_t size, unsigned int align, void **ptr) 587 { 588 CUresult res; 589 const char *err_string; 590 CUcontext current_ctx; 591 CUcontext input_ctx; 592 unsigned int flag = 1; 593 594 if (dev == NULL) 595 return -ENODEV; 596 597 /* Store current ctx */ 598 res = pfn_cuCtxGetCurrent(¤t_ctx); 599 if (res != 0) { 600 pfn_cuGetErrorString(res, &(err_string)); 601 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s", 602 err_string); 603 rte_errno = EPERM; 604 return -rte_errno; 605 } 606 607 /* Set child ctx as current ctx */ 608 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context); 609 res = pfn_cuCtxSetCurrent(input_ctx); 610 if (res != 0) { 611 pfn_cuGetErrorString(res, &(err_string)); 612 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s", 613 err_string); 614 rte_errno = EPERM; 615 return -rte_errno; 616 } 617 618 /* Get next memory list item */ 619 mem_alloc_list_tail = mem_list_add_item(); 620 if (mem_alloc_list_tail == NULL) { 621 rte_errno = EPERM; 622 return -rte_errno; 623 } 624 625 /* Allocate memory */ 626 mem_alloc_list_tail->size = size; 627 mem_alloc_list_tail->size_orig = size + align; 628 629 res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d), 630 mem_alloc_list_tail->size_orig); 631 if (res != 0) { 632 pfn_cuGetErrorString(res, &(err_string)); 633 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s", 634 err_string); 635 rte_errno = EPERM; 636 return -rte_errno; 637 } 638 639 /* Align memory address */ 640 mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d; 641 if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align) 642 mem_alloc_list_tail->ptr_d += (align - 643 (((uintptr_t)mem_alloc_list_tail->ptr_d) % align)); 644 645 /* GPUDirect RDMA attribute required */ 646 res = pfn_cuPointerSetAttribute(&flag, 647 CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, 648 mem_alloc_list_tail->ptr_d); 649 if (res != 0) { 650 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for " 651 "GPU memory at %"PRIu32", err %d", 652 (uint32_t)mem_alloc_list_tail->ptr_d, res); 653 rte_errno = EPERM; 654 return -rte_errno; 655 } 656 657 mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d); 658 mem_alloc_list_tail->ptr_h = NULL; 659 mem_alloc_list_tail->dev = dev; 660 mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context); 661 mem_alloc_list_tail->mtype = GPU_MEM; 662 663 /* Restore original ctx as current ctx */ 664 res = pfn_cuCtxSetCurrent(current_ctx); 665 if (res != 0) { 666 pfn_cuGetErrorString(res, &(err_string)); 667 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s", 668 err_string); 669 rte_errno = EPERM; 670 return -rte_errno; 671 } 672 673 *ptr = (void *)mem_alloc_list_tail->ptr_d; 674 675 return 0; 676 } 677 678 static int 679 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr) 680 { 681 CUresult res; 682 const char *err_string; 683 CUcontext current_ctx; 684 CUcontext input_ctx; 685 unsigned int flag = 1; 686 int use_ptr_h = 0; 687 688 if (dev == NULL) 689 return -ENODEV; 690 691 /* Store current ctx */ 692 res = pfn_cuCtxGetCurrent(¤t_ctx); 693 if (res != 0) { 694 pfn_cuGetErrorString(res, &(err_string)); 695 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s", 696 err_string); 697 rte_errno = EPERM; 698 return -rte_errno; 699 } 700 701 /* Set child ctx as current ctx */ 702 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context); 703 res = pfn_cuCtxSetCurrent(input_ctx); 704 if (res != 0) { 705 pfn_cuGetErrorString(res, &(err_string)); 706 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s", 707 err_string); 708 rte_errno = EPERM; 709 return -rte_errno; 710 } 711 712 /* Get next memory list item */ 713 mem_alloc_list_tail = mem_list_add_item(); 714 if (mem_alloc_list_tail == NULL) { 715 rte_errno = EPERM; 716 return -rte_errno; 717 } 718 719 /* Allocate memory */ 720 mem_alloc_list_tail->size = size; 721 mem_alloc_list_tail->ptr_h = ptr; 722 723 res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h, 724 mem_alloc_list_tail->size, 725 CU_MEMHOSTREGISTER_PORTABLE | 726 CU_MEMHOSTREGISTER_DEVICEMAP); 727 if (res != 0) { 728 pfn_cuGetErrorString(res, &(err_string)); 729 rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd", 730 err_string, 731 mem_alloc_list_tail->ptr_h, 732 mem_alloc_list_tail->size); 733 rte_errno = EPERM; 734 return -rte_errno; 735 } 736 737 res = pfn_cuDeviceGetAttribute(&(use_ptr_h), 738 CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, 739 ((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev); 740 if (res != 0) { 741 pfn_cuGetErrorString(res, &(err_string)); 742 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s", 743 err_string); 744 rte_errno = EPERM; 745 return -rte_errno; 746 } 747 748 if (use_ptr_h == 0) { 749 res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d), 750 mem_alloc_list_tail->ptr_h, 0); 751 if (res != 0) { 752 pfn_cuGetErrorString(res, &(err_string)); 753 rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s", 754 err_string); 755 rte_errno = EPERM; 756 return -rte_errno; 757 } 758 759 if ((uintptr_t)mem_alloc_list_tail->ptr_d != 760 (uintptr_t)mem_alloc_list_tail->ptr_h) { 761 rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer"); 762 rte_errno = ENOTSUP; 763 return -rte_errno; 764 } 765 } else { 766 mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h; 767 } 768 769 /* GPUDirect RDMA attribute required */ 770 res = pfn_cuPointerSetAttribute(&flag, 771 CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, 772 mem_alloc_list_tail->ptr_d); 773 if (res != 0) { 774 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32 775 ", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res); 776 rte_errno = EPERM; 777 return -rte_errno; 778 } 779 780 mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h); 781 mem_alloc_list_tail->size = size; 782 mem_alloc_list_tail->dev = dev; 783 mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context); 784 mem_alloc_list_tail->mtype = CPU_REGISTERED; 785 mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d; 786 787 /* Restore original ctx as current ctx */ 788 res = pfn_cuCtxSetCurrent(current_ctx); 789 if (res != 0) { 790 pfn_cuGetErrorString(res, &(err_string)); 791 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s", 792 err_string); 793 rte_errno = EPERM; 794 return -rte_errno; 795 } 796 797 return 0; 798 } 799 800 static int 801 cuda_mem_free(struct rte_gpu *dev, void *ptr) 802 { 803 CUresult res; 804 struct mem_entry *mem_item; 805 const char *err_string; 806 cuda_ptr_key hk; 807 808 if (dev == NULL) 809 return -ENODEV; 810 811 hk = get_hash_from_ptr((void *)ptr); 812 813 mem_item = mem_list_find_item(hk); 814 if (mem_item == NULL) { 815 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr); 816 rte_errno = EPERM; 817 return -rte_errno; 818 } 819 820 if (mem_item->mtype == GPU_MEM) { 821 res = pfn_cuMemFree(mem_item->ptr_orig_d); 822 if (res != 0) { 823 pfn_cuGetErrorString(res, &(err_string)); 824 rte_cuda_log(ERR, "cuMemFree current failed with %s", 825 err_string); 826 rte_errno = EPERM; 827 return -rte_errno; 828 } 829 830 return mem_list_del_item(hk); 831 } 832 833 rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype); 834 835 return -EPERM; 836 } 837 838 static int 839 cuda_mem_unregister(struct rte_gpu *dev, void *ptr) 840 { 841 CUresult res; 842 struct mem_entry *mem_item; 843 const char *err_string; 844 cuda_ptr_key hk; 845 846 if (dev == NULL) 847 return -ENODEV; 848 849 hk = get_hash_from_ptr((void *)ptr); 850 851 mem_item = mem_list_find_item(hk); 852 if (mem_item == NULL) { 853 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr); 854 rte_errno = EPERM; 855 return -rte_errno; 856 } 857 858 if (mem_item->mtype == CPU_REGISTERED) { 859 res = pfn_cuMemHostUnregister(ptr); 860 if (res != 0) { 861 pfn_cuGetErrorString(res, &(err_string)); 862 rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s", 863 err_string); 864 rte_errno = EPERM; 865 return -rte_errno; 866 } 867 868 return mem_list_del_item(hk); 869 } 870 871 rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype); 872 873 rte_errno = EPERM; 874 return -rte_errno; 875 } 876 877 static int 878 cuda_dev_close(struct rte_gpu *dev) 879 { 880 if (dev == NULL) 881 return -EINVAL; 882 883 rte_free(dev->mpshared->dev_private); 884 885 return 0; 886 } 887 888 static int 889 cuda_wmb(struct rte_gpu *dev) 890 { 891 CUresult res; 892 const char *err_string; 893 CUcontext current_ctx; 894 CUcontext input_ctx; 895 struct cuda_info *private; 896 897 if (dev == NULL) { 898 rte_errno = ENODEV; 899 return -rte_errno; 900 } 901 902 private = (struct cuda_info *)dev->mpshared->dev_private; 903 904 if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) { 905 /* 906 * No need to explicitly force the write ordering because 907 * the device natively supports it 908 */ 909 return 0; 910 } 911 912 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) { 913 /* 914 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function. 915 * Application needs to use alternative methods. 916 */ 917 rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function." 918 "Application needs to use alternative methods."); 919 920 rte_errno = ENOTSUP; 921 return -rte_errno; 922 } 923 924 /* Store current ctx */ 925 res = pfn_cuCtxGetCurrent(¤t_ctx); 926 if (res != 0) { 927 pfn_cuGetErrorString(res, &(err_string)); 928 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s", 929 err_string); 930 rte_errno = EPERM; 931 return -rte_errno; 932 } 933 934 /* Set child ctx as current ctx */ 935 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context); 936 res = pfn_cuCtxSetCurrent(input_ctx); 937 if (res != 0) { 938 pfn_cuGetErrorString(res, &(err_string)); 939 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s", 940 err_string); 941 rte_errno = EPERM; 942 return -rte_errno; 943 } 944 945 res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX, 946 CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES); 947 if (res != 0) { 948 pfn_cuGetErrorString(res, &(err_string)); 949 rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s", 950 err_string); 951 rte_errno = EPERM; 952 return -rte_errno; 953 } 954 955 /* Restore original ctx as current ctx */ 956 res = pfn_cuCtxSetCurrent(current_ctx); 957 if (res != 0) { 958 pfn_cuGetErrorString(res, &(err_string)); 959 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s", 960 err_string); 961 rte_errno = EPERM; 962 return -rte_errno; 963 } 964 965 return 0; 966 } 967 968 static int 969 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) 970 { 971 struct rte_gpu *dev = NULL; 972 CUresult res; 973 CUdevice cu_dev_id; 974 CUcontext pctx; 975 char dev_name[RTE_DEV_NAME_MAX_LEN]; 976 const char *err_string; 977 int processor_count = 0; 978 struct cuda_info *private; 979 980 if (pci_dev == NULL) { 981 rte_cuda_log(ERR, "NULL PCI device"); 982 rte_errno = ENODEV; 983 return -rte_errno; 984 } 985 986 rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name)); 987 988 /* Allocate memory to be used privately by drivers */ 989 dev = rte_gpu_allocate(pci_dev->device.name); 990 if (dev == NULL) { 991 rte_errno = ENODEV; 992 return -rte_errno; 993 } 994 995 /* Initialize values only for the first CUDA driver call */ 996 if (dev->mpshared->info.dev_id == 0) { 997 mem_alloc_list_head = NULL; 998 mem_alloc_list_tail = NULL; 999 mem_alloc_list_last_elem = 0; 1000 1001 /* Load libcuda.so library */ 1002 if (cuda_loader()) { 1003 rte_cuda_log(ERR, "CUDA Driver library not found"); 1004 rte_errno = ENOTSUP; 1005 return -rte_errno; 1006 } 1007 1008 /* Load initial CUDA functions */ 1009 if (cuda_sym_func_loader()) { 1010 rte_cuda_log(ERR, "CUDA functions not found in library"); 1011 rte_errno = ENOTSUP; 1012 return -rte_errno; 1013 } 1014 1015 /* 1016 * Required to initialize the CUDA Driver. 1017 * Multiple calls of cuInit() will return immediately 1018 * without making any relevant change 1019 */ 1020 sym_cuInit(0); 1021 1022 res = sym_cuDriverGetVersion(&cuda_driver_version); 1023 if (res != 0) { 1024 rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res); 1025 rte_errno = ENOTSUP; 1026 return -rte_errno; 1027 } 1028 1029 if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) { 1030 rte_cuda_log(ERR, "CUDA Driver version found is %d. " 1031 "Minimum requirement is %d", 1032 cuda_driver_version, 1033 CUDA_DRIVER_MIN_VERSION); 1034 rte_errno = ENOTSUP; 1035 return -rte_errno; 1036 } 1037 1038 if (cuda_pfn_func_loader()) { 1039 rte_cuda_log(ERR, "CUDA PFN functions not found in library"); 1040 rte_errno = ENOTSUP; 1041 return -rte_errno; 1042 } 1043 } 1044 1045 /* Fill HW specific part of device structure */ 1046 dev->device = &pci_dev->device; 1047 dev->mpshared->info.numa_node = pci_dev->device.numa_node; 1048 1049 /* Get NVIDIA GPU Device descriptor */ 1050 res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name); 1051 if (res != 0) { 1052 pfn_cuGetErrorString(res, &(err_string)); 1053 rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s", 1054 dev->device->name, res, err_string); 1055 rte_errno = EPERM; 1056 return -rte_errno; 1057 } 1058 1059 res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id); 1060 if (res != 0) { 1061 pfn_cuGetErrorString(res, &(err_string)); 1062 rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s", 1063 dev->device->name, res, err_string); 1064 rte_errno = EPERM; 1065 return -rte_errno; 1066 } 1067 1068 res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version); 1069 if (res != 0) { 1070 rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res); 1071 rte_errno = ENOTSUP; 1072 return -rte_errno; 1073 } 1074 1075 if (cuda_api_version < CUDA_API_MIN_VERSION) { 1076 rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d", 1077 cuda_api_version, CUDA_API_MIN_VERSION); 1078 rte_errno = ENOTSUP; 1079 return -rte_errno; 1080 } 1081 1082 dev->mpshared->info.context = (uint64_t)pctx; 1083 1084 /* 1085 * GPU Device generic info 1086 */ 1087 1088 /* Processor count */ 1089 res = pfn_cuDeviceGetAttribute(&(processor_count), 1090 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 1091 cu_dev_id); 1092 if (res != 0) { 1093 pfn_cuGetErrorString(res, &(err_string)); 1094 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s", 1095 err_string); 1096 rte_errno = EPERM; 1097 return -rte_errno; 1098 } 1099 dev->mpshared->info.processor_count = (uint32_t)processor_count; 1100 1101 /* Total memory */ 1102 res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id); 1103 if (res != 0) { 1104 pfn_cuGetErrorString(res, &(err_string)); 1105 rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s", 1106 err_string); 1107 rte_errno = EPERM; 1108 return -rte_errno; 1109 } 1110 1111 /* 1112 * GPU Device private info 1113 */ 1114 dev->mpshared->dev_private = rte_zmalloc(NULL, 1115 sizeof(struct cuda_info), 1116 RTE_CACHE_LINE_SIZE); 1117 if (dev->mpshared->dev_private == NULL) { 1118 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private"); 1119 rte_errno = EPERM; 1120 return -rte_errno; 1121 } 1122 1123 private = (struct cuda_info *)dev->mpshared->dev_private; 1124 private->cu_dev = cu_dev_id; 1125 res = pfn_cuDeviceGetName(private->gpu_name, 1126 RTE_DEV_NAME_MAX_LEN, 1127 cu_dev_id); 1128 if (res != 0) { 1129 pfn_cuGetErrorString(res, &(err_string)); 1130 rte_cuda_log(ERR, "cuDeviceGetName failed with %s", 1131 err_string); 1132 rte_errno = EPERM; 1133 return -rte_errno; 1134 } 1135 1136 res = pfn_cuDeviceGetAttribute(&(private->gdr_supported), 1137 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, 1138 cu_dev_id); 1139 if (res != 0) { 1140 pfn_cuGetErrorString(res, &(err_string)); 1141 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s", 1142 err_string); 1143 rte_errno = EPERM; 1144 return -rte_errno; 1145 } 1146 1147 if (private->gdr_supported == 0) 1148 rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA", 1149 pci_dev->device.name); 1150 1151 res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering), 1152 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, 1153 cu_dev_id); 1154 if (res != 0) { 1155 pfn_cuGetErrorString(res, &(err_string)); 1156 rte_cuda_log(ERR, 1157 "cuDeviceGetAttribute failed with %s", 1158 err_string); 1159 rte_errno = EPERM; 1160 return -rte_errno; 1161 } 1162 1163 if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) { 1164 res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type), 1165 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS, 1166 cu_dev_id); 1167 if (res != 0) { 1168 pfn_cuGetErrorString(res, &(err_string)); 1169 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s", 1170 err_string); 1171 rte_errno = EPERM; 1172 return -rte_errno; 1173 } 1174 1175 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) 1176 rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported"); 1177 } 1178 1179 dev->ops.dev_info_get = cuda_dev_info_get; 1180 dev->ops.dev_close = cuda_dev_close; 1181 dev->ops.mem_alloc = cuda_mem_alloc; 1182 dev->ops.mem_free = cuda_mem_free; 1183 dev->ops.mem_register = cuda_mem_register; 1184 dev->ops.mem_unregister = cuda_mem_unregister; 1185 dev->ops.mem_cpu_map = NULL; 1186 dev->ops.mem_cpu_unmap = NULL; 1187 dev->ops.wmb = cuda_wmb; 1188 1189 rte_gpu_complete_new(dev); 1190 1191 rte_cuda_debug("dev id = %u name = %s", 1192 dev->mpshared->info.dev_id, private->gpu_name); 1193 1194 return 0; 1195 } 1196 1197 static int 1198 cuda_gpu_remove(struct rte_pci_device *pci_dev) 1199 { 1200 struct rte_gpu *dev; 1201 int ret; 1202 uint8_t gpu_id; 1203 1204 if (pci_dev == NULL) { 1205 rte_errno = ENODEV; 1206 return -rte_errno; 1207 } 1208 1209 dev = rte_gpu_get_by_name(pci_dev->device.name); 1210 if (dev == NULL) { 1211 rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it", 1212 pci_dev->device.name); 1213 rte_errno = ENODEV; 1214 return -rte_errno; 1215 } 1216 gpu_id = dev->mpshared->info.dev_id; 1217 1218 /* release dev from library */ 1219 ret = rte_gpu_release(dev); 1220 if (ret) 1221 rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret); 1222 1223 rte_cuda_debug("Destroyed dev = %u", gpu_id); 1224 1225 return 0; 1226 } 1227 1228 static struct rte_pci_driver rte_cuda_driver = { 1229 .id_table = pci_id_cuda_map, 1230 .drv_flags = RTE_PCI_DRV_WC_ACTIVATE, 1231 .probe = cuda_gpu_probe, 1232 .remove = cuda_gpu_remove, 1233 }; 1234 1235 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver); 1236 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map); 1237 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)"); 1238