1 /* $NetBSD: amdgpu_gfx.c,v 1.6 2021/12/19 12:31:45 riastradh Exp $ */ 2 3 /* 4 * Copyright 2014 Advanced Micro Devices, Inc. 5 * Copyright 2008 Red Hat Inc. 6 * Copyright 2009 Jerome Glisse. 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice shall be included in 16 * all copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 22 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 23 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 * OTHER DEALINGS IN THE SOFTWARE. 25 * 26 */ 27 28 #include <sys/cdefs.h> 29 __KERNEL_RCSID(0, "$NetBSD: amdgpu_gfx.c,v 1.6 2021/12/19 12:31:45 riastradh Exp $"); 30 31 #include "amdgpu.h" 32 #include "amdgpu_gfx.h" 33 #include "amdgpu_rlc.h" 34 #include "amdgpu_ras.h" 35 36 /* delay 0.1 second to enable gfx off feature */ 37 #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100) 38 39 /* 40 * GPU GFX IP block helpers function. 41 */ 42 43 int amdgpu_gfx_mec_queue_to_bit(struct amdgpu_device *adev, int mec, 44 int pipe, int queue) 45 { 46 int bit = 0; 47 48 bit += mec * adev->gfx.mec.num_pipe_per_mec 49 * adev->gfx.mec.num_queue_per_pipe; 50 bit += pipe * adev->gfx.mec.num_queue_per_pipe; 51 bit += queue; 52 53 return bit; 54 } 55 56 void amdgpu_gfx_bit_to_mec_queue(struct amdgpu_device *adev, int bit, 57 int *mec, int *pipe, int *queue) 58 { 59 *queue = bit % adev->gfx.mec.num_queue_per_pipe; 60 *pipe = (bit / adev->gfx.mec.num_queue_per_pipe) 61 % adev->gfx.mec.num_pipe_per_mec; 62 *mec = (bit / adev->gfx.mec.num_queue_per_pipe) 63 / adev->gfx.mec.num_pipe_per_mec; 64 65 } 66 67 bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, 68 int mec, int pipe, int queue) 69 { 70 return test_bit(amdgpu_gfx_mec_queue_to_bit(adev, mec, pipe, queue), 71 adev->gfx.mec.queue_bitmap); 72 } 73 74 int amdgpu_gfx_me_queue_to_bit(struct amdgpu_device *adev, 75 int me, int pipe, int queue) 76 { 77 int bit = 0; 78 79 bit += me * adev->gfx.me.num_pipe_per_me 80 * adev->gfx.me.num_queue_per_pipe; 81 bit += pipe * adev->gfx.me.num_queue_per_pipe; 82 bit += queue; 83 84 return bit; 85 } 86 87 void amdgpu_gfx_bit_to_me_queue(struct amdgpu_device *adev, int bit, 88 int *me, int *pipe, int *queue) 89 { 90 *queue = bit % adev->gfx.me.num_queue_per_pipe; 91 *pipe = (bit / adev->gfx.me.num_queue_per_pipe) 92 % adev->gfx.me.num_pipe_per_me; 93 *me = (bit / adev->gfx.me.num_queue_per_pipe) 94 / adev->gfx.me.num_pipe_per_me; 95 } 96 97 bool amdgpu_gfx_is_me_queue_enabled(struct amdgpu_device *adev, 98 int me, int pipe, int queue) 99 { 100 return test_bit(amdgpu_gfx_me_queue_to_bit(adev, me, pipe, queue), 101 adev->gfx.me.queue_bitmap); 102 } 103 104 /** 105 * amdgpu_gfx_scratch_get - Allocate a scratch register 106 * 107 * @adev: amdgpu_device pointer 108 * @reg: scratch register mmio offset 109 * 110 * Allocate a CP scratch register for use by the driver (all asics). 111 * Returns 0 on success or -EINVAL on failure. 112 */ 113 int amdgpu_gfx_scratch_get(struct amdgpu_device *adev, uint32_t *reg) 114 { 115 int i; 116 117 i = ffs(adev->gfx.scratch.free_mask); 118 if (i != 0 && i <= adev->gfx.scratch.num_reg) { 119 i--; 120 adev->gfx.scratch.free_mask &= ~(1u << i); 121 *reg = adev->gfx.scratch.reg_base + i; 122 return 0; 123 } 124 return -EINVAL; 125 } 126 127 /** 128 * amdgpu_gfx_scratch_free - Free a scratch register 129 * 130 * @adev: amdgpu_device pointer 131 * @reg: scratch register mmio offset 132 * 133 * Free a CP scratch register allocated for use by the driver (all asics) 134 */ 135 void amdgpu_gfx_scratch_free(struct amdgpu_device *adev, uint32_t reg) 136 { 137 adev->gfx.scratch.free_mask |= 1u << (reg - adev->gfx.scratch.reg_base); 138 } 139 140 /** 141 * amdgpu_gfx_parse_disable_cu - Parse the disable_cu module parameter 142 * 143 * @mask: array in which the per-shader array disable masks will be stored 144 * @max_se: number of SEs 145 * @max_sh: number of SHs 146 * 147 * The bitmask of CUs to be disabled in the shader array determined by se and 148 * sh is stored in mask[se * max_sh + sh]. 149 */ 150 void amdgpu_gfx_parse_disable_cu(unsigned *mask, unsigned max_se, unsigned max_sh) 151 { 152 unsigned se, sh, cu; 153 const char *p; 154 155 memset(mask, 0, sizeof(*mask) * max_se * max_sh); 156 157 if (!amdgpu_disable_cu || !*amdgpu_disable_cu) 158 return; 159 160 p = amdgpu_disable_cu; 161 for (;;) { 162 char *next; 163 int ret = sscanf(p, "%u.%u.%u", &se, &sh, &cu); 164 if (ret < 3) { 165 DRM_ERROR("amdgpu: could not parse disable_cu\n"); 166 return; 167 } 168 169 if (se < max_se && sh < max_sh && cu < 16) { 170 DRM_INFO("amdgpu: disabling CU %u.%u.%u\n", se, sh, cu); 171 mask[se * max_sh + sh] |= 1u << cu; 172 } else { 173 DRM_ERROR("amdgpu: disable_cu %u.%u.%u is out of range\n", 174 se, sh, cu); 175 } 176 177 next = strchr(p, ','); 178 if (!next) 179 break; 180 p = next + 1; 181 } 182 } 183 184 static bool amdgpu_gfx_is_multipipe_capable(struct amdgpu_device *adev) 185 { 186 if (amdgpu_compute_multipipe != -1) { 187 DRM_INFO("amdgpu: forcing compute pipe policy %d\n", 188 amdgpu_compute_multipipe); 189 return amdgpu_compute_multipipe == 1; 190 } 191 192 /* FIXME: spreading the queues across pipes causes perf regressions 193 * on POLARIS11 compute workloads */ 194 if (adev->asic_type == CHIP_POLARIS11) 195 return false; 196 197 return adev->gfx.mec.num_mec > 1; 198 } 199 200 void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev) 201 { 202 int i, queue, pipe, mec; 203 bool multipipe_policy = amdgpu_gfx_is_multipipe_capable(adev); 204 205 /* policy for amdgpu compute queue ownership */ 206 for (i = 0; i < AMDGPU_MAX_COMPUTE_QUEUES; ++i) { 207 queue = i % adev->gfx.mec.num_queue_per_pipe; 208 pipe = (i / adev->gfx.mec.num_queue_per_pipe) 209 % adev->gfx.mec.num_pipe_per_mec; 210 mec = (i / adev->gfx.mec.num_queue_per_pipe) 211 / adev->gfx.mec.num_pipe_per_mec; 212 213 /* we've run out of HW */ 214 if (mec >= adev->gfx.mec.num_mec) 215 break; 216 217 if (multipipe_policy) { 218 /* policy: amdgpu owns the first two queues of the first MEC */ 219 if (mec == 0 && queue < 2) 220 set_bit(i, adev->gfx.mec.queue_bitmap); 221 } else { 222 /* policy: amdgpu owns all queues in the first pipe */ 223 if (mec == 0 && pipe == 0) 224 set_bit(i, adev->gfx.mec.queue_bitmap); 225 } 226 } 227 228 /* update the number of active compute rings */ 229 adev->gfx.num_compute_rings = 230 bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 231 232 /* If you hit this case and edited the policy, you probably just 233 * need to increase AMDGPU_MAX_COMPUTE_RINGS */ 234 if (WARN_ON(adev->gfx.num_compute_rings > AMDGPU_MAX_COMPUTE_RINGS)) 235 adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS; 236 } 237 238 void amdgpu_gfx_graphics_queue_acquire(struct amdgpu_device *adev) 239 { 240 int i, queue, me; 241 242 for (i = 0; i < AMDGPU_MAX_GFX_QUEUES; ++i) { 243 queue = i % adev->gfx.me.num_queue_per_pipe; 244 me = (i / adev->gfx.me.num_queue_per_pipe) 245 / adev->gfx.me.num_pipe_per_me; 246 247 if (me >= adev->gfx.me.num_me) 248 break; 249 /* policy: amdgpu owns the first queue per pipe at this stage 250 * will extend to mulitple queues per pipe later */ 251 if (me == 0 && queue < 1) 252 set_bit(i, adev->gfx.me.queue_bitmap); 253 } 254 255 /* update the number of active graphics rings */ 256 adev->gfx.num_gfx_rings = 257 bitmap_weight(adev->gfx.me.queue_bitmap, AMDGPU_MAX_GFX_QUEUES); 258 } 259 260 static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev, 261 struct amdgpu_ring *ring) 262 { 263 int queue_bit; 264 int mec, pipe, queue; 265 266 queue_bit = adev->gfx.mec.num_mec 267 * adev->gfx.mec.num_pipe_per_mec 268 * adev->gfx.mec.num_queue_per_pipe; 269 270 while (queue_bit-- >= 0) { 271 if (test_bit(queue_bit, adev->gfx.mec.queue_bitmap)) 272 continue; 273 274 amdgpu_gfx_bit_to_mec_queue(adev, queue_bit, &mec, &pipe, &queue); 275 276 /* 277 * 1. Using pipes 2/3 from MEC 2 seems cause problems. 278 * 2. It must use queue id 0, because CGPG_IDLE/SAVE/LOAD/RUN 279 * only can be issued on queue 0. 280 */ 281 if ((mec == 1 && pipe > 1) || queue != 0) 282 continue; 283 284 ring->me = mec + 1; 285 ring->pipe = pipe; 286 ring->queue = queue; 287 288 return 0; 289 } 290 291 dev_err(adev->dev, "Failed to find a queue for KIQ\n"); 292 return -EINVAL; 293 } 294 295 int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev, 296 struct amdgpu_ring *ring, 297 struct amdgpu_irq_src *irq) 298 { 299 struct amdgpu_kiq *kiq = &adev->gfx.kiq; 300 int r = 0; 301 302 spin_lock_init(&kiq->ring_lock); 303 304 r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs); 305 if (r) 306 return r; 307 308 ring->adev = NULL; 309 ring->ring_obj = NULL; 310 ring->use_doorbell = true; 311 ring->doorbell_index = adev->doorbell_index.kiq; 312 313 r = amdgpu_gfx_kiq_acquire(adev, ring); 314 if (r) 315 return r; 316 317 ring->eop_gpu_addr = kiq->eop_gpu_addr; 318 snprintf(ring->name, sizeof(ring->name), "kiq_%d.%d.%d", ring->me, ring->pipe, ring->queue); 319 r = amdgpu_ring_init(adev, ring, 1024, 320 irq, AMDGPU_CP_KIQ_IRQ_DRIVER0); 321 if (r) 322 dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r); 323 324 return r; 325 } 326 327 void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring) 328 { 329 amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs); 330 amdgpu_ring_fini(ring); 331 spin_lock_destroy(&ring->adev->gfx.kiq.ring_lock); 332 } 333 334 void amdgpu_gfx_kiq_fini(struct amdgpu_device *adev) 335 { 336 struct amdgpu_kiq *kiq = &adev->gfx.kiq; 337 338 amdgpu_bo_free_kernel(&kiq->eop_obj, &kiq->eop_gpu_addr, NULL); 339 } 340 341 int amdgpu_gfx_kiq_init(struct amdgpu_device *adev, 342 unsigned hpd_size) 343 { 344 int r; 345 u32 *hpd; 346 struct amdgpu_kiq *kiq = &adev->gfx.kiq; 347 348 r = amdgpu_bo_create_kernel(adev, hpd_size, PAGE_SIZE, 349 AMDGPU_GEM_DOMAIN_GTT, &kiq->eop_obj, 350 &kiq->eop_gpu_addr, (void **)&hpd); 351 if (r) { 352 dev_warn(adev->dev, "failed to create KIQ bo (%d).\n", r); 353 return r; 354 } 355 356 memset(hpd, 0, hpd_size); 357 358 r = amdgpu_bo_reserve(kiq->eop_obj, true); 359 if (unlikely(r != 0)) 360 dev_warn(adev->dev, "(%d) reserve kiq eop bo failed\n", r); 361 amdgpu_bo_kunmap(kiq->eop_obj); 362 amdgpu_bo_unreserve(kiq->eop_obj); 363 364 return 0; 365 } 366 367 /* create MQD for each compute/gfx queue */ 368 int amdgpu_gfx_mqd_sw_init(struct amdgpu_device *adev, 369 unsigned mqd_size) 370 { 371 struct amdgpu_ring *ring = NULL; 372 int r, i; 373 374 /* create MQD for KIQ */ 375 ring = &adev->gfx.kiq.ring; 376 if (!ring->mqd_obj) { 377 /* originaly the KIQ MQD is put in GTT domain, but for SRIOV VRAM domain is a must 378 * otherwise hypervisor trigger SAVE_VF fail after driver unloaded which mean MQD 379 * deallocated and gart_unbind, to strict diverage we decide to use VRAM domain for 380 * KIQ MQD no matter SRIOV or Bare-metal 381 */ 382 r = amdgpu_bo_create_kernel(adev, mqd_size, PAGE_SIZE, 383 AMDGPU_GEM_DOMAIN_VRAM, &ring->mqd_obj, 384 &ring->mqd_gpu_addr, &ring->mqd_ptr); 385 if (r) { 386 dev_warn(adev->dev, "failed to create ring mqd ob (%d)", r); 387 return r; 388 } 389 390 /* prepare MQD backup */ 391 adev->gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS] = kmalloc(mqd_size, GFP_KERNEL); 392 if (!adev->gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS]) 393 dev_warn(adev->dev, "no memory to create MQD backup for ring %s\n", ring->name); 394 } 395 396 if (adev->asic_type >= CHIP_NAVI10 && amdgpu_async_gfx_ring) { 397 /* create MQD for each KGQ */ 398 for (i = 0; i < adev->gfx.num_gfx_rings; i++) { 399 ring = &adev->gfx.gfx_ring[i]; 400 if (!ring->mqd_obj) { 401 r = amdgpu_bo_create_kernel(adev, mqd_size, PAGE_SIZE, 402 AMDGPU_GEM_DOMAIN_GTT, &ring->mqd_obj, 403 &ring->mqd_gpu_addr, &ring->mqd_ptr); 404 if (r) { 405 dev_warn(adev->dev, "failed to create ring mqd bo (%d)", r); 406 return r; 407 } 408 409 /* prepare MQD backup */ 410 adev->gfx.me.mqd_backup[i] = kmalloc(mqd_size, GFP_KERNEL); 411 if (!adev->gfx.me.mqd_backup[i]) 412 dev_warn(adev->dev, "no memory to create MQD backup for ring %s\n", ring->name); 413 } 414 } 415 } 416 417 /* create MQD for each KCQ */ 418 for (i = 0; i < adev->gfx.num_compute_rings; i++) { 419 ring = &adev->gfx.compute_ring[i]; 420 if (!ring->mqd_obj) { 421 r = amdgpu_bo_create_kernel(adev, mqd_size, PAGE_SIZE, 422 AMDGPU_GEM_DOMAIN_GTT, &ring->mqd_obj, 423 &ring->mqd_gpu_addr, &ring->mqd_ptr); 424 if (r) { 425 dev_warn(adev->dev, "failed to create ring mqd bo (%d)", r); 426 return r; 427 } 428 429 /* prepare MQD backup */ 430 adev->gfx.mec.mqd_backup[i] = kmalloc(mqd_size, GFP_KERNEL); 431 if (!adev->gfx.mec.mqd_backup[i]) 432 dev_warn(adev->dev, "no memory to create MQD backup for ring %s\n", ring->name); 433 } 434 } 435 436 return 0; 437 } 438 439 void amdgpu_gfx_mqd_sw_fini(struct amdgpu_device *adev) 440 { 441 struct amdgpu_ring *ring = NULL; 442 int i; 443 444 if (adev->asic_type >= CHIP_NAVI10 && amdgpu_async_gfx_ring) { 445 for (i = 0; i < adev->gfx.num_gfx_rings; i++) { 446 ring = &adev->gfx.gfx_ring[i]; 447 kfree(adev->gfx.me.mqd_backup[i]); 448 amdgpu_bo_free_kernel(&ring->mqd_obj, 449 &ring->mqd_gpu_addr, 450 &ring->mqd_ptr); 451 } 452 } 453 454 for (i = 0; i < adev->gfx.num_compute_rings; i++) { 455 ring = &adev->gfx.compute_ring[i]; 456 kfree(adev->gfx.mec.mqd_backup[i]); 457 amdgpu_bo_free_kernel(&ring->mqd_obj, 458 &ring->mqd_gpu_addr, 459 &ring->mqd_ptr); 460 } 461 462 ring = &adev->gfx.kiq.ring; 463 kfree(adev->gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS]); 464 amdgpu_bo_free_kernel(&ring->mqd_obj, 465 &ring->mqd_gpu_addr, 466 &ring->mqd_ptr); 467 } 468 469 int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev) 470 { 471 struct amdgpu_kiq *kiq = &adev->gfx.kiq; 472 struct amdgpu_ring *kiq_ring = &kiq->ring; 473 int i; 474 475 if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) 476 return -EINVAL; 477 478 if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size * 479 adev->gfx.num_compute_rings)) 480 return -ENOMEM; 481 482 for (i = 0; i < adev->gfx.num_compute_rings; i++) 483 kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i], 484 RESET_QUEUES, 0, 0); 485 486 return amdgpu_ring_test_ring(kiq_ring); 487 } 488 489 int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev) 490 { 491 struct amdgpu_kiq *kiq = &adev->gfx.kiq; 492 struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring; 493 uint64_t queue_mask = 0; 494 int r, i; 495 496 if (!kiq->pmf || !kiq->pmf->kiq_map_queues || !kiq->pmf->kiq_set_resources) 497 return -EINVAL; 498 499 for (i = 0; i < AMDGPU_MAX_COMPUTE_QUEUES; ++i) { 500 if (!test_bit(i, adev->gfx.mec.queue_bitmap)) 501 continue; 502 503 /* This situation may be hit in the future if a new HW 504 * generation exposes more than 64 queues. If so, the 505 * definition of queue_mask needs updating */ 506 if (WARN_ON(i > (sizeof(queue_mask)*8))) { 507 DRM_ERROR("Invalid KCQ enabled: %d\n", i); 508 break; 509 } 510 511 queue_mask |= (1ull << i); 512 } 513 514 DRM_INFO("kiq ring mec %d pipe %d q %d\n", kiq_ring->me, kiq_ring->pipe, 515 kiq_ring->queue); 516 517 r = amdgpu_ring_alloc(kiq_ring, kiq->pmf->map_queues_size * 518 adev->gfx.num_compute_rings + 519 kiq->pmf->set_resources_size); 520 if (r) { 521 DRM_ERROR("Failed to lock KIQ (%d).\n", r); 522 return r; 523 } 524 525 kiq->pmf->kiq_set_resources(kiq_ring, queue_mask); 526 for (i = 0; i < adev->gfx.num_compute_rings; i++) 527 kiq->pmf->kiq_map_queues(kiq_ring, &adev->gfx.compute_ring[i]); 528 529 r = amdgpu_ring_test_helper(kiq_ring); 530 if (r) 531 DRM_ERROR("KCQ enable failed\n"); 532 533 return r; 534 } 535 536 /* amdgpu_gfx_off_ctrl - Handle gfx off feature enable/disable 537 * 538 * @adev: amdgpu_device pointer 539 * @bool enable true: enable gfx off feature, false: disable gfx off feature 540 * 541 * 1. gfx off feature will be enabled by gfx ip after gfx cg gp enabled. 542 * 2. other client can send request to disable gfx off feature, the request should be honored. 543 * 3. other client can cancel their request of disable gfx off feature 544 * 4. other client should not send request to enable gfx off feature before disable gfx off feature. 545 */ 546 547 void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable) 548 { 549 if (!(adev->pm.pp_feature & PP_GFXOFF_MASK)) 550 return; 551 552 mutex_lock(&adev->gfx.gfx_off_mutex); 553 554 if (!enable) 555 adev->gfx.gfx_off_req_count++; 556 else if (adev->gfx.gfx_off_req_count > 0) 557 adev->gfx.gfx_off_req_count--; 558 559 if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 560 schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE); 561 } else if (!enable && adev->gfx.gfx_off_state) { 562 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) 563 adev->gfx.gfx_off_state = false; 564 } 565 566 mutex_unlock(&adev->gfx.gfx_off_mutex); 567 } 568 569 int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev) 570 { 571 int r; 572 struct ras_fs_if fs_info = { 573 .sysfs_name = "gfx_err_count", 574 .debugfs_name = "gfx_err_inject", 575 }; 576 struct ras_ih_if ih_info = { 577 .cb = amdgpu_gfx_process_ras_data_cb, 578 }; 579 580 if (!adev->gfx.ras_if) { 581 adev->gfx.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); 582 if (!adev->gfx.ras_if) 583 return -ENOMEM; 584 adev->gfx.ras_if->block = AMDGPU_RAS_BLOCK__GFX; 585 adev->gfx.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; 586 adev->gfx.ras_if->sub_block_index = 0; 587 strcpy(adev->gfx.ras_if->name, "gfx"); 588 } 589 fs_info.head = ih_info.head = *adev->gfx.ras_if; 590 591 r = amdgpu_ras_late_init(adev, adev->gfx.ras_if, 592 &fs_info, &ih_info); 593 if (r) 594 goto free; 595 596 if (amdgpu_ras_is_supported(adev, adev->gfx.ras_if->block)) { 597 r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0); 598 if (r) 599 goto late_fini; 600 } else { 601 /* free gfx ras_if if ras is not supported */ 602 r = 0; 603 goto free; 604 } 605 606 return 0; 607 late_fini: 608 amdgpu_ras_late_fini(adev, adev->gfx.ras_if, &ih_info); 609 free: 610 kfree(adev->gfx.ras_if); 611 adev->gfx.ras_if = NULL; 612 return r; 613 } 614 615 void amdgpu_gfx_ras_fini(struct amdgpu_device *adev) 616 { 617 if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX) && 618 adev->gfx.ras_if) { 619 struct ras_common_if *ras_if = adev->gfx.ras_if; 620 struct ras_ih_if ih_info = { 621 .head = *ras_if, 622 .cb = amdgpu_gfx_process_ras_data_cb, 623 }; 624 625 amdgpu_ras_late_fini(adev, ras_if, &ih_info); 626 kfree(ras_if); 627 } 628 } 629 630 int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev, 631 void *err_data, 632 struct amdgpu_iv_entry *entry) 633 { 634 /* TODO ue will trigger an interrupt. 635 * 636 * When “Full RAS” is enabled, the per-IP interrupt sources should 637 * be disabled and the driver should only look for the aggregated 638 * interrupt via sync flood 639 */ 640 if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { 641 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); 642 if (adev->gfx.funcs->query_ras_error_count) 643 adev->gfx.funcs->query_ras_error_count(adev, err_data); 644 amdgpu_ras_reset_gpu(adev); 645 } 646 return AMDGPU_RAS_SUCCESS; 647 } 648 649 int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev, 650 struct amdgpu_irq_src *source, 651 struct amdgpu_iv_entry *entry) 652 { 653 struct ras_common_if *ras_if = adev->gfx.ras_if; 654 struct ras_dispatch_if ih_data = { 655 .entry = entry, 656 }; 657 658 if (!ras_if) 659 return 0; 660 661 ih_data.head = *ras_if; 662 663 DRM_ERROR("CP ECC ERROR IRQ\n"); 664 amdgpu_ras_interrupt_dispatch(adev, &ih_data); 665 return 0; 666 } 667 668 uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) 669 { 670 signed long r, cnt = 0; 671 unsigned long flags; 672 uint32_t seq; 673 struct amdgpu_kiq *kiq = &adev->gfx.kiq; 674 struct amdgpu_ring *ring = &kiq->ring; 675 676 BUG_ON(!ring->funcs->emit_rreg); 677 678 spin_lock_irqsave(&kiq->ring_lock, flags); 679 amdgpu_ring_alloc(ring, 32); 680 amdgpu_ring_emit_rreg(ring, reg); 681 amdgpu_fence_emit_polling(ring, &seq); 682 amdgpu_ring_commit(ring); 683 spin_unlock_irqrestore(&kiq->ring_lock, flags); 684 685 r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT); 686 687 /* don't wait anymore for gpu reset case because this way may 688 * block gpu_recover() routine forever, e.g. this virt_kiq_rreg 689 * is triggered in TTM and ttm_bo_lock_delayed_workqueue() will 690 * never return if we keep waiting in virt_kiq_rreg, which cause 691 * gpu_recover() hang there. 692 * 693 * also don't wait anymore for IRQ context 694 * */ 695 if (r < 1 && (adev->in_gpu_reset || in_interrupt())) 696 goto failed_kiq_read; 697 698 might_sleep(); 699 while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) { 700 msleep(MAX_KIQ_REG_BAILOUT_INTERVAL); 701 r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT); 702 } 703 704 if (cnt > MAX_KIQ_REG_TRY) 705 goto failed_kiq_read; 706 707 return adev->wb.wb[kiq->reg_val_offs]; 708 709 failed_kiq_read: 710 pr_err("failed to read reg:%x\n", reg); 711 return ~0; 712 } 713 714 void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 715 { 716 signed long r, cnt = 0; 717 unsigned long flags; 718 uint32_t seq; 719 struct amdgpu_kiq *kiq = &adev->gfx.kiq; 720 struct amdgpu_ring *ring = &kiq->ring; 721 722 BUG_ON(!ring->funcs->emit_wreg); 723 724 spin_lock_irqsave(&kiq->ring_lock, flags); 725 amdgpu_ring_alloc(ring, 32); 726 amdgpu_ring_emit_wreg(ring, reg, v); 727 amdgpu_fence_emit_polling(ring, &seq); 728 amdgpu_ring_commit(ring); 729 spin_unlock_irqrestore(&kiq->ring_lock, flags); 730 731 r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT); 732 733 /* don't wait anymore for gpu reset case because this way may 734 * block gpu_recover() routine forever, e.g. this virt_kiq_rreg 735 * is triggered in TTM and ttm_bo_lock_delayed_workqueue() will 736 * never return if we keep waiting in virt_kiq_rreg, which cause 737 * gpu_recover() hang there. 738 * 739 * also don't wait anymore for IRQ context 740 * */ 741 if (r < 1 && (adev->in_gpu_reset || in_interrupt())) 742 goto failed_kiq_write; 743 744 might_sleep(); 745 while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) { 746 747 msleep(MAX_KIQ_REG_BAILOUT_INTERVAL); 748 r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT); 749 } 750 751 if (cnt > MAX_KIQ_REG_TRY) 752 goto failed_kiq_write; 753 754 return; 755 756 failed_kiq_write: 757 pr_err("failed to write reg:%x\n", reg); 758 } 759