1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/device.h> 47 #include <linux/vgaarb.h> 48 #include <linux/vga_switcheroo.h> 49 #include <linux/efi.h> 50 #include "amdgpu.h" 51 #include "amdgpu_trace.h" 52 #include "amdgpu_i2c.h" 53 #include "atom.h" 54 #include "amdgpu_atombios.h" 55 #include "amdgpu_atomfirmware.h" 56 #include "amd_pcie.h" 57 #ifdef CONFIG_DRM_AMDGPU_SI 58 #include "si.h" 59 #endif 60 #ifdef CONFIG_DRM_AMDGPU_CIK 61 #include "cik.h" 62 #endif 63 #include "vi.h" 64 #include "soc15.h" 65 #include "nv.h" 66 #include "bif/bif_4_1_d.h" 67 #include <linux/firmware.h> 68 #include "amdgpu_vf_error.h" 69 70 #include "amdgpu_amdkfd.h" 71 #include "amdgpu_pm.h" 72 73 #include "amdgpu_xgmi.h" 74 #include "amdgpu_ras.h" 75 #include "amdgpu_pmu.h" 76 #include "amdgpu_fru_eeprom.h" 77 #include "amdgpu_reset.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) && defined(__linux__) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 101 static const struct drm_driver amdgpu_kms_driver; 102 103 const char *amdgpu_asic_name[] = { 104 "TAHITI", 105 "PITCAIRN", 106 "VERDE", 107 "OLAND", 108 "HAINAN", 109 "BONAIRE", 110 "KAVERI", 111 "KABINI", 112 "HAWAII", 113 "MULLINS", 114 "TOPAZ", 115 "TONGA", 116 "FIJI", 117 "CARRIZO", 118 "STONEY", 119 "POLARIS10", 120 "POLARIS11", 121 "POLARIS12", 122 "VEGAM", 123 "VEGA10", 124 "VEGA12", 125 "VEGA20", 126 "RAVEN", 127 "ARCTURUS", 128 "RENOIR", 129 "ALDEBARAN", 130 "NAVI10", 131 "CYAN_SKILLFISH", 132 "NAVI14", 133 "NAVI12", 134 "SIENNA_CICHLID", 135 "NAVY_FLOUNDER", 136 "VANGOGH", 137 "DIMGREY_CAVEFISH", 138 "BEIGE_GOBY", 139 "YELLOW_CARP", 140 "IP DISCOVERY", 141 "LAST", 142 }; 143 144 /** 145 * DOC: pcie_replay_count 146 * 147 * The amdgpu driver provides a sysfs API for reporting the total number 148 * of PCIe replays (NAKs) 149 * The file pcie_replay_count is used for this and returns the total 150 * number of replays as a sum of the NAKs generated and NAKs received 151 */ 152 153 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 154 struct device_attribute *attr, char *buf) 155 { 156 struct drm_device *ddev = dev_get_drvdata(dev); 157 struct amdgpu_device *adev = drm_to_adev(ddev); 158 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 159 160 return sysfs_emit(buf, "%llu\n", cnt); 161 } 162 163 static DEVICE_ATTR(pcie_replay_count, 0444, 164 amdgpu_device_get_pcie_replay_count, NULL); 165 166 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 167 168 169 /** 170 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 171 * 172 * @dev: drm_device pointer 173 * 174 * Returns true if the device is a dGPU with ATPX power control, 175 * otherwise return false. 176 */ 177 bool amdgpu_device_supports_px(struct drm_device *dev) 178 { 179 struct amdgpu_device *adev = drm_to_adev(dev); 180 181 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 182 return true; 183 return false; 184 } 185 186 /** 187 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 188 * 189 * @dev: drm_device pointer 190 * 191 * Returns true if the device is a dGPU with ACPI power control, 192 * otherwise return false. 193 */ 194 bool amdgpu_device_supports_boco(struct drm_device *dev) 195 { 196 struct amdgpu_device *adev = drm_to_adev(dev); 197 198 if (adev->has_pr3 || 199 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 200 return true; 201 return false; 202 } 203 204 /** 205 * amdgpu_device_supports_baco - Does the device support BACO 206 * 207 * @dev: drm_device pointer 208 * 209 * Returns true if the device supporte BACO, 210 * otherwise return false. 211 */ 212 bool amdgpu_device_supports_baco(struct drm_device *dev) 213 { 214 struct amdgpu_device *adev = drm_to_adev(dev); 215 216 return amdgpu_asic_supports_baco(adev); 217 } 218 219 /** 220 * amdgpu_device_supports_smart_shift - Is the device dGPU with 221 * smart shift support 222 * 223 * @dev: drm_device pointer 224 * 225 * Returns true if the device is a dGPU with Smart Shift support, 226 * otherwise returns false. 227 */ 228 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 229 { 230 return (amdgpu_device_supports_boco(dev) && 231 amdgpu_acpi_is_power_shift_control_supported()); 232 } 233 234 /* 235 * VRAM access helper functions 236 */ 237 238 /** 239 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 240 * 241 * @adev: amdgpu_device pointer 242 * @pos: offset of the buffer in vram 243 * @buf: virtual address of the buffer in system memory 244 * @size: read/write size, sizeof(@buf) must > @size 245 * @write: true - write to vram, otherwise - read from vram 246 */ 247 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 248 void *buf, size_t size, bool write) 249 { 250 unsigned long flags; 251 uint32_t hi = ~0, tmp = 0; 252 uint32_t *data = buf; 253 uint64_t last; 254 int idx; 255 256 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 257 return; 258 259 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 260 261 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 262 for (last = pos + size; pos < last; pos += 4) { 263 tmp = pos >> 31; 264 265 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 266 if (tmp != hi) { 267 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 268 hi = tmp; 269 } 270 if (write) 271 WREG32_NO_KIQ(mmMM_DATA, *data++); 272 else 273 *data++ = RREG32_NO_KIQ(mmMM_DATA); 274 } 275 276 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 277 drm_dev_exit(idx); 278 } 279 280 /** 281 * amdgpu_device_aper_access - access vram by vram aperature 282 * 283 * @adev: amdgpu_device pointer 284 * @pos: offset of the buffer in vram 285 * @buf: virtual address of the buffer in system memory 286 * @size: read/write size, sizeof(@buf) must > @size 287 * @write: true - write to vram, otherwise - read from vram 288 * 289 * The return value means how many bytes have been transferred. 290 */ 291 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 292 void *buf, size_t size, bool write) 293 { 294 #ifdef CONFIG_64BIT 295 void __iomem *addr; 296 size_t count = 0; 297 uint64_t last; 298 299 if (!adev->mman.aper_base_kaddr) 300 return 0; 301 302 last = min(pos + size, adev->gmc.visible_vram_size); 303 if (last > pos) { 304 addr = adev->mman.aper_base_kaddr + pos; 305 count = last - pos; 306 307 if (write) { 308 memcpy_toio(addr, buf, count); 309 /* Make sure HDP write cache flush happens without any reordering 310 * after the system memory contents are sent over PCIe device 311 */ 312 mb(); 313 amdgpu_device_flush_hdp(adev, NULL); 314 } else { 315 amdgpu_device_invalidate_hdp(adev, NULL); 316 /* Make sure HDP read cache is invalidated before issuing a read 317 * to the PCIe device 318 */ 319 mb(); 320 memcpy_fromio(buf, addr, count); 321 } 322 323 } 324 325 return count; 326 #else 327 return 0; 328 #endif 329 } 330 331 /** 332 * amdgpu_device_vram_access - read/write a buffer in vram 333 * 334 * @adev: amdgpu_device pointer 335 * @pos: offset of the buffer in vram 336 * @buf: virtual address of the buffer in system memory 337 * @size: read/write size, sizeof(@buf) must > @size 338 * @write: true - write to vram, otherwise - read from vram 339 */ 340 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 341 void *buf, size_t size, bool write) 342 { 343 size_t count; 344 345 /* try to using vram apreature to access vram first */ 346 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 347 size -= count; 348 if (size) { 349 /* using MM to access rest vram */ 350 pos += count; 351 buf += count; 352 amdgpu_device_mm_access(adev, pos, buf, size, write); 353 } 354 } 355 356 /* 357 * register access helper functions. 358 */ 359 360 /* Check if hw access should be skipped because of hotplug or device error */ 361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 362 { 363 if (adev->no_hw_access) 364 return true; 365 366 #ifdef CONFIG_LOCKDEP 367 /* 368 * This is a bit complicated to understand, so worth a comment. What we assert 369 * here is that the GPU reset is not running on another thread in parallel. 370 * 371 * For this we trylock the read side of the reset semaphore, if that succeeds 372 * we know that the reset is not running in paralell. 373 * 374 * If the trylock fails we assert that we are either already holding the read 375 * side of the lock or are the reset thread itself and hold the write side of 376 * the lock. 377 */ 378 if (in_task()) { 379 if (down_read_trylock(&adev->reset_domain->sem)) 380 up_read(&adev->reset_domain->sem); 381 else 382 lockdep_assert_held(&adev->reset_domain->sem); 383 } 384 #endif 385 return false; 386 } 387 388 /** 389 * amdgpu_device_rreg - read a memory mapped IO or indirect register 390 * 391 * @adev: amdgpu_device pointer 392 * @reg: dword aligned register offset 393 * @acc_flags: access flags which require special behavior 394 * 395 * Returns the 32 bit value from the offset specified. 396 */ 397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 398 uint32_t reg, uint32_t acc_flags) 399 { 400 uint32_t ret; 401 402 if (amdgpu_device_skip_hw_access(adev)) 403 return 0; 404 405 if ((reg * 4) < adev->rmmio_size) { 406 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 407 amdgpu_sriov_runtime(adev) && 408 down_read_trylock(&adev->reset_domain->sem)) { 409 ret = amdgpu_kiq_rreg(adev, reg); 410 up_read(&adev->reset_domain->sem); 411 } else { 412 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 413 } 414 } else { 415 ret = adev->pcie_rreg(adev, reg * 4); 416 } 417 418 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 419 420 return ret; 421 } 422 423 /* 424 * MMIO register read with bytes helper functions 425 * @offset:bytes offset from MMIO start 426 */ 427 428 /** 429 * amdgpu_mm_rreg8 - read a memory mapped IO register 430 * 431 * @adev: amdgpu_device pointer 432 * @offset: byte aligned register offset 433 * 434 * Returns the 8 bit value from the offset specified. 435 */ 436 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 437 { 438 if (amdgpu_device_skip_hw_access(adev)) 439 return 0; 440 441 if (offset < adev->rmmio_size) 442 return (readb(adev->rmmio + offset)); 443 BUG(); 444 } 445 446 /* 447 * MMIO register write with bytes helper functions 448 * @offset:bytes offset from MMIO start 449 * @value: the value want to be written to the register 450 */ 451 452 /** 453 * amdgpu_mm_wreg8 - read a memory mapped IO register 454 * 455 * @adev: amdgpu_device pointer 456 * @offset: byte aligned register offset 457 * @value: 8 bit value to write 458 * 459 * Writes the value specified to the offset specified. 460 */ 461 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 462 { 463 if (amdgpu_device_skip_hw_access(adev)) 464 return; 465 466 if (offset < adev->rmmio_size) 467 writeb(value, adev->rmmio + offset); 468 else 469 BUG(); 470 } 471 472 /** 473 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 474 * 475 * @adev: amdgpu_device pointer 476 * @reg: dword aligned register offset 477 * @v: 32 bit value to write to the register 478 * @acc_flags: access flags which require special behavior 479 * 480 * Writes the value specified to the offset specified. 481 */ 482 void amdgpu_device_wreg(struct amdgpu_device *adev, 483 uint32_t reg, uint32_t v, 484 uint32_t acc_flags) 485 { 486 if (amdgpu_device_skip_hw_access(adev)) 487 return; 488 489 if ((reg * 4) < adev->rmmio_size) { 490 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 491 amdgpu_sriov_runtime(adev) && 492 down_read_trylock(&adev->reset_domain->sem)) { 493 amdgpu_kiq_wreg(adev, reg, v); 494 up_read(&adev->reset_domain->sem); 495 } else { 496 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 497 } 498 } else { 499 adev->pcie_wreg(adev, reg * 4, v); 500 } 501 502 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 503 } 504 505 /** 506 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 507 * 508 * @adev: amdgpu_device pointer 509 * @reg: mmio/rlc register 510 * @v: value to write 511 * 512 * this function is invoked only for the debugfs register access 513 */ 514 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 515 uint32_t reg, uint32_t v, 516 uint32_t xcc_id) 517 { 518 if (amdgpu_device_skip_hw_access(adev)) 519 return; 520 521 if (amdgpu_sriov_fullaccess(adev) && 522 adev->gfx.rlc.funcs && 523 adev->gfx.rlc.funcs->is_rlcg_access_range) { 524 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 525 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 526 } else if ((reg * 4) >= adev->rmmio_size) { 527 adev->pcie_wreg(adev, reg * 4, v); 528 } else { 529 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 530 } 531 } 532 533 /** 534 * amdgpu_device_indirect_rreg - read an indirect register 535 * 536 * @adev: amdgpu_device pointer 537 * @reg_addr: indirect register address to read from 538 * 539 * Returns the value of indirect register @reg_addr 540 */ 541 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 542 u32 reg_addr) 543 { 544 unsigned long flags, pcie_index, pcie_data; 545 void __iomem *pcie_index_offset; 546 void __iomem *pcie_data_offset; 547 u32 r; 548 549 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 550 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 551 552 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 553 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 554 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 555 556 writel(reg_addr, pcie_index_offset); 557 readl(pcie_index_offset); 558 r = readl(pcie_data_offset); 559 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 560 561 return r; 562 } 563 564 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 565 u64 reg_addr) 566 { 567 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 568 u32 r; 569 void __iomem *pcie_index_offset; 570 void __iomem *pcie_index_hi_offset; 571 void __iomem *pcie_data_offset; 572 573 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 574 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 575 if (adev->nbio.funcs->get_pcie_index_hi_offset) 576 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 577 else 578 pcie_index_hi = 0; 579 580 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 581 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 582 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 583 if (pcie_index_hi != 0) 584 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 585 pcie_index_hi * 4; 586 587 writel(reg_addr, pcie_index_offset); 588 readl(pcie_index_offset); 589 if (pcie_index_hi != 0) { 590 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 591 readl(pcie_index_hi_offset); 592 } 593 r = readl(pcie_data_offset); 594 595 /* clear the high bits */ 596 if (pcie_index_hi != 0) { 597 writel(0, pcie_index_hi_offset); 598 readl(pcie_index_hi_offset); 599 } 600 601 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 602 603 return r; 604 } 605 606 /** 607 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 608 * 609 * @adev: amdgpu_device pointer 610 * @reg_addr: indirect register address to read from 611 * 612 * Returns the value of indirect register @reg_addr 613 */ 614 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 615 u32 reg_addr) 616 { 617 unsigned long flags, pcie_index, pcie_data; 618 void __iomem *pcie_index_offset; 619 void __iomem *pcie_data_offset; 620 u64 r; 621 622 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 623 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 624 625 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 626 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 627 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 628 629 /* read low 32 bits */ 630 writel(reg_addr, pcie_index_offset); 631 readl(pcie_index_offset); 632 r = readl(pcie_data_offset); 633 /* read high 32 bits */ 634 writel(reg_addr + 4, pcie_index_offset); 635 readl(pcie_index_offset); 636 r |= ((u64)readl(pcie_data_offset) << 32); 637 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 638 639 return r; 640 } 641 642 /** 643 * amdgpu_device_indirect_wreg - write an indirect register address 644 * 645 * @adev: amdgpu_device pointer 646 * @reg_addr: indirect register offset 647 * @reg_data: indirect register data 648 * 649 */ 650 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 651 u32 reg_addr, u32 reg_data) 652 { 653 unsigned long flags, pcie_index, pcie_data; 654 void __iomem *pcie_index_offset; 655 void __iomem *pcie_data_offset; 656 657 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 658 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 659 660 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 661 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 662 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 663 664 writel(reg_addr, pcie_index_offset); 665 readl(pcie_index_offset); 666 writel(reg_data, pcie_data_offset); 667 readl(pcie_data_offset); 668 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 669 } 670 671 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 672 u64 reg_addr, u32 reg_data) 673 { 674 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 675 void __iomem *pcie_index_offset; 676 void __iomem *pcie_index_hi_offset; 677 void __iomem *pcie_data_offset; 678 679 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 680 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 681 if (adev->nbio.funcs->get_pcie_index_hi_offset) 682 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 683 else 684 pcie_index_hi = 0; 685 686 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 687 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 688 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 689 if (pcie_index_hi != 0) 690 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 691 pcie_index_hi * 4; 692 693 writel(reg_addr, pcie_index_offset); 694 readl(pcie_index_offset); 695 if (pcie_index_hi != 0) { 696 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 697 readl(pcie_index_hi_offset); 698 } 699 writel(reg_data, pcie_data_offset); 700 readl(pcie_data_offset); 701 702 /* clear the high bits */ 703 if (pcie_index_hi != 0) { 704 writel(0, pcie_index_hi_offset); 705 readl(pcie_index_hi_offset); 706 } 707 708 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 709 } 710 711 /** 712 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 713 * 714 * @adev: amdgpu_device pointer 715 * @reg_addr: indirect register offset 716 * @reg_data: indirect register data 717 * 718 */ 719 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 720 u32 reg_addr, u64 reg_data) 721 { 722 unsigned long flags, pcie_index, pcie_data; 723 void __iomem *pcie_index_offset; 724 void __iomem *pcie_data_offset; 725 726 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 727 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 728 729 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 732 733 /* write low 32 bits */ 734 writel(reg_addr, pcie_index_offset); 735 readl(pcie_index_offset); 736 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 737 readl(pcie_data_offset); 738 /* write high 32 bits */ 739 writel(reg_addr + 4, pcie_index_offset); 740 readl(pcie_index_offset); 741 writel((u32)(reg_data >> 32), pcie_data_offset); 742 readl(pcie_data_offset); 743 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 744 } 745 746 /** 747 * amdgpu_device_get_rev_id - query device rev_id 748 * 749 * @adev: amdgpu_device pointer 750 * 751 * Return device rev_id 752 */ 753 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 754 { 755 return adev->nbio.funcs->get_rev_id(adev); 756 } 757 758 /** 759 * amdgpu_invalid_rreg - dummy reg read function 760 * 761 * @adev: amdgpu_device pointer 762 * @reg: offset of register 763 * 764 * Dummy register read function. Used for register blocks 765 * that certain asics don't have (all asics). 766 * Returns the value in the register. 767 */ 768 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 769 { 770 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 771 BUG(); 772 return 0; 773 } 774 775 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 776 { 777 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 778 BUG(); 779 return 0; 780 } 781 782 /** 783 * amdgpu_invalid_wreg - dummy reg write function 784 * 785 * @adev: amdgpu_device pointer 786 * @reg: offset of register 787 * @v: value to write to the register 788 * 789 * Dummy register read function. Used for register blocks 790 * that certain asics don't have (all asics). 791 */ 792 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 793 { 794 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 795 reg, v); 796 BUG(); 797 } 798 799 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 800 { 801 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 802 reg, v); 803 BUG(); 804 } 805 806 /** 807 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 808 * 809 * @adev: amdgpu_device pointer 810 * @reg: offset of register 811 * 812 * Dummy register read function. Used for register blocks 813 * that certain asics don't have (all asics). 814 * Returns the value in the register. 815 */ 816 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 817 { 818 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 819 BUG(); 820 return 0; 821 } 822 823 /** 824 * amdgpu_invalid_wreg64 - dummy reg write function 825 * 826 * @adev: amdgpu_device pointer 827 * @reg: offset of register 828 * @v: value to write to the register 829 * 830 * Dummy register read function. Used for register blocks 831 * that certain asics don't have (all asics). 832 */ 833 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 834 { 835 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 836 reg, v); 837 BUG(); 838 } 839 840 /** 841 * amdgpu_block_invalid_rreg - dummy reg read function 842 * 843 * @adev: amdgpu_device pointer 844 * @block: offset of instance 845 * @reg: offset of register 846 * 847 * Dummy register read function. Used for register blocks 848 * that certain asics don't have (all asics). 849 * Returns the value in the register. 850 */ 851 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 852 uint32_t block, uint32_t reg) 853 { 854 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 855 reg, block); 856 BUG(); 857 return 0; 858 } 859 860 /** 861 * amdgpu_block_invalid_wreg - dummy reg write function 862 * 863 * @adev: amdgpu_device pointer 864 * @block: offset of instance 865 * @reg: offset of register 866 * @v: value to write to the register 867 * 868 * Dummy register read function. Used for register blocks 869 * that certain asics don't have (all asics). 870 */ 871 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 872 uint32_t block, 873 uint32_t reg, uint32_t v) 874 { 875 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 876 reg, block, v); 877 BUG(); 878 } 879 880 /** 881 * amdgpu_device_asic_init - Wrapper for atom asic_init 882 * 883 * @adev: amdgpu_device pointer 884 * 885 * Does any asic specific work and then calls atom asic init. 886 */ 887 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 888 { 889 int ret; 890 891 amdgpu_asic_pre_asic_init(adev); 892 893 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 894 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) { 895 amdgpu_psp_wait_for_bootloader(adev); 896 ret = amdgpu_atomfirmware_asic_init(adev, true); 897 return ret; 898 } else { 899 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 900 } 901 902 return 0; 903 } 904 905 /** 906 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 907 * 908 * @adev: amdgpu_device pointer 909 * 910 * Allocates a scratch page of VRAM for use by various things in the 911 * driver. 912 */ 913 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 914 { 915 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 916 AMDGPU_GEM_DOMAIN_VRAM | 917 AMDGPU_GEM_DOMAIN_GTT, 918 &adev->mem_scratch.robj, 919 &adev->mem_scratch.gpu_addr, 920 (void **)&adev->mem_scratch.ptr); 921 } 922 923 /** 924 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 925 * 926 * @adev: amdgpu_device pointer 927 * 928 * Frees the VRAM scratch page. 929 */ 930 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 931 { 932 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 933 } 934 935 /** 936 * amdgpu_device_program_register_sequence - program an array of registers. 937 * 938 * @adev: amdgpu_device pointer 939 * @registers: pointer to the register array 940 * @array_size: size of the register array 941 * 942 * Programs an array or registers with and or masks. 943 * This is a helper for setting golden registers. 944 */ 945 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 946 const u32 *registers, 947 const u32 array_size) 948 { 949 u32 tmp, reg, and_mask, or_mask; 950 int i; 951 952 if (array_size % 3) 953 return; 954 955 for (i = 0; i < array_size; i += 3) { 956 reg = registers[i + 0]; 957 and_mask = registers[i + 1]; 958 or_mask = registers[i + 2]; 959 960 if (and_mask == 0xffffffff) { 961 tmp = or_mask; 962 } else { 963 tmp = RREG32(reg); 964 tmp &= ~and_mask; 965 if (adev->family >= AMDGPU_FAMILY_AI) 966 tmp |= (or_mask & and_mask); 967 else 968 tmp |= or_mask; 969 } 970 WREG32(reg, tmp); 971 } 972 } 973 974 /** 975 * amdgpu_device_pci_config_reset - reset the GPU 976 * 977 * @adev: amdgpu_device pointer 978 * 979 * Resets the GPU using the pci config reset sequence. 980 * Only applicable to asics prior to vega10. 981 */ 982 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 983 { 984 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 985 } 986 987 /** 988 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 989 * 990 * @adev: amdgpu_device pointer 991 * 992 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 993 */ 994 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 995 { 996 STUB(); 997 return -ENOSYS; 998 #ifdef notyet 999 return pci_reset_function(adev->pdev); 1000 #endif 1001 } 1002 1003 /* 1004 * amdgpu_device_wb_*() 1005 * Writeback is the method by which the GPU updates special pages in memory 1006 * with the status of certain GPU events (fences, ring pointers,etc.). 1007 */ 1008 1009 /** 1010 * amdgpu_device_wb_fini - Disable Writeback and free memory 1011 * 1012 * @adev: amdgpu_device pointer 1013 * 1014 * Disables Writeback and frees the Writeback memory (all asics). 1015 * Used at driver shutdown. 1016 */ 1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1018 { 1019 if (adev->wb.wb_obj) { 1020 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1021 &adev->wb.gpu_addr, 1022 (void **)&adev->wb.wb); 1023 adev->wb.wb_obj = NULL; 1024 } 1025 } 1026 1027 /** 1028 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1029 * 1030 * @adev: amdgpu_device pointer 1031 * 1032 * Initializes writeback and allocates writeback memory (all asics). 1033 * Used at driver startup. 1034 * Returns 0 on success or an -error on failure. 1035 */ 1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1037 { 1038 int r; 1039 1040 if (adev->wb.wb_obj == NULL) { 1041 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1042 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1043 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1044 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1045 (void **)&adev->wb.wb); 1046 if (r) { 1047 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1048 return r; 1049 } 1050 1051 adev->wb.num_wb = AMDGPU_MAX_WB; 1052 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1053 1054 /* clear wb memory */ 1055 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1056 } 1057 1058 return 0; 1059 } 1060 1061 /** 1062 * amdgpu_device_wb_get - Allocate a wb entry 1063 * 1064 * @adev: amdgpu_device pointer 1065 * @wb: wb index 1066 * 1067 * Allocate a wb slot for use by the driver (all asics). 1068 * Returns 0 on success or -EINVAL on failure. 1069 */ 1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1071 { 1072 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1073 1074 if (offset < adev->wb.num_wb) { 1075 __set_bit(offset, adev->wb.used); 1076 *wb = offset << 3; /* convert to dw offset */ 1077 return 0; 1078 } else { 1079 return -EINVAL; 1080 } 1081 } 1082 1083 /** 1084 * amdgpu_device_wb_free - Free a wb entry 1085 * 1086 * @adev: amdgpu_device pointer 1087 * @wb: wb index 1088 * 1089 * Free a wb slot allocated for use by the driver (all asics) 1090 */ 1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1092 { 1093 wb >>= 3; 1094 if (wb < adev->wb.num_wb) 1095 __clear_bit(wb, adev->wb.used); 1096 } 1097 1098 /** 1099 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1100 * 1101 * @adev: amdgpu_device pointer 1102 * 1103 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1104 * to fail, but if any of the BARs is not accessible after the size we abort 1105 * driver loading by returning -ENODEV. 1106 */ 1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1108 { 1109 #ifdef __linux__ 1110 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1111 struct pci_bus *root; 1112 struct resource *res; 1113 unsigned int i; 1114 u16 cmd; 1115 int r; 1116 1117 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1118 return 0; 1119 1120 /* Bypass for VF */ 1121 if (amdgpu_sriov_vf(adev)) 1122 return 0; 1123 1124 /* skip if the bios has already enabled large BAR */ 1125 if (adev->gmc.real_vram_size && 1126 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1127 return 0; 1128 1129 /* Check if the root BUS has 64bit memory resources */ 1130 root = adev->pdev->bus; 1131 while (root->parent) 1132 root = root->parent; 1133 1134 pci_bus_for_each_resource(root, res, i) { 1135 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1136 res->start > 0x100000000ull) 1137 break; 1138 } 1139 1140 /* Trying to resize is pointless without a root hub window above 4GB */ 1141 if (!res) 1142 return 0; 1143 1144 /* Limit the BAR size to what is available */ 1145 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1146 rbar_size); 1147 1148 /* Disable memory decoding while we change the BAR addresses and size */ 1149 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1150 pci_write_config_word(adev->pdev, PCI_COMMAND, 1151 cmd & ~PCI_COMMAND_MEMORY); 1152 1153 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1154 amdgpu_doorbell_fini(adev); 1155 if (adev->asic_type >= CHIP_BONAIRE) 1156 pci_release_resource(adev->pdev, 2); 1157 1158 pci_release_resource(adev->pdev, 0); 1159 1160 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1161 if (r == -ENOSPC) 1162 DRM_INFO("Not enough PCI address space for a large BAR."); 1163 else if (r && r != -ENOTSUPP) 1164 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1165 1166 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1167 1168 /* When the doorbell or fb BAR isn't available we have no chance of 1169 * using the device. 1170 */ 1171 r = amdgpu_doorbell_init(adev); 1172 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1173 return -ENODEV; 1174 1175 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1176 #endif /* __linux__ */ 1177 1178 return 0; 1179 } 1180 1181 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1182 { 1183 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1184 return false; 1185 1186 return true; 1187 } 1188 1189 /* 1190 * GPU helpers function. 1191 */ 1192 /** 1193 * amdgpu_device_need_post - check if the hw need post or not 1194 * 1195 * @adev: amdgpu_device pointer 1196 * 1197 * Check if the asic has been initialized (all asics) at driver startup 1198 * or post is needed if hw reset is performed. 1199 * Returns true if need or false if not. 1200 */ 1201 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1202 { 1203 uint32_t reg; 1204 1205 if (amdgpu_sriov_vf(adev)) 1206 return false; 1207 1208 if (!amdgpu_device_read_bios(adev)) 1209 return false; 1210 1211 if (amdgpu_passthrough(adev)) { 1212 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1213 * some old smc fw still need driver do vPost otherwise gpu hang, while 1214 * those smc fw version above 22.15 doesn't have this flaw, so we force 1215 * vpost executed for smc version below 22.15 1216 */ 1217 if (adev->asic_type == CHIP_FIJI) { 1218 int err; 1219 uint32_t fw_ver; 1220 1221 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1222 /* force vPost if error occured */ 1223 if (err) 1224 return true; 1225 1226 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1227 if (fw_ver < 0x00160e00) 1228 return true; 1229 } 1230 } 1231 1232 /* Don't post if we need to reset whole hive on init */ 1233 if (adev->gmc.xgmi.pending_reset) 1234 return false; 1235 1236 if (adev->has_hw_reset) { 1237 adev->has_hw_reset = false; 1238 return true; 1239 } 1240 1241 /* bios scratch used on CIK+ */ 1242 if (adev->asic_type >= CHIP_BONAIRE) 1243 return amdgpu_atombios_scratch_need_asic_init(adev); 1244 1245 /* check MEM_SIZE for older asics */ 1246 reg = amdgpu_asic_get_config_memsize(adev); 1247 1248 if ((reg != 0) && (reg != 0xffffffff)) 1249 return false; 1250 1251 return true; 1252 } 1253 1254 /* 1255 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1256 * speed switching. Until we have confirmation from Intel that a specific host 1257 * supports it, it's safer that we keep it disabled for all. 1258 * 1259 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1260 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1261 */ 1262 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1263 { 1264 #if IS_ENABLED(CONFIG_X86) 1265 #ifdef __linux__ 1266 struct cpuinfo_x86 *c = &cpu_data(0); 1267 1268 if (c->x86_vendor == X86_VENDOR_INTEL) 1269 #else 1270 if (strcmp(cpu_vendor, "GenuineIntel") == 0) 1271 #endif 1272 return false; 1273 #endif 1274 return true; 1275 } 1276 1277 /** 1278 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1279 * 1280 * @adev: amdgpu_device pointer 1281 * 1282 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1283 * be set for this device. 1284 * 1285 * Returns true if it should be used or false if not. 1286 */ 1287 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1288 { 1289 switch (amdgpu_aspm) { 1290 case -1: 1291 break; 1292 case 0: 1293 return false; 1294 case 1: 1295 return true; 1296 default: 1297 return false; 1298 } 1299 return pcie_aspm_enabled(adev->pdev); 1300 } 1301 1302 bool amdgpu_device_aspm_support_quirk(void) 1303 { 1304 #if IS_ENABLED(CONFIG_X86) 1305 struct cpu_info *ci = curcpu(); 1306 1307 return !(ci->ci_family == 6 && ci->ci_model == 0x97); 1308 #else 1309 return true; 1310 #endif 1311 } 1312 1313 /* if we get transitioned to only one device, take VGA back */ 1314 /** 1315 * amdgpu_device_vga_set_decode - enable/disable vga decode 1316 * 1317 * @pdev: PCI device pointer 1318 * @state: enable/disable vga decode 1319 * 1320 * Enable/disable vga decode (all asics). 1321 * Returns VGA resource flags. 1322 */ 1323 #ifdef notyet 1324 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1325 bool state) 1326 { 1327 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1328 1329 amdgpu_asic_set_vga_state(adev, state); 1330 if (state) 1331 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1332 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1333 else 1334 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1335 } 1336 #endif 1337 1338 /** 1339 * amdgpu_device_check_block_size - validate the vm block size 1340 * 1341 * @adev: amdgpu_device pointer 1342 * 1343 * Validates the vm block size specified via module parameter. 1344 * The vm block size defines number of bits in page table versus page directory, 1345 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1346 * page table and the remaining bits are in the page directory. 1347 */ 1348 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1349 { 1350 /* defines number of bits in page table versus page directory, 1351 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1352 * page table and the remaining bits are in the page directory 1353 */ 1354 if (amdgpu_vm_block_size == -1) 1355 return; 1356 1357 if (amdgpu_vm_block_size < 9) { 1358 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1359 amdgpu_vm_block_size); 1360 amdgpu_vm_block_size = -1; 1361 } 1362 } 1363 1364 /** 1365 * amdgpu_device_check_vm_size - validate the vm size 1366 * 1367 * @adev: amdgpu_device pointer 1368 * 1369 * Validates the vm size in GB specified via module parameter. 1370 * The VM size is the size of the GPU virtual memory space in GB. 1371 */ 1372 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1373 { 1374 /* no need to check the default value */ 1375 if (amdgpu_vm_size == -1) 1376 return; 1377 1378 if (amdgpu_vm_size < 1) { 1379 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1380 amdgpu_vm_size); 1381 amdgpu_vm_size = -1; 1382 } 1383 } 1384 1385 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1386 { 1387 #ifdef __linux__ 1388 struct sysinfo si; 1389 #endif 1390 bool is_os_64 = (sizeof(void *) == 8); 1391 uint64_t total_memory; 1392 uint64_t dram_size_seven_GB = 0x1B8000000; 1393 uint64_t dram_size_three_GB = 0xB8000000; 1394 1395 if (amdgpu_smu_memory_pool_size == 0) 1396 return; 1397 1398 if (!is_os_64) { 1399 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1400 goto def_value; 1401 } 1402 #ifdef __linux__ 1403 si_meminfo(&si); 1404 total_memory = (uint64_t)si.totalram * si.mem_unit; 1405 #else 1406 total_memory = ptoa(physmem); 1407 #endif 1408 1409 if ((amdgpu_smu_memory_pool_size == 1) || 1410 (amdgpu_smu_memory_pool_size == 2)) { 1411 if (total_memory < dram_size_three_GB) 1412 goto def_value1; 1413 } else if ((amdgpu_smu_memory_pool_size == 4) || 1414 (amdgpu_smu_memory_pool_size == 8)) { 1415 if (total_memory < dram_size_seven_GB) 1416 goto def_value1; 1417 } else { 1418 DRM_WARN("Smu memory pool size not supported\n"); 1419 goto def_value; 1420 } 1421 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1422 1423 return; 1424 1425 def_value1: 1426 DRM_WARN("No enough system memory\n"); 1427 def_value: 1428 adev->pm.smu_prv_buffer_size = 0; 1429 } 1430 1431 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1432 { 1433 if (!(adev->flags & AMD_IS_APU) || 1434 adev->asic_type < CHIP_RAVEN) 1435 return 0; 1436 1437 switch (adev->asic_type) { 1438 case CHIP_RAVEN: 1439 if (adev->pdev->device == 0x15dd) 1440 adev->apu_flags |= AMD_APU_IS_RAVEN; 1441 if (adev->pdev->device == 0x15d8) 1442 adev->apu_flags |= AMD_APU_IS_PICASSO; 1443 break; 1444 case CHIP_RENOIR: 1445 if ((adev->pdev->device == 0x1636) || 1446 (adev->pdev->device == 0x164c)) 1447 adev->apu_flags |= AMD_APU_IS_RENOIR; 1448 else 1449 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1450 break; 1451 case CHIP_VANGOGH: 1452 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1453 break; 1454 case CHIP_YELLOW_CARP: 1455 break; 1456 case CHIP_CYAN_SKILLFISH: 1457 if ((adev->pdev->device == 0x13FE) || 1458 (adev->pdev->device == 0x143F)) 1459 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1460 break; 1461 default: 1462 break; 1463 } 1464 1465 return 0; 1466 } 1467 1468 /** 1469 * amdgpu_device_check_arguments - validate module params 1470 * 1471 * @adev: amdgpu_device pointer 1472 * 1473 * Validates certain module parameters and updates 1474 * the associated values used by the driver (all asics). 1475 */ 1476 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1477 { 1478 if (amdgpu_sched_jobs < 4) { 1479 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1480 amdgpu_sched_jobs); 1481 amdgpu_sched_jobs = 4; 1482 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1483 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1484 amdgpu_sched_jobs); 1485 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1486 } 1487 1488 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1489 /* gart size must be greater or equal to 32M */ 1490 dev_warn(adev->dev, "gart size (%d) too small\n", 1491 amdgpu_gart_size); 1492 amdgpu_gart_size = -1; 1493 } 1494 1495 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1496 /* gtt size must be greater or equal to 32M */ 1497 dev_warn(adev->dev, "gtt size (%d) too small\n", 1498 amdgpu_gtt_size); 1499 amdgpu_gtt_size = -1; 1500 } 1501 1502 /* valid range is between 4 and 9 inclusive */ 1503 if (amdgpu_vm_fragment_size != -1 && 1504 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1505 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1506 amdgpu_vm_fragment_size = -1; 1507 } 1508 1509 if (amdgpu_sched_hw_submission < 2) { 1510 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1511 amdgpu_sched_hw_submission); 1512 amdgpu_sched_hw_submission = 2; 1513 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1514 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1515 amdgpu_sched_hw_submission); 1516 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1517 } 1518 1519 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1520 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1521 amdgpu_reset_method = -1; 1522 } 1523 1524 amdgpu_device_check_smu_prv_buffer_size(adev); 1525 1526 amdgpu_device_check_vm_size(adev); 1527 1528 amdgpu_device_check_block_size(adev); 1529 1530 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1531 1532 return 0; 1533 } 1534 1535 #ifdef __linux__ 1536 /** 1537 * amdgpu_switcheroo_set_state - set switcheroo state 1538 * 1539 * @pdev: pci dev pointer 1540 * @state: vga_switcheroo state 1541 * 1542 * Callback for the switcheroo driver. Suspends or resumes 1543 * the asics before or after it is powered up using ACPI methods. 1544 */ 1545 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1546 enum vga_switcheroo_state state) 1547 { 1548 struct drm_device *dev = pci_get_drvdata(pdev); 1549 int r; 1550 1551 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1552 return; 1553 1554 if (state == VGA_SWITCHEROO_ON) { 1555 pr_info("switched on\n"); 1556 /* don't suspend or resume card normally */ 1557 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1558 1559 pci_set_power_state(pdev, PCI_D0); 1560 amdgpu_device_load_pci_state(pdev); 1561 r = pci_enable_device(pdev); 1562 if (r) 1563 DRM_WARN("pci_enable_device failed (%d)\n", r); 1564 amdgpu_device_resume(dev, true); 1565 1566 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1567 } else { 1568 pr_info("switched off\n"); 1569 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1570 amdgpu_device_suspend(dev, true); 1571 amdgpu_device_cache_pci_state(pdev); 1572 /* Shut down the device */ 1573 pci_disable_device(pdev); 1574 pci_set_power_state(pdev, PCI_D3cold); 1575 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1576 } 1577 } 1578 1579 /** 1580 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1581 * 1582 * @pdev: pci dev pointer 1583 * 1584 * Callback for the switcheroo driver. Check of the switcheroo 1585 * state can be changed. 1586 * Returns true if the state can be changed, false if not. 1587 */ 1588 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1589 { 1590 struct drm_device *dev = pci_get_drvdata(pdev); 1591 1592 /* 1593 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1594 * locking inversion with the driver load path. And the access here is 1595 * completely racy anyway. So don't bother with locking for now. 1596 */ 1597 return atomic_read(&dev->open_count) == 0; 1598 } 1599 #endif /* __linux__ */ 1600 1601 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1602 #ifdef notyet 1603 .set_gpu_state = amdgpu_switcheroo_set_state, 1604 .reprobe = NULL, 1605 .can_switch = amdgpu_switcheroo_can_switch, 1606 #endif 1607 }; 1608 1609 /** 1610 * amdgpu_device_ip_set_clockgating_state - set the CG state 1611 * 1612 * @dev: amdgpu_device pointer 1613 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1614 * @state: clockgating state (gate or ungate) 1615 * 1616 * Sets the requested clockgating state for all instances of 1617 * the hardware IP specified. 1618 * Returns the error code from the last instance. 1619 */ 1620 int amdgpu_device_ip_set_clockgating_state(void *dev, 1621 enum amd_ip_block_type block_type, 1622 enum amd_clockgating_state state) 1623 { 1624 struct amdgpu_device *adev = dev; 1625 int i, r = 0; 1626 1627 for (i = 0; i < adev->num_ip_blocks; i++) { 1628 if (!adev->ip_blocks[i].status.valid) 1629 continue; 1630 if (adev->ip_blocks[i].version->type != block_type) 1631 continue; 1632 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1633 continue; 1634 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1635 (void *)adev, state); 1636 if (r) 1637 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1638 adev->ip_blocks[i].version->funcs->name, r); 1639 } 1640 return r; 1641 } 1642 1643 /** 1644 * amdgpu_device_ip_set_powergating_state - set the PG state 1645 * 1646 * @dev: amdgpu_device pointer 1647 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1648 * @state: powergating state (gate or ungate) 1649 * 1650 * Sets the requested powergating state for all instances of 1651 * the hardware IP specified. 1652 * Returns the error code from the last instance. 1653 */ 1654 int amdgpu_device_ip_set_powergating_state(void *dev, 1655 enum amd_ip_block_type block_type, 1656 enum amd_powergating_state state) 1657 { 1658 struct amdgpu_device *adev = dev; 1659 int i, r = 0; 1660 1661 for (i = 0; i < adev->num_ip_blocks; i++) { 1662 if (!adev->ip_blocks[i].status.valid) 1663 continue; 1664 if (adev->ip_blocks[i].version->type != block_type) 1665 continue; 1666 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1667 continue; 1668 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1669 (void *)adev, state); 1670 if (r) 1671 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1672 adev->ip_blocks[i].version->funcs->name, r); 1673 } 1674 return r; 1675 } 1676 1677 /** 1678 * amdgpu_device_ip_get_clockgating_state - get the CG state 1679 * 1680 * @adev: amdgpu_device pointer 1681 * @flags: clockgating feature flags 1682 * 1683 * Walks the list of IPs on the device and updates the clockgating 1684 * flags for each IP. 1685 * Updates @flags with the feature flags for each hardware IP where 1686 * clockgating is enabled. 1687 */ 1688 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1689 u64 *flags) 1690 { 1691 int i; 1692 1693 for (i = 0; i < adev->num_ip_blocks; i++) { 1694 if (!adev->ip_blocks[i].status.valid) 1695 continue; 1696 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1697 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1698 } 1699 } 1700 1701 /** 1702 * amdgpu_device_ip_wait_for_idle - wait for idle 1703 * 1704 * @adev: amdgpu_device pointer 1705 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1706 * 1707 * Waits for the request hardware IP to be idle. 1708 * Returns 0 for success or a negative error code on failure. 1709 */ 1710 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1711 enum amd_ip_block_type block_type) 1712 { 1713 int i, r; 1714 1715 for (i = 0; i < adev->num_ip_blocks; i++) { 1716 if (!adev->ip_blocks[i].status.valid) 1717 continue; 1718 if (adev->ip_blocks[i].version->type == block_type) { 1719 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1720 if (r) 1721 return r; 1722 break; 1723 } 1724 } 1725 return 0; 1726 1727 } 1728 1729 /** 1730 * amdgpu_device_ip_is_idle - is the hardware IP idle 1731 * 1732 * @adev: amdgpu_device pointer 1733 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1734 * 1735 * Check if the hardware IP is idle or not. 1736 * Returns true if it the IP is idle, false if not. 1737 */ 1738 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1739 enum amd_ip_block_type block_type) 1740 { 1741 int i; 1742 1743 for (i = 0; i < adev->num_ip_blocks; i++) { 1744 if (!adev->ip_blocks[i].status.valid) 1745 continue; 1746 if (adev->ip_blocks[i].version->type == block_type) 1747 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1748 } 1749 return true; 1750 1751 } 1752 1753 /** 1754 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1755 * 1756 * @adev: amdgpu_device pointer 1757 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1758 * 1759 * Returns a pointer to the hardware IP block structure 1760 * if it exists for the asic, otherwise NULL. 1761 */ 1762 struct amdgpu_ip_block * 1763 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1764 enum amd_ip_block_type type) 1765 { 1766 int i; 1767 1768 for (i = 0; i < adev->num_ip_blocks; i++) 1769 if (adev->ip_blocks[i].version->type == type) 1770 return &adev->ip_blocks[i]; 1771 1772 return NULL; 1773 } 1774 1775 /** 1776 * amdgpu_device_ip_block_version_cmp 1777 * 1778 * @adev: amdgpu_device pointer 1779 * @type: enum amd_ip_block_type 1780 * @major: major version 1781 * @minor: minor version 1782 * 1783 * return 0 if equal or greater 1784 * return 1 if smaller or the ip_block doesn't exist 1785 */ 1786 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1787 enum amd_ip_block_type type, 1788 u32 major, u32 minor) 1789 { 1790 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1791 1792 if (ip_block && ((ip_block->version->major > major) || 1793 ((ip_block->version->major == major) && 1794 (ip_block->version->minor >= minor)))) 1795 return 0; 1796 1797 return 1; 1798 } 1799 1800 /** 1801 * amdgpu_device_ip_block_add 1802 * 1803 * @adev: amdgpu_device pointer 1804 * @ip_block_version: pointer to the IP to add 1805 * 1806 * Adds the IP block driver information to the collection of IPs 1807 * on the asic. 1808 */ 1809 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1810 const struct amdgpu_ip_block_version *ip_block_version) 1811 { 1812 if (!ip_block_version) 1813 return -EINVAL; 1814 1815 switch (ip_block_version->type) { 1816 case AMD_IP_BLOCK_TYPE_VCN: 1817 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1818 return 0; 1819 break; 1820 case AMD_IP_BLOCK_TYPE_JPEG: 1821 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1822 return 0; 1823 break; 1824 default: 1825 break; 1826 } 1827 1828 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1829 ip_block_version->funcs->name); 1830 1831 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1832 1833 return 0; 1834 } 1835 1836 /** 1837 * amdgpu_device_enable_virtual_display - enable virtual display feature 1838 * 1839 * @adev: amdgpu_device pointer 1840 * 1841 * Enabled the virtual display feature if the user has enabled it via 1842 * the module parameter virtual_display. This feature provides a virtual 1843 * display hardware on headless boards or in virtualized environments. 1844 * This function parses and validates the configuration string specified by 1845 * the user and configues the virtual display configuration (number of 1846 * virtual connectors, crtcs, etc.) specified. 1847 */ 1848 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1849 { 1850 adev->enable_virtual_display = false; 1851 1852 #ifdef notyet 1853 if (amdgpu_virtual_display) { 1854 const char *pci_address_name = pci_name(adev->pdev); 1855 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1856 1857 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1858 pciaddstr_tmp = pciaddstr; 1859 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1860 pciaddname = strsep(&pciaddname_tmp, ","); 1861 if (!strcmp("all", pciaddname) 1862 || !strcmp(pci_address_name, pciaddname)) { 1863 long num_crtc; 1864 int res = -1; 1865 1866 adev->enable_virtual_display = true; 1867 1868 if (pciaddname_tmp) 1869 res = kstrtol(pciaddname_tmp, 10, 1870 &num_crtc); 1871 1872 if (!res) { 1873 if (num_crtc < 1) 1874 num_crtc = 1; 1875 if (num_crtc > 6) 1876 num_crtc = 6; 1877 adev->mode_info.num_crtc = num_crtc; 1878 } else { 1879 adev->mode_info.num_crtc = 1; 1880 } 1881 break; 1882 } 1883 } 1884 1885 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1886 amdgpu_virtual_display, pci_address_name, 1887 adev->enable_virtual_display, adev->mode_info.num_crtc); 1888 1889 kfree(pciaddstr); 1890 } 1891 #endif 1892 } 1893 1894 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1895 { 1896 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1897 adev->mode_info.num_crtc = 1; 1898 adev->enable_virtual_display = true; 1899 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1900 adev->enable_virtual_display, adev->mode_info.num_crtc); 1901 } 1902 } 1903 1904 /** 1905 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1906 * 1907 * @adev: amdgpu_device pointer 1908 * 1909 * Parses the asic configuration parameters specified in the gpu info 1910 * firmware and makes them availale to the driver for use in configuring 1911 * the asic. 1912 * Returns 0 on success, -EINVAL on failure. 1913 */ 1914 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1915 { 1916 const char *chip_name; 1917 char fw_name[40]; 1918 int err; 1919 const struct gpu_info_firmware_header_v1_0 *hdr; 1920 1921 adev->firmware.gpu_info_fw = NULL; 1922 1923 if (adev->mman.discovery_bin) 1924 return 0; 1925 1926 switch (adev->asic_type) { 1927 default: 1928 return 0; 1929 case CHIP_VEGA10: 1930 chip_name = "vega10"; 1931 break; 1932 case CHIP_VEGA12: 1933 chip_name = "vega12"; 1934 break; 1935 case CHIP_RAVEN: 1936 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1937 chip_name = "raven2"; 1938 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1939 chip_name = "picasso"; 1940 else 1941 chip_name = "raven"; 1942 break; 1943 case CHIP_ARCTURUS: 1944 chip_name = "arcturus"; 1945 break; 1946 case CHIP_NAVI12: 1947 chip_name = "navi12"; 1948 break; 1949 } 1950 1951 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1952 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1953 if (err) { 1954 dev_err(adev->dev, 1955 "Failed to get gpu_info firmware \"%s\"\n", 1956 fw_name); 1957 goto out; 1958 } 1959 1960 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1961 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1962 1963 switch (hdr->version_major) { 1964 case 1: 1965 { 1966 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1967 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1968 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1969 1970 /* 1971 * Should be droped when DAL no longer needs it. 1972 */ 1973 if (adev->asic_type == CHIP_NAVI12) 1974 goto parse_soc_bounding_box; 1975 1976 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1977 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1978 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1979 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1980 adev->gfx.config.max_texture_channel_caches = 1981 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1982 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1983 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1984 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1985 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1986 adev->gfx.config.double_offchip_lds_buf = 1987 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1988 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1989 adev->gfx.cu_info.max_waves_per_simd = 1990 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1991 adev->gfx.cu_info.max_scratch_slots_per_cu = 1992 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1993 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1994 if (hdr->version_minor >= 1) { 1995 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1996 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1997 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1998 adev->gfx.config.num_sc_per_sh = 1999 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2000 adev->gfx.config.num_packer_per_sc = 2001 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2002 } 2003 2004 parse_soc_bounding_box: 2005 /* 2006 * soc bounding box info is not integrated in disocovery table, 2007 * we always need to parse it from gpu info firmware if needed. 2008 */ 2009 if (hdr->version_minor == 2) { 2010 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2011 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2012 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2013 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2014 } 2015 break; 2016 } 2017 default: 2018 dev_err(adev->dev, 2019 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2020 err = -EINVAL; 2021 goto out; 2022 } 2023 out: 2024 return err; 2025 } 2026 2027 /** 2028 * amdgpu_device_ip_early_init - run early init for hardware IPs 2029 * 2030 * @adev: amdgpu_device pointer 2031 * 2032 * Early initialization pass for hardware IPs. The hardware IPs that make 2033 * up each asic are discovered each IP's early_init callback is run. This 2034 * is the first stage in initializing the asic. 2035 * Returns 0 on success, negative error code on failure. 2036 */ 2037 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2038 { 2039 struct pci_dev *parent; 2040 int i, r; 2041 bool total; 2042 2043 amdgpu_device_enable_virtual_display(adev); 2044 2045 if (amdgpu_sriov_vf(adev)) { 2046 r = amdgpu_virt_request_full_gpu(adev, true); 2047 if (r) 2048 return r; 2049 } 2050 2051 switch (adev->asic_type) { 2052 #ifdef CONFIG_DRM_AMDGPU_SI 2053 case CHIP_VERDE: 2054 case CHIP_TAHITI: 2055 case CHIP_PITCAIRN: 2056 case CHIP_OLAND: 2057 case CHIP_HAINAN: 2058 adev->family = AMDGPU_FAMILY_SI; 2059 r = si_set_ip_blocks(adev); 2060 if (r) 2061 return r; 2062 break; 2063 #endif 2064 #ifdef CONFIG_DRM_AMDGPU_CIK 2065 case CHIP_BONAIRE: 2066 case CHIP_HAWAII: 2067 case CHIP_KAVERI: 2068 case CHIP_KABINI: 2069 case CHIP_MULLINS: 2070 if (adev->flags & AMD_IS_APU) 2071 adev->family = AMDGPU_FAMILY_KV; 2072 else 2073 adev->family = AMDGPU_FAMILY_CI; 2074 2075 r = cik_set_ip_blocks(adev); 2076 if (r) 2077 return r; 2078 break; 2079 #endif 2080 case CHIP_TOPAZ: 2081 case CHIP_TONGA: 2082 case CHIP_FIJI: 2083 case CHIP_POLARIS10: 2084 case CHIP_POLARIS11: 2085 case CHIP_POLARIS12: 2086 case CHIP_VEGAM: 2087 case CHIP_CARRIZO: 2088 case CHIP_STONEY: 2089 if (adev->flags & AMD_IS_APU) 2090 adev->family = AMDGPU_FAMILY_CZ; 2091 else 2092 adev->family = AMDGPU_FAMILY_VI; 2093 2094 r = vi_set_ip_blocks(adev); 2095 if (r) 2096 return r; 2097 break; 2098 default: 2099 r = amdgpu_discovery_set_ip_blocks(adev); 2100 if (r) 2101 return r; 2102 break; 2103 } 2104 2105 if (amdgpu_has_atpx() && 2106 (amdgpu_is_atpx_hybrid() || 2107 amdgpu_has_atpx_dgpu_power_cntl()) && 2108 ((adev->flags & AMD_IS_APU) == 0) && 2109 !dev_is_removable(&adev->pdev->dev)) 2110 adev->flags |= AMD_IS_PX; 2111 2112 if (!(adev->flags & AMD_IS_APU)) { 2113 #ifdef notyet 2114 parent = pcie_find_root_port(adev->pdev); 2115 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2116 #else 2117 adev->has_pr3 = false; 2118 #endif 2119 } 2120 2121 2122 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2123 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2124 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2125 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2126 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2127 if (!amdgpu_device_pcie_dynamic_switching_supported()) 2128 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2129 2130 total = true; 2131 for (i = 0; i < adev->num_ip_blocks; i++) { 2132 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2133 DRM_WARN("disabled ip block: %d <%s>\n", 2134 i, adev->ip_blocks[i].version->funcs->name); 2135 adev->ip_blocks[i].status.valid = false; 2136 } else { 2137 if (adev->ip_blocks[i].version->funcs->early_init) { 2138 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2139 if (r == -ENOENT) { 2140 adev->ip_blocks[i].status.valid = false; 2141 } else if (r) { 2142 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2143 adev->ip_blocks[i].version->funcs->name, r); 2144 total = false; 2145 } else { 2146 adev->ip_blocks[i].status.valid = true; 2147 } 2148 } else { 2149 adev->ip_blocks[i].status.valid = true; 2150 } 2151 } 2152 /* get the vbios after the asic_funcs are set up */ 2153 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2154 r = amdgpu_device_parse_gpu_info_fw(adev); 2155 if (r) 2156 return r; 2157 2158 /* Read BIOS */ 2159 if (amdgpu_device_read_bios(adev)) { 2160 if (!amdgpu_get_bios(adev)) 2161 return -EINVAL; 2162 2163 r = amdgpu_atombios_init(adev); 2164 if (r) { 2165 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2166 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2167 return r; 2168 } 2169 } 2170 2171 /*get pf2vf msg info at it's earliest time*/ 2172 if (amdgpu_sriov_vf(adev)) 2173 amdgpu_virt_init_data_exchange(adev); 2174 2175 } 2176 } 2177 if (!total) 2178 return -ENODEV; 2179 2180 amdgpu_amdkfd_device_probe(adev); 2181 adev->cg_flags &= amdgpu_cg_mask; 2182 adev->pg_flags &= amdgpu_pg_mask; 2183 2184 return 0; 2185 } 2186 2187 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2188 { 2189 int i, r; 2190 2191 for (i = 0; i < adev->num_ip_blocks; i++) { 2192 if (!adev->ip_blocks[i].status.sw) 2193 continue; 2194 if (adev->ip_blocks[i].status.hw) 2195 continue; 2196 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2197 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2198 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2199 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2200 if (r) { 2201 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2202 adev->ip_blocks[i].version->funcs->name, r); 2203 return r; 2204 } 2205 adev->ip_blocks[i].status.hw = true; 2206 } 2207 } 2208 2209 return 0; 2210 } 2211 2212 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2213 { 2214 int i, r; 2215 2216 for (i = 0; i < adev->num_ip_blocks; i++) { 2217 if (!adev->ip_blocks[i].status.sw) 2218 continue; 2219 if (adev->ip_blocks[i].status.hw) 2220 continue; 2221 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2222 if (r) { 2223 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2224 adev->ip_blocks[i].version->funcs->name, r); 2225 return r; 2226 } 2227 adev->ip_blocks[i].status.hw = true; 2228 } 2229 2230 return 0; 2231 } 2232 2233 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2234 { 2235 int r = 0; 2236 int i; 2237 uint32_t smu_version; 2238 2239 if (adev->asic_type >= CHIP_VEGA10) { 2240 for (i = 0; i < adev->num_ip_blocks; i++) { 2241 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2242 continue; 2243 2244 if (!adev->ip_blocks[i].status.sw) 2245 continue; 2246 2247 /* no need to do the fw loading again if already done*/ 2248 if (adev->ip_blocks[i].status.hw == true) 2249 break; 2250 2251 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2252 r = adev->ip_blocks[i].version->funcs->resume(adev); 2253 if (r) { 2254 DRM_ERROR("resume of IP block <%s> failed %d\n", 2255 adev->ip_blocks[i].version->funcs->name, r); 2256 return r; 2257 } 2258 } else { 2259 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2260 if (r) { 2261 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2262 adev->ip_blocks[i].version->funcs->name, r); 2263 return r; 2264 } 2265 } 2266 2267 adev->ip_blocks[i].status.hw = true; 2268 break; 2269 } 2270 } 2271 2272 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2273 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2274 2275 return r; 2276 } 2277 2278 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2279 { 2280 long timeout; 2281 int r, i; 2282 2283 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2284 struct amdgpu_ring *ring = adev->rings[i]; 2285 2286 /* No need to setup the GPU scheduler for rings that don't need it */ 2287 if (!ring || ring->no_scheduler) 2288 continue; 2289 2290 switch (ring->funcs->type) { 2291 case AMDGPU_RING_TYPE_GFX: 2292 timeout = adev->gfx_timeout; 2293 break; 2294 case AMDGPU_RING_TYPE_COMPUTE: 2295 timeout = adev->compute_timeout; 2296 break; 2297 case AMDGPU_RING_TYPE_SDMA: 2298 timeout = adev->sdma_timeout; 2299 break; 2300 default: 2301 timeout = adev->video_timeout; 2302 break; 2303 } 2304 2305 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2306 ring->num_hw_submission, 0, 2307 timeout, adev->reset_domain->wq, 2308 ring->sched_score, ring->name, 2309 adev->dev); 2310 if (r) { 2311 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2312 ring->name); 2313 return r; 2314 } 2315 } 2316 2317 amdgpu_xcp_update_partition_sched_list(adev); 2318 2319 return 0; 2320 } 2321 2322 2323 /** 2324 * amdgpu_device_ip_init - run init for hardware IPs 2325 * 2326 * @adev: amdgpu_device pointer 2327 * 2328 * Main initialization pass for hardware IPs. The list of all the hardware 2329 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2330 * are run. sw_init initializes the software state associated with each IP 2331 * and hw_init initializes the hardware associated with each IP. 2332 * Returns 0 on success, negative error code on failure. 2333 */ 2334 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2335 { 2336 int i, r; 2337 2338 r = amdgpu_ras_init(adev); 2339 if (r) 2340 return r; 2341 2342 for (i = 0; i < adev->num_ip_blocks; i++) { 2343 if (!adev->ip_blocks[i].status.valid) 2344 continue; 2345 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2346 if (r) { 2347 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2348 adev->ip_blocks[i].version->funcs->name, r); 2349 goto init_failed; 2350 } 2351 adev->ip_blocks[i].status.sw = true; 2352 2353 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2354 /* need to do common hw init early so everything is set up for gmc */ 2355 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2356 if (r) { 2357 DRM_ERROR("hw_init %d failed %d\n", i, r); 2358 goto init_failed; 2359 } 2360 adev->ip_blocks[i].status.hw = true; 2361 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2362 /* need to do gmc hw init early so we can allocate gpu mem */ 2363 /* Try to reserve bad pages early */ 2364 if (amdgpu_sriov_vf(adev)) 2365 amdgpu_virt_exchange_data(adev); 2366 2367 r = amdgpu_device_mem_scratch_init(adev); 2368 if (r) { 2369 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2370 goto init_failed; 2371 } 2372 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2373 if (r) { 2374 DRM_ERROR("hw_init %d failed %d\n", i, r); 2375 goto init_failed; 2376 } 2377 r = amdgpu_device_wb_init(adev); 2378 if (r) { 2379 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2380 goto init_failed; 2381 } 2382 adev->ip_blocks[i].status.hw = true; 2383 2384 /* right after GMC hw init, we create CSA */ 2385 if (adev->gfx.mcbp) { 2386 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2387 AMDGPU_GEM_DOMAIN_VRAM | 2388 AMDGPU_GEM_DOMAIN_GTT, 2389 AMDGPU_CSA_SIZE); 2390 if (r) { 2391 DRM_ERROR("allocate CSA failed %d\n", r); 2392 goto init_failed; 2393 } 2394 } 2395 } 2396 } 2397 2398 if (amdgpu_sriov_vf(adev)) 2399 amdgpu_virt_init_data_exchange(adev); 2400 2401 r = amdgpu_ib_pool_init(adev); 2402 if (r) { 2403 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2404 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2405 goto init_failed; 2406 } 2407 2408 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2409 if (r) 2410 goto init_failed; 2411 2412 r = amdgpu_device_ip_hw_init_phase1(adev); 2413 if (r) 2414 goto init_failed; 2415 2416 r = amdgpu_device_fw_loading(adev); 2417 if (r) 2418 goto init_failed; 2419 2420 r = amdgpu_device_ip_hw_init_phase2(adev); 2421 if (r) 2422 goto init_failed; 2423 2424 /* 2425 * retired pages will be loaded from eeprom and reserved here, 2426 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2427 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2428 * for I2C communication which only true at this point. 2429 * 2430 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2431 * failure from bad gpu situation and stop amdgpu init process 2432 * accordingly. For other failed cases, it will still release all 2433 * the resource and print error message, rather than returning one 2434 * negative value to upper level. 2435 * 2436 * Note: theoretically, this should be called before all vram allocations 2437 * to protect retired page from abusing 2438 */ 2439 r = amdgpu_ras_recovery_init(adev); 2440 if (r) 2441 goto init_failed; 2442 2443 /** 2444 * In case of XGMI grab extra reference for reset domain for this device 2445 */ 2446 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2447 if (amdgpu_xgmi_add_device(adev) == 0) { 2448 if (!amdgpu_sriov_vf(adev)) { 2449 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2450 2451 if (WARN_ON(!hive)) { 2452 r = -ENOENT; 2453 goto init_failed; 2454 } 2455 2456 if (!hive->reset_domain || 2457 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2458 r = -ENOENT; 2459 amdgpu_put_xgmi_hive(hive); 2460 goto init_failed; 2461 } 2462 2463 /* Drop the early temporary reset domain we created for device */ 2464 amdgpu_reset_put_reset_domain(adev->reset_domain); 2465 adev->reset_domain = hive->reset_domain; 2466 amdgpu_put_xgmi_hive(hive); 2467 } 2468 } 2469 } 2470 2471 r = amdgpu_device_init_schedulers(adev); 2472 if (r) 2473 goto init_failed; 2474 2475 /* Don't init kfd if whole hive need to be reset during init */ 2476 if (!adev->gmc.xgmi.pending_reset) { 2477 kgd2kfd_init_zone_device(adev); 2478 amdgpu_amdkfd_device_init(adev); 2479 } 2480 2481 amdgpu_fru_get_product_info(adev); 2482 2483 init_failed: 2484 2485 return r; 2486 } 2487 2488 /** 2489 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2490 * 2491 * @adev: amdgpu_device pointer 2492 * 2493 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2494 * this function before a GPU reset. If the value is retained after a 2495 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2496 */ 2497 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2498 { 2499 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2500 } 2501 2502 /** 2503 * amdgpu_device_check_vram_lost - check if vram is valid 2504 * 2505 * @adev: amdgpu_device pointer 2506 * 2507 * Checks the reset magic value written to the gart pointer in VRAM. 2508 * The driver calls this after a GPU reset to see if the contents of 2509 * VRAM is lost or now. 2510 * returns true if vram is lost, false if not. 2511 */ 2512 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2513 { 2514 if (memcmp(adev->gart.ptr, adev->reset_magic, 2515 AMDGPU_RESET_MAGIC_NUM)) 2516 return true; 2517 2518 if (!amdgpu_in_reset(adev)) 2519 return false; 2520 2521 /* 2522 * For all ASICs with baco/mode1 reset, the VRAM is 2523 * always assumed to be lost. 2524 */ 2525 switch (amdgpu_asic_reset_method(adev)) { 2526 case AMD_RESET_METHOD_BACO: 2527 case AMD_RESET_METHOD_MODE1: 2528 return true; 2529 default: 2530 return false; 2531 } 2532 } 2533 2534 /** 2535 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2536 * 2537 * @adev: amdgpu_device pointer 2538 * @state: clockgating state (gate or ungate) 2539 * 2540 * The list of all the hardware IPs that make up the asic is walked and the 2541 * set_clockgating_state callbacks are run. 2542 * Late initialization pass enabling clockgating for hardware IPs. 2543 * Fini or suspend, pass disabling clockgating for hardware IPs. 2544 * Returns 0 on success, negative error code on failure. 2545 */ 2546 2547 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2548 enum amd_clockgating_state state) 2549 { 2550 int i, j, r; 2551 2552 if (amdgpu_emu_mode == 1) 2553 return 0; 2554 2555 for (j = 0; j < adev->num_ip_blocks; j++) { 2556 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2557 if (!adev->ip_blocks[i].status.late_initialized) 2558 continue; 2559 /* skip CG for GFX, SDMA on S0ix */ 2560 if (adev->in_s0ix && 2561 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2562 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2563 continue; 2564 /* skip CG for VCE/UVD, it's handled specially */ 2565 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2566 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2567 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2568 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2569 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2570 /* enable clockgating to save power */ 2571 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2572 state); 2573 if (r) { 2574 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2575 adev->ip_blocks[i].version->funcs->name, r); 2576 return r; 2577 } 2578 } 2579 } 2580 2581 return 0; 2582 } 2583 2584 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2585 enum amd_powergating_state state) 2586 { 2587 int i, j, r; 2588 2589 if (amdgpu_emu_mode == 1) 2590 return 0; 2591 2592 for (j = 0; j < adev->num_ip_blocks; j++) { 2593 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2594 if (!adev->ip_blocks[i].status.late_initialized) 2595 continue; 2596 /* skip PG for GFX, SDMA on S0ix */ 2597 if (adev->in_s0ix && 2598 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2599 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2600 continue; 2601 /* skip CG for VCE/UVD, it's handled specially */ 2602 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2603 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2604 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2605 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2606 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2607 /* enable powergating to save power */ 2608 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2609 state); 2610 if (r) { 2611 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2612 adev->ip_blocks[i].version->funcs->name, r); 2613 return r; 2614 } 2615 } 2616 } 2617 return 0; 2618 } 2619 2620 static int amdgpu_device_enable_mgpu_fan_boost(void) 2621 { 2622 struct amdgpu_gpu_instance *gpu_ins; 2623 struct amdgpu_device *adev; 2624 int i, ret = 0; 2625 2626 mutex_lock(&mgpu_info.mutex); 2627 2628 /* 2629 * MGPU fan boost feature should be enabled 2630 * only when there are two or more dGPUs in 2631 * the system 2632 */ 2633 if (mgpu_info.num_dgpu < 2) 2634 goto out; 2635 2636 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2637 gpu_ins = &(mgpu_info.gpu_ins[i]); 2638 adev = gpu_ins->adev; 2639 if (!(adev->flags & AMD_IS_APU) && 2640 !gpu_ins->mgpu_fan_enabled) { 2641 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2642 if (ret) 2643 break; 2644 2645 gpu_ins->mgpu_fan_enabled = 1; 2646 } 2647 } 2648 2649 out: 2650 mutex_unlock(&mgpu_info.mutex); 2651 2652 return ret; 2653 } 2654 2655 /** 2656 * amdgpu_device_ip_late_init - run late init for hardware IPs 2657 * 2658 * @adev: amdgpu_device pointer 2659 * 2660 * Late initialization pass for hardware IPs. The list of all the hardware 2661 * IPs that make up the asic is walked and the late_init callbacks are run. 2662 * late_init covers any special initialization that an IP requires 2663 * after all of the have been initialized or something that needs to happen 2664 * late in the init process. 2665 * Returns 0 on success, negative error code on failure. 2666 */ 2667 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2668 { 2669 struct amdgpu_gpu_instance *gpu_instance; 2670 int i = 0, r; 2671 2672 for (i = 0; i < adev->num_ip_blocks; i++) { 2673 if (!adev->ip_blocks[i].status.hw) 2674 continue; 2675 if (adev->ip_blocks[i].version->funcs->late_init) { 2676 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2677 if (r) { 2678 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2679 adev->ip_blocks[i].version->funcs->name, r); 2680 return r; 2681 } 2682 } 2683 adev->ip_blocks[i].status.late_initialized = true; 2684 } 2685 2686 r = amdgpu_ras_late_init(adev); 2687 if (r) { 2688 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2689 return r; 2690 } 2691 2692 amdgpu_ras_set_error_query_ready(adev, true); 2693 2694 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2695 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2696 2697 amdgpu_device_fill_reset_magic(adev); 2698 2699 r = amdgpu_device_enable_mgpu_fan_boost(); 2700 if (r) 2701 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2702 2703 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2704 if (amdgpu_passthrough(adev) && 2705 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2706 adev->asic_type == CHIP_ALDEBARAN)) 2707 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2708 2709 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2710 mutex_lock(&mgpu_info.mutex); 2711 2712 /* 2713 * Reset device p-state to low as this was booted with high. 2714 * 2715 * This should be performed only after all devices from the same 2716 * hive get initialized. 2717 * 2718 * However, it's unknown how many device in the hive in advance. 2719 * As this is counted one by one during devices initializations. 2720 * 2721 * So, we wait for all XGMI interlinked devices initialized. 2722 * This may bring some delays as those devices may come from 2723 * different hives. But that should be OK. 2724 */ 2725 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2726 for (i = 0; i < mgpu_info.num_gpu; i++) { 2727 gpu_instance = &(mgpu_info.gpu_ins[i]); 2728 if (gpu_instance->adev->flags & AMD_IS_APU) 2729 continue; 2730 2731 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2732 AMDGPU_XGMI_PSTATE_MIN); 2733 if (r) { 2734 DRM_ERROR("pstate setting failed (%d).\n", r); 2735 break; 2736 } 2737 } 2738 } 2739 2740 mutex_unlock(&mgpu_info.mutex); 2741 } 2742 2743 return 0; 2744 } 2745 2746 /** 2747 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2748 * 2749 * @adev: amdgpu_device pointer 2750 * 2751 * For ASICs need to disable SMC first 2752 */ 2753 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2754 { 2755 int i, r; 2756 2757 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2758 return; 2759 2760 for (i = 0; i < adev->num_ip_blocks; i++) { 2761 if (!adev->ip_blocks[i].status.hw) 2762 continue; 2763 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2764 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2765 /* XXX handle errors */ 2766 if (r) { 2767 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2768 adev->ip_blocks[i].version->funcs->name, r); 2769 } 2770 adev->ip_blocks[i].status.hw = false; 2771 break; 2772 } 2773 } 2774 } 2775 2776 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2777 { 2778 int i, r; 2779 2780 for (i = 0; i < adev->num_ip_blocks; i++) { 2781 if (!adev->ip_blocks[i].version->funcs->early_fini) 2782 continue; 2783 2784 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2785 if (r) { 2786 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2787 adev->ip_blocks[i].version->funcs->name, r); 2788 } 2789 } 2790 2791 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2792 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2793 2794 amdgpu_amdkfd_suspend(adev, false); 2795 2796 /* Workaroud for ASICs need to disable SMC first */ 2797 amdgpu_device_smu_fini_early(adev); 2798 2799 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2800 if (!adev->ip_blocks[i].status.hw) 2801 continue; 2802 2803 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2804 /* XXX handle errors */ 2805 if (r) { 2806 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2807 adev->ip_blocks[i].version->funcs->name, r); 2808 } 2809 2810 adev->ip_blocks[i].status.hw = false; 2811 } 2812 2813 if (amdgpu_sriov_vf(adev)) { 2814 if (amdgpu_virt_release_full_gpu(adev, false)) 2815 DRM_ERROR("failed to release exclusive mode on fini\n"); 2816 } 2817 2818 return 0; 2819 } 2820 2821 /** 2822 * amdgpu_device_ip_fini - run fini for hardware IPs 2823 * 2824 * @adev: amdgpu_device pointer 2825 * 2826 * Main teardown pass for hardware IPs. The list of all the hardware 2827 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2828 * are run. hw_fini tears down the hardware associated with each IP 2829 * and sw_fini tears down any software state associated with each IP. 2830 * Returns 0 on success, negative error code on failure. 2831 */ 2832 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2833 { 2834 int i, r; 2835 2836 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2837 amdgpu_virt_release_ras_err_handler_data(adev); 2838 2839 if (adev->gmc.xgmi.num_physical_nodes > 1) 2840 amdgpu_xgmi_remove_device(adev); 2841 2842 amdgpu_amdkfd_device_fini_sw(adev); 2843 2844 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2845 if (!adev->ip_blocks[i].status.sw) 2846 continue; 2847 2848 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2849 amdgpu_ucode_free_bo(adev); 2850 amdgpu_free_static_csa(&adev->virt.csa_obj); 2851 amdgpu_device_wb_fini(adev); 2852 amdgpu_device_mem_scratch_fini(adev); 2853 amdgpu_ib_pool_fini(adev); 2854 } 2855 2856 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2857 /* XXX handle errors */ 2858 if (r) { 2859 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2860 adev->ip_blocks[i].version->funcs->name, r); 2861 } 2862 adev->ip_blocks[i].status.sw = false; 2863 adev->ip_blocks[i].status.valid = false; 2864 } 2865 2866 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2867 if (!adev->ip_blocks[i].status.late_initialized) 2868 continue; 2869 if (adev->ip_blocks[i].version->funcs->late_fini) 2870 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2871 adev->ip_blocks[i].status.late_initialized = false; 2872 } 2873 2874 amdgpu_ras_fini(adev); 2875 2876 return 0; 2877 } 2878 2879 /** 2880 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2881 * 2882 * @work: work_struct. 2883 */ 2884 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2885 { 2886 struct amdgpu_device *adev = 2887 container_of(work, struct amdgpu_device, delayed_init_work.work); 2888 int r; 2889 2890 r = amdgpu_ib_ring_tests(adev); 2891 if (r) 2892 DRM_ERROR("ib ring test failed (%d).\n", r); 2893 } 2894 2895 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2896 { 2897 struct amdgpu_device *adev = 2898 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2899 2900 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2901 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2902 2903 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2904 adev->gfx.gfx_off_state = true; 2905 } 2906 2907 /** 2908 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2909 * 2910 * @adev: amdgpu_device pointer 2911 * 2912 * Main suspend function for hardware IPs. The list of all the hardware 2913 * IPs that make up the asic is walked, clockgating is disabled and the 2914 * suspend callbacks are run. suspend puts the hardware and software state 2915 * in each IP into a state suitable for suspend. 2916 * Returns 0 on success, negative error code on failure. 2917 */ 2918 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2919 { 2920 int i, r; 2921 2922 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2923 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2924 2925 /* 2926 * Per PMFW team's suggestion, driver needs to handle gfxoff 2927 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2928 * scenario. Add the missing df cstate disablement here. 2929 */ 2930 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2931 dev_warn(adev->dev, "Failed to disallow df cstate"); 2932 2933 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2934 if (!adev->ip_blocks[i].status.valid) 2935 continue; 2936 2937 /* displays are handled separately */ 2938 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2939 continue; 2940 2941 /* XXX handle errors */ 2942 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2943 /* XXX handle errors */ 2944 if (r) { 2945 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2946 adev->ip_blocks[i].version->funcs->name, r); 2947 return r; 2948 } 2949 2950 adev->ip_blocks[i].status.hw = false; 2951 } 2952 2953 return 0; 2954 } 2955 2956 /** 2957 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2958 * 2959 * @adev: amdgpu_device pointer 2960 * 2961 * Main suspend function for hardware IPs. The list of all the hardware 2962 * IPs that make up the asic is walked, clockgating is disabled and the 2963 * suspend callbacks are run. suspend puts the hardware and software state 2964 * in each IP into a state suitable for suspend. 2965 * Returns 0 on success, negative error code on failure. 2966 */ 2967 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2968 { 2969 int i, r; 2970 2971 if (adev->in_s0ix) 2972 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2973 2974 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2975 if (!adev->ip_blocks[i].status.valid) 2976 continue; 2977 /* displays are handled in phase1 */ 2978 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2979 continue; 2980 /* PSP lost connection when err_event_athub occurs */ 2981 if (amdgpu_ras_intr_triggered() && 2982 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2983 adev->ip_blocks[i].status.hw = false; 2984 continue; 2985 } 2986 2987 /* skip unnecessary suspend if we do not initialize them yet */ 2988 if (adev->gmc.xgmi.pending_reset && 2989 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2990 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2991 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2992 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2993 adev->ip_blocks[i].status.hw = false; 2994 continue; 2995 } 2996 2997 /* skip suspend of gfx/mes and psp for S0ix 2998 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2999 * like at runtime. PSP is also part of the always on hardware 3000 * so no need to suspend it. 3001 */ 3002 if (adev->in_s0ix && 3003 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3004 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3005 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3006 continue; 3007 3008 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3009 if (adev->in_s0ix && 3010 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3011 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3012 continue; 3013 3014 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3015 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3016 * from this location and RLC Autoload automatically also gets loaded 3017 * from here based on PMFW -> PSP message during re-init sequence. 3018 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3019 * the TMR and reload FWs again for IMU enabled APU ASICs. 3020 */ 3021 if (amdgpu_in_reset(adev) && 3022 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3023 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3024 continue; 3025 3026 /* XXX handle errors */ 3027 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3028 /* XXX handle errors */ 3029 if (r) { 3030 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3031 adev->ip_blocks[i].version->funcs->name, r); 3032 } 3033 adev->ip_blocks[i].status.hw = false; 3034 /* handle putting the SMC in the appropriate state */ 3035 if (!amdgpu_sriov_vf(adev)) { 3036 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3037 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3038 if (r) { 3039 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3040 adev->mp1_state, r); 3041 return r; 3042 } 3043 } 3044 } 3045 } 3046 3047 return 0; 3048 } 3049 3050 /** 3051 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3052 * 3053 * @adev: amdgpu_device pointer 3054 * 3055 * Main suspend function for hardware IPs. The list of all the hardware 3056 * IPs that make up the asic is walked, clockgating is disabled and the 3057 * suspend callbacks are run. suspend puts the hardware and software state 3058 * in each IP into a state suitable for suspend. 3059 * Returns 0 on success, negative error code on failure. 3060 */ 3061 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3062 { 3063 int r; 3064 3065 if (amdgpu_sriov_vf(adev)) { 3066 amdgpu_virt_fini_data_exchange(adev); 3067 amdgpu_virt_request_full_gpu(adev, false); 3068 } 3069 3070 r = amdgpu_device_ip_suspend_phase1(adev); 3071 if (r) 3072 return r; 3073 r = amdgpu_device_ip_suspend_phase2(adev); 3074 3075 if (amdgpu_sriov_vf(adev)) 3076 amdgpu_virt_release_full_gpu(adev, false); 3077 3078 return r; 3079 } 3080 3081 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3082 { 3083 int i, r; 3084 3085 static enum amd_ip_block_type ip_order[] = { 3086 AMD_IP_BLOCK_TYPE_COMMON, 3087 AMD_IP_BLOCK_TYPE_GMC, 3088 AMD_IP_BLOCK_TYPE_PSP, 3089 AMD_IP_BLOCK_TYPE_IH, 3090 }; 3091 3092 for (i = 0; i < adev->num_ip_blocks; i++) { 3093 int j; 3094 struct amdgpu_ip_block *block; 3095 3096 block = &adev->ip_blocks[i]; 3097 block->status.hw = false; 3098 3099 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3100 3101 if (block->version->type != ip_order[j] || 3102 !block->status.valid) 3103 continue; 3104 3105 r = block->version->funcs->hw_init(adev); 3106 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3107 if (r) 3108 return r; 3109 block->status.hw = true; 3110 } 3111 } 3112 3113 return 0; 3114 } 3115 3116 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3117 { 3118 int i, r; 3119 3120 static enum amd_ip_block_type ip_order[] = { 3121 AMD_IP_BLOCK_TYPE_SMC, 3122 AMD_IP_BLOCK_TYPE_DCE, 3123 AMD_IP_BLOCK_TYPE_GFX, 3124 AMD_IP_BLOCK_TYPE_SDMA, 3125 AMD_IP_BLOCK_TYPE_MES, 3126 AMD_IP_BLOCK_TYPE_UVD, 3127 AMD_IP_BLOCK_TYPE_VCE, 3128 AMD_IP_BLOCK_TYPE_VCN, 3129 AMD_IP_BLOCK_TYPE_JPEG 3130 }; 3131 3132 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3133 int j; 3134 struct amdgpu_ip_block *block; 3135 3136 for (j = 0; j < adev->num_ip_blocks; j++) { 3137 block = &adev->ip_blocks[j]; 3138 3139 if (block->version->type != ip_order[i] || 3140 !block->status.valid || 3141 block->status.hw) 3142 continue; 3143 3144 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3145 r = block->version->funcs->resume(adev); 3146 else 3147 r = block->version->funcs->hw_init(adev); 3148 3149 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3150 if (r) 3151 return r; 3152 block->status.hw = true; 3153 } 3154 } 3155 3156 return 0; 3157 } 3158 3159 /** 3160 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3161 * 3162 * @adev: amdgpu_device pointer 3163 * 3164 * First resume function for hardware IPs. The list of all the hardware 3165 * IPs that make up the asic is walked and the resume callbacks are run for 3166 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3167 * after a suspend and updates the software state as necessary. This 3168 * function is also used for restoring the GPU after a GPU reset. 3169 * Returns 0 on success, negative error code on failure. 3170 */ 3171 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3172 { 3173 int i, r; 3174 3175 for (i = 0; i < adev->num_ip_blocks; i++) { 3176 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3177 continue; 3178 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3179 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3180 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3181 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3182 3183 r = adev->ip_blocks[i].version->funcs->resume(adev); 3184 if (r) { 3185 DRM_ERROR("resume of IP block <%s> failed %d\n", 3186 adev->ip_blocks[i].version->funcs->name, r); 3187 return r; 3188 } 3189 adev->ip_blocks[i].status.hw = true; 3190 } 3191 } 3192 3193 return 0; 3194 } 3195 3196 /** 3197 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3198 * 3199 * @adev: amdgpu_device pointer 3200 * 3201 * First resume function for hardware IPs. The list of all the hardware 3202 * IPs that make up the asic is walked and the resume callbacks are run for 3203 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3204 * functional state after a suspend and updates the software state as 3205 * necessary. This function is also used for restoring the GPU after a GPU 3206 * reset. 3207 * Returns 0 on success, negative error code on failure. 3208 */ 3209 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3210 { 3211 int i, r; 3212 3213 for (i = 0; i < adev->num_ip_blocks; i++) { 3214 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3215 continue; 3216 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3217 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3218 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3219 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3220 continue; 3221 r = adev->ip_blocks[i].version->funcs->resume(adev); 3222 if (r) { 3223 DRM_ERROR("resume of IP block <%s> failed %d\n", 3224 adev->ip_blocks[i].version->funcs->name, r); 3225 return r; 3226 } 3227 adev->ip_blocks[i].status.hw = true; 3228 } 3229 3230 return 0; 3231 } 3232 3233 /** 3234 * amdgpu_device_ip_resume - run resume for hardware IPs 3235 * 3236 * @adev: amdgpu_device pointer 3237 * 3238 * Main resume function for hardware IPs. The hardware IPs 3239 * are split into two resume functions because they are 3240 * also used in recovering from a GPU reset and some additional 3241 * steps need to be take between them. In this case (S3/S4) they are 3242 * run sequentially. 3243 * Returns 0 on success, negative error code on failure. 3244 */ 3245 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3246 { 3247 int r; 3248 3249 r = amdgpu_device_ip_resume_phase1(adev); 3250 if (r) 3251 return r; 3252 3253 r = amdgpu_device_fw_loading(adev); 3254 if (r) 3255 return r; 3256 3257 r = amdgpu_device_ip_resume_phase2(adev); 3258 3259 return r; 3260 } 3261 3262 /** 3263 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3264 * 3265 * @adev: amdgpu_device pointer 3266 * 3267 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3268 */ 3269 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3270 { 3271 if (amdgpu_sriov_vf(adev)) { 3272 if (adev->is_atom_fw) { 3273 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3274 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3275 } else { 3276 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3277 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3278 } 3279 3280 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3281 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3282 } 3283 } 3284 3285 /** 3286 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3287 * 3288 * @asic_type: AMD asic type 3289 * 3290 * Check if there is DC (new modesetting infrastructre) support for an asic. 3291 * returns true if DC has support, false if not. 3292 */ 3293 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3294 { 3295 switch (asic_type) { 3296 #ifdef CONFIG_DRM_AMDGPU_SI 3297 case CHIP_HAINAN: 3298 #endif 3299 case CHIP_TOPAZ: 3300 /* chips with no display hardware */ 3301 return false; 3302 #if defined(CONFIG_DRM_AMD_DC) 3303 case CHIP_TAHITI: 3304 case CHIP_PITCAIRN: 3305 case CHIP_VERDE: 3306 case CHIP_OLAND: 3307 /* 3308 * We have systems in the wild with these ASICs that require 3309 * LVDS and VGA support which is not supported with DC. 3310 * 3311 * Fallback to the non-DC driver here by default so as not to 3312 * cause regressions. 3313 */ 3314 #if defined(CONFIG_DRM_AMD_DC_SI) 3315 return amdgpu_dc > 0; 3316 #else 3317 return false; 3318 #endif 3319 case CHIP_BONAIRE: 3320 case CHIP_KAVERI: 3321 case CHIP_KABINI: 3322 case CHIP_MULLINS: 3323 /* 3324 * We have systems in the wild with these ASICs that require 3325 * VGA support which is not supported with DC. 3326 * 3327 * Fallback to the non-DC driver here by default so as not to 3328 * cause regressions. 3329 */ 3330 return amdgpu_dc > 0; 3331 default: 3332 return amdgpu_dc != 0; 3333 #else 3334 default: 3335 if (amdgpu_dc > 0) 3336 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3337 return false; 3338 #endif 3339 } 3340 } 3341 3342 /** 3343 * amdgpu_device_has_dc_support - check if dc is supported 3344 * 3345 * @adev: amdgpu_device pointer 3346 * 3347 * Returns true for supported, false for not supported 3348 */ 3349 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3350 { 3351 if (adev->enable_virtual_display || 3352 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3353 return false; 3354 3355 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3356 } 3357 3358 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3359 { 3360 struct amdgpu_device *adev = 3361 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3362 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3363 3364 /* It's a bug to not have a hive within this function */ 3365 if (WARN_ON(!hive)) 3366 return; 3367 3368 /* 3369 * Use task barrier to synchronize all xgmi reset works across the 3370 * hive. task_barrier_enter and task_barrier_exit will block 3371 * until all the threads running the xgmi reset works reach 3372 * those points. task_barrier_full will do both blocks. 3373 */ 3374 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3375 3376 task_barrier_enter(&hive->tb); 3377 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3378 3379 if (adev->asic_reset_res) 3380 goto fail; 3381 3382 task_barrier_exit(&hive->tb); 3383 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3384 3385 if (adev->asic_reset_res) 3386 goto fail; 3387 3388 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3389 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3390 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3391 } else { 3392 3393 task_barrier_full(&hive->tb); 3394 adev->asic_reset_res = amdgpu_asic_reset(adev); 3395 } 3396 3397 fail: 3398 if (adev->asic_reset_res) 3399 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3400 adev->asic_reset_res, adev_to_drm(adev)->unique); 3401 amdgpu_put_xgmi_hive(hive); 3402 } 3403 3404 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3405 { 3406 char *input = amdgpu_lockup_timeout; 3407 char *timeout_setting = NULL; 3408 int index = 0; 3409 long timeout; 3410 int ret = 0; 3411 3412 /* 3413 * By default timeout for non compute jobs is 10000 3414 * and 60000 for compute jobs. 3415 * In SR-IOV or passthrough mode, timeout for compute 3416 * jobs are 60000 by default. 3417 */ 3418 adev->gfx_timeout = msecs_to_jiffies(10000); 3419 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3420 if (amdgpu_sriov_vf(adev)) 3421 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3422 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3423 else 3424 adev->compute_timeout = msecs_to_jiffies(60000); 3425 3426 #ifdef notyet 3427 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3428 while ((timeout_setting = strsep(&input, ",")) && 3429 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3430 ret = kstrtol(timeout_setting, 0, &timeout); 3431 if (ret) 3432 return ret; 3433 3434 if (timeout == 0) { 3435 index++; 3436 continue; 3437 } else if (timeout < 0) { 3438 timeout = MAX_SCHEDULE_TIMEOUT; 3439 dev_warn(adev->dev, "lockup timeout disabled"); 3440 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3441 } else { 3442 timeout = msecs_to_jiffies(timeout); 3443 } 3444 3445 switch (index++) { 3446 case 0: 3447 adev->gfx_timeout = timeout; 3448 break; 3449 case 1: 3450 adev->compute_timeout = timeout; 3451 break; 3452 case 2: 3453 adev->sdma_timeout = timeout; 3454 break; 3455 case 3: 3456 adev->video_timeout = timeout; 3457 break; 3458 default: 3459 break; 3460 } 3461 } 3462 /* 3463 * There is only one value specified and 3464 * it should apply to all non-compute jobs. 3465 */ 3466 if (index == 1) { 3467 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3468 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3469 adev->compute_timeout = adev->gfx_timeout; 3470 } 3471 } 3472 #endif 3473 3474 return ret; 3475 } 3476 3477 /** 3478 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3479 * 3480 * @adev: amdgpu_device pointer 3481 * 3482 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3483 */ 3484 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3485 { 3486 #ifdef notyet 3487 struct iommu_domain *domain; 3488 3489 domain = iommu_get_domain_for_dev(adev->dev); 3490 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3491 #endif 3492 adev->ram_is_direct_mapped = true; 3493 } 3494 3495 static const struct attribute *amdgpu_dev_attributes[] = { 3496 &dev_attr_pcie_replay_count.attr, 3497 NULL 3498 }; 3499 3500 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3501 { 3502 if (amdgpu_mcbp == 1) 3503 adev->gfx.mcbp = true; 3504 else if (amdgpu_mcbp == 0) 3505 adev->gfx.mcbp = false; 3506 3507 if (amdgpu_sriov_vf(adev)) 3508 adev->gfx.mcbp = true; 3509 3510 if (adev->gfx.mcbp) 3511 DRM_INFO("MCBP is enabled\n"); 3512 } 3513 3514 /** 3515 * amdgpu_device_init - initialize the driver 3516 * 3517 * @adev: amdgpu_device pointer 3518 * @flags: driver flags 3519 * 3520 * Initializes the driver info and hw (all asics). 3521 * Returns 0 for success or an error on failure. 3522 * Called at driver startup. 3523 */ 3524 int amdgpu_device_init(struct amdgpu_device *adev, 3525 uint32_t flags) 3526 { 3527 struct drm_device *ddev = adev_to_drm(adev); 3528 struct pci_dev *pdev = adev->pdev; 3529 int r, i; 3530 bool px = false; 3531 u32 max_MBps; 3532 int tmp; 3533 3534 adev->shutdown = false; 3535 adev->flags = flags; 3536 3537 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3538 adev->asic_type = amdgpu_force_asic_type; 3539 else 3540 adev->asic_type = flags & AMD_ASIC_MASK; 3541 3542 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3543 if (amdgpu_emu_mode == 1) 3544 adev->usec_timeout *= 10; 3545 adev->gmc.gart_size = 512 * 1024 * 1024; 3546 adev->accel_working = false; 3547 adev->num_rings = 0; 3548 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3549 adev->mman.buffer_funcs = NULL; 3550 adev->mman.buffer_funcs_ring = NULL; 3551 adev->vm_manager.vm_pte_funcs = NULL; 3552 adev->vm_manager.vm_pte_num_scheds = 0; 3553 adev->gmc.gmc_funcs = NULL; 3554 adev->harvest_ip_mask = 0x0; 3555 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3556 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3557 3558 adev->smc_rreg = &amdgpu_invalid_rreg; 3559 adev->smc_wreg = &amdgpu_invalid_wreg; 3560 adev->pcie_rreg = &amdgpu_invalid_rreg; 3561 adev->pcie_wreg = &amdgpu_invalid_wreg; 3562 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3563 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3564 adev->pciep_rreg = &amdgpu_invalid_rreg; 3565 adev->pciep_wreg = &amdgpu_invalid_wreg; 3566 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3567 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3568 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3569 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3570 adev->didt_rreg = &amdgpu_invalid_rreg; 3571 adev->didt_wreg = &amdgpu_invalid_wreg; 3572 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3573 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3574 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3575 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3576 3577 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3578 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3579 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3580 3581 /* mutex initialization are all done here so we 3582 * can recall function without having locking issues 3583 */ 3584 rw_init(&adev->firmware.mutex, "agfw"); 3585 rw_init(&adev->pm.mutex, "agpm"); 3586 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3587 rw_init(&adev->srbm_mutex, "srbm"); 3588 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3589 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3590 rw_init(&adev->gfx.partition_mutex, "gfxpar"); 3591 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3592 rw_init(&adev->mn_lock, "agpumn"); 3593 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3594 hash_init(adev->mn_hash); 3595 rw_init(&adev->psp.mutex, "agpsp"); 3596 rw_init(&adev->notifier_lock, "agnf"); 3597 rw_init(&adev->pm.stable_pstate_ctx_lock, "agps"); 3598 rw_init(&adev->benchmark_mutex, "agbm"); 3599 3600 amdgpu_device_init_apu_flags(adev); 3601 3602 r = amdgpu_device_check_arguments(adev); 3603 if (r) 3604 return r; 3605 3606 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3607 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3608 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3609 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3610 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3611 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3612 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3613 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3614 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3615 3616 INIT_LIST_HEAD(&adev->shadow_list); 3617 rw_init(&adev->shadow_list_lock, "sdwlst"); 3618 3619 INIT_LIST_HEAD(&adev->reset_list); 3620 3621 INIT_LIST_HEAD(&adev->ras_list); 3622 3623 INIT_DELAYED_WORK(&adev->delayed_init_work, 3624 amdgpu_device_delayed_init_work_handler); 3625 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3626 amdgpu_device_delay_enable_gfx_off); 3627 3628 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3629 3630 adev->gfx.gfx_off_req_count = 1; 3631 adev->gfx.gfx_off_residency = 0; 3632 adev->gfx.gfx_off_entrycount = 0; 3633 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3634 3635 atomic_set(&adev->throttling_logging_enabled, 1); 3636 /* 3637 * If throttling continues, logging will be performed every minute 3638 * to avoid log flooding. "-1" is subtracted since the thermal 3639 * throttling interrupt comes every second. Thus, the total logging 3640 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3641 * for throttling interrupt) = 60 seconds. 3642 */ 3643 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3644 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3645 3646 #ifdef __linux__ 3647 /* Registers mapping */ 3648 /* TODO: block userspace mapping of io register */ 3649 if (adev->asic_type >= CHIP_BONAIRE) { 3650 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3651 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3652 } else { 3653 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3654 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3655 } 3656 #endif 3657 3658 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3659 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3660 3661 #ifdef __linux__ 3662 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3663 if (!adev->rmmio) 3664 return -ENOMEM; 3665 #endif 3666 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3667 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3668 3669 /* 3670 * Reset domain needs to be present early, before XGMI hive discovered 3671 * (if any) and intitialized to use reset sem and in_gpu reset flag 3672 * early on during init and before calling to RREG32. 3673 */ 3674 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3675 if (!adev->reset_domain) 3676 return -ENOMEM; 3677 3678 /* detect hw virtualization here */ 3679 amdgpu_detect_virtualization(adev); 3680 3681 amdgpu_device_get_pcie_info(adev); 3682 3683 r = amdgpu_device_get_job_timeout_settings(adev); 3684 if (r) { 3685 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3686 return r; 3687 } 3688 3689 /* early init functions */ 3690 r = amdgpu_device_ip_early_init(adev); 3691 if (r) 3692 return r; 3693 3694 amdgpu_device_set_mcbp(adev); 3695 3696 /* Get rid of things like offb */ 3697 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3698 if (r) 3699 return r; 3700 3701 /* Enable TMZ based on IP_VERSION */ 3702 amdgpu_gmc_tmz_set(adev); 3703 3704 amdgpu_gmc_noretry_set(adev); 3705 /* Need to get xgmi info early to decide the reset behavior*/ 3706 if (adev->gmc.xgmi.supported) { 3707 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3708 if (r) 3709 return r; 3710 } 3711 3712 /* enable PCIE atomic ops */ 3713 #ifdef notyet 3714 if (amdgpu_sriov_vf(adev)) { 3715 if (adev->virt.fw_reserve.p_pf2vf) 3716 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3717 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3718 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3719 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3720 * internal path natively support atomics, set have_atomics_support to true. 3721 */ 3722 } else if ((adev->flags & AMD_IS_APU) && 3723 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3724 adev->have_atomics_support = true; 3725 } else { 3726 adev->have_atomics_support = 3727 !pci_enable_atomic_ops_to_root(adev->pdev, 3728 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3729 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3730 } 3731 3732 if (!adev->have_atomics_support) 3733 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3734 #else 3735 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3736 * internal path natively support atomics, set have_atomics_support to true. 3737 */ 3738 if ((adev->flags & AMD_IS_APU) && 3739 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) 3740 adev->have_atomics_support = true; 3741 else 3742 adev->have_atomics_support = false; 3743 #endif 3744 3745 /* doorbell bar mapping and doorbell index init*/ 3746 amdgpu_doorbell_init(adev); 3747 3748 if (amdgpu_emu_mode == 1) { 3749 /* post the asic on emulation mode */ 3750 emu_soc_asic_init(adev); 3751 goto fence_driver_init; 3752 } 3753 3754 amdgpu_reset_init(adev); 3755 3756 /* detect if we are with an SRIOV vbios */ 3757 if (adev->bios) 3758 amdgpu_device_detect_sriov_bios(adev); 3759 3760 /* check if we need to reset the asic 3761 * E.g., driver was not cleanly unloaded previously, etc. 3762 */ 3763 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3764 if (adev->gmc.xgmi.num_physical_nodes) { 3765 dev_info(adev->dev, "Pending hive reset.\n"); 3766 adev->gmc.xgmi.pending_reset = true; 3767 /* Only need to init necessary block for SMU to handle the reset */ 3768 for (i = 0; i < adev->num_ip_blocks; i++) { 3769 if (!adev->ip_blocks[i].status.valid) 3770 continue; 3771 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3772 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3773 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3774 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3775 DRM_DEBUG("IP %s disabled for hw_init.\n", 3776 adev->ip_blocks[i].version->funcs->name); 3777 adev->ip_blocks[i].status.hw = true; 3778 } 3779 } 3780 } else { 3781 tmp = amdgpu_reset_method; 3782 /* It should do a default reset when loading or reloading the driver, 3783 * regardless of the module parameter reset_method. 3784 */ 3785 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3786 r = amdgpu_asic_reset(adev); 3787 amdgpu_reset_method = tmp; 3788 if (r) { 3789 dev_err(adev->dev, "asic reset on init failed\n"); 3790 goto failed; 3791 } 3792 } 3793 } 3794 3795 /* Post card if necessary */ 3796 if (amdgpu_device_need_post(adev)) { 3797 if (!adev->bios) { 3798 dev_err(adev->dev, "no vBIOS found\n"); 3799 r = -EINVAL; 3800 goto failed; 3801 } 3802 DRM_INFO("GPU posting now...\n"); 3803 r = amdgpu_device_asic_init(adev); 3804 if (r) { 3805 dev_err(adev->dev, "gpu post error!\n"); 3806 goto failed; 3807 } 3808 } 3809 3810 if (adev->bios) { 3811 if (adev->is_atom_fw) { 3812 /* Initialize clocks */ 3813 r = amdgpu_atomfirmware_get_clock_info(adev); 3814 if (r) { 3815 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3816 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3817 goto failed; 3818 } 3819 } else { 3820 /* Initialize clocks */ 3821 r = amdgpu_atombios_get_clock_info(adev); 3822 if (r) { 3823 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3824 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3825 goto failed; 3826 } 3827 /* init i2c buses */ 3828 if (!amdgpu_device_has_dc_support(adev)) 3829 amdgpu_atombios_i2c_init(adev); 3830 } 3831 } 3832 3833 fence_driver_init: 3834 /* Fence driver */ 3835 r = amdgpu_fence_driver_sw_init(adev); 3836 if (r) { 3837 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3838 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3839 goto failed; 3840 } 3841 3842 /* init the mode config */ 3843 drm_mode_config_init(adev_to_drm(adev)); 3844 3845 r = amdgpu_device_ip_init(adev); 3846 if (r) { 3847 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3848 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3849 goto release_ras_con; 3850 } 3851 3852 amdgpu_fence_driver_hw_init(adev); 3853 3854 dev_info(adev->dev, 3855 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3856 adev->gfx.config.max_shader_engines, 3857 adev->gfx.config.max_sh_per_se, 3858 adev->gfx.config.max_cu_per_sh, 3859 adev->gfx.cu_info.number); 3860 3861 #ifdef __OpenBSD__ 3862 { 3863 const char *chip_name; 3864 uint32_t version = adev->ip_versions[GC_HWIP][0]; 3865 int maj, min, rev; 3866 3867 switch (adev->asic_type) { 3868 case CHIP_RAVEN: 3869 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3870 chip_name = "RAVEN2"; 3871 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3872 chip_name = "PICASSO"; 3873 else 3874 chip_name = "RAVEN"; 3875 break; 3876 case CHIP_RENOIR: 3877 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3878 chip_name = "RENOIR"; 3879 else 3880 chip_name = "GREEN_SARDINE"; 3881 break; 3882 default: 3883 chip_name = amdgpu_asic_name[adev->asic_type]; 3884 } 3885 3886 printf("%s: %s", adev->self.dv_xname, chip_name); 3887 /* show graphics/compute ip block version, not set on < GFX9 */ 3888 if (version) { 3889 maj = IP_VERSION_MAJ(version); 3890 min = IP_VERSION_MIN(version); 3891 rev = IP_VERSION_REV(version); 3892 printf(" GC %d.%d.%d", maj, min, rev); 3893 } 3894 printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id); 3895 } 3896 #endif 3897 3898 adev->accel_working = true; 3899 3900 amdgpu_vm_check_compute_bug(adev); 3901 3902 /* Initialize the buffer migration limit. */ 3903 if (amdgpu_moverate >= 0) 3904 max_MBps = amdgpu_moverate; 3905 else 3906 max_MBps = 8; /* Allow 8 MB/s. */ 3907 /* Get a log2 for easy divisions. */ 3908 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3909 3910 r = amdgpu_atombios_sysfs_init(adev); 3911 if (r) 3912 drm_err(&adev->ddev, 3913 "registering atombios sysfs failed (%d).\n", r); 3914 3915 r = amdgpu_pm_sysfs_init(adev); 3916 if (r) 3917 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3918 3919 r = amdgpu_ucode_sysfs_init(adev); 3920 if (r) { 3921 adev->ucode_sysfs_en = false; 3922 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3923 } else 3924 adev->ucode_sysfs_en = true; 3925 3926 /* 3927 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3928 * Otherwise the mgpu fan boost feature will be skipped due to the 3929 * gpu instance is counted less. 3930 */ 3931 amdgpu_register_gpu_instance(adev); 3932 3933 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3934 * explicit gating rather than handling it automatically. 3935 */ 3936 if (!adev->gmc.xgmi.pending_reset) { 3937 r = amdgpu_device_ip_late_init(adev); 3938 if (r) { 3939 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3940 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3941 goto release_ras_con; 3942 } 3943 /* must succeed. */ 3944 amdgpu_ras_resume(adev); 3945 queue_delayed_work(system_wq, &adev->delayed_init_work, 3946 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3947 } 3948 3949 if (amdgpu_sriov_vf(adev)) { 3950 amdgpu_virt_release_full_gpu(adev, true); 3951 flush_delayed_work(&adev->delayed_init_work); 3952 } 3953 3954 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3955 if (r) 3956 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3957 3958 amdgpu_fru_sysfs_init(adev); 3959 3960 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3961 r = amdgpu_pmu_init(adev); 3962 if (r) 3963 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3964 3965 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3966 if (amdgpu_device_cache_pci_state(adev->pdev)) 3967 pci_restore_state(pdev); 3968 3969 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3970 /* this will fail for cards that aren't VGA class devices, just 3971 * ignore it 3972 */ 3973 #ifdef notyet 3974 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3975 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3976 #endif 3977 3978 px = amdgpu_device_supports_px(ddev); 3979 3980 if (px || (!dev_is_removable(&adev->pdev->dev) && 3981 apple_gmux_detect(NULL, NULL))) 3982 vga_switcheroo_register_client(adev->pdev, 3983 &amdgpu_switcheroo_ops, px); 3984 3985 if (px) 3986 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3987 3988 if (adev->gmc.xgmi.pending_reset) 3989 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3990 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3991 3992 amdgpu_device_check_iommu_direct_map(adev); 3993 3994 return 0; 3995 3996 release_ras_con: 3997 if (amdgpu_sriov_vf(adev)) 3998 amdgpu_virt_release_full_gpu(adev, true); 3999 4000 /* failed in exclusive mode due to timeout */ 4001 if (amdgpu_sriov_vf(adev) && 4002 !amdgpu_sriov_runtime(adev) && 4003 amdgpu_virt_mmio_blocked(adev) && 4004 !amdgpu_virt_wait_reset(adev)) { 4005 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4006 /* Don't send request since VF is inactive. */ 4007 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4008 adev->virt.ops = NULL; 4009 r = -EAGAIN; 4010 } 4011 amdgpu_release_ras_context(adev); 4012 4013 failed: 4014 amdgpu_vf_error_trans_all(adev); 4015 4016 return r; 4017 } 4018 4019 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4020 { 4021 STUB(); 4022 #ifdef notyet 4023 4024 /* Clear all CPU mappings pointing to this device */ 4025 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4026 #endif 4027 4028 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4029 amdgpu_doorbell_fini(adev); 4030 4031 #ifdef __linux__ 4032 iounmap(adev->rmmio); 4033 adev->rmmio = NULL; 4034 if (adev->mman.aper_base_kaddr) 4035 iounmap(adev->mman.aper_base_kaddr); 4036 adev->mman.aper_base_kaddr = NULL; 4037 #else 4038 if (adev->rmmio_size > 0) 4039 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4040 adev->rmmio_size); 4041 adev->rmmio_size = 0; 4042 adev->rmmio = NULL; 4043 if (adev->mman.aper_base_kaddr) 4044 bus_space_unmap(adev->memt, adev->mman.aper_bsh, 4045 adev->gmc.visible_vram_size); 4046 adev->mman.aper_base_kaddr = NULL; 4047 #endif 4048 4049 /* Memory manager related */ 4050 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4051 #ifdef __linux__ 4052 arch_phys_wc_del(adev->gmc.vram_mtrr); 4053 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4054 #else 4055 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC); 4056 #endif 4057 } 4058 } 4059 4060 /** 4061 * amdgpu_device_fini_hw - tear down the driver 4062 * 4063 * @adev: amdgpu_device pointer 4064 * 4065 * Tear down the driver info (all asics). 4066 * Called at driver shutdown. 4067 */ 4068 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4069 { 4070 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4071 flush_delayed_work(&adev->delayed_init_work); 4072 adev->shutdown = true; 4073 4074 /* make sure IB test finished before entering exclusive mode 4075 * to avoid preemption on IB test 4076 */ 4077 if (amdgpu_sriov_vf(adev)) { 4078 amdgpu_virt_request_full_gpu(adev, false); 4079 amdgpu_virt_fini_data_exchange(adev); 4080 } 4081 4082 /* disable all interrupts */ 4083 amdgpu_irq_disable_all(adev); 4084 if (adev->mode_info.mode_config_initialized) { 4085 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4086 drm_helper_force_disable_all(adev_to_drm(adev)); 4087 else 4088 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4089 } 4090 amdgpu_fence_driver_hw_fini(adev); 4091 4092 if (adev->mman.initialized) 4093 drain_workqueue(adev->mman.bdev.wq); 4094 4095 if (adev->pm.sysfs_initialized) 4096 amdgpu_pm_sysfs_fini(adev); 4097 if (adev->ucode_sysfs_en) 4098 amdgpu_ucode_sysfs_fini(adev); 4099 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4100 amdgpu_fru_sysfs_fini(adev); 4101 4102 /* disable ras feature must before hw fini */ 4103 amdgpu_ras_pre_fini(adev); 4104 4105 amdgpu_device_ip_fini_early(adev); 4106 4107 amdgpu_irq_fini_hw(adev); 4108 4109 if (adev->mman.initialized) 4110 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4111 4112 amdgpu_gart_dummy_page_fini(adev); 4113 4114 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4115 amdgpu_device_unmap_mmio(adev); 4116 4117 } 4118 4119 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4120 { 4121 int idx; 4122 bool px; 4123 4124 amdgpu_fence_driver_sw_fini(adev); 4125 amdgpu_device_ip_fini(adev); 4126 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4127 adev->accel_working = false; 4128 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4129 4130 amdgpu_reset_fini(adev); 4131 4132 /* free i2c buses */ 4133 if (!amdgpu_device_has_dc_support(adev)) 4134 amdgpu_i2c_fini(adev); 4135 4136 if (amdgpu_emu_mode != 1) 4137 amdgpu_atombios_fini(adev); 4138 4139 kfree(adev->bios); 4140 adev->bios = NULL; 4141 4142 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4143 4144 if (px || (!dev_is_removable(&adev->pdev->dev) && 4145 apple_gmux_detect(NULL, NULL))) 4146 vga_switcheroo_unregister_client(adev->pdev); 4147 4148 if (px) 4149 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4150 4151 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4152 vga_client_unregister(adev->pdev); 4153 4154 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4155 #ifdef __linux__ 4156 iounmap(adev->rmmio); 4157 adev->rmmio = NULL; 4158 #else 4159 if (adev->rmmio_size > 0) 4160 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4161 adev->rmmio_size); 4162 adev->rmmio_size = 0; 4163 adev->rmmio = NULL; 4164 #endif 4165 amdgpu_doorbell_fini(adev); 4166 drm_dev_exit(idx); 4167 } 4168 4169 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4170 amdgpu_pmu_fini(adev); 4171 if (adev->mman.discovery_bin) 4172 amdgpu_discovery_fini(adev); 4173 4174 amdgpu_reset_put_reset_domain(adev->reset_domain); 4175 adev->reset_domain = NULL; 4176 4177 kfree(adev->pci_state); 4178 4179 } 4180 4181 /** 4182 * amdgpu_device_evict_resources - evict device resources 4183 * @adev: amdgpu device object 4184 * 4185 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4186 * of the vram memory type. Mainly used for evicting device resources 4187 * at suspend time. 4188 * 4189 */ 4190 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4191 { 4192 int ret; 4193 4194 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4195 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4196 return 0; 4197 4198 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4199 if (ret) 4200 DRM_WARN("evicting device resources failed\n"); 4201 return ret; 4202 } 4203 4204 /* 4205 * Suspend & resume. 4206 */ 4207 /** 4208 * amdgpu_device_suspend - initiate device suspend 4209 * 4210 * @dev: drm dev pointer 4211 * @fbcon : notify the fbdev of suspend 4212 * 4213 * Puts the hw in the suspend state (all asics). 4214 * Returns 0 for success or an error on failure. 4215 * Called at driver suspend. 4216 */ 4217 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4218 { 4219 struct amdgpu_device *adev = drm_to_adev(dev); 4220 int r = 0; 4221 4222 if (adev->shutdown) 4223 return 0; 4224 4225 #ifdef notyet 4226 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4227 return 0; 4228 #endif 4229 4230 adev->in_suspend = true; 4231 4232 /* Evict the majority of BOs before grabbing the full access */ 4233 r = amdgpu_device_evict_resources(adev); 4234 if (r) 4235 return r; 4236 4237 if (amdgpu_sriov_vf(adev)) { 4238 amdgpu_virt_fini_data_exchange(adev); 4239 r = amdgpu_virt_request_full_gpu(adev, false); 4240 if (r) 4241 return r; 4242 } 4243 4244 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4245 DRM_WARN("smart shift update failed\n"); 4246 4247 if (fbcon) 4248 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4249 4250 cancel_delayed_work_sync(&adev->delayed_init_work); 4251 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4252 4253 amdgpu_ras_suspend(adev); 4254 4255 amdgpu_device_ip_suspend_phase1(adev); 4256 4257 if (!adev->in_s0ix) 4258 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4259 4260 r = amdgpu_device_evict_resources(adev); 4261 if (r) 4262 return r; 4263 4264 amdgpu_fence_driver_hw_fini(adev); 4265 4266 amdgpu_device_ip_suspend_phase2(adev); 4267 4268 if (amdgpu_sriov_vf(adev)) 4269 amdgpu_virt_release_full_gpu(adev, false); 4270 4271 return 0; 4272 } 4273 4274 /** 4275 * amdgpu_device_resume - initiate device resume 4276 * 4277 * @dev: drm dev pointer 4278 * @fbcon : notify the fbdev of resume 4279 * 4280 * Bring the hw back to operating state (all asics). 4281 * Returns 0 for success or an error on failure. 4282 * Called at driver resume. 4283 */ 4284 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4285 { 4286 struct amdgpu_device *adev = drm_to_adev(dev); 4287 int r = 0; 4288 4289 if (amdgpu_sriov_vf(adev)) { 4290 r = amdgpu_virt_request_full_gpu(adev, true); 4291 if (r) 4292 return r; 4293 } 4294 4295 #ifdef notyet 4296 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4297 return 0; 4298 #endif 4299 4300 if (adev->in_s0ix) 4301 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4302 4303 /* post card */ 4304 if (amdgpu_device_need_post(adev)) { 4305 r = amdgpu_device_asic_init(adev); 4306 if (r) 4307 dev_err(adev->dev, "amdgpu asic init failed\n"); 4308 } 4309 4310 r = amdgpu_device_ip_resume(adev); 4311 4312 if (r) { 4313 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4314 goto exit; 4315 } 4316 amdgpu_fence_driver_hw_init(adev); 4317 4318 r = amdgpu_device_ip_late_init(adev); 4319 if (r) 4320 goto exit; 4321 4322 queue_delayed_work(system_wq, &adev->delayed_init_work, 4323 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4324 4325 if (!adev->in_s0ix) { 4326 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4327 if (r) 4328 goto exit; 4329 } 4330 4331 exit: 4332 if (amdgpu_sriov_vf(adev)) { 4333 amdgpu_virt_init_data_exchange(adev); 4334 amdgpu_virt_release_full_gpu(adev, true); 4335 } 4336 4337 if (r) 4338 return r; 4339 4340 /* Make sure IB tests flushed */ 4341 flush_delayed_work(&adev->delayed_init_work); 4342 4343 if (fbcon) 4344 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4345 4346 amdgpu_ras_resume(adev); 4347 4348 if (adev->mode_info.num_crtc) { 4349 /* 4350 * Most of the connector probing functions try to acquire runtime pm 4351 * refs to ensure that the GPU is powered on when connector polling is 4352 * performed. Since we're calling this from a runtime PM callback, 4353 * trying to acquire rpm refs will cause us to deadlock. 4354 * 4355 * Since we're guaranteed to be holding the rpm lock, it's safe to 4356 * temporarily disable the rpm helpers so this doesn't deadlock us. 4357 */ 4358 #if defined(CONFIG_PM) && defined(__linux__) 4359 dev->dev->power.disable_depth++; 4360 #endif 4361 if (!adev->dc_enabled) 4362 drm_helper_hpd_irq_event(dev); 4363 else 4364 drm_kms_helper_hotplug_event(dev); 4365 #if defined(CONFIG_PM) && defined(__linux__) 4366 dev->dev->power.disable_depth--; 4367 #endif 4368 } 4369 adev->in_suspend = false; 4370 4371 if (adev->enable_mes) 4372 amdgpu_mes_self_test(adev); 4373 4374 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4375 DRM_WARN("smart shift update failed\n"); 4376 4377 return 0; 4378 } 4379 4380 /** 4381 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4382 * 4383 * @adev: amdgpu_device pointer 4384 * 4385 * The list of all the hardware IPs that make up the asic is walked and 4386 * the check_soft_reset callbacks are run. check_soft_reset determines 4387 * if the asic is still hung or not. 4388 * Returns true if any of the IPs are still in a hung state, false if not. 4389 */ 4390 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4391 { 4392 int i; 4393 bool asic_hang = false; 4394 4395 if (amdgpu_sriov_vf(adev)) 4396 return true; 4397 4398 if (amdgpu_asic_need_full_reset(adev)) 4399 return true; 4400 4401 for (i = 0; i < adev->num_ip_blocks; i++) { 4402 if (!adev->ip_blocks[i].status.valid) 4403 continue; 4404 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4405 adev->ip_blocks[i].status.hang = 4406 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4407 if (adev->ip_blocks[i].status.hang) { 4408 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4409 asic_hang = true; 4410 } 4411 } 4412 return asic_hang; 4413 } 4414 4415 /** 4416 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4417 * 4418 * @adev: amdgpu_device pointer 4419 * 4420 * The list of all the hardware IPs that make up the asic is walked and the 4421 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4422 * handles any IP specific hardware or software state changes that are 4423 * necessary for a soft reset to succeed. 4424 * Returns 0 on success, negative error code on failure. 4425 */ 4426 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4427 { 4428 int i, r = 0; 4429 4430 for (i = 0; i < adev->num_ip_blocks; i++) { 4431 if (!adev->ip_blocks[i].status.valid) 4432 continue; 4433 if (adev->ip_blocks[i].status.hang && 4434 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4435 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4436 if (r) 4437 return r; 4438 } 4439 } 4440 4441 return 0; 4442 } 4443 4444 /** 4445 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4446 * 4447 * @adev: amdgpu_device pointer 4448 * 4449 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4450 * reset is necessary to recover. 4451 * Returns true if a full asic reset is required, false if not. 4452 */ 4453 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4454 { 4455 int i; 4456 4457 if (amdgpu_asic_need_full_reset(adev)) 4458 return true; 4459 4460 for (i = 0; i < adev->num_ip_blocks; i++) { 4461 if (!adev->ip_blocks[i].status.valid) 4462 continue; 4463 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4464 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4465 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4466 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4467 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4468 if (adev->ip_blocks[i].status.hang) { 4469 dev_info(adev->dev, "Some block need full reset!\n"); 4470 return true; 4471 } 4472 } 4473 } 4474 return false; 4475 } 4476 4477 /** 4478 * amdgpu_device_ip_soft_reset - do a soft reset 4479 * 4480 * @adev: amdgpu_device pointer 4481 * 4482 * The list of all the hardware IPs that make up the asic is walked and the 4483 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4484 * IP specific hardware or software state changes that are necessary to soft 4485 * reset the IP. 4486 * Returns 0 on success, negative error code on failure. 4487 */ 4488 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4489 { 4490 int i, r = 0; 4491 4492 for (i = 0; i < adev->num_ip_blocks; i++) { 4493 if (!adev->ip_blocks[i].status.valid) 4494 continue; 4495 if (adev->ip_blocks[i].status.hang && 4496 adev->ip_blocks[i].version->funcs->soft_reset) { 4497 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4498 if (r) 4499 return r; 4500 } 4501 } 4502 4503 return 0; 4504 } 4505 4506 /** 4507 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4508 * 4509 * @adev: amdgpu_device pointer 4510 * 4511 * The list of all the hardware IPs that make up the asic is walked and the 4512 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4513 * handles any IP specific hardware or software state changes that are 4514 * necessary after the IP has been soft reset. 4515 * Returns 0 on success, negative error code on failure. 4516 */ 4517 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4518 { 4519 int i, r = 0; 4520 4521 for (i = 0; i < adev->num_ip_blocks; i++) { 4522 if (!adev->ip_blocks[i].status.valid) 4523 continue; 4524 if (adev->ip_blocks[i].status.hang && 4525 adev->ip_blocks[i].version->funcs->post_soft_reset) 4526 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4527 if (r) 4528 return r; 4529 } 4530 4531 return 0; 4532 } 4533 4534 /** 4535 * amdgpu_device_recover_vram - Recover some VRAM contents 4536 * 4537 * @adev: amdgpu_device pointer 4538 * 4539 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4540 * restore things like GPUVM page tables after a GPU reset where 4541 * the contents of VRAM might be lost. 4542 * 4543 * Returns: 4544 * 0 on success, negative error code on failure. 4545 */ 4546 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4547 { 4548 struct dma_fence *fence = NULL, *next = NULL; 4549 struct amdgpu_bo *shadow; 4550 struct amdgpu_bo_vm *vmbo; 4551 long r = 1, tmo; 4552 4553 if (amdgpu_sriov_runtime(adev)) 4554 tmo = msecs_to_jiffies(8000); 4555 else 4556 tmo = msecs_to_jiffies(100); 4557 4558 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4559 mutex_lock(&adev->shadow_list_lock); 4560 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4561 /* If vm is compute context or adev is APU, shadow will be NULL */ 4562 if (!vmbo->shadow) 4563 continue; 4564 shadow = vmbo->shadow; 4565 4566 /* No need to recover an evicted BO */ 4567 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4568 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4569 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4570 continue; 4571 4572 r = amdgpu_bo_restore_shadow(shadow, &next); 4573 if (r) 4574 break; 4575 4576 if (fence) { 4577 tmo = dma_fence_wait_timeout(fence, false, tmo); 4578 dma_fence_put(fence); 4579 fence = next; 4580 if (tmo == 0) { 4581 r = -ETIMEDOUT; 4582 break; 4583 } else if (tmo < 0) { 4584 r = tmo; 4585 break; 4586 } 4587 } else { 4588 fence = next; 4589 } 4590 } 4591 mutex_unlock(&adev->shadow_list_lock); 4592 4593 if (fence) 4594 tmo = dma_fence_wait_timeout(fence, false, tmo); 4595 dma_fence_put(fence); 4596 4597 if (r < 0 || tmo <= 0) { 4598 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4599 return -EIO; 4600 } 4601 4602 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4603 return 0; 4604 } 4605 4606 4607 /** 4608 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4609 * 4610 * @adev: amdgpu_device pointer 4611 * @from_hypervisor: request from hypervisor 4612 * 4613 * do VF FLR and reinitialize Asic 4614 * return 0 means succeeded otherwise failed 4615 */ 4616 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4617 bool from_hypervisor) 4618 { 4619 int r; 4620 struct amdgpu_hive_info *hive = NULL; 4621 int retry_limit = 0; 4622 4623 retry: 4624 amdgpu_amdkfd_pre_reset(adev); 4625 4626 if (from_hypervisor) 4627 r = amdgpu_virt_request_full_gpu(adev, true); 4628 else 4629 r = amdgpu_virt_reset_gpu(adev); 4630 if (r) 4631 return r; 4632 amdgpu_irq_gpu_reset_resume_helper(adev); 4633 4634 /* some sw clean up VF needs to do before recover */ 4635 amdgpu_virt_post_reset(adev); 4636 4637 /* Resume IP prior to SMC */ 4638 r = amdgpu_device_ip_reinit_early_sriov(adev); 4639 if (r) 4640 goto error; 4641 4642 amdgpu_virt_init_data_exchange(adev); 4643 4644 r = amdgpu_device_fw_loading(adev); 4645 if (r) 4646 return r; 4647 4648 /* now we are okay to resume SMC/CP/SDMA */ 4649 r = amdgpu_device_ip_reinit_late_sriov(adev); 4650 if (r) 4651 goto error; 4652 4653 hive = amdgpu_get_xgmi_hive(adev); 4654 /* Update PSP FW topology after reset */ 4655 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4656 r = amdgpu_xgmi_update_topology(hive, adev); 4657 4658 if (hive) 4659 amdgpu_put_xgmi_hive(hive); 4660 4661 if (!r) { 4662 r = amdgpu_ib_ring_tests(adev); 4663 4664 amdgpu_amdkfd_post_reset(adev); 4665 } 4666 4667 error: 4668 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4669 amdgpu_inc_vram_lost(adev); 4670 r = amdgpu_device_recover_vram(adev); 4671 } 4672 amdgpu_virt_release_full_gpu(adev, true); 4673 4674 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4675 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4676 retry_limit++; 4677 goto retry; 4678 } else 4679 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4680 } 4681 4682 return r; 4683 } 4684 4685 /** 4686 * amdgpu_device_has_job_running - check if there is any job in mirror list 4687 * 4688 * @adev: amdgpu_device pointer 4689 * 4690 * check if there is any job in mirror list 4691 */ 4692 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4693 { 4694 int i; 4695 struct drm_sched_job *job; 4696 4697 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4698 struct amdgpu_ring *ring = adev->rings[i]; 4699 4700 if (!ring || !ring->sched.thread) 4701 continue; 4702 4703 spin_lock(&ring->sched.job_list_lock); 4704 job = list_first_entry_or_null(&ring->sched.pending_list, 4705 struct drm_sched_job, list); 4706 spin_unlock(&ring->sched.job_list_lock); 4707 if (job) 4708 return true; 4709 } 4710 return false; 4711 } 4712 4713 /** 4714 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4715 * 4716 * @adev: amdgpu_device pointer 4717 * 4718 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4719 * a hung GPU. 4720 */ 4721 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4722 { 4723 4724 if (amdgpu_gpu_recovery == 0) 4725 goto disabled; 4726 4727 /* Skip soft reset check in fatal error mode */ 4728 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4729 return true; 4730 4731 if (amdgpu_sriov_vf(adev)) 4732 return true; 4733 4734 if (amdgpu_gpu_recovery == -1) { 4735 switch (adev->asic_type) { 4736 #ifdef CONFIG_DRM_AMDGPU_SI 4737 case CHIP_VERDE: 4738 case CHIP_TAHITI: 4739 case CHIP_PITCAIRN: 4740 case CHIP_OLAND: 4741 case CHIP_HAINAN: 4742 #endif 4743 #ifdef CONFIG_DRM_AMDGPU_CIK 4744 case CHIP_KAVERI: 4745 case CHIP_KABINI: 4746 case CHIP_MULLINS: 4747 #endif 4748 case CHIP_CARRIZO: 4749 case CHIP_STONEY: 4750 case CHIP_CYAN_SKILLFISH: 4751 goto disabled; 4752 default: 4753 break; 4754 } 4755 } 4756 4757 return true; 4758 4759 disabled: 4760 dev_info(adev->dev, "GPU recovery disabled.\n"); 4761 return false; 4762 } 4763 4764 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4765 { 4766 u32 i; 4767 int ret = 0; 4768 4769 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4770 4771 dev_info(adev->dev, "GPU mode1 reset\n"); 4772 4773 /* disable BM */ 4774 pci_clear_master(adev->pdev); 4775 4776 amdgpu_device_cache_pci_state(adev->pdev); 4777 4778 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4779 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4780 ret = amdgpu_dpm_mode1_reset(adev); 4781 } else { 4782 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4783 ret = psp_gpu_reset(adev); 4784 } 4785 4786 if (ret) 4787 goto mode1_reset_failed; 4788 4789 amdgpu_device_load_pci_state(adev->pdev); 4790 ret = amdgpu_psp_wait_for_bootloader(adev); 4791 if (ret) 4792 goto mode1_reset_failed; 4793 4794 /* wait for asic to come out of reset */ 4795 for (i = 0; i < adev->usec_timeout; i++) { 4796 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4797 4798 if (memsize != 0xffffffff) 4799 break; 4800 udelay(1); 4801 } 4802 4803 if (i >= adev->usec_timeout) { 4804 ret = -ETIMEDOUT; 4805 goto mode1_reset_failed; 4806 } 4807 4808 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4809 4810 return 0; 4811 4812 mode1_reset_failed: 4813 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4814 return ret; 4815 } 4816 4817 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4818 struct amdgpu_reset_context *reset_context) 4819 { 4820 int i, r = 0; 4821 struct amdgpu_job *job = NULL; 4822 bool need_full_reset = 4823 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4824 4825 if (reset_context->reset_req_dev == adev) 4826 job = reset_context->job; 4827 4828 if (amdgpu_sriov_vf(adev)) { 4829 /* stop the data exchange thread */ 4830 amdgpu_virt_fini_data_exchange(adev); 4831 } 4832 4833 amdgpu_fence_driver_isr_toggle(adev, true); 4834 4835 /* block all schedulers and reset given job's ring */ 4836 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4837 struct amdgpu_ring *ring = adev->rings[i]; 4838 4839 if (!ring || !ring->sched.thread) 4840 continue; 4841 4842 /* Clear job fence from fence drv to avoid force_completion 4843 * leave NULL and vm flush fence in fence drv 4844 */ 4845 amdgpu_fence_driver_clear_job_fences(ring); 4846 4847 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4848 amdgpu_fence_driver_force_completion(ring); 4849 } 4850 4851 amdgpu_fence_driver_isr_toggle(adev, false); 4852 4853 if (job && job->vm) 4854 drm_sched_increase_karma(&job->base); 4855 4856 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4857 /* If reset handler not implemented, continue; otherwise return */ 4858 if (r == -EOPNOTSUPP) 4859 r = 0; 4860 else 4861 return r; 4862 4863 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4864 if (!amdgpu_sriov_vf(adev)) { 4865 4866 if (!need_full_reset) 4867 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4868 4869 if (!need_full_reset && amdgpu_gpu_recovery && 4870 amdgpu_device_ip_check_soft_reset(adev)) { 4871 amdgpu_device_ip_pre_soft_reset(adev); 4872 r = amdgpu_device_ip_soft_reset(adev); 4873 amdgpu_device_ip_post_soft_reset(adev); 4874 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4875 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4876 need_full_reset = true; 4877 } 4878 } 4879 4880 if (need_full_reset) 4881 r = amdgpu_device_ip_suspend(adev); 4882 if (need_full_reset) 4883 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4884 else 4885 clear_bit(AMDGPU_NEED_FULL_RESET, 4886 &reset_context->flags); 4887 } 4888 4889 return r; 4890 } 4891 4892 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4893 { 4894 int i; 4895 4896 lockdep_assert_held(&adev->reset_domain->sem); 4897 4898 for (i = 0; i < adev->num_regs; i++) { 4899 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4900 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4901 adev->reset_dump_reg_value[i]); 4902 } 4903 4904 return 0; 4905 } 4906 4907 #ifdef CONFIG_DEV_COREDUMP 4908 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4909 size_t count, void *data, size_t datalen) 4910 { 4911 struct drm_printer p; 4912 struct amdgpu_device *adev = data; 4913 struct drm_print_iterator iter; 4914 int i; 4915 4916 iter.data = buffer; 4917 iter.offset = 0; 4918 iter.start = offset; 4919 iter.remain = count; 4920 4921 p = drm_coredump_printer(&iter); 4922 4923 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4924 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4925 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4926 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4927 if (adev->reset_task_info.pid) 4928 drm_printf(&p, "process_name: %s PID: %d\n", 4929 adev->reset_task_info.process_name, 4930 adev->reset_task_info.pid); 4931 4932 if (adev->reset_vram_lost) 4933 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4934 if (adev->num_regs) { 4935 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4936 4937 for (i = 0; i < adev->num_regs; i++) 4938 drm_printf(&p, "0x%08x: 0x%08x\n", 4939 adev->reset_dump_reg_list[i], 4940 adev->reset_dump_reg_value[i]); 4941 } 4942 4943 return count - iter.remain; 4944 } 4945 4946 static void amdgpu_devcoredump_free(void *data) 4947 { 4948 } 4949 4950 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4951 { 4952 struct drm_device *dev = adev_to_drm(adev); 4953 4954 ktime_get_ts64(&adev->reset_time); 4955 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT, 4956 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4957 } 4958 #endif 4959 4960 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4961 struct amdgpu_reset_context *reset_context) 4962 { 4963 struct amdgpu_device *tmp_adev = NULL; 4964 bool need_full_reset, skip_hw_reset, vram_lost = false; 4965 int r = 0; 4966 bool gpu_reset_for_dev_remove = 0; 4967 4968 /* Try reset handler method first */ 4969 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4970 reset_list); 4971 amdgpu_reset_reg_dumps(tmp_adev); 4972 4973 reset_context->reset_device_list = device_list_handle; 4974 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4975 /* If reset handler not implemented, continue; otherwise return */ 4976 if (r == -EOPNOTSUPP) 4977 r = 0; 4978 else 4979 return r; 4980 4981 /* Reset handler not implemented, use the default method */ 4982 need_full_reset = 4983 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4984 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4985 4986 gpu_reset_for_dev_remove = 4987 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4988 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4989 4990 /* 4991 * ASIC reset has to be done on all XGMI hive nodes ASAP 4992 * to allow proper links negotiation in FW (within 1 sec) 4993 */ 4994 if (!skip_hw_reset && need_full_reset) { 4995 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4996 /* For XGMI run all resets in parallel to speed up the process */ 4997 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4998 tmp_adev->gmc.xgmi.pending_reset = false; 4999 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5000 r = -EALREADY; 5001 } else 5002 r = amdgpu_asic_reset(tmp_adev); 5003 5004 if (r) { 5005 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5006 r, adev_to_drm(tmp_adev)->unique); 5007 break; 5008 } 5009 } 5010 5011 /* For XGMI wait for all resets to complete before proceed */ 5012 if (!r) { 5013 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5014 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5015 flush_work(&tmp_adev->xgmi_reset_work); 5016 r = tmp_adev->asic_reset_res; 5017 if (r) 5018 break; 5019 } 5020 } 5021 } 5022 } 5023 5024 if (!r && amdgpu_ras_intr_triggered()) { 5025 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5026 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 5027 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 5028 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 5029 } 5030 5031 amdgpu_ras_intr_cleared(); 5032 } 5033 5034 /* Since the mode1 reset affects base ip blocks, the 5035 * phase1 ip blocks need to be resumed. Otherwise there 5036 * will be a BIOS signature error and the psp bootloader 5037 * can't load kdb on the next amdgpu install. 5038 */ 5039 if (gpu_reset_for_dev_remove) { 5040 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5041 amdgpu_device_ip_resume_phase1(tmp_adev); 5042 5043 goto end; 5044 } 5045 5046 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5047 if (need_full_reset) { 5048 /* post card */ 5049 r = amdgpu_device_asic_init(tmp_adev); 5050 if (r) { 5051 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5052 } else { 5053 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5054 5055 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5056 if (r) 5057 goto out; 5058 5059 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5060 #ifdef CONFIG_DEV_COREDUMP 5061 tmp_adev->reset_vram_lost = vram_lost; 5062 memset(&tmp_adev->reset_task_info, 0, 5063 sizeof(tmp_adev->reset_task_info)); 5064 if (reset_context->job && reset_context->job->vm) 5065 tmp_adev->reset_task_info = 5066 reset_context->job->vm->task_info; 5067 amdgpu_reset_capture_coredumpm(tmp_adev); 5068 #endif 5069 if (vram_lost) { 5070 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5071 amdgpu_inc_vram_lost(tmp_adev); 5072 } 5073 5074 r = amdgpu_device_fw_loading(tmp_adev); 5075 if (r) 5076 return r; 5077 5078 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5079 if (r) 5080 goto out; 5081 5082 if (vram_lost) 5083 amdgpu_device_fill_reset_magic(tmp_adev); 5084 5085 /* 5086 * Add this ASIC as tracked as reset was already 5087 * complete successfully. 5088 */ 5089 amdgpu_register_gpu_instance(tmp_adev); 5090 5091 if (!reset_context->hive && 5092 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5093 amdgpu_xgmi_add_device(tmp_adev); 5094 5095 r = amdgpu_device_ip_late_init(tmp_adev); 5096 if (r) 5097 goto out; 5098 5099 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5100 5101 /* 5102 * The GPU enters bad state once faulty pages 5103 * by ECC has reached the threshold, and ras 5104 * recovery is scheduled next. So add one check 5105 * here to break recovery if it indeed exceeds 5106 * bad page threshold, and remind user to 5107 * retire this GPU or setting one bigger 5108 * bad_page_threshold value to fix this once 5109 * probing driver again. 5110 */ 5111 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5112 /* must succeed. */ 5113 amdgpu_ras_resume(tmp_adev); 5114 } else { 5115 r = -EINVAL; 5116 goto out; 5117 } 5118 5119 /* Update PSP FW topology after reset */ 5120 if (reset_context->hive && 5121 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5122 r = amdgpu_xgmi_update_topology( 5123 reset_context->hive, tmp_adev); 5124 } 5125 } 5126 5127 out: 5128 if (!r) { 5129 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5130 r = amdgpu_ib_ring_tests(tmp_adev); 5131 if (r) { 5132 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5133 need_full_reset = true; 5134 r = -EAGAIN; 5135 goto end; 5136 } 5137 } 5138 5139 if (!r) 5140 r = amdgpu_device_recover_vram(tmp_adev); 5141 else 5142 tmp_adev->asic_reset_res = r; 5143 } 5144 5145 end: 5146 if (need_full_reset) 5147 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5148 else 5149 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5150 return r; 5151 } 5152 5153 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5154 { 5155 5156 switch (amdgpu_asic_reset_method(adev)) { 5157 case AMD_RESET_METHOD_MODE1: 5158 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5159 break; 5160 case AMD_RESET_METHOD_MODE2: 5161 adev->mp1_state = PP_MP1_STATE_RESET; 5162 break; 5163 default: 5164 adev->mp1_state = PP_MP1_STATE_NONE; 5165 break; 5166 } 5167 5168 pci_dev_put(p); 5169 } 5170 5171 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5172 { 5173 amdgpu_vf_error_trans_all(adev); 5174 adev->mp1_state = PP_MP1_STATE_NONE; 5175 } 5176 5177 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5178 { 5179 STUB(); 5180 #ifdef notyet 5181 struct pci_dev *p = NULL; 5182 5183 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5184 adev->pdev->bus->number, 1); 5185 if (p) { 5186 pm_runtime_enable(&(p->dev)); 5187 pm_runtime_resume(&(p->dev)); 5188 } 5189 #endif 5190 } 5191 5192 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5193 { 5194 enum amd_reset_method reset_method; 5195 struct pci_dev *p = NULL; 5196 u64 expires; 5197 5198 /* 5199 * For now, only BACO and mode1 reset are confirmed 5200 * to suffer the audio issue without proper suspended. 5201 */ 5202 reset_method = amdgpu_asic_reset_method(adev); 5203 if ((reset_method != AMD_RESET_METHOD_BACO) && 5204 (reset_method != AMD_RESET_METHOD_MODE1)) 5205 return -EINVAL; 5206 5207 STUB(); 5208 return -ENOSYS; 5209 #ifdef notyet 5210 5211 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5212 adev->pdev->bus->number, 1); 5213 if (!p) 5214 return -ENODEV; 5215 5216 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5217 if (!expires) 5218 /* 5219 * If we cannot get the audio device autosuspend delay, 5220 * a fixed 4S interval will be used. Considering 3S is 5221 * the audio controller default autosuspend delay setting. 5222 * 4S used here is guaranteed to cover that. 5223 */ 5224 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5225 5226 while (!pm_runtime_status_suspended(&(p->dev))) { 5227 if (!pm_runtime_suspend(&(p->dev))) 5228 break; 5229 5230 if (expires < ktime_get_mono_fast_ns()) { 5231 dev_warn(adev->dev, "failed to suspend display audio\n"); 5232 pci_dev_put(p); 5233 /* TODO: abort the succeeding gpu reset? */ 5234 return -ETIMEDOUT; 5235 } 5236 } 5237 5238 pm_runtime_disable(&(p->dev)); 5239 5240 pci_dev_put(p); 5241 return 0; 5242 #endif 5243 } 5244 5245 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5246 { 5247 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5248 5249 #if defined(CONFIG_DEBUG_FS) 5250 if (!amdgpu_sriov_vf(adev)) 5251 cancel_work(&adev->reset_work); 5252 #endif 5253 5254 if (adev->kfd.dev) 5255 cancel_work(&adev->kfd.reset_work); 5256 5257 if (amdgpu_sriov_vf(adev)) 5258 cancel_work(&adev->virt.flr_work); 5259 5260 if (con && adev->ras_enabled) 5261 cancel_work(&con->recovery_work); 5262 5263 } 5264 5265 /** 5266 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5267 * 5268 * @adev: amdgpu_device pointer 5269 * @job: which job trigger hang 5270 * @reset_context: amdgpu reset context pointer 5271 * 5272 * Attempt to reset the GPU if it has hung (all asics). 5273 * Attempt to do soft-reset or full-reset and reinitialize Asic 5274 * Returns 0 for success or an error on failure. 5275 */ 5276 5277 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5278 struct amdgpu_job *job, 5279 struct amdgpu_reset_context *reset_context) 5280 { 5281 struct list_head device_list, *device_list_handle = NULL; 5282 bool job_signaled = false; 5283 struct amdgpu_hive_info *hive = NULL; 5284 struct amdgpu_device *tmp_adev = NULL; 5285 int i, r = 0; 5286 bool need_emergency_restart = false; 5287 bool audio_suspended = false; 5288 bool gpu_reset_for_dev_remove = false; 5289 5290 gpu_reset_for_dev_remove = 5291 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5292 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5293 5294 /* 5295 * Special case: RAS triggered and full reset isn't supported 5296 */ 5297 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5298 5299 /* 5300 * Flush RAM to disk so that after reboot 5301 * the user can read log and see why the system rebooted. 5302 */ 5303 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5304 amdgpu_ras_get_context(adev)->reboot) { 5305 DRM_WARN("Emergency reboot."); 5306 5307 #ifdef notyet 5308 ksys_sync_helper(); 5309 emergency_restart(); 5310 #else 5311 panic("emergency_restart"); 5312 #endif 5313 } 5314 5315 dev_info(adev->dev, "GPU %s begin!\n", 5316 need_emergency_restart ? "jobs stop":"reset"); 5317 5318 if (!amdgpu_sriov_vf(adev)) 5319 hive = amdgpu_get_xgmi_hive(adev); 5320 if (hive) 5321 mutex_lock(&hive->hive_lock); 5322 5323 reset_context->job = job; 5324 reset_context->hive = hive; 5325 /* 5326 * Build list of devices to reset. 5327 * In case we are in XGMI hive mode, resort the device list 5328 * to put adev in the 1st position. 5329 */ 5330 INIT_LIST_HEAD(&device_list); 5331 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5332 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5333 list_add_tail(&tmp_adev->reset_list, &device_list); 5334 if (gpu_reset_for_dev_remove && adev->shutdown) 5335 tmp_adev->shutdown = true; 5336 } 5337 if (!list_is_first(&adev->reset_list, &device_list)) 5338 list_rotate_to_front(&adev->reset_list, &device_list); 5339 device_list_handle = &device_list; 5340 } else { 5341 list_add_tail(&adev->reset_list, &device_list); 5342 device_list_handle = &device_list; 5343 } 5344 5345 /* We need to lock reset domain only once both for XGMI and single device */ 5346 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5347 reset_list); 5348 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5349 5350 /* block all schedulers and reset given job's ring */ 5351 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5352 5353 amdgpu_device_set_mp1_state(tmp_adev); 5354 5355 /* 5356 * Try to put the audio codec into suspend state 5357 * before gpu reset started. 5358 * 5359 * Due to the power domain of the graphics device 5360 * is shared with AZ power domain. Without this, 5361 * we may change the audio hardware from behind 5362 * the audio driver's back. That will trigger 5363 * some audio codec errors. 5364 */ 5365 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5366 audio_suspended = true; 5367 5368 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5369 5370 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5371 5372 if (!amdgpu_sriov_vf(tmp_adev)) 5373 amdgpu_amdkfd_pre_reset(tmp_adev); 5374 5375 /* 5376 * Mark these ASICs to be reseted as untracked first 5377 * And add them back after reset completed 5378 */ 5379 amdgpu_unregister_gpu_instance(tmp_adev); 5380 5381 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5382 5383 /* disable ras on ALL IPs */ 5384 if (!need_emergency_restart && 5385 amdgpu_device_ip_need_full_reset(tmp_adev)) 5386 amdgpu_ras_suspend(tmp_adev); 5387 5388 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5389 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5390 5391 if (!ring || !ring->sched.thread) 5392 continue; 5393 5394 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5395 5396 if (need_emergency_restart) 5397 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5398 } 5399 atomic_inc(&tmp_adev->gpu_reset_counter); 5400 } 5401 5402 if (need_emergency_restart) 5403 goto skip_sched_resume; 5404 5405 /* 5406 * Must check guilty signal here since after this point all old 5407 * HW fences are force signaled. 5408 * 5409 * job->base holds a reference to parent fence 5410 */ 5411 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5412 job_signaled = true; 5413 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5414 goto skip_hw_reset; 5415 } 5416 5417 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5418 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5419 if (gpu_reset_for_dev_remove) { 5420 /* Workaroud for ASICs need to disable SMC first */ 5421 amdgpu_device_smu_fini_early(tmp_adev); 5422 } 5423 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5424 /*TODO Should we stop ?*/ 5425 if (r) { 5426 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5427 r, adev_to_drm(tmp_adev)->unique); 5428 tmp_adev->asic_reset_res = r; 5429 } 5430 5431 /* 5432 * Drop all pending non scheduler resets. Scheduler resets 5433 * were already dropped during drm_sched_stop 5434 */ 5435 amdgpu_device_stop_pending_resets(tmp_adev); 5436 } 5437 5438 /* Actual ASIC resets if needed.*/ 5439 /* Host driver will handle XGMI hive reset for SRIOV */ 5440 if (amdgpu_sriov_vf(adev)) { 5441 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5442 if (r) 5443 adev->asic_reset_res = r; 5444 5445 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5446 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5447 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5448 amdgpu_ras_resume(adev); 5449 } else { 5450 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5451 if (r && r == -EAGAIN) 5452 goto retry; 5453 5454 if (!r && gpu_reset_for_dev_remove) 5455 goto recover_end; 5456 } 5457 5458 skip_hw_reset: 5459 5460 /* Post ASIC reset for all devs .*/ 5461 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5462 5463 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5464 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5465 5466 if (!ring || !ring->sched.thread) 5467 continue; 5468 5469 drm_sched_start(&ring->sched, true); 5470 } 5471 5472 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5473 amdgpu_mes_self_test(tmp_adev); 5474 5475 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5476 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5477 5478 if (tmp_adev->asic_reset_res) 5479 r = tmp_adev->asic_reset_res; 5480 5481 tmp_adev->asic_reset_res = 0; 5482 5483 if (r) { 5484 /* bad news, how to tell it to userspace ? */ 5485 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5486 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5487 } else { 5488 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5489 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5490 DRM_WARN("smart shift update failed\n"); 5491 } 5492 } 5493 5494 skip_sched_resume: 5495 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5496 /* unlock kfd: SRIOV would do it separately */ 5497 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5498 amdgpu_amdkfd_post_reset(tmp_adev); 5499 5500 /* kfd_post_reset will do nothing if kfd device is not initialized, 5501 * need to bring up kfd here if it's not be initialized before 5502 */ 5503 if (!adev->kfd.init_complete) 5504 amdgpu_amdkfd_device_init(adev); 5505 5506 if (audio_suspended) 5507 amdgpu_device_resume_display_audio(tmp_adev); 5508 5509 amdgpu_device_unset_mp1_state(tmp_adev); 5510 5511 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5512 } 5513 5514 recover_end: 5515 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5516 reset_list); 5517 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5518 5519 if (hive) { 5520 mutex_unlock(&hive->hive_lock); 5521 amdgpu_put_xgmi_hive(hive); 5522 } 5523 5524 if (r) 5525 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5526 5527 atomic_set(&adev->reset_domain->reset_res, r); 5528 return r; 5529 } 5530 5531 /** 5532 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5533 * 5534 * @adev: amdgpu_device pointer 5535 * 5536 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5537 * and lanes) of the slot the device is in. Handles APUs and 5538 * virtualized environments where PCIE config space may not be available. 5539 */ 5540 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5541 { 5542 struct pci_dev *pdev; 5543 enum pci_bus_speed speed_cap, platform_speed_cap; 5544 enum pcie_link_width platform_link_width; 5545 5546 if (amdgpu_pcie_gen_cap) 5547 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5548 5549 if (amdgpu_pcie_lane_cap) 5550 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5551 5552 /* covers APUs as well */ 5553 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5554 if (adev->pm.pcie_gen_mask == 0) 5555 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5556 if (adev->pm.pcie_mlw_mask == 0) 5557 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5558 return; 5559 } 5560 5561 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5562 return; 5563 5564 pcie_bandwidth_available(adev->pdev, NULL, 5565 &platform_speed_cap, &platform_link_width); 5566 5567 if (adev->pm.pcie_gen_mask == 0) { 5568 /* asic caps */ 5569 pdev = adev->pdev; 5570 speed_cap = pcie_get_speed_cap(pdev); 5571 if (speed_cap == PCI_SPEED_UNKNOWN) { 5572 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5573 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5574 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5575 } else { 5576 if (speed_cap == PCIE_SPEED_32_0GT) 5577 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5578 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5579 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5580 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5581 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5582 else if (speed_cap == PCIE_SPEED_16_0GT) 5583 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5584 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5585 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5586 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5587 else if (speed_cap == PCIE_SPEED_8_0GT) 5588 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5589 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5590 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5591 else if (speed_cap == PCIE_SPEED_5_0GT) 5592 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5593 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5594 else 5595 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5596 } 5597 /* platform caps */ 5598 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5599 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5600 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5601 } else { 5602 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5603 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5604 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5605 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5606 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5607 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5608 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5609 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5610 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5611 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5612 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5613 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5614 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5615 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5616 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5617 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5618 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5619 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5620 else 5621 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5622 5623 } 5624 } 5625 if (adev->pm.pcie_mlw_mask == 0) { 5626 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5627 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5628 } else { 5629 switch (platform_link_width) { 5630 case PCIE_LNK_X32: 5631 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5632 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5633 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5634 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5635 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5636 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5637 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5638 break; 5639 case PCIE_LNK_X16: 5640 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5641 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5642 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5643 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5644 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5645 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5646 break; 5647 case PCIE_LNK_X12: 5648 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5649 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5650 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5651 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5652 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5653 break; 5654 case PCIE_LNK_X8: 5655 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5656 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5657 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5658 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5659 break; 5660 case PCIE_LNK_X4: 5661 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5662 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5663 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5664 break; 5665 case PCIE_LNK_X2: 5666 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5667 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5668 break; 5669 case PCIE_LNK_X1: 5670 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5671 break; 5672 default: 5673 break; 5674 } 5675 } 5676 } 5677 } 5678 5679 /** 5680 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5681 * 5682 * @adev: amdgpu_device pointer 5683 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5684 * 5685 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5686 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5687 * @peer_adev. 5688 */ 5689 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5690 struct amdgpu_device *peer_adev) 5691 { 5692 #ifdef CONFIG_HSA_AMD_P2P 5693 uint64_t address_mask = peer_adev->dev->dma_mask ? 5694 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5695 resource_size_t aper_limit = 5696 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5697 bool p2p_access = 5698 !adev->gmc.xgmi.connected_to_cpu && 5699 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5700 5701 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5702 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5703 !(adev->gmc.aper_base & address_mask || 5704 aper_limit & address_mask)); 5705 #else 5706 return false; 5707 #endif 5708 } 5709 5710 int amdgpu_device_baco_enter(struct drm_device *dev) 5711 { 5712 struct amdgpu_device *adev = drm_to_adev(dev); 5713 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5714 5715 if (!amdgpu_device_supports_baco(dev)) 5716 return -ENOTSUPP; 5717 5718 if (ras && adev->ras_enabled && 5719 adev->nbio.funcs->enable_doorbell_interrupt) 5720 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5721 5722 return amdgpu_dpm_baco_enter(adev); 5723 } 5724 5725 int amdgpu_device_baco_exit(struct drm_device *dev) 5726 { 5727 struct amdgpu_device *adev = drm_to_adev(dev); 5728 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5729 int ret = 0; 5730 5731 if (!amdgpu_device_supports_baco(dev)) 5732 return -ENOTSUPP; 5733 5734 ret = amdgpu_dpm_baco_exit(adev); 5735 if (ret) 5736 return ret; 5737 5738 if (ras && adev->ras_enabled && 5739 adev->nbio.funcs->enable_doorbell_interrupt) 5740 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5741 5742 if (amdgpu_passthrough(adev) && 5743 adev->nbio.funcs->clear_doorbell_interrupt) 5744 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5745 5746 return 0; 5747 } 5748 5749 /** 5750 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5751 * @pdev: PCI device struct 5752 * @state: PCI channel state 5753 * 5754 * Description: Called when a PCI error is detected. 5755 * 5756 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5757 */ 5758 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5759 { 5760 STUB(); 5761 return 0; 5762 #ifdef notyet 5763 struct drm_device *dev = pci_get_drvdata(pdev); 5764 struct amdgpu_device *adev = drm_to_adev(dev); 5765 int i; 5766 5767 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5768 5769 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5770 DRM_WARN("No support for XGMI hive yet..."); 5771 return PCI_ERS_RESULT_DISCONNECT; 5772 } 5773 5774 adev->pci_channel_state = state; 5775 5776 switch (state) { 5777 case pci_channel_io_normal: 5778 return PCI_ERS_RESULT_CAN_RECOVER; 5779 /* Fatal error, prepare for slot reset */ 5780 case pci_channel_io_frozen: 5781 /* 5782 * Locking adev->reset_domain->sem will prevent any external access 5783 * to GPU during PCI error recovery 5784 */ 5785 amdgpu_device_lock_reset_domain(adev->reset_domain); 5786 amdgpu_device_set_mp1_state(adev); 5787 5788 /* 5789 * Block any work scheduling as we do for regular GPU reset 5790 * for the duration of the recovery 5791 */ 5792 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5793 struct amdgpu_ring *ring = adev->rings[i]; 5794 5795 if (!ring || !ring->sched.thread) 5796 continue; 5797 5798 drm_sched_stop(&ring->sched, NULL); 5799 } 5800 atomic_inc(&adev->gpu_reset_counter); 5801 return PCI_ERS_RESULT_NEED_RESET; 5802 case pci_channel_io_perm_failure: 5803 /* Permanent error, prepare for device removal */ 5804 return PCI_ERS_RESULT_DISCONNECT; 5805 } 5806 5807 return PCI_ERS_RESULT_NEED_RESET; 5808 #endif 5809 } 5810 5811 /** 5812 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5813 * @pdev: pointer to PCI device 5814 */ 5815 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5816 { 5817 5818 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5819 5820 /* TODO - dump whatever for debugging purposes */ 5821 5822 /* This called only if amdgpu_pci_error_detected returns 5823 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5824 * works, no need to reset slot. 5825 */ 5826 5827 return PCI_ERS_RESULT_RECOVERED; 5828 } 5829 5830 /** 5831 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5832 * @pdev: PCI device struct 5833 * 5834 * Description: This routine is called by the pci error recovery 5835 * code after the PCI slot has been reset, just before we 5836 * should resume normal operations. 5837 */ 5838 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5839 { 5840 STUB(); 5841 return PCI_ERS_RESULT_RECOVERED; 5842 #ifdef notyet 5843 struct drm_device *dev = pci_get_drvdata(pdev); 5844 struct amdgpu_device *adev = drm_to_adev(dev); 5845 int r, i; 5846 struct amdgpu_reset_context reset_context; 5847 u32 memsize; 5848 struct list_head device_list; 5849 5850 DRM_INFO("PCI error: slot reset callback!!\n"); 5851 5852 memset(&reset_context, 0, sizeof(reset_context)); 5853 5854 INIT_LIST_HEAD(&device_list); 5855 list_add_tail(&adev->reset_list, &device_list); 5856 5857 /* wait for asic to come out of reset */ 5858 drm_msleep(500); 5859 5860 /* Restore PCI confspace */ 5861 amdgpu_device_load_pci_state(pdev); 5862 5863 /* confirm ASIC came out of reset */ 5864 for (i = 0; i < adev->usec_timeout; i++) { 5865 memsize = amdgpu_asic_get_config_memsize(adev); 5866 5867 if (memsize != 0xffffffff) 5868 break; 5869 udelay(1); 5870 } 5871 if (memsize == 0xffffffff) { 5872 r = -ETIME; 5873 goto out; 5874 } 5875 5876 reset_context.method = AMD_RESET_METHOD_NONE; 5877 reset_context.reset_req_dev = adev; 5878 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5879 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5880 5881 adev->no_hw_access = true; 5882 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5883 adev->no_hw_access = false; 5884 if (r) 5885 goto out; 5886 5887 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5888 5889 out: 5890 if (!r) { 5891 if (amdgpu_device_cache_pci_state(adev->pdev)) 5892 pci_restore_state(adev->pdev); 5893 5894 DRM_INFO("PCIe error recovery succeeded\n"); 5895 } else { 5896 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5897 amdgpu_device_unset_mp1_state(adev); 5898 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5899 } 5900 5901 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5902 #endif 5903 } 5904 5905 /** 5906 * amdgpu_pci_resume() - resume normal ops after PCI reset 5907 * @pdev: pointer to PCI device 5908 * 5909 * Called when the error recovery driver tells us that its 5910 * OK to resume normal operation. 5911 */ 5912 void amdgpu_pci_resume(struct pci_dev *pdev) 5913 { 5914 STUB(); 5915 #ifdef notyet 5916 struct drm_device *dev = pci_get_drvdata(pdev); 5917 struct amdgpu_device *adev = drm_to_adev(dev); 5918 int i; 5919 5920 5921 DRM_INFO("PCI error: resume callback!!\n"); 5922 5923 /* Only continue execution for the case of pci_channel_io_frozen */ 5924 if (adev->pci_channel_state != pci_channel_io_frozen) 5925 return; 5926 5927 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5928 struct amdgpu_ring *ring = adev->rings[i]; 5929 5930 if (!ring || !ring->sched.thread) 5931 continue; 5932 5933 drm_sched_start(&ring->sched, true); 5934 } 5935 5936 amdgpu_device_unset_mp1_state(adev); 5937 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5938 #endif 5939 } 5940 5941 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5942 { 5943 return false; 5944 #ifdef notyet 5945 struct drm_device *dev = pci_get_drvdata(pdev); 5946 struct amdgpu_device *adev = drm_to_adev(dev); 5947 int r; 5948 5949 r = pci_save_state(pdev); 5950 if (!r) { 5951 kfree(adev->pci_state); 5952 5953 adev->pci_state = pci_store_saved_state(pdev); 5954 5955 if (!adev->pci_state) { 5956 DRM_ERROR("Failed to store PCI saved state"); 5957 return false; 5958 } 5959 } else { 5960 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5961 return false; 5962 } 5963 5964 return true; 5965 #endif 5966 } 5967 5968 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5969 { 5970 STUB(); 5971 return false; 5972 #ifdef notyet 5973 struct drm_device *dev = pci_get_drvdata(pdev); 5974 struct amdgpu_device *adev = drm_to_adev(dev); 5975 int r; 5976 5977 if (!adev->pci_state) 5978 return false; 5979 5980 r = pci_load_saved_state(pdev, adev->pci_state); 5981 5982 if (!r) { 5983 pci_restore_state(pdev); 5984 } else { 5985 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5986 return false; 5987 } 5988 5989 return true; 5990 #endif 5991 } 5992 5993 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5994 struct amdgpu_ring *ring) 5995 { 5996 #ifdef CONFIG_X86_64 5997 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5998 return; 5999 #endif 6000 if (adev->gmc.xgmi.connected_to_cpu) 6001 return; 6002 6003 if (ring && ring->funcs->emit_hdp_flush) 6004 amdgpu_ring_emit_hdp_flush(ring); 6005 else 6006 amdgpu_asic_flush_hdp(adev, ring); 6007 } 6008 6009 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6010 struct amdgpu_ring *ring) 6011 { 6012 #ifdef CONFIG_X86_64 6013 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6014 return; 6015 #endif 6016 if (adev->gmc.xgmi.connected_to_cpu) 6017 return; 6018 6019 amdgpu_asic_invalidate_hdp(adev, ring); 6020 } 6021 6022 int amdgpu_in_reset(struct amdgpu_device *adev) 6023 { 6024 return atomic_read(&adev->reset_domain->in_gpu_reset); 6025 } 6026 6027 /** 6028 * amdgpu_device_halt() - bring hardware to some kind of halt state 6029 * 6030 * @adev: amdgpu_device pointer 6031 * 6032 * Bring hardware to some kind of halt state so that no one can touch it 6033 * any more. It will help to maintain error context when error occurred. 6034 * Compare to a simple hang, the system will keep stable at least for SSH 6035 * access. Then it should be trivial to inspect the hardware state and 6036 * see what's going on. Implemented as following: 6037 * 6038 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6039 * clears all CPU mappings to device, disallows remappings through page faults 6040 * 2. amdgpu_irq_disable_all() disables all interrupts 6041 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6042 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6043 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6044 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6045 * flush any in flight DMA operations 6046 */ 6047 void amdgpu_device_halt(struct amdgpu_device *adev) 6048 { 6049 struct pci_dev *pdev = adev->pdev; 6050 struct drm_device *ddev = adev_to_drm(adev); 6051 6052 amdgpu_xcp_dev_unplug(adev); 6053 drm_dev_unplug(ddev); 6054 6055 amdgpu_irq_disable_all(adev); 6056 6057 amdgpu_fence_driver_hw_fini(adev); 6058 6059 adev->no_hw_access = true; 6060 6061 amdgpu_device_unmap_mmio(adev); 6062 6063 pci_disable_device(pdev); 6064 pci_wait_for_pending_transaction(pdev); 6065 } 6066 6067 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6068 u32 reg) 6069 { 6070 unsigned long flags, address, data; 6071 u32 r; 6072 6073 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6074 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6075 6076 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6077 WREG32(address, reg * 4); 6078 (void)RREG32(address); 6079 r = RREG32(data); 6080 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6081 return r; 6082 } 6083 6084 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6085 u32 reg, u32 v) 6086 { 6087 unsigned long flags, address, data; 6088 6089 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6090 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6091 6092 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6093 WREG32(address, reg * 4); 6094 (void)RREG32(address); 6095 WREG32(data, v); 6096 (void)RREG32(data); 6097 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6098 } 6099 6100 /** 6101 * amdgpu_device_switch_gang - switch to a new gang 6102 * @adev: amdgpu_device pointer 6103 * @gang: the gang to switch to 6104 * 6105 * Try to switch to a new gang. 6106 * Returns: NULL if we switched to the new gang or a reference to the current 6107 * gang leader. 6108 */ 6109 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6110 struct dma_fence *gang) 6111 { 6112 struct dma_fence *old = NULL; 6113 6114 do { 6115 dma_fence_put(old); 6116 rcu_read_lock(); 6117 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6118 rcu_read_unlock(); 6119 6120 if (old == gang) 6121 break; 6122 6123 if (!dma_fence_is_signaled(old)) 6124 return old; 6125 6126 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6127 old, gang) != old); 6128 6129 dma_fence_put(old); 6130 return NULL; 6131 } 6132 6133 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6134 { 6135 switch (adev->asic_type) { 6136 #ifdef CONFIG_DRM_AMDGPU_SI 6137 case CHIP_HAINAN: 6138 #endif 6139 case CHIP_TOPAZ: 6140 /* chips with no display hardware */ 6141 return false; 6142 #ifdef CONFIG_DRM_AMDGPU_SI 6143 case CHIP_TAHITI: 6144 case CHIP_PITCAIRN: 6145 case CHIP_VERDE: 6146 case CHIP_OLAND: 6147 #endif 6148 #ifdef CONFIG_DRM_AMDGPU_CIK 6149 case CHIP_BONAIRE: 6150 case CHIP_HAWAII: 6151 case CHIP_KAVERI: 6152 case CHIP_KABINI: 6153 case CHIP_MULLINS: 6154 #endif 6155 case CHIP_TONGA: 6156 case CHIP_FIJI: 6157 case CHIP_POLARIS10: 6158 case CHIP_POLARIS11: 6159 case CHIP_POLARIS12: 6160 case CHIP_VEGAM: 6161 case CHIP_CARRIZO: 6162 case CHIP_STONEY: 6163 /* chips with display hardware */ 6164 return true; 6165 default: 6166 /* IP discovery */ 6167 if (!adev->ip_versions[DCE_HWIP][0] || 6168 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6169 return false; 6170 return true; 6171 } 6172 } 6173 6174 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6175 uint32_t inst, uint32_t reg_addr, char reg_name[], 6176 uint32_t expected_value, uint32_t mask) 6177 { 6178 uint32_t ret = 0; 6179 uint32_t old_ = 0; 6180 uint32_t tmp_ = RREG32(reg_addr); 6181 uint32_t loop = adev->usec_timeout; 6182 6183 while ((tmp_ & (mask)) != (expected_value)) { 6184 if (old_ != tmp_) { 6185 loop = adev->usec_timeout; 6186 old_ = tmp_; 6187 } else 6188 udelay(1); 6189 tmp_ = RREG32(reg_addr); 6190 loop--; 6191 if (!loop) { 6192 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6193 inst, reg_name, (uint32_t)expected_value, 6194 (uint32_t)(tmp_ & (mask))); 6195 ret = -ETIMEDOUT; 6196 break; 6197 } 6198 } 6199 return ret; 6200 } 6201