1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/device.h> 47 #include <linux/vgaarb.h> 48 #include <linux/vga_switcheroo.h> 49 #include <linux/efi.h> 50 #include "amdgpu.h" 51 #include "amdgpu_trace.h" 52 #include "amdgpu_i2c.h" 53 #include "atom.h" 54 #include "amdgpu_atombios.h" 55 #include "amdgpu_atomfirmware.h" 56 #include "amd_pcie.h" 57 #ifdef CONFIG_DRM_AMDGPU_SI 58 #include "si.h" 59 #endif 60 #ifdef CONFIG_DRM_AMDGPU_CIK 61 #include "cik.h" 62 #endif 63 #include "vi.h" 64 #include "soc15.h" 65 #include "nv.h" 66 #include "bif/bif_4_1_d.h" 67 #include <linux/firmware.h> 68 #include "amdgpu_vf_error.h" 69 70 #include "amdgpu_amdkfd.h" 71 #include "amdgpu_pm.h" 72 73 #include "amdgpu_xgmi.h" 74 #include "amdgpu_ras.h" 75 #include "amdgpu_pmu.h" 76 #include "amdgpu_fru_eeprom.h" 77 #include "amdgpu_reset.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) && defined(__linux__) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 101 static const struct drm_driver amdgpu_kms_driver; 102 103 const char *amdgpu_asic_name[] = { 104 "TAHITI", 105 "PITCAIRN", 106 "VERDE", 107 "OLAND", 108 "HAINAN", 109 "BONAIRE", 110 "KAVERI", 111 "KABINI", 112 "HAWAII", 113 "MULLINS", 114 "TOPAZ", 115 "TONGA", 116 "FIJI", 117 "CARRIZO", 118 "STONEY", 119 "POLARIS10", 120 "POLARIS11", 121 "POLARIS12", 122 "VEGAM", 123 "VEGA10", 124 "VEGA12", 125 "VEGA20", 126 "RAVEN", 127 "ARCTURUS", 128 "RENOIR", 129 "ALDEBARAN", 130 "NAVI10", 131 "CYAN_SKILLFISH", 132 "NAVI14", 133 "NAVI12", 134 "SIENNA_CICHLID", 135 "NAVY_FLOUNDER", 136 "VANGOGH", 137 "DIMGREY_CAVEFISH", 138 "BEIGE_GOBY", 139 "YELLOW_CARP", 140 "IP DISCOVERY", 141 "LAST", 142 }; 143 144 /** 145 * DOC: pcie_replay_count 146 * 147 * The amdgpu driver provides a sysfs API for reporting the total number 148 * of PCIe replays (NAKs) 149 * The file pcie_replay_count is used for this and returns the total 150 * number of replays as a sum of the NAKs generated and NAKs received 151 */ 152 153 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 154 struct device_attribute *attr, char *buf) 155 { 156 struct drm_device *ddev = dev_get_drvdata(dev); 157 struct amdgpu_device *adev = drm_to_adev(ddev); 158 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 159 160 return sysfs_emit(buf, "%llu\n", cnt); 161 } 162 163 static DEVICE_ATTR(pcie_replay_count, 0444, 164 amdgpu_device_get_pcie_replay_count, NULL); 165 166 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 167 168 169 /** 170 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 171 * 172 * @dev: drm_device pointer 173 * 174 * Returns true if the device is a dGPU with ATPX power control, 175 * otherwise return false. 176 */ 177 bool amdgpu_device_supports_px(struct drm_device *dev) 178 { 179 struct amdgpu_device *adev = drm_to_adev(dev); 180 181 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 182 return true; 183 return false; 184 } 185 186 /** 187 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 188 * 189 * @dev: drm_device pointer 190 * 191 * Returns true if the device is a dGPU with ACPI power control, 192 * otherwise return false. 193 */ 194 bool amdgpu_device_supports_boco(struct drm_device *dev) 195 { 196 struct amdgpu_device *adev = drm_to_adev(dev); 197 198 if (adev->has_pr3 || 199 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 200 return true; 201 return false; 202 } 203 204 /** 205 * amdgpu_device_supports_baco - Does the device support BACO 206 * 207 * @dev: drm_device pointer 208 * 209 * Returns true if the device supporte BACO, 210 * otherwise return false. 211 */ 212 bool amdgpu_device_supports_baco(struct drm_device *dev) 213 { 214 struct amdgpu_device *adev = drm_to_adev(dev); 215 216 return amdgpu_asic_supports_baco(adev); 217 } 218 219 /** 220 * amdgpu_device_supports_smart_shift - Is the device dGPU with 221 * smart shift support 222 * 223 * @dev: drm_device pointer 224 * 225 * Returns true if the device is a dGPU with Smart Shift support, 226 * otherwise returns false. 227 */ 228 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 229 { 230 return (amdgpu_device_supports_boco(dev) && 231 amdgpu_acpi_is_power_shift_control_supported()); 232 } 233 234 /* 235 * VRAM access helper functions 236 */ 237 238 /** 239 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 240 * 241 * @adev: amdgpu_device pointer 242 * @pos: offset of the buffer in vram 243 * @buf: virtual address of the buffer in system memory 244 * @size: read/write size, sizeof(@buf) must > @size 245 * @write: true - write to vram, otherwise - read from vram 246 */ 247 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 248 void *buf, size_t size, bool write) 249 { 250 unsigned long flags; 251 uint32_t hi = ~0, tmp = 0; 252 uint32_t *data = buf; 253 uint64_t last; 254 int idx; 255 256 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 257 return; 258 259 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 260 261 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 262 for (last = pos + size; pos < last; pos += 4) { 263 tmp = pos >> 31; 264 265 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 266 if (tmp != hi) { 267 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 268 hi = tmp; 269 } 270 if (write) 271 WREG32_NO_KIQ(mmMM_DATA, *data++); 272 else 273 *data++ = RREG32_NO_KIQ(mmMM_DATA); 274 } 275 276 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 277 drm_dev_exit(idx); 278 } 279 280 /** 281 * amdgpu_device_aper_access - access vram by vram aperature 282 * 283 * @adev: amdgpu_device pointer 284 * @pos: offset of the buffer in vram 285 * @buf: virtual address of the buffer in system memory 286 * @size: read/write size, sizeof(@buf) must > @size 287 * @write: true - write to vram, otherwise - read from vram 288 * 289 * The return value means how many bytes have been transferred. 290 */ 291 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 292 void *buf, size_t size, bool write) 293 { 294 #ifdef CONFIG_64BIT 295 void __iomem *addr; 296 size_t count = 0; 297 uint64_t last; 298 299 if (!adev->mman.aper_base_kaddr) 300 return 0; 301 302 last = min(pos + size, adev->gmc.visible_vram_size); 303 if (last > pos) { 304 addr = adev->mman.aper_base_kaddr + pos; 305 count = last - pos; 306 307 if (write) { 308 memcpy_toio(addr, buf, count); 309 /* Make sure HDP write cache flush happens without any reordering 310 * after the system memory contents are sent over PCIe device 311 */ 312 mb(); 313 amdgpu_device_flush_hdp(adev, NULL); 314 } else { 315 amdgpu_device_invalidate_hdp(adev, NULL); 316 /* Make sure HDP read cache is invalidated before issuing a read 317 * to the PCIe device 318 */ 319 mb(); 320 memcpy_fromio(buf, addr, count); 321 } 322 323 } 324 325 return count; 326 #else 327 return 0; 328 #endif 329 } 330 331 /** 332 * amdgpu_device_vram_access - read/write a buffer in vram 333 * 334 * @adev: amdgpu_device pointer 335 * @pos: offset of the buffer in vram 336 * @buf: virtual address of the buffer in system memory 337 * @size: read/write size, sizeof(@buf) must > @size 338 * @write: true - write to vram, otherwise - read from vram 339 */ 340 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 341 void *buf, size_t size, bool write) 342 { 343 size_t count; 344 345 /* try to using vram apreature to access vram first */ 346 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 347 size -= count; 348 if (size) { 349 /* using MM to access rest vram */ 350 pos += count; 351 buf += count; 352 amdgpu_device_mm_access(adev, pos, buf, size, write); 353 } 354 } 355 356 /* 357 * register access helper functions. 358 */ 359 360 /* Check if hw access should be skipped because of hotplug or device error */ 361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 362 { 363 if (adev->no_hw_access) 364 return true; 365 366 #ifdef CONFIG_LOCKDEP 367 /* 368 * This is a bit complicated to understand, so worth a comment. What we assert 369 * here is that the GPU reset is not running on another thread in parallel. 370 * 371 * For this we trylock the read side of the reset semaphore, if that succeeds 372 * we know that the reset is not running in paralell. 373 * 374 * If the trylock fails we assert that we are either already holding the read 375 * side of the lock or are the reset thread itself and hold the write side of 376 * the lock. 377 */ 378 if (in_task()) { 379 if (down_read_trylock(&adev->reset_domain->sem)) 380 up_read(&adev->reset_domain->sem); 381 else 382 lockdep_assert_held(&adev->reset_domain->sem); 383 } 384 #endif 385 return false; 386 } 387 388 /** 389 * amdgpu_device_rreg - read a memory mapped IO or indirect register 390 * 391 * @adev: amdgpu_device pointer 392 * @reg: dword aligned register offset 393 * @acc_flags: access flags which require special behavior 394 * 395 * Returns the 32 bit value from the offset specified. 396 */ 397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 398 uint32_t reg, uint32_t acc_flags) 399 { 400 uint32_t ret; 401 402 if (amdgpu_device_skip_hw_access(adev)) 403 return 0; 404 405 if ((reg * 4) < adev->rmmio_size) { 406 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 407 amdgpu_sriov_runtime(adev) && 408 down_read_trylock(&adev->reset_domain->sem)) { 409 ret = amdgpu_kiq_rreg(adev, reg); 410 up_read(&adev->reset_domain->sem); 411 } else { 412 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 413 } 414 } else { 415 ret = adev->pcie_rreg(adev, reg * 4); 416 } 417 418 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 419 420 return ret; 421 } 422 423 /* 424 * MMIO register read with bytes helper functions 425 * @offset:bytes offset from MMIO start 426 */ 427 428 /** 429 * amdgpu_mm_rreg8 - read a memory mapped IO register 430 * 431 * @adev: amdgpu_device pointer 432 * @offset: byte aligned register offset 433 * 434 * Returns the 8 bit value from the offset specified. 435 */ 436 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 437 { 438 if (amdgpu_device_skip_hw_access(adev)) 439 return 0; 440 441 if (offset < adev->rmmio_size) 442 return (readb(adev->rmmio + offset)); 443 BUG(); 444 } 445 446 /* 447 * MMIO register write with bytes helper functions 448 * @offset:bytes offset from MMIO start 449 * @value: the value want to be written to the register 450 */ 451 452 /** 453 * amdgpu_mm_wreg8 - read a memory mapped IO register 454 * 455 * @adev: amdgpu_device pointer 456 * @offset: byte aligned register offset 457 * @value: 8 bit value to write 458 * 459 * Writes the value specified to the offset specified. 460 */ 461 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 462 { 463 if (amdgpu_device_skip_hw_access(adev)) 464 return; 465 466 if (offset < adev->rmmio_size) 467 writeb(value, adev->rmmio + offset); 468 else 469 BUG(); 470 } 471 472 /** 473 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 474 * 475 * @adev: amdgpu_device pointer 476 * @reg: dword aligned register offset 477 * @v: 32 bit value to write to the register 478 * @acc_flags: access flags which require special behavior 479 * 480 * Writes the value specified to the offset specified. 481 */ 482 void amdgpu_device_wreg(struct amdgpu_device *adev, 483 uint32_t reg, uint32_t v, 484 uint32_t acc_flags) 485 { 486 if (amdgpu_device_skip_hw_access(adev)) 487 return; 488 489 if ((reg * 4) < adev->rmmio_size) { 490 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 491 amdgpu_sriov_runtime(adev) && 492 down_read_trylock(&adev->reset_domain->sem)) { 493 amdgpu_kiq_wreg(adev, reg, v); 494 up_read(&adev->reset_domain->sem); 495 } else { 496 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 497 } 498 } else { 499 adev->pcie_wreg(adev, reg * 4, v); 500 } 501 502 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 503 } 504 505 /** 506 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 507 * 508 * @adev: amdgpu_device pointer 509 * @reg: mmio/rlc register 510 * @v: value to write 511 * 512 * this function is invoked only for the debugfs register access 513 */ 514 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 515 uint32_t reg, uint32_t v, 516 uint32_t xcc_id) 517 { 518 if (amdgpu_device_skip_hw_access(adev)) 519 return; 520 521 if (amdgpu_sriov_fullaccess(adev) && 522 adev->gfx.rlc.funcs && 523 adev->gfx.rlc.funcs->is_rlcg_access_range) { 524 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 525 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 526 } else if ((reg * 4) >= adev->rmmio_size) { 527 adev->pcie_wreg(adev, reg * 4, v); 528 } else { 529 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 530 } 531 } 532 533 /** 534 * amdgpu_device_indirect_rreg - read an indirect register 535 * 536 * @adev: amdgpu_device pointer 537 * @reg_addr: indirect register address to read from 538 * 539 * Returns the value of indirect register @reg_addr 540 */ 541 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 542 u32 reg_addr) 543 { 544 unsigned long flags, pcie_index, pcie_data; 545 void __iomem *pcie_index_offset; 546 void __iomem *pcie_data_offset; 547 u32 r; 548 549 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 550 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 551 552 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 553 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 554 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 555 556 writel(reg_addr, pcie_index_offset); 557 readl(pcie_index_offset); 558 r = readl(pcie_data_offset); 559 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 560 561 return r; 562 } 563 564 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 565 u64 reg_addr) 566 { 567 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 568 u32 r; 569 void __iomem *pcie_index_offset; 570 void __iomem *pcie_index_hi_offset; 571 void __iomem *pcie_data_offset; 572 573 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 574 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 575 if (adev->nbio.funcs->get_pcie_index_hi_offset) 576 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 577 else 578 pcie_index_hi = 0; 579 580 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 581 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 582 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 583 if (pcie_index_hi != 0) 584 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 585 pcie_index_hi * 4; 586 587 writel(reg_addr, pcie_index_offset); 588 readl(pcie_index_offset); 589 if (pcie_index_hi != 0) { 590 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 591 readl(pcie_index_hi_offset); 592 } 593 r = readl(pcie_data_offset); 594 595 /* clear the high bits */ 596 if (pcie_index_hi != 0) { 597 writel(0, pcie_index_hi_offset); 598 readl(pcie_index_hi_offset); 599 } 600 601 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 602 603 return r; 604 } 605 606 /** 607 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 608 * 609 * @adev: amdgpu_device pointer 610 * @reg_addr: indirect register address to read from 611 * 612 * Returns the value of indirect register @reg_addr 613 */ 614 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 615 u32 reg_addr) 616 { 617 unsigned long flags, pcie_index, pcie_data; 618 void __iomem *pcie_index_offset; 619 void __iomem *pcie_data_offset; 620 u64 r; 621 622 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 623 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 624 625 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 626 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 627 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 628 629 /* read low 32 bits */ 630 writel(reg_addr, pcie_index_offset); 631 readl(pcie_index_offset); 632 r = readl(pcie_data_offset); 633 /* read high 32 bits */ 634 writel(reg_addr + 4, pcie_index_offset); 635 readl(pcie_index_offset); 636 r |= ((u64)readl(pcie_data_offset) << 32); 637 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 638 639 return r; 640 } 641 642 /** 643 * amdgpu_device_indirect_wreg - write an indirect register address 644 * 645 * @adev: amdgpu_device pointer 646 * @reg_addr: indirect register offset 647 * @reg_data: indirect register data 648 * 649 */ 650 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 651 u32 reg_addr, u32 reg_data) 652 { 653 unsigned long flags, pcie_index, pcie_data; 654 void __iomem *pcie_index_offset; 655 void __iomem *pcie_data_offset; 656 657 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 658 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 659 660 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 661 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 662 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 663 664 writel(reg_addr, pcie_index_offset); 665 readl(pcie_index_offset); 666 writel(reg_data, pcie_data_offset); 667 readl(pcie_data_offset); 668 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 669 } 670 671 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 672 u64 reg_addr, u32 reg_data) 673 { 674 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 675 void __iomem *pcie_index_offset; 676 void __iomem *pcie_index_hi_offset; 677 void __iomem *pcie_data_offset; 678 679 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 680 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 681 if (adev->nbio.funcs->get_pcie_index_hi_offset) 682 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 683 else 684 pcie_index_hi = 0; 685 686 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 687 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 688 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 689 if (pcie_index_hi != 0) 690 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 691 pcie_index_hi * 4; 692 693 writel(reg_addr, pcie_index_offset); 694 readl(pcie_index_offset); 695 if (pcie_index_hi != 0) { 696 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 697 readl(pcie_index_hi_offset); 698 } 699 writel(reg_data, pcie_data_offset); 700 readl(pcie_data_offset); 701 702 /* clear the high bits */ 703 if (pcie_index_hi != 0) { 704 writel(0, pcie_index_hi_offset); 705 readl(pcie_index_hi_offset); 706 } 707 708 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 709 } 710 711 /** 712 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 713 * 714 * @adev: amdgpu_device pointer 715 * @reg_addr: indirect register offset 716 * @reg_data: indirect register data 717 * 718 */ 719 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 720 u32 reg_addr, u64 reg_data) 721 { 722 unsigned long flags, pcie_index, pcie_data; 723 void __iomem *pcie_index_offset; 724 void __iomem *pcie_data_offset; 725 726 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 727 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 728 729 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 732 733 /* write low 32 bits */ 734 writel(reg_addr, pcie_index_offset); 735 readl(pcie_index_offset); 736 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 737 readl(pcie_data_offset); 738 /* write high 32 bits */ 739 writel(reg_addr + 4, pcie_index_offset); 740 readl(pcie_index_offset); 741 writel((u32)(reg_data >> 32), pcie_data_offset); 742 readl(pcie_data_offset); 743 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 744 } 745 746 /** 747 * amdgpu_device_get_rev_id - query device rev_id 748 * 749 * @adev: amdgpu_device pointer 750 * 751 * Return device rev_id 752 */ 753 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 754 { 755 return adev->nbio.funcs->get_rev_id(adev); 756 } 757 758 /** 759 * amdgpu_invalid_rreg - dummy reg read function 760 * 761 * @adev: amdgpu_device pointer 762 * @reg: offset of register 763 * 764 * Dummy register read function. Used for register blocks 765 * that certain asics don't have (all asics). 766 * Returns the value in the register. 767 */ 768 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 769 { 770 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 771 BUG(); 772 return 0; 773 } 774 775 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 776 { 777 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 778 BUG(); 779 return 0; 780 } 781 782 /** 783 * amdgpu_invalid_wreg - dummy reg write function 784 * 785 * @adev: amdgpu_device pointer 786 * @reg: offset of register 787 * @v: value to write to the register 788 * 789 * Dummy register read function. Used for register blocks 790 * that certain asics don't have (all asics). 791 */ 792 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 793 { 794 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 795 reg, v); 796 BUG(); 797 } 798 799 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 800 { 801 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 802 reg, v); 803 BUG(); 804 } 805 806 /** 807 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 808 * 809 * @adev: amdgpu_device pointer 810 * @reg: offset of register 811 * 812 * Dummy register read function. Used for register blocks 813 * that certain asics don't have (all asics). 814 * Returns the value in the register. 815 */ 816 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 817 { 818 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 819 BUG(); 820 return 0; 821 } 822 823 /** 824 * amdgpu_invalid_wreg64 - dummy reg write function 825 * 826 * @adev: amdgpu_device pointer 827 * @reg: offset of register 828 * @v: value to write to the register 829 * 830 * Dummy register read function. Used for register blocks 831 * that certain asics don't have (all asics). 832 */ 833 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 834 { 835 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 836 reg, v); 837 BUG(); 838 } 839 840 /** 841 * amdgpu_block_invalid_rreg - dummy reg read function 842 * 843 * @adev: amdgpu_device pointer 844 * @block: offset of instance 845 * @reg: offset of register 846 * 847 * Dummy register read function. Used for register blocks 848 * that certain asics don't have (all asics). 849 * Returns the value in the register. 850 */ 851 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 852 uint32_t block, uint32_t reg) 853 { 854 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 855 reg, block); 856 BUG(); 857 return 0; 858 } 859 860 /** 861 * amdgpu_block_invalid_wreg - dummy reg write function 862 * 863 * @adev: amdgpu_device pointer 864 * @block: offset of instance 865 * @reg: offset of register 866 * @v: value to write to the register 867 * 868 * Dummy register read function. Used for register blocks 869 * that certain asics don't have (all asics). 870 */ 871 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 872 uint32_t block, 873 uint32_t reg, uint32_t v) 874 { 875 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 876 reg, block, v); 877 BUG(); 878 } 879 880 /** 881 * amdgpu_device_asic_init - Wrapper for atom asic_init 882 * 883 * @adev: amdgpu_device pointer 884 * 885 * Does any asic specific work and then calls atom asic init. 886 */ 887 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 888 { 889 int ret; 890 891 amdgpu_asic_pre_asic_init(adev); 892 893 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 894 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) { 895 amdgpu_psp_wait_for_bootloader(adev); 896 ret = amdgpu_atomfirmware_asic_init(adev, true); 897 return ret; 898 } else { 899 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 900 } 901 902 return 0; 903 } 904 905 /** 906 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 907 * 908 * @adev: amdgpu_device pointer 909 * 910 * Allocates a scratch page of VRAM for use by various things in the 911 * driver. 912 */ 913 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 914 { 915 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 916 AMDGPU_GEM_DOMAIN_VRAM | 917 AMDGPU_GEM_DOMAIN_GTT, 918 &adev->mem_scratch.robj, 919 &adev->mem_scratch.gpu_addr, 920 (void **)&adev->mem_scratch.ptr); 921 } 922 923 /** 924 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 925 * 926 * @adev: amdgpu_device pointer 927 * 928 * Frees the VRAM scratch page. 929 */ 930 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 931 { 932 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 933 } 934 935 /** 936 * amdgpu_device_program_register_sequence - program an array of registers. 937 * 938 * @adev: amdgpu_device pointer 939 * @registers: pointer to the register array 940 * @array_size: size of the register array 941 * 942 * Programs an array or registers with and or masks. 943 * This is a helper for setting golden registers. 944 */ 945 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 946 const u32 *registers, 947 const u32 array_size) 948 { 949 u32 tmp, reg, and_mask, or_mask; 950 int i; 951 952 if (array_size % 3) 953 return; 954 955 for (i = 0; i < array_size; i += 3) { 956 reg = registers[i + 0]; 957 and_mask = registers[i + 1]; 958 or_mask = registers[i + 2]; 959 960 if (and_mask == 0xffffffff) { 961 tmp = or_mask; 962 } else { 963 tmp = RREG32(reg); 964 tmp &= ~and_mask; 965 if (adev->family >= AMDGPU_FAMILY_AI) 966 tmp |= (or_mask & and_mask); 967 else 968 tmp |= or_mask; 969 } 970 WREG32(reg, tmp); 971 } 972 } 973 974 /** 975 * amdgpu_device_pci_config_reset - reset the GPU 976 * 977 * @adev: amdgpu_device pointer 978 * 979 * Resets the GPU using the pci config reset sequence. 980 * Only applicable to asics prior to vega10. 981 */ 982 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 983 { 984 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 985 } 986 987 /** 988 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 989 * 990 * @adev: amdgpu_device pointer 991 * 992 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 993 */ 994 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 995 { 996 STUB(); 997 return -ENOSYS; 998 #ifdef notyet 999 return pci_reset_function(adev->pdev); 1000 #endif 1001 } 1002 1003 /* 1004 * amdgpu_device_wb_*() 1005 * Writeback is the method by which the GPU updates special pages in memory 1006 * with the status of certain GPU events (fences, ring pointers,etc.). 1007 */ 1008 1009 /** 1010 * amdgpu_device_wb_fini - Disable Writeback and free memory 1011 * 1012 * @adev: amdgpu_device pointer 1013 * 1014 * Disables Writeback and frees the Writeback memory (all asics). 1015 * Used at driver shutdown. 1016 */ 1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1018 { 1019 if (adev->wb.wb_obj) { 1020 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1021 &adev->wb.gpu_addr, 1022 (void **)&adev->wb.wb); 1023 adev->wb.wb_obj = NULL; 1024 } 1025 } 1026 1027 /** 1028 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1029 * 1030 * @adev: amdgpu_device pointer 1031 * 1032 * Initializes writeback and allocates writeback memory (all asics). 1033 * Used at driver startup. 1034 * Returns 0 on success or an -error on failure. 1035 */ 1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1037 { 1038 int r; 1039 1040 if (adev->wb.wb_obj == NULL) { 1041 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1042 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1043 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1044 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1045 (void **)&adev->wb.wb); 1046 if (r) { 1047 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1048 return r; 1049 } 1050 1051 adev->wb.num_wb = AMDGPU_MAX_WB; 1052 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1053 1054 /* clear wb memory */ 1055 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1056 } 1057 1058 return 0; 1059 } 1060 1061 /** 1062 * amdgpu_device_wb_get - Allocate a wb entry 1063 * 1064 * @adev: amdgpu_device pointer 1065 * @wb: wb index 1066 * 1067 * Allocate a wb slot for use by the driver (all asics). 1068 * Returns 0 on success or -EINVAL on failure. 1069 */ 1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1071 { 1072 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1073 1074 if (offset < adev->wb.num_wb) { 1075 __set_bit(offset, adev->wb.used); 1076 *wb = offset << 3; /* convert to dw offset */ 1077 return 0; 1078 } else { 1079 return -EINVAL; 1080 } 1081 } 1082 1083 /** 1084 * amdgpu_device_wb_free - Free a wb entry 1085 * 1086 * @adev: amdgpu_device pointer 1087 * @wb: wb index 1088 * 1089 * Free a wb slot allocated for use by the driver (all asics) 1090 */ 1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1092 { 1093 wb >>= 3; 1094 if (wb < adev->wb.num_wb) 1095 __clear_bit(wb, adev->wb.used); 1096 } 1097 1098 /** 1099 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1100 * 1101 * @adev: amdgpu_device pointer 1102 * 1103 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1104 * to fail, but if any of the BARs is not accessible after the size we abort 1105 * driver loading by returning -ENODEV. 1106 */ 1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1108 { 1109 #ifdef __linux__ 1110 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1111 struct pci_bus *root; 1112 struct resource *res; 1113 unsigned int i; 1114 u16 cmd; 1115 int r; 1116 1117 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1118 return 0; 1119 1120 /* Bypass for VF */ 1121 if (amdgpu_sriov_vf(adev)) 1122 return 0; 1123 1124 /* skip if the bios has already enabled large BAR */ 1125 if (adev->gmc.real_vram_size && 1126 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1127 return 0; 1128 1129 /* Check if the root BUS has 64bit memory resources */ 1130 root = adev->pdev->bus; 1131 while (root->parent) 1132 root = root->parent; 1133 1134 pci_bus_for_each_resource(root, res, i) { 1135 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1136 res->start > 0x100000000ull) 1137 break; 1138 } 1139 1140 /* Trying to resize is pointless without a root hub window above 4GB */ 1141 if (!res) 1142 return 0; 1143 1144 /* Limit the BAR size to what is available */ 1145 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1146 rbar_size); 1147 1148 /* Disable memory decoding while we change the BAR addresses and size */ 1149 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1150 pci_write_config_word(adev->pdev, PCI_COMMAND, 1151 cmd & ~PCI_COMMAND_MEMORY); 1152 1153 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1154 amdgpu_doorbell_fini(adev); 1155 if (adev->asic_type >= CHIP_BONAIRE) 1156 pci_release_resource(adev->pdev, 2); 1157 1158 pci_release_resource(adev->pdev, 0); 1159 1160 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1161 if (r == -ENOSPC) 1162 DRM_INFO("Not enough PCI address space for a large BAR."); 1163 else if (r && r != -ENOTSUPP) 1164 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1165 1166 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1167 1168 /* When the doorbell or fb BAR isn't available we have no chance of 1169 * using the device. 1170 */ 1171 r = amdgpu_doorbell_init(adev); 1172 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1173 return -ENODEV; 1174 1175 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1176 #endif /* __linux__ */ 1177 1178 return 0; 1179 } 1180 1181 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1182 { 1183 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1184 return false; 1185 1186 return true; 1187 } 1188 1189 /* 1190 * GPU helpers function. 1191 */ 1192 /** 1193 * amdgpu_device_need_post - check if the hw need post or not 1194 * 1195 * @adev: amdgpu_device pointer 1196 * 1197 * Check if the asic has been initialized (all asics) at driver startup 1198 * or post is needed if hw reset is performed. 1199 * Returns true if need or false if not. 1200 */ 1201 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1202 { 1203 uint32_t reg; 1204 1205 if (amdgpu_sriov_vf(adev)) 1206 return false; 1207 1208 if (!amdgpu_device_read_bios(adev)) 1209 return false; 1210 1211 if (amdgpu_passthrough(adev)) { 1212 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1213 * some old smc fw still need driver do vPost otherwise gpu hang, while 1214 * those smc fw version above 22.15 doesn't have this flaw, so we force 1215 * vpost executed for smc version below 22.15 1216 */ 1217 if (adev->asic_type == CHIP_FIJI) { 1218 int err; 1219 uint32_t fw_ver; 1220 1221 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1222 /* force vPost if error occured */ 1223 if (err) 1224 return true; 1225 1226 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1227 release_firmware(adev->pm.fw); 1228 if (fw_ver < 0x00160e00) 1229 return true; 1230 } 1231 } 1232 1233 /* Don't post if we need to reset whole hive on init */ 1234 if (adev->gmc.xgmi.pending_reset) 1235 return false; 1236 1237 if (adev->has_hw_reset) { 1238 adev->has_hw_reset = false; 1239 return true; 1240 } 1241 1242 /* bios scratch used on CIK+ */ 1243 if (adev->asic_type >= CHIP_BONAIRE) 1244 return amdgpu_atombios_scratch_need_asic_init(adev); 1245 1246 /* check MEM_SIZE for older asics */ 1247 reg = amdgpu_asic_get_config_memsize(adev); 1248 1249 if ((reg != 0) && (reg != 0xffffffff)) 1250 return false; 1251 1252 return true; 1253 } 1254 1255 /* 1256 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1257 * speed switching. Until we have confirmation from Intel that a specific host 1258 * supports it, it's safer that we keep it disabled for all. 1259 * 1260 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1261 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1262 */ 1263 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1264 { 1265 #if IS_ENABLED(CONFIG_X86) 1266 #ifdef __linux__ 1267 struct cpuinfo_x86 *c = &cpu_data(0); 1268 1269 if (c->x86_vendor == X86_VENDOR_INTEL) 1270 #else 1271 if (strcmp(cpu_vendor, "GenuineIntel") == 0) 1272 #endif 1273 return false; 1274 #endif 1275 return true; 1276 } 1277 1278 /** 1279 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1280 * 1281 * @adev: amdgpu_device pointer 1282 * 1283 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1284 * be set for this device. 1285 * 1286 * Returns true if it should be used or false if not. 1287 */ 1288 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1289 { 1290 switch (amdgpu_aspm) { 1291 case -1: 1292 break; 1293 case 0: 1294 return false; 1295 case 1: 1296 return true; 1297 default: 1298 return false; 1299 } 1300 return pcie_aspm_enabled(adev->pdev); 1301 } 1302 1303 bool amdgpu_device_aspm_support_quirk(void) 1304 { 1305 #if IS_ENABLED(CONFIG_X86) 1306 struct cpu_info *ci = curcpu(); 1307 1308 return !(ci->ci_family == 6 && ci->ci_model == 0x97); 1309 #else 1310 return true; 1311 #endif 1312 } 1313 1314 /* if we get transitioned to only one device, take VGA back */ 1315 /** 1316 * amdgpu_device_vga_set_decode - enable/disable vga decode 1317 * 1318 * @pdev: PCI device pointer 1319 * @state: enable/disable vga decode 1320 * 1321 * Enable/disable vga decode (all asics). 1322 * Returns VGA resource flags. 1323 */ 1324 #ifdef notyet 1325 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1326 bool state) 1327 { 1328 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1329 1330 amdgpu_asic_set_vga_state(adev, state); 1331 if (state) 1332 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1333 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1334 else 1335 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1336 } 1337 #endif 1338 1339 /** 1340 * amdgpu_device_check_block_size - validate the vm block size 1341 * 1342 * @adev: amdgpu_device pointer 1343 * 1344 * Validates the vm block size specified via module parameter. 1345 * The vm block size defines number of bits in page table versus page directory, 1346 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1347 * page table and the remaining bits are in the page directory. 1348 */ 1349 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1350 { 1351 /* defines number of bits in page table versus page directory, 1352 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1353 * page table and the remaining bits are in the page directory 1354 */ 1355 if (amdgpu_vm_block_size == -1) 1356 return; 1357 1358 if (amdgpu_vm_block_size < 9) { 1359 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1360 amdgpu_vm_block_size); 1361 amdgpu_vm_block_size = -1; 1362 } 1363 } 1364 1365 /** 1366 * amdgpu_device_check_vm_size - validate the vm size 1367 * 1368 * @adev: amdgpu_device pointer 1369 * 1370 * Validates the vm size in GB specified via module parameter. 1371 * The VM size is the size of the GPU virtual memory space in GB. 1372 */ 1373 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1374 { 1375 /* no need to check the default value */ 1376 if (amdgpu_vm_size == -1) 1377 return; 1378 1379 if (amdgpu_vm_size < 1) { 1380 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1381 amdgpu_vm_size); 1382 amdgpu_vm_size = -1; 1383 } 1384 } 1385 1386 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1387 { 1388 #ifdef __linux__ 1389 struct sysinfo si; 1390 #endif 1391 bool is_os_64 = (sizeof(void *) == 8); 1392 uint64_t total_memory; 1393 uint64_t dram_size_seven_GB = 0x1B8000000; 1394 uint64_t dram_size_three_GB = 0xB8000000; 1395 1396 if (amdgpu_smu_memory_pool_size == 0) 1397 return; 1398 1399 if (!is_os_64) { 1400 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1401 goto def_value; 1402 } 1403 #ifdef __linux__ 1404 si_meminfo(&si); 1405 total_memory = (uint64_t)si.totalram * si.mem_unit; 1406 #else 1407 total_memory = ptoa(physmem); 1408 #endif 1409 1410 if ((amdgpu_smu_memory_pool_size == 1) || 1411 (amdgpu_smu_memory_pool_size == 2)) { 1412 if (total_memory < dram_size_three_GB) 1413 goto def_value1; 1414 } else if ((amdgpu_smu_memory_pool_size == 4) || 1415 (amdgpu_smu_memory_pool_size == 8)) { 1416 if (total_memory < dram_size_seven_GB) 1417 goto def_value1; 1418 } else { 1419 DRM_WARN("Smu memory pool size not supported\n"); 1420 goto def_value; 1421 } 1422 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1423 1424 return; 1425 1426 def_value1: 1427 DRM_WARN("No enough system memory\n"); 1428 def_value: 1429 adev->pm.smu_prv_buffer_size = 0; 1430 } 1431 1432 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1433 { 1434 if (!(adev->flags & AMD_IS_APU) || 1435 adev->asic_type < CHIP_RAVEN) 1436 return 0; 1437 1438 switch (adev->asic_type) { 1439 case CHIP_RAVEN: 1440 if (adev->pdev->device == 0x15dd) 1441 adev->apu_flags |= AMD_APU_IS_RAVEN; 1442 if (adev->pdev->device == 0x15d8) 1443 adev->apu_flags |= AMD_APU_IS_PICASSO; 1444 break; 1445 case CHIP_RENOIR: 1446 if ((adev->pdev->device == 0x1636) || 1447 (adev->pdev->device == 0x164c)) 1448 adev->apu_flags |= AMD_APU_IS_RENOIR; 1449 else 1450 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1451 break; 1452 case CHIP_VANGOGH: 1453 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1454 break; 1455 case CHIP_YELLOW_CARP: 1456 break; 1457 case CHIP_CYAN_SKILLFISH: 1458 if ((adev->pdev->device == 0x13FE) || 1459 (adev->pdev->device == 0x143F)) 1460 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1461 break; 1462 default: 1463 break; 1464 } 1465 1466 return 0; 1467 } 1468 1469 /** 1470 * amdgpu_device_check_arguments - validate module params 1471 * 1472 * @adev: amdgpu_device pointer 1473 * 1474 * Validates certain module parameters and updates 1475 * the associated values used by the driver (all asics). 1476 */ 1477 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1478 { 1479 if (amdgpu_sched_jobs < 4) { 1480 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1481 amdgpu_sched_jobs); 1482 amdgpu_sched_jobs = 4; 1483 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1484 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1485 amdgpu_sched_jobs); 1486 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1487 } 1488 1489 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1490 /* gart size must be greater or equal to 32M */ 1491 dev_warn(adev->dev, "gart size (%d) too small\n", 1492 amdgpu_gart_size); 1493 amdgpu_gart_size = -1; 1494 } 1495 1496 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1497 /* gtt size must be greater or equal to 32M */ 1498 dev_warn(adev->dev, "gtt size (%d) too small\n", 1499 amdgpu_gtt_size); 1500 amdgpu_gtt_size = -1; 1501 } 1502 1503 /* valid range is between 4 and 9 inclusive */ 1504 if (amdgpu_vm_fragment_size != -1 && 1505 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1506 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1507 amdgpu_vm_fragment_size = -1; 1508 } 1509 1510 if (amdgpu_sched_hw_submission < 2) { 1511 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1512 amdgpu_sched_hw_submission); 1513 amdgpu_sched_hw_submission = 2; 1514 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1515 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1516 amdgpu_sched_hw_submission); 1517 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1518 } 1519 1520 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1521 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1522 amdgpu_reset_method = -1; 1523 } 1524 1525 amdgpu_device_check_smu_prv_buffer_size(adev); 1526 1527 amdgpu_device_check_vm_size(adev); 1528 1529 amdgpu_device_check_block_size(adev); 1530 1531 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1532 1533 return 0; 1534 } 1535 1536 #ifdef __linux__ 1537 /** 1538 * amdgpu_switcheroo_set_state - set switcheroo state 1539 * 1540 * @pdev: pci dev pointer 1541 * @state: vga_switcheroo state 1542 * 1543 * Callback for the switcheroo driver. Suspends or resumes 1544 * the asics before or after it is powered up using ACPI methods. 1545 */ 1546 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1547 enum vga_switcheroo_state state) 1548 { 1549 struct drm_device *dev = pci_get_drvdata(pdev); 1550 int r; 1551 1552 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1553 return; 1554 1555 if (state == VGA_SWITCHEROO_ON) { 1556 pr_info("switched on\n"); 1557 /* don't suspend or resume card normally */ 1558 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1559 1560 pci_set_power_state(pdev, PCI_D0); 1561 amdgpu_device_load_pci_state(pdev); 1562 r = pci_enable_device(pdev); 1563 if (r) 1564 DRM_WARN("pci_enable_device failed (%d)\n", r); 1565 amdgpu_device_resume(dev, true); 1566 1567 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1568 } else { 1569 pr_info("switched off\n"); 1570 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1571 amdgpu_device_suspend(dev, true); 1572 amdgpu_device_cache_pci_state(pdev); 1573 /* Shut down the device */ 1574 pci_disable_device(pdev); 1575 pci_set_power_state(pdev, PCI_D3cold); 1576 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1577 } 1578 } 1579 1580 /** 1581 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1582 * 1583 * @pdev: pci dev pointer 1584 * 1585 * Callback for the switcheroo driver. Check of the switcheroo 1586 * state can be changed. 1587 * Returns true if the state can be changed, false if not. 1588 */ 1589 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1590 { 1591 struct drm_device *dev = pci_get_drvdata(pdev); 1592 1593 /* 1594 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1595 * locking inversion with the driver load path. And the access here is 1596 * completely racy anyway. So don't bother with locking for now. 1597 */ 1598 return atomic_read(&dev->open_count) == 0; 1599 } 1600 #endif /* __linux__ */ 1601 1602 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1603 #ifdef notyet 1604 .set_gpu_state = amdgpu_switcheroo_set_state, 1605 .reprobe = NULL, 1606 .can_switch = amdgpu_switcheroo_can_switch, 1607 #endif 1608 }; 1609 1610 /** 1611 * amdgpu_device_ip_set_clockgating_state - set the CG state 1612 * 1613 * @dev: amdgpu_device pointer 1614 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1615 * @state: clockgating state (gate or ungate) 1616 * 1617 * Sets the requested clockgating state for all instances of 1618 * the hardware IP specified. 1619 * Returns the error code from the last instance. 1620 */ 1621 int amdgpu_device_ip_set_clockgating_state(void *dev, 1622 enum amd_ip_block_type block_type, 1623 enum amd_clockgating_state state) 1624 { 1625 struct amdgpu_device *adev = dev; 1626 int i, r = 0; 1627 1628 for (i = 0; i < adev->num_ip_blocks; i++) { 1629 if (!adev->ip_blocks[i].status.valid) 1630 continue; 1631 if (adev->ip_blocks[i].version->type != block_type) 1632 continue; 1633 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1634 continue; 1635 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1636 (void *)adev, state); 1637 if (r) 1638 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1639 adev->ip_blocks[i].version->funcs->name, r); 1640 } 1641 return r; 1642 } 1643 1644 /** 1645 * amdgpu_device_ip_set_powergating_state - set the PG state 1646 * 1647 * @dev: amdgpu_device pointer 1648 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1649 * @state: powergating state (gate or ungate) 1650 * 1651 * Sets the requested powergating state for all instances of 1652 * the hardware IP specified. 1653 * Returns the error code from the last instance. 1654 */ 1655 int amdgpu_device_ip_set_powergating_state(void *dev, 1656 enum amd_ip_block_type block_type, 1657 enum amd_powergating_state state) 1658 { 1659 struct amdgpu_device *adev = dev; 1660 int i, r = 0; 1661 1662 for (i = 0; i < adev->num_ip_blocks; i++) { 1663 if (!adev->ip_blocks[i].status.valid) 1664 continue; 1665 if (adev->ip_blocks[i].version->type != block_type) 1666 continue; 1667 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1668 continue; 1669 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1670 (void *)adev, state); 1671 if (r) 1672 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1673 adev->ip_blocks[i].version->funcs->name, r); 1674 } 1675 return r; 1676 } 1677 1678 /** 1679 * amdgpu_device_ip_get_clockgating_state - get the CG state 1680 * 1681 * @adev: amdgpu_device pointer 1682 * @flags: clockgating feature flags 1683 * 1684 * Walks the list of IPs on the device and updates the clockgating 1685 * flags for each IP. 1686 * Updates @flags with the feature flags for each hardware IP where 1687 * clockgating is enabled. 1688 */ 1689 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1690 u64 *flags) 1691 { 1692 int i; 1693 1694 for (i = 0; i < adev->num_ip_blocks; i++) { 1695 if (!adev->ip_blocks[i].status.valid) 1696 continue; 1697 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1698 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1699 } 1700 } 1701 1702 /** 1703 * amdgpu_device_ip_wait_for_idle - wait for idle 1704 * 1705 * @adev: amdgpu_device pointer 1706 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1707 * 1708 * Waits for the request hardware IP to be idle. 1709 * Returns 0 for success or a negative error code on failure. 1710 */ 1711 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1712 enum amd_ip_block_type block_type) 1713 { 1714 int i, r; 1715 1716 for (i = 0; i < adev->num_ip_blocks; i++) { 1717 if (!adev->ip_blocks[i].status.valid) 1718 continue; 1719 if (adev->ip_blocks[i].version->type == block_type) { 1720 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1721 if (r) 1722 return r; 1723 break; 1724 } 1725 } 1726 return 0; 1727 1728 } 1729 1730 /** 1731 * amdgpu_device_ip_is_idle - is the hardware IP idle 1732 * 1733 * @adev: amdgpu_device pointer 1734 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1735 * 1736 * Check if the hardware IP is idle or not. 1737 * Returns true if it the IP is idle, false if not. 1738 */ 1739 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1740 enum amd_ip_block_type block_type) 1741 { 1742 int i; 1743 1744 for (i = 0; i < adev->num_ip_blocks; i++) { 1745 if (!adev->ip_blocks[i].status.valid) 1746 continue; 1747 if (adev->ip_blocks[i].version->type == block_type) 1748 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1749 } 1750 return true; 1751 1752 } 1753 1754 /** 1755 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1756 * 1757 * @adev: amdgpu_device pointer 1758 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1759 * 1760 * Returns a pointer to the hardware IP block structure 1761 * if it exists for the asic, otherwise NULL. 1762 */ 1763 struct amdgpu_ip_block * 1764 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1765 enum amd_ip_block_type type) 1766 { 1767 int i; 1768 1769 for (i = 0; i < adev->num_ip_blocks; i++) 1770 if (adev->ip_blocks[i].version->type == type) 1771 return &adev->ip_blocks[i]; 1772 1773 return NULL; 1774 } 1775 1776 /** 1777 * amdgpu_device_ip_block_version_cmp 1778 * 1779 * @adev: amdgpu_device pointer 1780 * @type: enum amd_ip_block_type 1781 * @major: major version 1782 * @minor: minor version 1783 * 1784 * return 0 if equal or greater 1785 * return 1 if smaller or the ip_block doesn't exist 1786 */ 1787 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1788 enum amd_ip_block_type type, 1789 u32 major, u32 minor) 1790 { 1791 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1792 1793 if (ip_block && ((ip_block->version->major > major) || 1794 ((ip_block->version->major == major) && 1795 (ip_block->version->minor >= minor)))) 1796 return 0; 1797 1798 return 1; 1799 } 1800 1801 /** 1802 * amdgpu_device_ip_block_add 1803 * 1804 * @adev: amdgpu_device pointer 1805 * @ip_block_version: pointer to the IP to add 1806 * 1807 * Adds the IP block driver information to the collection of IPs 1808 * on the asic. 1809 */ 1810 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1811 const struct amdgpu_ip_block_version *ip_block_version) 1812 { 1813 if (!ip_block_version) 1814 return -EINVAL; 1815 1816 switch (ip_block_version->type) { 1817 case AMD_IP_BLOCK_TYPE_VCN: 1818 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1819 return 0; 1820 break; 1821 case AMD_IP_BLOCK_TYPE_JPEG: 1822 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1823 return 0; 1824 break; 1825 default: 1826 break; 1827 } 1828 1829 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1830 ip_block_version->funcs->name); 1831 1832 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1833 1834 return 0; 1835 } 1836 1837 /** 1838 * amdgpu_device_enable_virtual_display - enable virtual display feature 1839 * 1840 * @adev: amdgpu_device pointer 1841 * 1842 * Enabled the virtual display feature if the user has enabled it via 1843 * the module parameter virtual_display. This feature provides a virtual 1844 * display hardware on headless boards or in virtualized environments. 1845 * This function parses and validates the configuration string specified by 1846 * the user and configues the virtual display configuration (number of 1847 * virtual connectors, crtcs, etc.) specified. 1848 */ 1849 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1850 { 1851 adev->enable_virtual_display = false; 1852 1853 #ifdef notyet 1854 if (amdgpu_virtual_display) { 1855 const char *pci_address_name = pci_name(adev->pdev); 1856 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1857 1858 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1859 pciaddstr_tmp = pciaddstr; 1860 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1861 pciaddname = strsep(&pciaddname_tmp, ","); 1862 if (!strcmp("all", pciaddname) 1863 || !strcmp(pci_address_name, pciaddname)) { 1864 long num_crtc; 1865 int res = -1; 1866 1867 adev->enable_virtual_display = true; 1868 1869 if (pciaddname_tmp) 1870 res = kstrtol(pciaddname_tmp, 10, 1871 &num_crtc); 1872 1873 if (!res) { 1874 if (num_crtc < 1) 1875 num_crtc = 1; 1876 if (num_crtc > 6) 1877 num_crtc = 6; 1878 adev->mode_info.num_crtc = num_crtc; 1879 } else { 1880 adev->mode_info.num_crtc = 1; 1881 } 1882 break; 1883 } 1884 } 1885 1886 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1887 amdgpu_virtual_display, pci_address_name, 1888 adev->enable_virtual_display, adev->mode_info.num_crtc); 1889 1890 kfree(pciaddstr); 1891 } 1892 #endif 1893 } 1894 1895 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1896 { 1897 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1898 adev->mode_info.num_crtc = 1; 1899 adev->enable_virtual_display = true; 1900 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1901 adev->enable_virtual_display, adev->mode_info.num_crtc); 1902 } 1903 } 1904 1905 /** 1906 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1907 * 1908 * @adev: amdgpu_device pointer 1909 * 1910 * Parses the asic configuration parameters specified in the gpu info 1911 * firmware and makes them availale to the driver for use in configuring 1912 * the asic. 1913 * Returns 0 on success, -EINVAL on failure. 1914 */ 1915 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1916 { 1917 const char *chip_name; 1918 char fw_name[40]; 1919 int err; 1920 const struct gpu_info_firmware_header_v1_0 *hdr; 1921 1922 adev->firmware.gpu_info_fw = NULL; 1923 1924 if (adev->mman.discovery_bin) 1925 return 0; 1926 1927 switch (adev->asic_type) { 1928 default: 1929 return 0; 1930 case CHIP_VEGA10: 1931 chip_name = "vega10"; 1932 break; 1933 case CHIP_VEGA12: 1934 chip_name = "vega12"; 1935 break; 1936 case CHIP_RAVEN: 1937 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1938 chip_name = "raven2"; 1939 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1940 chip_name = "picasso"; 1941 else 1942 chip_name = "raven"; 1943 break; 1944 case CHIP_ARCTURUS: 1945 chip_name = "arcturus"; 1946 break; 1947 case CHIP_NAVI12: 1948 chip_name = "navi12"; 1949 break; 1950 } 1951 1952 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1953 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1954 if (err) { 1955 dev_err(adev->dev, 1956 "Failed to get gpu_info firmware \"%s\"\n", 1957 fw_name); 1958 goto out; 1959 } 1960 1961 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1962 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1963 1964 switch (hdr->version_major) { 1965 case 1: 1966 { 1967 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1968 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1969 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1970 1971 /* 1972 * Should be droped when DAL no longer needs it. 1973 */ 1974 if (adev->asic_type == CHIP_NAVI12) 1975 goto parse_soc_bounding_box; 1976 1977 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1978 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1979 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1980 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1981 adev->gfx.config.max_texture_channel_caches = 1982 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1983 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1984 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1985 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1986 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1987 adev->gfx.config.double_offchip_lds_buf = 1988 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1989 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1990 adev->gfx.cu_info.max_waves_per_simd = 1991 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1992 adev->gfx.cu_info.max_scratch_slots_per_cu = 1993 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1994 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1995 if (hdr->version_minor >= 1) { 1996 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1997 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1998 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1999 adev->gfx.config.num_sc_per_sh = 2000 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2001 adev->gfx.config.num_packer_per_sc = 2002 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2003 } 2004 2005 parse_soc_bounding_box: 2006 /* 2007 * soc bounding box info is not integrated in disocovery table, 2008 * we always need to parse it from gpu info firmware if needed. 2009 */ 2010 if (hdr->version_minor == 2) { 2011 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2012 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2013 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2014 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2015 } 2016 break; 2017 } 2018 default: 2019 dev_err(adev->dev, 2020 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2021 err = -EINVAL; 2022 goto out; 2023 } 2024 out: 2025 return err; 2026 } 2027 2028 /** 2029 * amdgpu_device_ip_early_init - run early init for hardware IPs 2030 * 2031 * @adev: amdgpu_device pointer 2032 * 2033 * Early initialization pass for hardware IPs. The hardware IPs that make 2034 * up each asic are discovered each IP's early_init callback is run. This 2035 * is the first stage in initializing the asic. 2036 * Returns 0 on success, negative error code on failure. 2037 */ 2038 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2039 { 2040 struct pci_dev *parent; 2041 int i, r; 2042 bool total; 2043 2044 amdgpu_device_enable_virtual_display(adev); 2045 2046 if (amdgpu_sriov_vf(adev)) { 2047 r = amdgpu_virt_request_full_gpu(adev, true); 2048 if (r) 2049 return r; 2050 } 2051 2052 switch (adev->asic_type) { 2053 #ifdef CONFIG_DRM_AMDGPU_SI 2054 case CHIP_VERDE: 2055 case CHIP_TAHITI: 2056 case CHIP_PITCAIRN: 2057 case CHIP_OLAND: 2058 case CHIP_HAINAN: 2059 adev->family = AMDGPU_FAMILY_SI; 2060 r = si_set_ip_blocks(adev); 2061 if (r) 2062 return r; 2063 break; 2064 #endif 2065 #ifdef CONFIG_DRM_AMDGPU_CIK 2066 case CHIP_BONAIRE: 2067 case CHIP_HAWAII: 2068 case CHIP_KAVERI: 2069 case CHIP_KABINI: 2070 case CHIP_MULLINS: 2071 if (adev->flags & AMD_IS_APU) 2072 adev->family = AMDGPU_FAMILY_KV; 2073 else 2074 adev->family = AMDGPU_FAMILY_CI; 2075 2076 r = cik_set_ip_blocks(adev); 2077 if (r) 2078 return r; 2079 break; 2080 #endif 2081 case CHIP_TOPAZ: 2082 case CHIP_TONGA: 2083 case CHIP_FIJI: 2084 case CHIP_POLARIS10: 2085 case CHIP_POLARIS11: 2086 case CHIP_POLARIS12: 2087 case CHIP_VEGAM: 2088 case CHIP_CARRIZO: 2089 case CHIP_STONEY: 2090 if (adev->flags & AMD_IS_APU) 2091 adev->family = AMDGPU_FAMILY_CZ; 2092 else 2093 adev->family = AMDGPU_FAMILY_VI; 2094 2095 r = vi_set_ip_blocks(adev); 2096 if (r) 2097 return r; 2098 break; 2099 default: 2100 r = amdgpu_discovery_set_ip_blocks(adev); 2101 if (r) 2102 return r; 2103 break; 2104 } 2105 2106 if (amdgpu_has_atpx() && 2107 (amdgpu_is_atpx_hybrid() || 2108 amdgpu_has_atpx_dgpu_power_cntl()) && 2109 ((adev->flags & AMD_IS_APU) == 0) && 2110 !dev_is_removable(&adev->pdev->dev)) 2111 adev->flags |= AMD_IS_PX; 2112 2113 if (!(adev->flags & AMD_IS_APU)) { 2114 #ifdef notyet 2115 parent = pcie_find_root_port(adev->pdev); 2116 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2117 #else 2118 adev->has_pr3 = false; 2119 #endif 2120 } 2121 2122 2123 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2124 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2125 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2126 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2127 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2128 if (!amdgpu_device_pcie_dynamic_switching_supported()) 2129 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2130 2131 total = true; 2132 for (i = 0; i < adev->num_ip_blocks; i++) { 2133 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2134 DRM_WARN("disabled ip block: %d <%s>\n", 2135 i, adev->ip_blocks[i].version->funcs->name); 2136 adev->ip_blocks[i].status.valid = false; 2137 } else { 2138 if (adev->ip_blocks[i].version->funcs->early_init) { 2139 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2140 if (r == -ENOENT) { 2141 adev->ip_blocks[i].status.valid = false; 2142 } else if (r) { 2143 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2144 adev->ip_blocks[i].version->funcs->name, r); 2145 total = false; 2146 } else { 2147 adev->ip_blocks[i].status.valid = true; 2148 } 2149 } else { 2150 adev->ip_blocks[i].status.valid = true; 2151 } 2152 } 2153 /* get the vbios after the asic_funcs are set up */ 2154 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2155 r = amdgpu_device_parse_gpu_info_fw(adev); 2156 if (r) 2157 return r; 2158 2159 /* Read BIOS */ 2160 if (amdgpu_device_read_bios(adev)) { 2161 if (!amdgpu_get_bios(adev)) 2162 return -EINVAL; 2163 2164 r = amdgpu_atombios_init(adev); 2165 if (r) { 2166 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2167 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2168 return r; 2169 } 2170 } 2171 2172 /*get pf2vf msg info at it's earliest time*/ 2173 if (amdgpu_sriov_vf(adev)) 2174 amdgpu_virt_init_data_exchange(adev); 2175 2176 } 2177 } 2178 if (!total) 2179 return -ENODEV; 2180 2181 amdgpu_amdkfd_device_probe(adev); 2182 adev->cg_flags &= amdgpu_cg_mask; 2183 adev->pg_flags &= amdgpu_pg_mask; 2184 2185 return 0; 2186 } 2187 2188 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2189 { 2190 int i, r; 2191 2192 for (i = 0; i < adev->num_ip_blocks; i++) { 2193 if (!adev->ip_blocks[i].status.sw) 2194 continue; 2195 if (adev->ip_blocks[i].status.hw) 2196 continue; 2197 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2198 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2199 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2200 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2201 if (r) { 2202 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2203 adev->ip_blocks[i].version->funcs->name, r); 2204 return r; 2205 } 2206 adev->ip_blocks[i].status.hw = true; 2207 } 2208 } 2209 2210 return 0; 2211 } 2212 2213 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2214 { 2215 int i, r; 2216 2217 for (i = 0; i < adev->num_ip_blocks; i++) { 2218 if (!adev->ip_blocks[i].status.sw) 2219 continue; 2220 if (adev->ip_blocks[i].status.hw) 2221 continue; 2222 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2223 if (r) { 2224 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2225 adev->ip_blocks[i].version->funcs->name, r); 2226 return r; 2227 } 2228 adev->ip_blocks[i].status.hw = true; 2229 } 2230 2231 return 0; 2232 } 2233 2234 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2235 { 2236 int r = 0; 2237 int i; 2238 uint32_t smu_version; 2239 2240 if (adev->asic_type >= CHIP_VEGA10) { 2241 for (i = 0; i < adev->num_ip_blocks; i++) { 2242 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2243 continue; 2244 2245 if (!adev->ip_blocks[i].status.sw) 2246 continue; 2247 2248 /* no need to do the fw loading again if already done*/ 2249 if (adev->ip_blocks[i].status.hw == true) 2250 break; 2251 2252 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2253 r = adev->ip_blocks[i].version->funcs->resume(adev); 2254 if (r) { 2255 DRM_ERROR("resume of IP block <%s> failed %d\n", 2256 adev->ip_blocks[i].version->funcs->name, r); 2257 return r; 2258 } 2259 } else { 2260 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2261 if (r) { 2262 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2263 adev->ip_blocks[i].version->funcs->name, r); 2264 return r; 2265 } 2266 } 2267 2268 adev->ip_blocks[i].status.hw = true; 2269 break; 2270 } 2271 } 2272 2273 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2274 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2275 2276 return r; 2277 } 2278 2279 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2280 { 2281 long timeout; 2282 int r, i; 2283 2284 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2285 struct amdgpu_ring *ring = adev->rings[i]; 2286 2287 /* No need to setup the GPU scheduler for rings that don't need it */ 2288 if (!ring || ring->no_scheduler) 2289 continue; 2290 2291 switch (ring->funcs->type) { 2292 case AMDGPU_RING_TYPE_GFX: 2293 timeout = adev->gfx_timeout; 2294 break; 2295 case AMDGPU_RING_TYPE_COMPUTE: 2296 timeout = adev->compute_timeout; 2297 break; 2298 case AMDGPU_RING_TYPE_SDMA: 2299 timeout = adev->sdma_timeout; 2300 break; 2301 default: 2302 timeout = adev->video_timeout; 2303 break; 2304 } 2305 2306 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2307 ring->num_hw_submission, 0, 2308 timeout, adev->reset_domain->wq, 2309 ring->sched_score, ring->name, 2310 adev->dev); 2311 if (r) { 2312 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2313 ring->name); 2314 return r; 2315 } 2316 } 2317 2318 amdgpu_xcp_update_partition_sched_list(adev); 2319 2320 return 0; 2321 } 2322 2323 2324 /** 2325 * amdgpu_device_ip_init - run init for hardware IPs 2326 * 2327 * @adev: amdgpu_device pointer 2328 * 2329 * Main initialization pass for hardware IPs. The list of all the hardware 2330 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2331 * are run. sw_init initializes the software state associated with each IP 2332 * and hw_init initializes the hardware associated with each IP. 2333 * Returns 0 on success, negative error code on failure. 2334 */ 2335 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2336 { 2337 int i, r; 2338 2339 r = amdgpu_ras_init(adev); 2340 if (r) 2341 return r; 2342 2343 for (i = 0; i < adev->num_ip_blocks; i++) { 2344 if (!adev->ip_blocks[i].status.valid) 2345 continue; 2346 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2347 if (r) { 2348 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2349 adev->ip_blocks[i].version->funcs->name, r); 2350 goto init_failed; 2351 } 2352 adev->ip_blocks[i].status.sw = true; 2353 2354 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2355 /* need to do common hw init early so everything is set up for gmc */ 2356 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2357 if (r) { 2358 DRM_ERROR("hw_init %d failed %d\n", i, r); 2359 goto init_failed; 2360 } 2361 adev->ip_blocks[i].status.hw = true; 2362 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2363 /* need to do gmc hw init early so we can allocate gpu mem */ 2364 /* Try to reserve bad pages early */ 2365 if (amdgpu_sriov_vf(adev)) 2366 amdgpu_virt_exchange_data(adev); 2367 2368 r = amdgpu_device_mem_scratch_init(adev); 2369 if (r) { 2370 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2371 goto init_failed; 2372 } 2373 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2374 if (r) { 2375 DRM_ERROR("hw_init %d failed %d\n", i, r); 2376 goto init_failed; 2377 } 2378 r = amdgpu_device_wb_init(adev); 2379 if (r) { 2380 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2381 goto init_failed; 2382 } 2383 adev->ip_blocks[i].status.hw = true; 2384 2385 /* right after GMC hw init, we create CSA */ 2386 if (adev->gfx.mcbp) { 2387 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2388 AMDGPU_GEM_DOMAIN_VRAM | 2389 AMDGPU_GEM_DOMAIN_GTT, 2390 AMDGPU_CSA_SIZE); 2391 if (r) { 2392 DRM_ERROR("allocate CSA failed %d\n", r); 2393 goto init_failed; 2394 } 2395 } 2396 } 2397 } 2398 2399 if (amdgpu_sriov_vf(adev)) 2400 amdgpu_virt_init_data_exchange(adev); 2401 2402 r = amdgpu_ib_pool_init(adev); 2403 if (r) { 2404 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2405 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2406 goto init_failed; 2407 } 2408 2409 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2410 if (r) 2411 goto init_failed; 2412 2413 r = amdgpu_device_ip_hw_init_phase1(adev); 2414 if (r) 2415 goto init_failed; 2416 2417 r = amdgpu_device_fw_loading(adev); 2418 if (r) 2419 goto init_failed; 2420 2421 r = amdgpu_device_ip_hw_init_phase2(adev); 2422 if (r) 2423 goto init_failed; 2424 2425 /* 2426 * retired pages will be loaded from eeprom and reserved here, 2427 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2428 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2429 * for I2C communication which only true at this point. 2430 * 2431 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2432 * failure from bad gpu situation and stop amdgpu init process 2433 * accordingly. For other failed cases, it will still release all 2434 * the resource and print error message, rather than returning one 2435 * negative value to upper level. 2436 * 2437 * Note: theoretically, this should be called before all vram allocations 2438 * to protect retired page from abusing 2439 */ 2440 r = amdgpu_ras_recovery_init(adev); 2441 if (r) 2442 goto init_failed; 2443 2444 /** 2445 * In case of XGMI grab extra reference for reset domain for this device 2446 */ 2447 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2448 if (amdgpu_xgmi_add_device(adev) == 0) { 2449 if (!amdgpu_sriov_vf(adev)) { 2450 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2451 2452 if (WARN_ON(!hive)) { 2453 r = -ENOENT; 2454 goto init_failed; 2455 } 2456 2457 if (!hive->reset_domain || 2458 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2459 r = -ENOENT; 2460 amdgpu_put_xgmi_hive(hive); 2461 goto init_failed; 2462 } 2463 2464 /* Drop the early temporary reset domain we created for device */ 2465 amdgpu_reset_put_reset_domain(adev->reset_domain); 2466 adev->reset_domain = hive->reset_domain; 2467 amdgpu_put_xgmi_hive(hive); 2468 } 2469 } 2470 } 2471 2472 r = amdgpu_device_init_schedulers(adev); 2473 if (r) 2474 goto init_failed; 2475 2476 /* Don't init kfd if whole hive need to be reset during init */ 2477 if (!adev->gmc.xgmi.pending_reset) { 2478 kgd2kfd_init_zone_device(adev); 2479 amdgpu_amdkfd_device_init(adev); 2480 } 2481 2482 amdgpu_fru_get_product_info(adev); 2483 2484 init_failed: 2485 2486 return r; 2487 } 2488 2489 /** 2490 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2491 * 2492 * @adev: amdgpu_device pointer 2493 * 2494 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2495 * this function before a GPU reset. If the value is retained after a 2496 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2497 */ 2498 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2499 { 2500 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2501 } 2502 2503 /** 2504 * amdgpu_device_check_vram_lost - check if vram is valid 2505 * 2506 * @adev: amdgpu_device pointer 2507 * 2508 * Checks the reset magic value written to the gart pointer in VRAM. 2509 * The driver calls this after a GPU reset to see if the contents of 2510 * VRAM is lost or now. 2511 * returns true if vram is lost, false if not. 2512 */ 2513 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2514 { 2515 if (memcmp(adev->gart.ptr, adev->reset_magic, 2516 AMDGPU_RESET_MAGIC_NUM)) 2517 return true; 2518 2519 if (!amdgpu_in_reset(adev)) 2520 return false; 2521 2522 /* 2523 * For all ASICs with baco/mode1 reset, the VRAM is 2524 * always assumed to be lost. 2525 */ 2526 switch (amdgpu_asic_reset_method(adev)) { 2527 case AMD_RESET_METHOD_BACO: 2528 case AMD_RESET_METHOD_MODE1: 2529 return true; 2530 default: 2531 return false; 2532 } 2533 } 2534 2535 /** 2536 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2537 * 2538 * @adev: amdgpu_device pointer 2539 * @state: clockgating state (gate or ungate) 2540 * 2541 * The list of all the hardware IPs that make up the asic is walked and the 2542 * set_clockgating_state callbacks are run. 2543 * Late initialization pass enabling clockgating for hardware IPs. 2544 * Fini or suspend, pass disabling clockgating for hardware IPs. 2545 * Returns 0 on success, negative error code on failure. 2546 */ 2547 2548 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2549 enum amd_clockgating_state state) 2550 { 2551 int i, j, r; 2552 2553 if (amdgpu_emu_mode == 1) 2554 return 0; 2555 2556 for (j = 0; j < adev->num_ip_blocks; j++) { 2557 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2558 if (!adev->ip_blocks[i].status.late_initialized) 2559 continue; 2560 /* skip CG for GFX, SDMA on S0ix */ 2561 if (adev->in_s0ix && 2562 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2563 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2564 continue; 2565 /* skip CG for VCE/UVD, it's handled specially */ 2566 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2567 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2568 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2569 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2570 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2571 /* enable clockgating to save power */ 2572 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2573 state); 2574 if (r) { 2575 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2576 adev->ip_blocks[i].version->funcs->name, r); 2577 return r; 2578 } 2579 } 2580 } 2581 2582 return 0; 2583 } 2584 2585 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2586 enum amd_powergating_state state) 2587 { 2588 int i, j, r; 2589 2590 if (amdgpu_emu_mode == 1) 2591 return 0; 2592 2593 for (j = 0; j < adev->num_ip_blocks; j++) { 2594 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2595 if (!adev->ip_blocks[i].status.late_initialized) 2596 continue; 2597 /* skip PG for GFX, SDMA on S0ix */ 2598 if (adev->in_s0ix && 2599 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2600 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2601 continue; 2602 /* skip CG for VCE/UVD, it's handled specially */ 2603 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2604 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2605 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2606 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2607 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2608 /* enable powergating to save power */ 2609 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2610 state); 2611 if (r) { 2612 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2613 adev->ip_blocks[i].version->funcs->name, r); 2614 return r; 2615 } 2616 } 2617 } 2618 return 0; 2619 } 2620 2621 static int amdgpu_device_enable_mgpu_fan_boost(void) 2622 { 2623 struct amdgpu_gpu_instance *gpu_ins; 2624 struct amdgpu_device *adev; 2625 int i, ret = 0; 2626 2627 mutex_lock(&mgpu_info.mutex); 2628 2629 /* 2630 * MGPU fan boost feature should be enabled 2631 * only when there are two or more dGPUs in 2632 * the system 2633 */ 2634 if (mgpu_info.num_dgpu < 2) 2635 goto out; 2636 2637 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2638 gpu_ins = &(mgpu_info.gpu_ins[i]); 2639 adev = gpu_ins->adev; 2640 if (!(adev->flags & AMD_IS_APU) && 2641 !gpu_ins->mgpu_fan_enabled) { 2642 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2643 if (ret) 2644 break; 2645 2646 gpu_ins->mgpu_fan_enabled = 1; 2647 } 2648 } 2649 2650 out: 2651 mutex_unlock(&mgpu_info.mutex); 2652 2653 return ret; 2654 } 2655 2656 /** 2657 * amdgpu_device_ip_late_init - run late init for hardware IPs 2658 * 2659 * @adev: amdgpu_device pointer 2660 * 2661 * Late initialization pass for hardware IPs. The list of all the hardware 2662 * IPs that make up the asic is walked and the late_init callbacks are run. 2663 * late_init covers any special initialization that an IP requires 2664 * after all of the have been initialized or something that needs to happen 2665 * late in the init process. 2666 * Returns 0 on success, negative error code on failure. 2667 */ 2668 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2669 { 2670 struct amdgpu_gpu_instance *gpu_instance; 2671 int i = 0, r; 2672 2673 for (i = 0; i < adev->num_ip_blocks; i++) { 2674 if (!adev->ip_blocks[i].status.hw) 2675 continue; 2676 if (adev->ip_blocks[i].version->funcs->late_init) { 2677 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2678 if (r) { 2679 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2680 adev->ip_blocks[i].version->funcs->name, r); 2681 return r; 2682 } 2683 } 2684 adev->ip_blocks[i].status.late_initialized = true; 2685 } 2686 2687 r = amdgpu_ras_late_init(adev); 2688 if (r) { 2689 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2690 return r; 2691 } 2692 2693 amdgpu_ras_set_error_query_ready(adev, true); 2694 2695 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2696 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2697 2698 amdgpu_device_fill_reset_magic(adev); 2699 2700 r = amdgpu_device_enable_mgpu_fan_boost(); 2701 if (r) 2702 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2703 2704 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2705 if (amdgpu_passthrough(adev) && 2706 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2707 adev->asic_type == CHIP_ALDEBARAN)) 2708 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2709 2710 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2711 mutex_lock(&mgpu_info.mutex); 2712 2713 /* 2714 * Reset device p-state to low as this was booted with high. 2715 * 2716 * This should be performed only after all devices from the same 2717 * hive get initialized. 2718 * 2719 * However, it's unknown how many device in the hive in advance. 2720 * As this is counted one by one during devices initializations. 2721 * 2722 * So, we wait for all XGMI interlinked devices initialized. 2723 * This may bring some delays as those devices may come from 2724 * different hives. But that should be OK. 2725 */ 2726 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2727 for (i = 0; i < mgpu_info.num_gpu; i++) { 2728 gpu_instance = &(mgpu_info.gpu_ins[i]); 2729 if (gpu_instance->adev->flags & AMD_IS_APU) 2730 continue; 2731 2732 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2733 AMDGPU_XGMI_PSTATE_MIN); 2734 if (r) { 2735 DRM_ERROR("pstate setting failed (%d).\n", r); 2736 break; 2737 } 2738 } 2739 } 2740 2741 mutex_unlock(&mgpu_info.mutex); 2742 } 2743 2744 return 0; 2745 } 2746 2747 /** 2748 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2749 * 2750 * @adev: amdgpu_device pointer 2751 * 2752 * For ASICs need to disable SMC first 2753 */ 2754 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2755 { 2756 int i, r; 2757 2758 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2759 return; 2760 2761 for (i = 0; i < adev->num_ip_blocks; i++) { 2762 if (!adev->ip_blocks[i].status.hw) 2763 continue; 2764 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2765 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2766 /* XXX handle errors */ 2767 if (r) { 2768 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2769 adev->ip_blocks[i].version->funcs->name, r); 2770 } 2771 adev->ip_blocks[i].status.hw = false; 2772 break; 2773 } 2774 } 2775 } 2776 2777 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2778 { 2779 int i, r; 2780 2781 for (i = 0; i < adev->num_ip_blocks; i++) { 2782 if (!adev->ip_blocks[i].version->funcs->early_fini) 2783 continue; 2784 2785 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2786 if (r) { 2787 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2788 adev->ip_blocks[i].version->funcs->name, r); 2789 } 2790 } 2791 2792 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2793 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2794 2795 amdgpu_amdkfd_suspend(adev, false); 2796 2797 /* Workaroud for ASICs need to disable SMC first */ 2798 amdgpu_device_smu_fini_early(adev); 2799 2800 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2801 if (!adev->ip_blocks[i].status.hw) 2802 continue; 2803 2804 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2805 /* XXX handle errors */ 2806 if (r) { 2807 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2808 adev->ip_blocks[i].version->funcs->name, r); 2809 } 2810 2811 adev->ip_blocks[i].status.hw = false; 2812 } 2813 2814 if (amdgpu_sriov_vf(adev)) { 2815 if (amdgpu_virt_release_full_gpu(adev, false)) 2816 DRM_ERROR("failed to release exclusive mode on fini\n"); 2817 } 2818 2819 return 0; 2820 } 2821 2822 /** 2823 * amdgpu_device_ip_fini - run fini for hardware IPs 2824 * 2825 * @adev: amdgpu_device pointer 2826 * 2827 * Main teardown pass for hardware IPs. The list of all the hardware 2828 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2829 * are run. hw_fini tears down the hardware associated with each IP 2830 * and sw_fini tears down any software state associated with each IP. 2831 * Returns 0 on success, negative error code on failure. 2832 */ 2833 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2834 { 2835 int i, r; 2836 2837 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2838 amdgpu_virt_release_ras_err_handler_data(adev); 2839 2840 if (adev->gmc.xgmi.num_physical_nodes > 1) 2841 amdgpu_xgmi_remove_device(adev); 2842 2843 amdgpu_amdkfd_device_fini_sw(adev); 2844 2845 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2846 if (!adev->ip_blocks[i].status.sw) 2847 continue; 2848 2849 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2850 amdgpu_ucode_free_bo(adev); 2851 amdgpu_free_static_csa(&adev->virt.csa_obj); 2852 amdgpu_device_wb_fini(adev); 2853 amdgpu_device_mem_scratch_fini(adev); 2854 amdgpu_ib_pool_fini(adev); 2855 } 2856 2857 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2858 /* XXX handle errors */ 2859 if (r) { 2860 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2861 adev->ip_blocks[i].version->funcs->name, r); 2862 } 2863 adev->ip_blocks[i].status.sw = false; 2864 adev->ip_blocks[i].status.valid = false; 2865 } 2866 2867 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2868 if (!adev->ip_blocks[i].status.late_initialized) 2869 continue; 2870 if (adev->ip_blocks[i].version->funcs->late_fini) 2871 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2872 adev->ip_blocks[i].status.late_initialized = false; 2873 } 2874 2875 amdgpu_ras_fini(adev); 2876 2877 return 0; 2878 } 2879 2880 /** 2881 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2882 * 2883 * @work: work_struct. 2884 */ 2885 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2886 { 2887 struct amdgpu_device *adev = 2888 container_of(work, struct amdgpu_device, delayed_init_work.work); 2889 int r; 2890 2891 r = amdgpu_ib_ring_tests(adev); 2892 if (r) 2893 DRM_ERROR("ib ring test failed (%d).\n", r); 2894 } 2895 2896 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2897 { 2898 struct amdgpu_device *adev = 2899 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2900 2901 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2902 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2903 2904 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2905 adev->gfx.gfx_off_state = true; 2906 } 2907 2908 /** 2909 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2910 * 2911 * @adev: amdgpu_device pointer 2912 * 2913 * Main suspend function for hardware IPs. The list of all the hardware 2914 * IPs that make up the asic is walked, clockgating is disabled and the 2915 * suspend callbacks are run. suspend puts the hardware and software state 2916 * in each IP into a state suitable for suspend. 2917 * Returns 0 on success, negative error code on failure. 2918 */ 2919 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2920 { 2921 int i, r; 2922 2923 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2924 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2925 2926 /* 2927 * Per PMFW team's suggestion, driver needs to handle gfxoff 2928 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2929 * scenario. Add the missing df cstate disablement here. 2930 */ 2931 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2932 dev_warn(adev->dev, "Failed to disallow df cstate"); 2933 2934 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2935 if (!adev->ip_blocks[i].status.valid) 2936 continue; 2937 2938 /* displays are handled separately */ 2939 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2940 continue; 2941 2942 /* XXX handle errors */ 2943 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2944 /* XXX handle errors */ 2945 if (r) { 2946 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2947 adev->ip_blocks[i].version->funcs->name, r); 2948 return r; 2949 } 2950 2951 adev->ip_blocks[i].status.hw = false; 2952 } 2953 2954 return 0; 2955 } 2956 2957 /** 2958 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2959 * 2960 * @adev: amdgpu_device pointer 2961 * 2962 * Main suspend function for hardware IPs. The list of all the hardware 2963 * IPs that make up the asic is walked, clockgating is disabled and the 2964 * suspend callbacks are run. suspend puts the hardware and software state 2965 * in each IP into a state suitable for suspend. 2966 * Returns 0 on success, negative error code on failure. 2967 */ 2968 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2969 { 2970 int i, r; 2971 2972 if (adev->in_s0ix) 2973 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2974 2975 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2976 if (!adev->ip_blocks[i].status.valid) 2977 continue; 2978 /* displays are handled in phase1 */ 2979 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2980 continue; 2981 /* PSP lost connection when err_event_athub occurs */ 2982 if (amdgpu_ras_intr_triggered() && 2983 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2984 adev->ip_blocks[i].status.hw = false; 2985 continue; 2986 } 2987 2988 /* skip unnecessary suspend if we do not initialize them yet */ 2989 if (adev->gmc.xgmi.pending_reset && 2990 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2991 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2992 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2993 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2994 adev->ip_blocks[i].status.hw = false; 2995 continue; 2996 } 2997 2998 /* skip suspend of gfx/mes and psp for S0ix 2999 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3000 * like at runtime. PSP is also part of the always on hardware 3001 * so no need to suspend it. 3002 */ 3003 if (adev->in_s0ix && 3004 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3005 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3006 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3007 continue; 3008 3009 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3010 if (adev->in_s0ix && 3011 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3012 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3013 continue; 3014 3015 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3016 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3017 * from this location and RLC Autoload automatically also gets loaded 3018 * from here based on PMFW -> PSP message during re-init sequence. 3019 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3020 * the TMR and reload FWs again for IMU enabled APU ASICs. 3021 */ 3022 if (amdgpu_in_reset(adev) && 3023 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3024 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3025 continue; 3026 3027 /* XXX handle errors */ 3028 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3029 /* XXX handle errors */ 3030 if (r) { 3031 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3032 adev->ip_blocks[i].version->funcs->name, r); 3033 } 3034 adev->ip_blocks[i].status.hw = false; 3035 /* handle putting the SMC in the appropriate state */ 3036 if (!amdgpu_sriov_vf(adev)) { 3037 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3038 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3039 if (r) { 3040 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3041 adev->mp1_state, r); 3042 return r; 3043 } 3044 } 3045 } 3046 } 3047 3048 return 0; 3049 } 3050 3051 /** 3052 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3053 * 3054 * @adev: amdgpu_device pointer 3055 * 3056 * Main suspend function for hardware IPs. The list of all the hardware 3057 * IPs that make up the asic is walked, clockgating is disabled and the 3058 * suspend callbacks are run. suspend puts the hardware and software state 3059 * in each IP into a state suitable for suspend. 3060 * Returns 0 on success, negative error code on failure. 3061 */ 3062 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3063 { 3064 int r; 3065 3066 if (amdgpu_sriov_vf(adev)) { 3067 amdgpu_virt_fini_data_exchange(adev); 3068 amdgpu_virt_request_full_gpu(adev, false); 3069 } 3070 3071 r = amdgpu_device_ip_suspend_phase1(adev); 3072 if (r) 3073 return r; 3074 r = amdgpu_device_ip_suspend_phase2(adev); 3075 3076 if (amdgpu_sriov_vf(adev)) 3077 amdgpu_virt_release_full_gpu(adev, false); 3078 3079 return r; 3080 } 3081 3082 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3083 { 3084 int i, r; 3085 3086 static enum amd_ip_block_type ip_order[] = { 3087 AMD_IP_BLOCK_TYPE_COMMON, 3088 AMD_IP_BLOCK_TYPE_GMC, 3089 AMD_IP_BLOCK_TYPE_PSP, 3090 AMD_IP_BLOCK_TYPE_IH, 3091 }; 3092 3093 for (i = 0; i < adev->num_ip_blocks; i++) { 3094 int j; 3095 struct amdgpu_ip_block *block; 3096 3097 block = &adev->ip_blocks[i]; 3098 block->status.hw = false; 3099 3100 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3101 3102 if (block->version->type != ip_order[j] || 3103 !block->status.valid) 3104 continue; 3105 3106 r = block->version->funcs->hw_init(adev); 3107 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3108 if (r) 3109 return r; 3110 block->status.hw = true; 3111 } 3112 } 3113 3114 return 0; 3115 } 3116 3117 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3118 { 3119 int i, r; 3120 3121 static enum amd_ip_block_type ip_order[] = { 3122 AMD_IP_BLOCK_TYPE_SMC, 3123 AMD_IP_BLOCK_TYPE_DCE, 3124 AMD_IP_BLOCK_TYPE_GFX, 3125 AMD_IP_BLOCK_TYPE_SDMA, 3126 AMD_IP_BLOCK_TYPE_MES, 3127 AMD_IP_BLOCK_TYPE_UVD, 3128 AMD_IP_BLOCK_TYPE_VCE, 3129 AMD_IP_BLOCK_TYPE_VCN, 3130 AMD_IP_BLOCK_TYPE_JPEG 3131 }; 3132 3133 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3134 int j; 3135 struct amdgpu_ip_block *block; 3136 3137 for (j = 0; j < adev->num_ip_blocks; j++) { 3138 block = &adev->ip_blocks[j]; 3139 3140 if (block->version->type != ip_order[i] || 3141 !block->status.valid || 3142 block->status.hw) 3143 continue; 3144 3145 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3146 r = block->version->funcs->resume(adev); 3147 else 3148 r = block->version->funcs->hw_init(adev); 3149 3150 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3151 if (r) 3152 return r; 3153 block->status.hw = true; 3154 } 3155 } 3156 3157 return 0; 3158 } 3159 3160 /** 3161 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3162 * 3163 * @adev: amdgpu_device pointer 3164 * 3165 * First resume function for hardware IPs. The list of all the hardware 3166 * IPs that make up the asic is walked and the resume callbacks are run for 3167 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3168 * after a suspend and updates the software state as necessary. This 3169 * function is also used for restoring the GPU after a GPU reset. 3170 * Returns 0 on success, negative error code on failure. 3171 */ 3172 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3173 { 3174 int i, r; 3175 3176 for (i = 0; i < adev->num_ip_blocks; i++) { 3177 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3178 continue; 3179 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3180 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3181 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3182 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3183 3184 r = adev->ip_blocks[i].version->funcs->resume(adev); 3185 if (r) { 3186 DRM_ERROR("resume of IP block <%s> failed %d\n", 3187 adev->ip_blocks[i].version->funcs->name, r); 3188 return r; 3189 } 3190 adev->ip_blocks[i].status.hw = true; 3191 } 3192 } 3193 3194 return 0; 3195 } 3196 3197 /** 3198 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3199 * 3200 * @adev: amdgpu_device pointer 3201 * 3202 * First resume function for hardware IPs. The list of all the hardware 3203 * IPs that make up the asic is walked and the resume callbacks are run for 3204 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3205 * functional state after a suspend and updates the software state as 3206 * necessary. This function is also used for restoring the GPU after a GPU 3207 * reset. 3208 * Returns 0 on success, negative error code on failure. 3209 */ 3210 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3211 { 3212 int i, r; 3213 3214 for (i = 0; i < adev->num_ip_blocks; i++) { 3215 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3216 continue; 3217 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3218 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3219 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3220 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3221 continue; 3222 r = adev->ip_blocks[i].version->funcs->resume(adev); 3223 if (r) { 3224 DRM_ERROR("resume of IP block <%s> failed %d\n", 3225 adev->ip_blocks[i].version->funcs->name, r); 3226 return r; 3227 } 3228 adev->ip_blocks[i].status.hw = true; 3229 } 3230 3231 return 0; 3232 } 3233 3234 /** 3235 * amdgpu_device_ip_resume - run resume for hardware IPs 3236 * 3237 * @adev: amdgpu_device pointer 3238 * 3239 * Main resume function for hardware IPs. The hardware IPs 3240 * are split into two resume functions because they are 3241 * also used in recovering from a GPU reset and some additional 3242 * steps need to be take between them. In this case (S3/S4) they are 3243 * run sequentially. 3244 * Returns 0 on success, negative error code on failure. 3245 */ 3246 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3247 { 3248 int r; 3249 3250 r = amdgpu_device_ip_resume_phase1(adev); 3251 if (r) 3252 return r; 3253 3254 r = amdgpu_device_fw_loading(adev); 3255 if (r) 3256 return r; 3257 3258 r = amdgpu_device_ip_resume_phase2(adev); 3259 3260 return r; 3261 } 3262 3263 /** 3264 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3265 * 3266 * @adev: amdgpu_device pointer 3267 * 3268 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3269 */ 3270 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3271 { 3272 if (amdgpu_sriov_vf(adev)) { 3273 if (adev->is_atom_fw) { 3274 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3275 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3276 } else { 3277 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3278 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3279 } 3280 3281 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3282 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3283 } 3284 } 3285 3286 /** 3287 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3288 * 3289 * @asic_type: AMD asic type 3290 * 3291 * Check if there is DC (new modesetting infrastructre) support for an asic. 3292 * returns true if DC has support, false if not. 3293 */ 3294 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3295 { 3296 switch (asic_type) { 3297 #ifdef CONFIG_DRM_AMDGPU_SI 3298 case CHIP_HAINAN: 3299 #endif 3300 case CHIP_TOPAZ: 3301 /* chips with no display hardware */ 3302 return false; 3303 #if defined(CONFIG_DRM_AMD_DC) 3304 case CHIP_TAHITI: 3305 case CHIP_PITCAIRN: 3306 case CHIP_VERDE: 3307 case CHIP_OLAND: 3308 /* 3309 * We have systems in the wild with these ASICs that require 3310 * LVDS and VGA support which is not supported with DC. 3311 * 3312 * Fallback to the non-DC driver here by default so as not to 3313 * cause regressions. 3314 */ 3315 #if defined(CONFIG_DRM_AMD_DC_SI) 3316 return amdgpu_dc > 0; 3317 #else 3318 return false; 3319 #endif 3320 case CHIP_BONAIRE: 3321 case CHIP_KAVERI: 3322 case CHIP_KABINI: 3323 case CHIP_MULLINS: 3324 /* 3325 * We have systems in the wild with these ASICs that require 3326 * VGA support which is not supported with DC. 3327 * 3328 * Fallback to the non-DC driver here by default so as not to 3329 * cause regressions. 3330 */ 3331 return amdgpu_dc > 0; 3332 default: 3333 return amdgpu_dc != 0; 3334 #else 3335 default: 3336 if (amdgpu_dc > 0) 3337 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3338 return false; 3339 #endif 3340 } 3341 } 3342 3343 /** 3344 * amdgpu_device_has_dc_support - check if dc is supported 3345 * 3346 * @adev: amdgpu_device pointer 3347 * 3348 * Returns true for supported, false for not supported 3349 */ 3350 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3351 { 3352 if (adev->enable_virtual_display || 3353 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3354 return false; 3355 3356 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3357 } 3358 3359 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3360 { 3361 struct amdgpu_device *adev = 3362 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3363 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3364 3365 /* It's a bug to not have a hive within this function */ 3366 if (WARN_ON(!hive)) 3367 return; 3368 3369 /* 3370 * Use task barrier to synchronize all xgmi reset works across the 3371 * hive. task_barrier_enter and task_barrier_exit will block 3372 * until all the threads running the xgmi reset works reach 3373 * those points. task_barrier_full will do both blocks. 3374 */ 3375 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3376 3377 task_barrier_enter(&hive->tb); 3378 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3379 3380 if (adev->asic_reset_res) 3381 goto fail; 3382 3383 task_barrier_exit(&hive->tb); 3384 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3385 3386 if (adev->asic_reset_res) 3387 goto fail; 3388 3389 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3390 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3391 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3392 } else { 3393 3394 task_barrier_full(&hive->tb); 3395 adev->asic_reset_res = amdgpu_asic_reset(adev); 3396 } 3397 3398 fail: 3399 if (adev->asic_reset_res) 3400 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3401 adev->asic_reset_res, adev_to_drm(adev)->unique); 3402 amdgpu_put_xgmi_hive(hive); 3403 } 3404 3405 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3406 { 3407 char *input = amdgpu_lockup_timeout; 3408 char *timeout_setting = NULL; 3409 int index = 0; 3410 long timeout; 3411 int ret = 0; 3412 3413 /* 3414 * By default timeout for non compute jobs is 10000 3415 * and 60000 for compute jobs. 3416 * In SR-IOV or passthrough mode, timeout for compute 3417 * jobs are 60000 by default. 3418 */ 3419 adev->gfx_timeout = msecs_to_jiffies(10000); 3420 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3421 if (amdgpu_sriov_vf(adev)) 3422 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3423 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3424 else 3425 adev->compute_timeout = msecs_to_jiffies(60000); 3426 3427 #ifdef notyet 3428 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3429 while ((timeout_setting = strsep(&input, ",")) && 3430 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3431 ret = kstrtol(timeout_setting, 0, &timeout); 3432 if (ret) 3433 return ret; 3434 3435 if (timeout == 0) { 3436 index++; 3437 continue; 3438 } else if (timeout < 0) { 3439 timeout = MAX_SCHEDULE_TIMEOUT; 3440 dev_warn(adev->dev, "lockup timeout disabled"); 3441 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3442 } else { 3443 timeout = msecs_to_jiffies(timeout); 3444 } 3445 3446 switch (index++) { 3447 case 0: 3448 adev->gfx_timeout = timeout; 3449 break; 3450 case 1: 3451 adev->compute_timeout = timeout; 3452 break; 3453 case 2: 3454 adev->sdma_timeout = timeout; 3455 break; 3456 case 3: 3457 adev->video_timeout = timeout; 3458 break; 3459 default: 3460 break; 3461 } 3462 } 3463 /* 3464 * There is only one value specified and 3465 * it should apply to all non-compute jobs. 3466 */ 3467 if (index == 1) { 3468 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3469 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3470 adev->compute_timeout = adev->gfx_timeout; 3471 } 3472 } 3473 #endif 3474 3475 return ret; 3476 } 3477 3478 /** 3479 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3480 * 3481 * @adev: amdgpu_device pointer 3482 * 3483 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3484 */ 3485 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3486 { 3487 #ifdef notyet 3488 struct iommu_domain *domain; 3489 3490 domain = iommu_get_domain_for_dev(adev->dev); 3491 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3492 #endif 3493 adev->ram_is_direct_mapped = true; 3494 } 3495 3496 static const struct attribute *amdgpu_dev_attributes[] = { 3497 &dev_attr_pcie_replay_count.attr, 3498 NULL 3499 }; 3500 3501 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3502 { 3503 if (amdgpu_mcbp == 1) 3504 adev->gfx.mcbp = true; 3505 else if (amdgpu_mcbp == 0) 3506 adev->gfx.mcbp = false; 3507 3508 if (amdgpu_sriov_vf(adev)) 3509 adev->gfx.mcbp = true; 3510 3511 if (adev->gfx.mcbp) 3512 DRM_INFO("MCBP is enabled\n"); 3513 } 3514 3515 /** 3516 * amdgpu_device_init - initialize the driver 3517 * 3518 * @adev: amdgpu_device pointer 3519 * @flags: driver flags 3520 * 3521 * Initializes the driver info and hw (all asics). 3522 * Returns 0 for success or an error on failure. 3523 * Called at driver startup. 3524 */ 3525 int amdgpu_device_init(struct amdgpu_device *adev, 3526 uint32_t flags) 3527 { 3528 struct drm_device *ddev = adev_to_drm(adev); 3529 struct pci_dev *pdev = adev->pdev; 3530 int r, i; 3531 bool px = false; 3532 u32 max_MBps; 3533 int tmp; 3534 3535 adev->shutdown = false; 3536 adev->flags = flags; 3537 3538 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3539 adev->asic_type = amdgpu_force_asic_type; 3540 else 3541 adev->asic_type = flags & AMD_ASIC_MASK; 3542 3543 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3544 if (amdgpu_emu_mode == 1) 3545 adev->usec_timeout *= 10; 3546 adev->gmc.gart_size = 512 * 1024 * 1024; 3547 adev->accel_working = false; 3548 adev->num_rings = 0; 3549 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3550 adev->mman.buffer_funcs = NULL; 3551 adev->mman.buffer_funcs_ring = NULL; 3552 adev->vm_manager.vm_pte_funcs = NULL; 3553 adev->vm_manager.vm_pte_num_scheds = 0; 3554 adev->gmc.gmc_funcs = NULL; 3555 adev->harvest_ip_mask = 0x0; 3556 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3557 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3558 3559 adev->smc_rreg = &amdgpu_invalid_rreg; 3560 adev->smc_wreg = &amdgpu_invalid_wreg; 3561 adev->pcie_rreg = &amdgpu_invalid_rreg; 3562 adev->pcie_wreg = &amdgpu_invalid_wreg; 3563 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3564 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3565 adev->pciep_rreg = &amdgpu_invalid_rreg; 3566 adev->pciep_wreg = &amdgpu_invalid_wreg; 3567 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3568 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3569 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3570 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3571 adev->didt_rreg = &amdgpu_invalid_rreg; 3572 adev->didt_wreg = &amdgpu_invalid_wreg; 3573 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3574 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3575 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3576 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3577 3578 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3579 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3580 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3581 3582 /* mutex initialization are all done here so we 3583 * can recall function without having locking issues 3584 */ 3585 rw_init(&adev->firmware.mutex, "agfw"); 3586 rw_init(&adev->pm.mutex, "agpm"); 3587 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3588 rw_init(&adev->srbm_mutex, "srbm"); 3589 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3590 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3591 rw_init(&adev->gfx.partition_mutex, "gfxpar"); 3592 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3593 rw_init(&adev->mn_lock, "agpumn"); 3594 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3595 hash_init(adev->mn_hash); 3596 rw_init(&adev->psp.mutex, "agpsp"); 3597 rw_init(&adev->notifier_lock, "agnf"); 3598 rw_init(&adev->pm.stable_pstate_ctx_lock, "agps"); 3599 rw_init(&adev->benchmark_mutex, "agbm"); 3600 3601 amdgpu_device_init_apu_flags(adev); 3602 3603 r = amdgpu_device_check_arguments(adev); 3604 if (r) 3605 return r; 3606 3607 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3608 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3609 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3610 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3611 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3612 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3613 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3614 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3615 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3616 3617 INIT_LIST_HEAD(&adev->shadow_list); 3618 rw_init(&adev->shadow_list_lock, "sdwlst"); 3619 3620 INIT_LIST_HEAD(&adev->reset_list); 3621 3622 INIT_LIST_HEAD(&adev->ras_list); 3623 3624 INIT_DELAYED_WORK(&adev->delayed_init_work, 3625 amdgpu_device_delayed_init_work_handler); 3626 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3627 amdgpu_device_delay_enable_gfx_off); 3628 3629 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3630 3631 adev->gfx.gfx_off_req_count = 1; 3632 adev->gfx.gfx_off_residency = 0; 3633 adev->gfx.gfx_off_entrycount = 0; 3634 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3635 3636 atomic_set(&adev->throttling_logging_enabled, 1); 3637 /* 3638 * If throttling continues, logging will be performed every minute 3639 * to avoid log flooding. "-1" is subtracted since the thermal 3640 * throttling interrupt comes every second. Thus, the total logging 3641 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3642 * for throttling interrupt) = 60 seconds. 3643 */ 3644 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3645 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3646 3647 #ifdef __linux__ 3648 /* Registers mapping */ 3649 /* TODO: block userspace mapping of io register */ 3650 if (adev->asic_type >= CHIP_BONAIRE) { 3651 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3652 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3653 } else { 3654 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3655 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3656 } 3657 #endif 3658 3659 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3660 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3661 3662 #ifdef __linux__ 3663 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3664 if (!adev->rmmio) 3665 return -ENOMEM; 3666 #endif 3667 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3668 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3669 3670 /* 3671 * Reset domain needs to be present early, before XGMI hive discovered 3672 * (if any) and intitialized to use reset sem and in_gpu reset flag 3673 * early on during init and before calling to RREG32. 3674 */ 3675 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3676 if (!adev->reset_domain) 3677 return -ENOMEM; 3678 3679 /* detect hw virtualization here */ 3680 amdgpu_detect_virtualization(adev); 3681 3682 amdgpu_device_get_pcie_info(adev); 3683 3684 r = amdgpu_device_get_job_timeout_settings(adev); 3685 if (r) { 3686 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3687 return r; 3688 } 3689 3690 /* early init functions */ 3691 r = amdgpu_device_ip_early_init(adev); 3692 if (r) 3693 return r; 3694 3695 amdgpu_device_set_mcbp(adev); 3696 3697 /* Get rid of things like offb */ 3698 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3699 if (r) 3700 return r; 3701 3702 /* Enable TMZ based on IP_VERSION */ 3703 amdgpu_gmc_tmz_set(adev); 3704 3705 amdgpu_gmc_noretry_set(adev); 3706 /* Need to get xgmi info early to decide the reset behavior*/ 3707 if (adev->gmc.xgmi.supported) { 3708 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3709 if (r) 3710 return r; 3711 } 3712 3713 /* enable PCIE atomic ops */ 3714 #ifdef notyet 3715 if (amdgpu_sriov_vf(adev)) { 3716 if (adev->virt.fw_reserve.p_pf2vf) 3717 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3718 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3719 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3720 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3721 * internal path natively support atomics, set have_atomics_support to true. 3722 */ 3723 } else if ((adev->flags & AMD_IS_APU) && 3724 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3725 adev->have_atomics_support = true; 3726 } else { 3727 adev->have_atomics_support = 3728 !pci_enable_atomic_ops_to_root(adev->pdev, 3729 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3730 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3731 } 3732 3733 if (!adev->have_atomics_support) 3734 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3735 #else 3736 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3737 * internal path natively support atomics, set have_atomics_support to true. 3738 */ 3739 if ((adev->flags & AMD_IS_APU) && 3740 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) 3741 adev->have_atomics_support = true; 3742 else 3743 adev->have_atomics_support = false; 3744 #endif 3745 3746 /* doorbell bar mapping and doorbell index init*/ 3747 amdgpu_doorbell_init(adev); 3748 3749 if (amdgpu_emu_mode == 1) { 3750 /* post the asic on emulation mode */ 3751 emu_soc_asic_init(adev); 3752 goto fence_driver_init; 3753 } 3754 3755 amdgpu_reset_init(adev); 3756 3757 /* detect if we are with an SRIOV vbios */ 3758 if (adev->bios) 3759 amdgpu_device_detect_sriov_bios(adev); 3760 3761 /* check if we need to reset the asic 3762 * E.g., driver was not cleanly unloaded previously, etc. 3763 */ 3764 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3765 if (adev->gmc.xgmi.num_physical_nodes) { 3766 dev_info(adev->dev, "Pending hive reset.\n"); 3767 adev->gmc.xgmi.pending_reset = true; 3768 /* Only need to init necessary block for SMU to handle the reset */ 3769 for (i = 0; i < adev->num_ip_blocks; i++) { 3770 if (!adev->ip_blocks[i].status.valid) 3771 continue; 3772 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3773 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3774 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3775 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3776 DRM_DEBUG("IP %s disabled for hw_init.\n", 3777 adev->ip_blocks[i].version->funcs->name); 3778 adev->ip_blocks[i].status.hw = true; 3779 } 3780 } 3781 } else { 3782 tmp = amdgpu_reset_method; 3783 /* It should do a default reset when loading or reloading the driver, 3784 * regardless of the module parameter reset_method. 3785 */ 3786 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3787 r = amdgpu_asic_reset(adev); 3788 amdgpu_reset_method = tmp; 3789 if (r) { 3790 dev_err(adev->dev, "asic reset on init failed\n"); 3791 goto failed; 3792 } 3793 } 3794 } 3795 3796 /* Post card if necessary */ 3797 if (amdgpu_device_need_post(adev)) { 3798 if (!adev->bios) { 3799 dev_err(adev->dev, "no vBIOS found\n"); 3800 r = -EINVAL; 3801 goto failed; 3802 } 3803 DRM_INFO("GPU posting now...\n"); 3804 r = amdgpu_device_asic_init(adev); 3805 if (r) { 3806 dev_err(adev->dev, "gpu post error!\n"); 3807 goto failed; 3808 } 3809 } 3810 3811 if (adev->bios) { 3812 if (adev->is_atom_fw) { 3813 /* Initialize clocks */ 3814 r = amdgpu_atomfirmware_get_clock_info(adev); 3815 if (r) { 3816 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3817 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3818 goto failed; 3819 } 3820 } else { 3821 /* Initialize clocks */ 3822 r = amdgpu_atombios_get_clock_info(adev); 3823 if (r) { 3824 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3825 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3826 goto failed; 3827 } 3828 /* init i2c buses */ 3829 if (!amdgpu_device_has_dc_support(adev)) 3830 amdgpu_atombios_i2c_init(adev); 3831 } 3832 } 3833 3834 fence_driver_init: 3835 /* Fence driver */ 3836 r = amdgpu_fence_driver_sw_init(adev); 3837 if (r) { 3838 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3839 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3840 goto failed; 3841 } 3842 3843 /* init the mode config */ 3844 drm_mode_config_init(adev_to_drm(adev)); 3845 3846 r = amdgpu_device_ip_init(adev); 3847 if (r) { 3848 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3849 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3850 goto release_ras_con; 3851 } 3852 3853 amdgpu_fence_driver_hw_init(adev); 3854 3855 dev_info(adev->dev, 3856 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3857 adev->gfx.config.max_shader_engines, 3858 adev->gfx.config.max_sh_per_se, 3859 adev->gfx.config.max_cu_per_sh, 3860 adev->gfx.cu_info.number); 3861 3862 #ifdef __OpenBSD__ 3863 { 3864 const char *chip_name; 3865 uint32_t version = adev->ip_versions[GC_HWIP][0]; 3866 int maj, min, rev; 3867 3868 switch (adev->asic_type) { 3869 case CHIP_RAVEN: 3870 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3871 chip_name = "RAVEN2"; 3872 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3873 chip_name = "PICASSO"; 3874 else 3875 chip_name = "RAVEN"; 3876 break; 3877 case CHIP_RENOIR: 3878 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3879 chip_name = "RENOIR"; 3880 else 3881 chip_name = "GREEN_SARDINE"; 3882 break; 3883 default: 3884 chip_name = amdgpu_asic_name[adev->asic_type]; 3885 } 3886 3887 printf("%s: %s", adev->self.dv_xname, chip_name); 3888 /* show graphics/compute ip block version, not set on < GFX9 */ 3889 if (version) { 3890 maj = IP_VERSION_MAJ(version); 3891 min = IP_VERSION_MIN(version); 3892 rev = IP_VERSION_REV(version); 3893 printf(" GC %d.%d.%d", maj, min, rev); 3894 } 3895 printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id); 3896 } 3897 #endif 3898 3899 adev->accel_working = true; 3900 3901 amdgpu_vm_check_compute_bug(adev); 3902 3903 /* Initialize the buffer migration limit. */ 3904 if (amdgpu_moverate >= 0) 3905 max_MBps = amdgpu_moverate; 3906 else 3907 max_MBps = 8; /* Allow 8 MB/s. */ 3908 /* Get a log2 for easy divisions. */ 3909 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3910 3911 r = amdgpu_atombios_sysfs_init(adev); 3912 if (r) 3913 drm_err(&adev->ddev, 3914 "registering atombios sysfs failed (%d).\n", r); 3915 3916 r = amdgpu_pm_sysfs_init(adev); 3917 if (r) 3918 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3919 3920 r = amdgpu_ucode_sysfs_init(adev); 3921 if (r) { 3922 adev->ucode_sysfs_en = false; 3923 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3924 } else 3925 adev->ucode_sysfs_en = true; 3926 3927 /* 3928 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3929 * Otherwise the mgpu fan boost feature will be skipped due to the 3930 * gpu instance is counted less. 3931 */ 3932 amdgpu_register_gpu_instance(adev); 3933 3934 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3935 * explicit gating rather than handling it automatically. 3936 */ 3937 if (!adev->gmc.xgmi.pending_reset) { 3938 r = amdgpu_device_ip_late_init(adev); 3939 if (r) { 3940 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3941 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3942 goto release_ras_con; 3943 } 3944 /* must succeed. */ 3945 amdgpu_ras_resume(adev); 3946 queue_delayed_work(system_wq, &adev->delayed_init_work, 3947 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3948 } 3949 3950 if (amdgpu_sriov_vf(adev)) { 3951 amdgpu_virt_release_full_gpu(adev, true); 3952 flush_delayed_work(&adev->delayed_init_work); 3953 } 3954 3955 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3956 if (r) 3957 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3958 3959 amdgpu_fru_sysfs_init(adev); 3960 3961 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3962 r = amdgpu_pmu_init(adev); 3963 if (r) 3964 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3965 3966 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3967 if (amdgpu_device_cache_pci_state(adev->pdev)) 3968 pci_restore_state(pdev); 3969 3970 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3971 /* this will fail for cards that aren't VGA class devices, just 3972 * ignore it 3973 */ 3974 #ifdef notyet 3975 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3976 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3977 #endif 3978 3979 px = amdgpu_device_supports_px(ddev); 3980 3981 if (px || (!dev_is_removable(&adev->pdev->dev) && 3982 apple_gmux_detect(NULL, NULL))) 3983 vga_switcheroo_register_client(adev->pdev, 3984 &amdgpu_switcheroo_ops, px); 3985 3986 if (px) 3987 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3988 3989 if (adev->gmc.xgmi.pending_reset) 3990 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3991 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3992 3993 amdgpu_device_check_iommu_direct_map(adev); 3994 3995 return 0; 3996 3997 release_ras_con: 3998 if (amdgpu_sriov_vf(adev)) 3999 amdgpu_virt_release_full_gpu(adev, true); 4000 4001 /* failed in exclusive mode due to timeout */ 4002 if (amdgpu_sriov_vf(adev) && 4003 !amdgpu_sriov_runtime(adev) && 4004 amdgpu_virt_mmio_blocked(adev) && 4005 !amdgpu_virt_wait_reset(adev)) { 4006 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4007 /* Don't send request since VF is inactive. */ 4008 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4009 adev->virt.ops = NULL; 4010 r = -EAGAIN; 4011 } 4012 amdgpu_release_ras_context(adev); 4013 4014 failed: 4015 amdgpu_vf_error_trans_all(adev); 4016 4017 return r; 4018 } 4019 4020 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4021 { 4022 STUB(); 4023 #ifdef notyet 4024 4025 /* Clear all CPU mappings pointing to this device */ 4026 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4027 #endif 4028 4029 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4030 amdgpu_doorbell_fini(adev); 4031 4032 #ifdef __linux__ 4033 iounmap(adev->rmmio); 4034 adev->rmmio = NULL; 4035 if (adev->mman.aper_base_kaddr) 4036 iounmap(adev->mman.aper_base_kaddr); 4037 adev->mman.aper_base_kaddr = NULL; 4038 #else 4039 if (adev->rmmio_size > 0) 4040 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4041 adev->rmmio_size); 4042 adev->rmmio_size = 0; 4043 adev->rmmio = NULL; 4044 if (adev->mman.aper_base_kaddr) 4045 bus_space_unmap(adev->memt, adev->mman.aper_bsh, 4046 adev->gmc.visible_vram_size); 4047 adev->mman.aper_base_kaddr = NULL; 4048 #endif 4049 4050 /* Memory manager related */ 4051 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4052 #ifdef __linux__ 4053 arch_phys_wc_del(adev->gmc.vram_mtrr); 4054 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4055 #else 4056 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC); 4057 #endif 4058 } 4059 } 4060 4061 /** 4062 * amdgpu_device_fini_hw - tear down the driver 4063 * 4064 * @adev: amdgpu_device pointer 4065 * 4066 * Tear down the driver info (all asics). 4067 * Called at driver shutdown. 4068 */ 4069 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4070 { 4071 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4072 flush_delayed_work(&adev->delayed_init_work); 4073 adev->shutdown = true; 4074 4075 /* make sure IB test finished before entering exclusive mode 4076 * to avoid preemption on IB test 4077 */ 4078 if (amdgpu_sriov_vf(adev)) { 4079 amdgpu_virt_request_full_gpu(adev, false); 4080 amdgpu_virt_fini_data_exchange(adev); 4081 } 4082 4083 /* disable all interrupts */ 4084 amdgpu_irq_disable_all(adev); 4085 if (adev->mode_info.mode_config_initialized) { 4086 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4087 drm_helper_force_disable_all(adev_to_drm(adev)); 4088 else 4089 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4090 } 4091 amdgpu_fence_driver_hw_fini(adev); 4092 4093 if (adev->mman.initialized) 4094 drain_workqueue(adev->mman.bdev.wq); 4095 4096 if (adev->pm.sysfs_initialized) 4097 amdgpu_pm_sysfs_fini(adev); 4098 if (adev->ucode_sysfs_en) 4099 amdgpu_ucode_sysfs_fini(adev); 4100 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4101 amdgpu_fru_sysfs_fini(adev); 4102 4103 /* disable ras feature must before hw fini */ 4104 amdgpu_ras_pre_fini(adev); 4105 4106 amdgpu_device_ip_fini_early(adev); 4107 4108 amdgpu_irq_fini_hw(adev); 4109 4110 if (adev->mman.initialized) 4111 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4112 4113 amdgpu_gart_dummy_page_fini(adev); 4114 4115 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4116 amdgpu_device_unmap_mmio(adev); 4117 4118 } 4119 4120 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4121 { 4122 int idx; 4123 bool px; 4124 4125 amdgpu_fence_driver_sw_fini(adev); 4126 amdgpu_device_ip_fini(adev); 4127 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4128 adev->accel_working = false; 4129 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4130 4131 amdgpu_reset_fini(adev); 4132 4133 /* free i2c buses */ 4134 if (!amdgpu_device_has_dc_support(adev)) 4135 amdgpu_i2c_fini(adev); 4136 4137 if (amdgpu_emu_mode != 1) 4138 amdgpu_atombios_fini(adev); 4139 4140 kfree(adev->bios); 4141 adev->bios = NULL; 4142 4143 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4144 4145 if (px || (!dev_is_removable(&adev->pdev->dev) && 4146 apple_gmux_detect(NULL, NULL))) 4147 vga_switcheroo_unregister_client(adev->pdev); 4148 4149 if (px) 4150 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4151 4152 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4153 vga_client_unregister(adev->pdev); 4154 4155 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4156 #ifdef __linux__ 4157 iounmap(adev->rmmio); 4158 adev->rmmio = NULL; 4159 #else 4160 if (adev->rmmio_size > 0) 4161 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4162 adev->rmmio_size); 4163 adev->rmmio_size = 0; 4164 adev->rmmio = NULL; 4165 #endif 4166 amdgpu_doorbell_fini(adev); 4167 drm_dev_exit(idx); 4168 } 4169 4170 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4171 amdgpu_pmu_fini(adev); 4172 if (adev->mman.discovery_bin) 4173 amdgpu_discovery_fini(adev); 4174 4175 amdgpu_reset_put_reset_domain(adev->reset_domain); 4176 adev->reset_domain = NULL; 4177 4178 kfree(adev->pci_state); 4179 4180 } 4181 4182 /** 4183 * amdgpu_device_evict_resources - evict device resources 4184 * @adev: amdgpu device object 4185 * 4186 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4187 * of the vram memory type. Mainly used for evicting device resources 4188 * at suspend time. 4189 * 4190 */ 4191 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4192 { 4193 int ret; 4194 4195 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4196 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4197 return 0; 4198 4199 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4200 if (ret) 4201 DRM_WARN("evicting device resources failed\n"); 4202 return ret; 4203 } 4204 4205 /* 4206 * Suspend & resume. 4207 */ 4208 /** 4209 * amdgpu_device_suspend - initiate device suspend 4210 * 4211 * @dev: drm dev pointer 4212 * @fbcon : notify the fbdev of suspend 4213 * 4214 * Puts the hw in the suspend state (all asics). 4215 * Returns 0 for success or an error on failure. 4216 * Called at driver suspend. 4217 */ 4218 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4219 { 4220 struct amdgpu_device *adev = drm_to_adev(dev); 4221 int r = 0; 4222 4223 if (adev->shutdown) 4224 return 0; 4225 4226 #ifdef notyet 4227 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4228 return 0; 4229 #endif 4230 4231 adev->in_suspend = true; 4232 4233 /* Evict the majority of BOs before grabbing the full access */ 4234 r = amdgpu_device_evict_resources(adev); 4235 if (r) 4236 return r; 4237 4238 if (amdgpu_sriov_vf(adev)) { 4239 amdgpu_virt_fini_data_exchange(adev); 4240 r = amdgpu_virt_request_full_gpu(adev, false); 4241 if (r) 4242 return r; 4243 } 4244 4245 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4246 DRM_WARN("smart shift update failed\n"); 4247 4248 if (fbcon) 4249 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4250 4251 cancel_delayed_work_sync(&adev->delayed_init_work); 4252 4253 amdgpu_ras_suspend(adev); 4254 4255 amdgpu_device_ip_suspend_phase1(adev); 4256 4257 if (!adev->in_s0ix) 4258 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4259 4260 r = amdgpu_device_evict_resources(adev); 4261 if (r) 4262 return r; 4263 4264 amdgpu_fence_driver_hw_fini(adev); 4265 4266 amdgpu_device_ip_suspend_phase2(adev); 4267 4268 if (amdgpu_sriov_vf(adev)) 4269 amdgpu_virt_release_full_gpu(adev, false); 4270 4271 return 0; 4272 } 4273 4274 /** 4275 * amdgpu_device_resume - initiate device resume 4276 * 4277 * @dev: drm dev pointer 4278 * @fbcon : notify the fbdev of resume 4279 * 4280 * Bring the hw back to operating state (all asics). 4281 * Returns 0 for success or an error on failure. 4282 * Called at driver resume. 4283 */ 4284 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4285 { 4286 struct amdgpu_device *adev = drm_to_adev(dev); 4287 int r = 0; 4288 4289 if (amdgpu_sriov_vf(adev)) { 4290 r = amdgpu_virt_request_full_gpu(adev, true); 4291 if (r) 4292 return r; 4293 } 4294 4295 #ifdef notyet 4296 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4297 return 0; 4298 #endif 4299 4300 if (adev->in_s0ix) 4301 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4302 4303 /* post card */ 4304 if (amdgpu_device_need_post(adev)) { 4305 r = amdgpu_device_asic_init(adev); 4306 if (r) 4307 dev_err(adev->dev, "amdgpu asic init failed\n"); 4308 } 4309 4310 r = amdgpu_device_ip_resume(adev); 4311 4312 if (r) { 4313 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4314 goto exit; 4315 } 4316 amdgpu_fence_driver_hw_init(adev); 4317 4318 r = amdgpu_device_ip_late_init(adev); 4319 if (r) 4320 goto exit; 4321 4322 queue_delayed_work(system_wq, &adev->delayed_init_work, 4323 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4324 4325 if (!adev->in_s0ix) { 4326 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4327 if (r) 4328 goto exit; 4329 } 4330 4331 exit: 4332 if (amdgpu_sriov_vf(adev)) { 4333 amdgpu_virt_init_data_exchange(adev); 4334 amdgpu_virt_release_full_gpu(adev, true); 4335 } 4336 4337 if (r) 4338 return r; 4339 4340 /* Make sure IB tests flushed */ 4341 flush_delayed_work(&adev->delayed_init_work); 4342 4343 if (fbcon) 4344 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4345 4346 amdgpu_ras_resume(adev); 4347 4348 if (adev->mode_info.num_crtc) { 4349 /* 4350 * Most of the connector probing functions try to acquire runtime pm 4351 * refs to ensure that the GPU is powered on when connector polling is 4352 * performed. Since we're calling this from a runtime PM callback, 4353 * trying to acquire rpm refs will cause us to deadlock. 4354 * 4355 * Since we're guaranteed to be holding the rpm lock, it's safe to 4356 * temporarily disable the rpm helpers so this doesn't deadlock us. 4357 */ 4358 #if defined(CONFIG_PM) && defined(__linux__) 4359 dev->dev->power.disable_depth++; 4360 #endif 4361 if (!adev->dc_enabled) 4362 drm_helper_hpd_irq_event(dev); 4363 else 4364 drm_kms_helper_hotplug_event(dev); 4365 #if defined(CONFIG_PM) && defined(__linux__) 4366 dev->dev->power.disable_depth--; 4367 #endif 4368 } 4369 adev->in_suspend = false; 4370 4371 if (adev->enable_mes) 4372 amdgpu_mes_self_test(adev); 4373 4374 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4375 DRM_WARN("smart shift update failed\n"); 4376 4377 return 0; 4378 } 4379 4380 /** 4381 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4382 * 4383 * @adev: amdgpu_device pointer 4384 * 4385 * The list of all the hardware IPs that make up the asic is walked and 4386 * the check_soft_reset callbacks are run. check_soft_reset determines 4387 * if the asic is still hung or not. 4388 * Returns true if any of the IPs are still in a hung state, false if not. 4389 */ 4390 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4391 { 4392 int i; 4393 bool asic_hang = false; 4394 4395 if (amdgpu_sriov_vf(adev)) 4396 return true; 4397 4398 if (amdgpu_asic_need_full_reset(adev)) 4399 return true; 4400 4401 for (i = 0; i < adev->num_ip_blocks; i++) { 4402 if (!adev->ip_blocks[i].status.valid) 4403 continue; 4404 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4405 adev->ip_blocks[i].status.hang = 4406 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4407 if (adev->ip_blocks[i].status.hang) { 4408 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4409 asic_hang = true; 4410 } 4411 } 4412 return asic_hang; 4413 } 4414 4415 /** 4416 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4417 * 4418 * @adev: amdgpu_device pointer 4419 * 4420 * The list of all the hardware IPs that make up the asic is walked and the 4421 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4422 * handles any IP specific hardware or software state changes that are 4423 * necessary for a soft reset to succeed. 4424 * Returns 0 on success, negative error code on failure. 4425 */ 4426 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4427 { 4428 int i, r = 0; 4429 4430 for (i = 0; i < adev->num_ip_blocks; i++) { 4431 if (!adev->ip_blocks[i].status.valid) 4432 continue; 4433 if (adev->ip_blocks[i].status.hang && 4434 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4435 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4436 if (r) 4437 return r; 4438 } 4439 } 4440 4441 return 0; 4442 } 4443 4444 /** 4445 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4446 * 4447 * @adev: amdgpu_device pointer 4448 * 4449 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4450 * reset is necessary to recover. 4451 * Returns true if a full asic reset is required, false if not. 4452 */ 4453 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4454 { 4455 int i; 4456 4457 if (amdgpu_asic_need_full_reset(adev)) 4458 return true; 4459 4460 for (i = 0; i < adev->num_ip_blocks; i++) { 4461 if (!adev->ip_blocks[i].status.valid) 4462 continue; 4463 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4464 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4465 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4466 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4467 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4468 if (adev->ip_blocks[i].status.hang) { 4469 dev_info(adev->dev, "Some block need full reset!\n"); 4470 return true; 4471 } 4472 } 4473 } 4474 return false; 4475 } 4476 4477 /** 4478 * amdgpu_device_ip_soft_reset - do a soft reset 4479 * 4480 * @adev: amdgpu_device pointer 4481 * 4482 * The list of all the hardware IPs that make up the asic is walked and the 4483 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4484 * IP specific hardware or software state changes that are necessary to soft 4485 * reset the IP. 4486 * Returns 0 on success, negative error code on failure. 4487 */ 4488 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4489 { 4490 int i, r = 0; 4491 4492 for (i = 0; i < adev->num_ip_blocks; i++) { 4493 if (!adev->ip_blocks[i].status.valid) 4494 continue; 4495 if (adev->ip_blocks[i].status.hang && 4496 adev->ip_blocks[i].version->funcs->soft_reset) { 4497 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4498 if (r) 4499 return r; 4500 } 4501 } 4502 4503 return 0; 4504 } 4505 4506 /** 4507 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4508 * 4509 * @adev: amdgpu_device pointer 4510 * 4511 * The list of all the hardware IPs that make up the asic is walked and the 4512 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4513 * handles any IP specific hardware or software state changes that are 4514 * necessary after the IP has been soft reset. 4515 * Returns 0 on success, negative error code on failure. 4516 */ 4517 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4518 { 4519 int i, r = 0; 4520 4521 for (i = 0; i < adev->num_ip_blocks; i++) { 4522 if (!adev->ip_blocks[i].status.valid) 4523 continue; 4524 if (adev->ip_blocks[i].status.hang && 4525 adev->ip_blocks[i].version->funcs->post_soft_reset) 4526 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4527 if (r) 4528 return r; 4529 } 4530 4531 return 0; 4532 } 4533 4534 /** 4535 * amdgpu_device_recover_vram - Recover some VRAM contents 4536 * 4537 * @adev: amdgpu_device pointer 4538 * 4539 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4540 * restore things like GPUVM page tables after a GPU reset where 4541 * the contents of VRAM might be lost. 4542 * 4543 * Returns: 4544 * 0 on success, negative error code on failure. 4545 */ 4546 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4547 { 4548 struct dma_fence *fence = NULL, *next = NULL; 4549 struct amdgpu_bo *shadow; 4550 struct amdgpu_bo_vm *vmbo; 4551 long r = 1, tmo; 4552 4553 if (amdgpu_sriov_runtime(adev)) 4554 tmo = msecs_to_jiffies(8000); 4555 else 4556 tmo = msecs_to_jiffies(100); 4557 4558 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4559 mutex_lock(&adev->shadow_list_lock); 4560 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4561 /* If vm is compute context or adev is APU, shadow will be NULL */ 4562 if (!vmbo->shadow) 4563 continue; 4564 shadow = vmbo->shadow; 4565 4566 /* No need to recover an evicted BO */ 4567 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4568 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4569 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4570 continue; 4571 4572 r = amdgpu_bo_restore_shadow(shadow, &next); 4573 if (r) 4574 break; 4575 4576 if (fence) { 4577 tmo = dma_fence_wait_timeout(fence, false, tmo); 4578 dma_fence_put(fence); 4579 fence = next; 4580 if (tmo == 0) { 4581 r = -ETIMEDOUT; 4582 break; 4583 } else if (tmo < 0) { 4584 r = tmo; 4585 break; 4586 } 4587 } else { 4588 fence = next; 4589 } 4590 } 4591 mutex_unlock(&adev->shadow_list_lock); 4592 4593 if (fence) 4594 tmo = dma_fence_wait_timeout(fence, false, tmo); 4595 dma_fence_put(fence); 4596 4597 if (r < 0 || tmo <= 0) { 4598 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4599 return -EIO; 4600 } 4601 4602 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4603 return 0; 4604 } 4605 4606 4607 /** 4608 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4609 * 4610 * @adev: amdgpu_device pointer 4611 * @from_hypervisor: request from hypervisor 4612 * 4613 * do VF FLR and reinitialize Asic 4614 * return 0 means succeeded otherwise failed 4615 */ 4616 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4617 bool from_hypervisor) 4618 { 4619 int r; 4620 struct amdgpu_hive_info *hive = NULL; 4621 int retry_limit = 0; 4622 4623 retry: 4624 amdgpu_amdkfd_pre_reset(adev); 4625 4626 if (from_hypervisor) 4627 r = amdgpu_virt_request_full_gpu(adev, true); 4628 else 4629 r = amdgpu_virt_reset_gpu(adev); 4630 if (r) 4631 return r; 4632 amdgpu_irq_gpu_reset_resume_helper(adev); 4633 4634 /* some sw clean up VF needs to do before recover */ 4635 amdgpu_virt_post_reset(adev); 4636 4637 /* Resume IP prior to SMC */ 4638 r = amdgpu_device_ip_reinit_early_sriov(adev); 4639 if (r) 4640 goto error; 4641 4642 amdgpu_virt_init_data_exchange(adev); 4643 4644 r = amdgpu_device_fw_loading(adev); 4645 if (r) 4646 return r; 4647 4648 /* now we are okay to resume SMC/CP/SDMA */ 4649 r = amdgpu_device_ip_reinit_late_sriov(adev); 4650 if (r) 4651 goto error; 4652 4653 hive = amdgpu_get_xgmi_hive(adev); 4654 /* Update PSP FW topology after reset */ 4655 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4656 r = amdgpu_xgmi_update_topology(hive, adev); 4657 4658 if (hive) 4659 amdgpu_put_xgmi_hive(hive); 4660 4661 if (!r) { 4662 r = amdgpu_ib_ring_tests(adev); 4663 4664 amdgpu_amdkfd_post_reset(adev); 4665 } 4666 4667 error: 4668 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4669 amdgpu_inc_vram_lost(adev); 4670 r = amdgpu_device_recover_vram(adev); 4671 } 4672 amdgpu_virt_release_full_gpu(adev, true); 4673 4674 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4675 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4676 retry_limit++; 4677 goto retry; 4678 } else 4679 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4680 } 4681 4682 return r; 4683 } 4684 4685 /** 4686 * amdgpu_device_has_job_running - check if there is any job in mirror list 4687 * 4688 * @adev: amdgpu_device pointer 4689 * 4690 * check if there is any job in mirror list 4691 */ 4692 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4693 { 4694 int i; 4695 struct drm_sched_job *job; 4696 4697 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4698 struct amdgpu_ring *ring = adev->rings[i]; 4699 4700 if (!ring || !ring->sched.thread) 4701 continue; 4702 4703 spin_lock(&ring->sched.job_list_lock); 4704 job = list_first_entry_or_null(&ring->sched.pending_list, 4705 struct drm_sched_job, list); 4706 spin_unlock(&ring->sched.job_list_lock); 4707 if (job) 4708 return true; 4709 } 4710 return false; 4711 } 4712 4713 /** 4714 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4715 * 4716 * @adev: amdgpu_device pointer 4717 * 4718 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4719 * a hung GPU. 4720 */ 4721 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4722 { 4723 4724 if (amdgpu_gpu_recovery == 0) 4725 goto disabled; 4726 4727 /* Skip soft reset check in fatal error mode */ 4728 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4729 return true; 4730 4731 if (amdgpu_sriov_vf(adev)) 4732 return true; 4733 4734 if (amdgpu_gpu_recovery == -1) { 4735 switch (adev->asic_type) { 4736 #ifdef CONFIG_DRM_AMDGPU_SI 4737 case CHIP_VERDE: 4738 case CHIP_TAHITI: 4739 case CHIP_PITCAIRN: 4740 case CHIP_OLAND: 4741 case CHIP_HAINAN: 4742 #endif 4743 #ifdef CONFIG_DRM_AMDGPU_CIK 4744 case CHIP_KAVERI: 4745 case CHIP_KABINI: 4746 case CHIP_MULLINS: 4747 #endif 4748 case CHIP_CARRIZO: 4749 case CHIP_STONEY: 4750 case CHIP_CYAN_SKILLFISH: 4751 goto disabled; 4752 default: 4753 break; 4754 } 4755 } 4756 4757 return true; 4758 4759 disabled: 4760 dev_info(adev->dev, "GPU recovery disabled.\n"); 4761 return false; 4762 } 4763 4764 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4765 { 4766 u32 i; 4767 int ret = 0; 4768 4769 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4770 4771 dev_info(adev->dev, "GPU mode1 reset\n"); 4772 4773 /* disable BM */ 4774 pci_clear_master(adev->pdev); 4775 4776 amdgpu_device_cache_pci_state(adev->pdev); 4777 4778 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4779 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4780 ret = amdgpu_dpm_mode1_reset(adev); 4781 } else { 4782 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4783 ret = psp_gpu_reset(adev); 4784 } 4785 4786 if (ret) 4787 goto mode1_reset_failed; 4788 4789 amdgpu_device_load_pci_state(adev->pdev); 4790 ret = amdgpu_psp_wait_for_bootloader(adev); 4791 if (ret) 4792 goto mode1_reset_failed; 4793 4794 /* wait for asic to come out of reset */ 4795 for (i = 0; i < adev->usec_timeout; i++) { 4796 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4797 4798 if (memsize != 0xffffffff) 4799 break; 4800 udelay(1); 4801 } 4802 4803 if (i >= adev->usec_timeout) { 4804 ret = -ETIMEDOUT; 4805 goto mode1_reset_failed; 4806 } 4807 4808 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4809 4810 return 0; 4811 4812 mode1_reset_failed: 4813 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4814 return ret; 4815 } 4816 4817 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4818 struct amdgpu_reset_context *reset_context) 4819 { 4820 int i, r = 0; 4821 struct amdgpu_job *job = NULL; 4822 bool need_full_reset = 4823 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4824 4825 if (reset_context->reset_req_dev == adev) 4826 job = reset_context->job; 4827 4828 if (amdgpu_sriov_vf(adev)) { 4829 /* stop the data exchange thread */ 4830 amdgpu_virt_fini_data_exchange(adev); 4831 } 4832 4833 amdgpu_fence_driver_isr_toggle(adev, true); 4834 4835 /* block all schedulers and reset given job's ring */ 4836 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4837 struct amdgpu_ring *ring = adev->rings[i]; 4838 4839 if (!ring || !ring->sched.thread) 4840 continue; 4841 4842 /* Clear job fence from fence drv to avoid force_completion 4843 * leave NULL and vm flush fence in fence drv 4844 */ 4845 amdgpu_fence_driver_clear_job_fences(ring); 4846 4847 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4848 amdgpu_fence_driver_force_completion(ring); 4849 } 4850 4851 amdgpu_fence_driver_isr_toggle(adev, false); 4852 4853 if (job && job->vm) 4854 drm_sched_increase_karma(&job->base); 4855 4856 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4857 /* If reset handler not implemented, continue; otherwise return */ 4858 if (r == -EOPNOTSUPP) 4859 r = 0; 4860 else 4861 return r; 4862 4863 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4864 if (!amdgpu_sriov_vf(adev)) { 4865 4866 if (!need_full_reset) 4867 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4868 4869 if (!need_full_reset && amdgpu_gpu_recovery && 4870 amdgpu_device_ip_check_soft_reset(adev)) { 4871 amdgpu_device_ip_pre_soft_reset(adev); 4872 r = amdgpu_device_ip_soft_reset(adev); 4873 amdgpu_device_ip_post_soft_reset(adev); 4874 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4875 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4876 need_full_reset = true; 4877 } 4878 } 4879 4880 if (need_full_reset) 4881 r = amdgpu_device_ip_suspend(adev); 4882 if (need_full_reset) 4883 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4884 else 4885 clear_bit(AMDGPU_NEED_FULL_RESET, 4886 &reset_context->flags); 4887 } 4888 4889 return r; 4890 } 4891 4892 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4893 { 4894 int i; 4895 4896 lockdep_assert_held(&adev->reset_domain->sem); 4897 4898 for (i = 0; i < adev->num_regs; i++) { 4899 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4900 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4901 adev->reset_dump_reg_value[i]); 4902 } 4903 4904 return 0; 4905 } 4906 4907 #ifdef CONFIG_DEV_COREDUMP 4908 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4909 size_t count, void *data, size_t datalen) 4910 { 4911 struct drm_printer p; 4912 struct amdgpu_device *adev = data; 4913 struct drm_print_iterator iter; 4914 int i; 4915 4916 iter.data = buffer; 4917 iter.offset = 0; 4918 iter.start = offset; 4919 iter.remain = count; 4920 4921 p = drm_coredump_printer(&iter); 4922 4923 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4924 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4925 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4926 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4927 if (adev->reset_task_info.pid) 4928 drm_printf(&p, "process_name: %s PID: %d\n", 4929 adev->reset_task_info.process_name, 4930 adev->reset_task_info.pid); 4931 4932 if (adev->reset_vram_lost) 4933 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4934 if (adev->num_regs) { 4935 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4936 4937 for (i = 0; i < adev->num_regs; i++) 4938 drm_printf(&p, "0x%08x: 0x%08x\n", 4939 adev->reset_dump_reg_list[i], 4940 adev->reset_dump_reg_value[i]); 4941 } 4942 4943 return count - iter.remain; 4944 } 4945 4946 static void amdgpu_devcoredump_free(void *data) 4947 { 4948 } 4949 4950 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4951 { 4952 struct drm_device *dev = adev_to_drm(adev); 4953 4954 ktime_get_ts64(&adev->reset_time); 4955 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT, 4956 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4957 } 4958 #endif 4959 4960 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4961 struct amdgpu_reset_context *reset_context) 4962 { 4963 struct amdgpu_device *tmp_adev = NULL; 4964 bool need_full_reset, skip_hw_reset, vram_lost = false; 4965 int r = 0; 4966 bool gpu_reset_for_dev_remove = 0; 4967 4968 /* Try reset handler method first */ 4969 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4970 reset_list); 4971 amdgpu_reset_reg_dumps(tmp_adev); 4972 4973 reset_context->reset_device_list = device_list_handle; 4974 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4975 /* If reset handler not implemented, continue; otherwise return */ 4976 if (r == -EOPNOTSUPP) 4977 r = 0; 4978 else 4979 return r; 4980 4981 /* Reset handler not implemented, use the default method */ 4982 need_full_reset = 4983 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4984 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4985 4986 gpu_reset_for_dev_remove = 4987 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4988 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4989 4990 /* 4991 * ASIC reset has to be done on all XGMI hive nodes ASAP 4992 * to allow proper links negotiation in FW (within 1 sec) 4993 */ 4994 if (!skip_hw_reset && need_full_reset) { 4995 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4996 /* For XGMI run all resets in parallel to speed up the process */ 4997 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4998 tmp_adev->gmc.xgmi.pending_reset = false; 4999 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5000 r = -EALREADY; 5001 } else 5002 r = amdgpu_asic_reset(tmp_adev); 5003 5004 if (r) { 5005 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5006 r, adev_to_drm(tmp_adev)->unique); 5007 break; 5008 } 5009 } 5010 5011 /* For XGMI wait for all resets to complete before proceed */ 5012 if (!r) { 5013 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5014 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5015 flush_work(&tmp_adev->xgmi_reset_work); 5016 r = tmp_adev->asic_reset_res; 5017 if (r) 5018 break; 5019 } 5020 } 5021 } 5022 } 5023 5024 if (!r && amdgpu_ras_intr_triggered()) { 5025 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5026 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 5027 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 5028 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 5029 } 5030 5031 amdgpu_ras_intr_cleared(); 5032 } 5033 5034 /* Since the mode1 reset affects base ip blocks, the 5035 * phase1 ip blocks need to be resumed. Otherwise there 5036 * will be a BIOS signature error and the psp bootloader 5037 * can't load kdb on the next amdgpu install. 5038 */ 5039 if (gpu_reset_for_dev_remove) { 5040 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5041 amdgpu_device_ip_resume_phase1(tmp_adev); 5042 5043 goto end; 5044 } 5045 5046 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5047 if (need_full_reset) { 5048 /* post card */ 5049 r = amdgpu_device_asic_init(tmp_adev); 5050 if (r) { 5051 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5052 } else { 5053 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5054 5055 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5056 if (r) 5057 goto out; 5058 5059 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5060 #ifdef CONFIG_DEV_COREDUMP 5061 tmp_adev->reset_vram_lost = vram_lost; 5062 memset(&tmp_adev->reset_task_info, 0, 5063 sizeof(tmp_adev->reset_task_info)); 5064 if (reset_context->job && reset_context->job->vm) 5065 tmp_adev->reset_task_info = 5066 reset_context->job->vm->task_info; 5067 amdgpu_reset_capture_coredumpm(tmp_adev); 5068 #endif 5069 if (vram_lost) { 5070 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5071 amdgpu_inc_vram_lost(tmp_adev); 5072 } 5073 5074 r = amdgpu_device_fw_loading(tmp_adev); 5075 if (r) 5076 return r; 5077 5078 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5079 if (r) 5080 goto out; 5081 5082 if (vram_lost) 5083 amdgpu_device_fill_reset_magic(tmp_adev); 5084 5085 /* 5086 * Add this ASIC as tracked as reset was already 5087 * complete successfully. 5088 */ 5089 amdgpu_register_gpu_instance(tmp_adev); 5090 5091 if (!reset_context->hive && 5092 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5093 amdgpu_xgmi_add_device(tmp_adev); 5094 5095 r = amdgpu_device_ip_late_init(tmp_adev); 5096 if (r) 5097 goto out; 5098 5099 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5100 5101 /* 5102 * The GPU enters bad state once faulty pages 5103 * by ECC has reached the threshold, and ras 5104 * recovery is scheduled next. So add one check 5105 * here to break recovery if it indeed exceeds 5106 * bad page threshold, and remind user to 5107 * retire this GPU or setting one bigger 5108 * bad_page_threshold value to fix this once 5109 * probing driver again. 5110 */ 5111 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5112 /* must succeed. */ 5113 amdgpu_ras_resume(tmp_adev); 5114 } else { 5115 r = -EINVAL; 5116 goto out; 5117 } 5118 5119 /* Update PSP FW topology after reset */ 5120 if (reset_context->hive && 5121 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5122 r = amdgpu_xgmi_update_topology( 5123 reset_context->hive, tmp_adev); 5124 } 5125 } 5126 5127 out: 5128 if (!r) { 5129 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5130 r = amdgpu_ib_ring_tests(tmp_adev); 5131 if (r) { 5132 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5133 need_full_reset = true; 5134 r = -EAGAIN; 5135 goto end; 5136 } 5137 } 5138 5139 if (!r) 5140 r = amdgpu_device_recover_vram(tmp_adev); 5141 else 5142 tmp_adev->asic_reset_res = r; 5143 } 5144 5145 end: 5146 if (need_full_reset) 5147 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5148 else 5149 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5150 return r; 5151 } 5152 5153 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5154 { 5155 5156 switch (amdgpu_asic_reset_method(adev)) { 5157 case AMD_RESET_METHOD_MODE1: 5158 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5159 break; 5160 case AMD_RESET_METHOD_MODE2: 5161 adev->mp1_state = PP_MP1_STATE_RESET; 5162 break; 5163 default: 5164 adev->mp1_state = PP_MP1_STATE_NONE; 5165 break; 5166 } 5167 5168 pci_dev_put(p); 5169 } 5170 5171 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5172 { 5173 amdgpu_vf_error_trans_all(adev); 5174 adev->mp1_state = PP_MP1_STATE_NONE; 5175 } 5176 5177 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5178 { 5179 STUB(); 5180 #ifdef notyet 5181 struct pci_dev *p = NULL; 5182 5183 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5184 adev->pdev->bus->number, 1); 5185 if (p) { 5186 pm_runtime_enable(&(p->dev)); 5187 pm_runtime_resume(&(p->dev)); 5188 } 5189 #endif 5190 } 5191 5192 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5193 { 5194 enum amd_reset_method reset_method; 5195 struct pci_dev *p = NULL; 5196 u64 expires; 5197 5198 /* 5199 * For now, only BACO and mode1 reset are confirmed 5200 * to suffer the audio issue without proper suspended. 5201 */ 5202 reset_method = amdgpu_asic_reset_method(adev); 5203 if ((reset_method != AMD_RESET_METHOD_BACO) && 5204 (reset_method != AMD_RESET_METHOD_MODE1)) 5205 return -EINVAL; 5206 5207 STUB(); 5208 return -ENOSYS; 5209 #ifdef notyet 5210 5211 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5212 adev->pdev->bus->number, 1); 5213 if (!p) 5214 return -ENODEV; 5215 5216 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5217 if (!expires) 5218 /* 5219 * If we cannot get the audio device autosuspend delay, 5220 * a fixed 4S interval will be used. Considering 3S is 5221 * the audio controller default autosuspend delay setting. 5222 * 4S used here is guaranteed to cover that. 5223 */ 5224 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5225 5226 while (!pm_runtime_status_suspended(&(p->dev))) { 5227 if (!pm_runtime_suspend(&(p->dev))) 5228 break; 5229 5230 if (expires < ktime_get_mono_fast_ns()) { 5231 dev_warn(adev->dev, "failed to suspend display audio\n"); 5232 pci_dev_put(p); 5233 /* TODO: abort the succeeding gpu reset? */ 5234 return -ETIMEDOUT; 5235 } 5236 } 5237 5238 pm_runtime_disable(&(p->dev)); 5239 5240 pci_dev_put(p); 5241 return 0; 5242 #endif 5243 } 5244 5245 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5246 { 5247 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5248 5249 #if defined(CONFIG_DEBUG_FS) 5250 if (!amdgpu_sriov_vf(adev)) 5251 cancel_work(&adev->reset_work); 5252 #endif 5253 5254 if (adev->kfd.dev) 5255 cancel_work(&adev->kfd.reset_work); 5256 5257 if (amdgpu_sriov_vf(adev)) 5258 cancel_work(&adev->virt.flr_work); 5259 5260 if (con && adev->ras_enabled) 5261 cancel_work(&con->recovery_work); 5262 5263 } 5264 5265 /** 5266 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5267 * 5268 * @adev: amdgpu_device pointer 5269 * @job: which job trigger hang 5270 * @reset_context: amdgpu reset context pointer 5271 * 5272 * Attempt to reset the GPU if it has hung (all asics). 5273 * Attempt to do soft-reset or full-reset and reinitialize Asic 5274 * Returns 0 for success or an error on failure. 5275 */ 5276 5277 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5278 struct amdgpu_job *job, 5279 struct amdgpu_reset_context *reset_context) 5280 { 5281 struct list_head device_list, *device_list_handle = NULL; 5282 bool job_signaled = false; 5283 struct amdgpu_hive_info *hive = NULL; 5284 struct amdgpu_device *tmp_adev = NULL; 5285 int i, r = 0; 5286 bool need_emergency_restart = false; 5287 bool audio_suspended = false; 5288 bool gpu_reset_for_dev_remove = false; 5289 5290 gpu_reset_for_dev_remove = 5291 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5292 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5293 5294 /* 5295 * Special case: RAS triggered and full reset isn't supported 5296 */ 5297 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5298 5299 /* 5300 * Flush RAM to disk so that after reboot 5301 * the user can read log and see why the system rebooted. 5302 */ 5303 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5304 amdgpu_ras_get_context(adev)->reboot) { 5305 DRM_WARN("Emergency reboot."); 5306 5307 #ifdef notyet 5308 ksys_sync_helper(); 5309 emergency_restart(); 5310 #else 5311 panic("emergency_restart"); 5312 #endif 5313 } 5314 5315 dev_info(adev->dev, "GPU %s begin!\n", 5316 need_emergency_restart ? "jobs stop":"reset"); 5317 5318 if (!amdgpu_sriov_vf(adev)) 5319 hive = amdgpu_get_xgmi_hive(adev); 5320 if (hive) 5321 mutex_lock(&hive->hive_lock); 5322 5323 reset_context->job = job; 5324 reset_context->hive = hive; 5325 /* 5326 * Build list of devices to reset. 5327 * In case we are in XGMI hive mode, resort the device list 5328 * to put adev in the 1st position. 5329 */ 5330 INIT_LIST_HEAD(&device_list); 5331 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5332 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5333 list_add_tail(&tmp_adev->reset_list, &device_list); 5334 if (gpu_reset_for_dev_remove && adev->shutdown) 5335 tmp_adev->shutdown = true; 5336 } 5337 if (!list_is_first(&adev->reset_list, &device_list)) 5338 list_rotate_to_front(&adev->reset_list, &device_list); 5339 device_list_handle = &device_list; 5340 } else { 5341 list_add_tail(&adev->reset_list, &device_list); 5342 device_list_handle = &device_list; 5343 } 5344 5345 /* We need to lock reset domain only once both for XGMI and single device */ 5346 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5347 reset_list); 5348 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5349 5350 /* block all schedulers and reset given job's ring */ 5351 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5352 5353 amdgpu_device_set_mp1_state(tmp_adev); 5354 5355 /* 5356 * Try to put the audio codec into suspend state 5357 * before gpu reset started. 5358 * 5359 * Due to the power domain of the graphics device 5360 * is shared with AZ power domain. Without this, 5361 * we may change the audio hardware from behind 5362 * the audio driver's back. That will trigger 5363 * some audio codec errors. 5364 */ 5365 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5366 audio_suspended = true; 5367 5368 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5369 5370 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5371 5372 if (!amdgpu_sriov_vf(tmp_adev)) 5373 amdgpu_amdkfd_pre_reset(tmp_adev); 5374 5375 /* 5376 * Mark these ASICs to be reseted as untracked first 5377 * And add them back after reset completed 5378 */ 5379 amdgpu_unregister_gpu_instance(tmp_adev); 5380 5381 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5382 5383 /* disable ras on ALL IPs */ 5384 if (!need_emergency_restart && 5385 amdgpu_device_ip_need_full_reset(tmp_adev)) 5386 amdgpu_ras_suspend(tmp_adev); 5387 5388 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5389 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5390 5391 if (!ring || !ring->sched.thread) 5392 continue; 5393 5394 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5395 5396 if (need_emergency_restart) 5397 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5398 } 5399 atomic_inc(&tmp_adev->gpu_reset_counter); 5400 } 5401 5402 if (need_emergency_restart) 5403 goto skip_sched_resume; 5404 5405 /* 5406 * Must check guilty signal here since after this point all old 5407 * HW fences are force signaled. 5408 * 5409 * job->base holds a reference to parent fence 5410 */ 5411 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5412 job_signaled = true; 5413 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5414 goto skip_hw_reset; 5415 } 5416 5417 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5418 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5419 if (gpu_reset_for_dev_remove) { 5420 /* Workaroud for ASICs need to disable SMC first */ 5421 amdgpu_device_smu_fini_early(tmp_adev); 5422 } 5423 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5424 /*TODO Should we stop ?*/ 5425 if (r) { 5426 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5427 r, adev_to_drm(tmp_adev)->unique); 5428 tmp_adev->asic_reset_res = r; 5429 } 5430 5431 /* 5432 * Drop all pending non scheduler resets. Scheduler resets 5433 * were already dropped during drm_sched_stop 5434 */ 5435 amdgpu_device_stop_pending_resets(tmp_adev); 5436 } 5437 5438 /* Actual ASIC resets if needed.*/ 5439 /* Host driver will handle XGMI hive reset for SRIOV */ 5440 if (amdgpu_sriov_vf(adev)) { 5441 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5442 if (r) 5443 adev->asic_reset_res = r; 5444 5445 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5446 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5447 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5448 amdgpu_ras_resume(adev); 5449 } else { 5450 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5451 if (r && r == -EAGAIN) 5452 goto retry; 5453 5454 if (!r && gpu_reset_for_dev_remove) 5455 goto recover_end; 5456 } 5457 5458 skip_hw_reset: 5459 5460 /* Post ASIC reset for all devs .*/ 5461 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5462 5463 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5464 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5465 5466 if (!ring || !ring->sched.thread) 5467 continue; 5468 5469 drm_sched_start(&ring->sched, true); 5470 } 5471 5472 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5473 amdgpu_mes_self_test(tmp_adev); 5474 5475 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5476 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5477 5478 if (tmp_adev->asic_reset_res) 5479 r = tmp_adev->asic_reset_res; 5480 5481 tmp_adev->asic_reset_res = 0; 5482 5483 if (r) { 5484 /* bad news, how to tell it to userspace ? */ 5485 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5486 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5487 } else { 5488 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5489 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5490 DRM_WARN("smart shift update failed\n"); 5491 } 5492 } 5493 5494 skip_sched_resume: 5495 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5496 /* unlock kfd: SRIOV would do it separately */ 5497 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5498 amdgpu_amdkfd_post_reset(tmp_adev); 5499 5500 /* kfd_post_reset will do nothing if kfd device is not initialized, 5501 * need to bring up kfd here if it's not be initialized before 5502 */ 5503 if (!adev->kfd.init_complete) 5504 amdgpu_amdkfd_device_init(adev); 5505 5506 if (audio_suspended) 5507 amdgpu_device_resume_display_audio(tmp_adev); 5508 5509 amdgpu_device_unset_mp1_state(tmp_adev); 5510 5511 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5512 } 5513 5514 recover_end: 5515 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5516 reset_list); 5517 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5518 5519 if (hive) { 5520 mutex_unlock(&hive->hive_lock); 5521 amdgpu_put_xgmi_hive(hive); 5522 } 5523 5524 if (r) 5525 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5526 5527 atomic_set(&adev->reset_domain->reset_res, r); 5528 return r; 5529 } 5530 5531 /** 5532 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5533 * 5534 * @adev: amdgpu_device pointer 5535 * 5536 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5537 * and lanes) of the slot the device is in. Handles APUs and 5538 * virtualized environments where PCIE config space may not be available. 5539 */ 5540 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5541 { 5542 struct pci_dev *pdev; 5543 enum pci_bus_speed speed_cap, platform_speed_cap; 5544 enum pcie_link_width platform_link_width; 5545 5546 if (amdgpu_pcie_gen_cap) 5547 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5548 5549 if (amdgpu_pcie_lane_cap) 5550 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5551 5552 /* covers APUs as well */ 5553 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5554 if (adev->pm.pcie_gen_mask == 0) 5555 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5556 if (adev->pm.pcie_mlw_mask == 0) 5557 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5558 return; 5559 } 5560 5561 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5562 return; 5563 5564 pcie_bandwidth_available(adev->pdev, NULL, 5565 &platform_speed_cap, &platform_link_width); 5566 5567 if (adev->pm.pcie_gen_mask == 0) { 5568 /* asic caps */ 5569 pdev = adev->pdev; 5570 speed_cap = pcie_get_speed_cap(pdev); 5571 if (speed_cap == PCI_SPEED_UNKNOWN) { 5572 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5573 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5574 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5575 } else { 5576 if (speed_cap == PCIE_SPEED_32_0GT) 5577 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5578 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5579 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5580 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5581 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5582 else if (speed_cap == PCIE_SPEED_16_0GT) 5583 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5584 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5585 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5586 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5587 else if (speed_cap == PCIE_SPEED_8_0GT) 5588 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5589 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5590 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5591 else if (speed_cap == PCIE_SPEED_5_0GT) 5592 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5593 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5594 else 5595 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5596 } 5597 /* platform caps */ 5598 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5599 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5600 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5601 } else { 5602 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5603 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5604 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5605 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5606 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5607 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5608 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5609 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5610 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5611 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5612 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5613 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5614 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5615 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5616 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5617 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5618 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5619 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5620 else 5621 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5622 5623 } 5624 } 5625 if (adev->pm.pcie_mlw_mask == 0) { 5626 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5627 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5628 } else { 5629 switch (platform_link_width) { 5630 case PCIE_LNK_X32: 5631 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5632 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5633 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5634 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5635 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5636 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5637 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5638 break; 5639 case PCIE_LNK_X16: 5640 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5641 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5642 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5643 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5644 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5645 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5646 break; 5647 case PCIE_LNK_X12: 5648 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5649 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5650 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5651 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5652 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5653 break; 5654 case PCIE_LNK_X8: 5655 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5656 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5657 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5658 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5659 break; 5660 case PCIE_LNK_X4: 5661 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5662 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5663 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5664 break; 5665 case PCIE_LNK_X2: 5666 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5667 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5668 break; 5669 case PCIE_LNK_X1: 5670 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5671 break; 5672 default: 5673 break; 5674 } 5675 } 5676 } 5677 } 5678 5679 /** 5680 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5681 * 5682 * @adev: amdgpu_device pointer 5683 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5684 * 5685 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5686 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5687 * @peer_adev. 5688 */ 5689 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5690 struct amdgpu_device *peer_adev) 5691 { 5692 #ifdef CONFIG_HSA_AMD_P2P 5693 uint64_t address_mask = peer_adev->dev->dma_mask ? 5694 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5695 resource_size_t aper_limit = 5696 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5697 bool p2p_access = 5698 !adev->gmc.xgmi.connected_to_cpu && 5699 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5700 5701 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5702 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5703 !(adev->gmc.aper_base & address_mask || 5704 aper_limit & address_mask)); 5705 #else 5706 return false; 5707 #endif 5708 } 5709 5710 int amdgpu_device_baco_enter(struct drm_device *dev) 5711 { 5712 struct amdgpu_device *adev = drm_to_adev(dev); 5713 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5714 5715 if (!amdgpu_device_supports_baco(dev)) 5716 return -ENOTSUPP; 5717 5718 if (ras && adev->ras_enabled && 5719 adev->nbio.funcs->enable_doorbell_interrupt) 5720 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5721 5722 return amdgpu_dpm_baco_enter(adev); 5723 } 5724 5725 int amdgpu_device_baco_exit(struct drm_device *dev) 5726 { 5727 struct amdgpu_device *adev = drm_to_adev(dev); 5728 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5729 int ret = 0; 5730 5731 if (!amdgpu_device_supports_baco(dev)) 5732 return -ENOTSUPP; 5733 5734 ret = amdgpu_dpm_baco_exit(adev); 5735 if (ret) 5736 return ret; 5737 5738 if (ras && adev->ras_enabled && 5739 adev->nbio.funcs->enable_doorbell_interrupt) 5740 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5741 5742 if (amdgpu_passthrough(adev) && 5743 adev->nbio.funcs->clear_doorbell_interrupt) 5744 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5745 5746 return 0; 5747 } 5748 5749 /** 5750 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5751 * @pdev: PCI device struct 5752 * @state: PCI channel state 5753 * 5754 * Description: Called when a PCI error is detected. 5755 * 5756 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5757 */ 5758 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5759 { 5760 STUB(); 5761 return 0; 5762 #ifdef notyet 5763 struct drm_device *dev = pci_get_drvdata(pdev); 5764 struct amdgpu_device *adev = drm_to_adev(dev); 5765 int i; 5766 5767 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5768 5769 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5770 DRM_WARN("No support for XGMI hive yet..."); 5771 return PCI_ERS_RESULT_DISCONNECT; 5772 } 5773 5774 adev->pci_channel_state = state; 5775 5776 switch (state) { 5777 case pci_channel_io_normal: 5778 return PCI_ERS_RESULT_CAN_RECOVER; 5779 /* Fatal error, prepare for slot reset */ 5780 case pci_channel_io_frozen: 5781 /* 5782 * Locking adev->reset_domain->sem will prevent any external access 5783 * to GPU during PCI error recovery 5784 */ 5785 amdgpu_device_lock_reset_domain(adev->reset_domain); 5786 amdgpu_device_set_mp1_state(adev); 5787 5788 /* 5789 * Block any work scheduling as we do for regular GPU reset 5790 * for the duration of the recovery 5791 */ 5792 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5793 struct amdgpu_ring *ring = adev->rings[i]; 5794 5795 if (!ring || !ring->sched.thread) 5796 continue; 5797 5798 drm_sched_stop(&ring->sched, NULL); 5799 } 5800 atomic_inc(&adev->gpu_reset_counter); 5801 return PCI_ERS_RESULT_NEED_RESET; 5802 case pci_channel_io_perm_failure: 5803 /* Permanent error, prepare for device removal */ 5804 return PCI_ERS_RESULT_DISCONNECT; 5805 } 5806 5807 return PCI_ERS_RESULT_NEED_RESET; 5808 #endif 5809 } 5810 5811 /** 5812 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5813 * @pdev: pointer to PCI device 5814 */ 5815 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5816 { 5817 5818 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5819 5820 /* TODO - dump whatever for debugging purposes */ 5821 5822 /* This called only if amdgpu_pci_error_detected returns 5823 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5824 * works, no need to reset slot. 5825 */ 5826 5827 return PCI_ERS_RESULT_RECOVERED; 5828 } 5829 5830 /** 5831 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5832 * @pdev: PCI device struct 5833 * 5834 * Description: This routine is called by the pci error recovery 5835 * code after the PCI slot has been reset, just before we 5836 * should resume normal operations. 5837 */ 5838 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5839 { 5840 STUB(); 5841 return PCI_ERS_RESULT_RECOVERED; 5842 #ifdef notyet 5843 struct drm_device *dev = pci_get_drvdata(pdev); 5844 struct amdgpu_device *adev = drm_to_adev(dev); 5845 int r, i; 5846 struct amdgpu_reset_context reset_context; 5847 u32 memsize; 5848 struct list_head device_list; 5849 5850 DRM_INFO("PCI error: slot reset callback!!\n"); 5851 5852 memset(&reset_context, 0, sizeof(reset_context)); 5853 5854 INIT_LIST_HEAD(&device_list); 5855 list_add_tail(&adev->reset_list, &device_list); 5856 5857 /* wait for asic to come out of reset */ 5858 drm_msleep(500); 5859 5860 /* Restore PCI confspace */ 5861 amdgpu_device_load_pci_state(pdev); 5862 5863 /* confirm ASIC came out of reset */ 5864 for (i = 0; i < adev->usec_timeout; i++) { 5865 memsize = amdgpu_asic_get_config_memsize(adev); 5866 5867 if (memsize != 0xffffffff) 5868 break; 5869 udelay(1); 5870 } 5871 if (memsize == 0xffffffff) { 5872 r = -ETIME; 5873 goto out; 5874 } 5875 5876 reset_context.method = AMD_RESET_METHOD_NONE; 5877 reset_context.reset_req_dev = adev; 5878 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5879 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5880 5881 adev->no_hw_access = true; 5882 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5883 adev->no_hw_access = false; 5884 if (r) 5885 goto out; 5886 5887 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5888 5889 out: 5890 if (!r) { 5891 if (amdgpu_device_cache_pci_state(adev->pdev)) 5892 pci_restore_state(adev->pdev); 5893 5894 DRM_INFO("PCIe error recovery succeeded\n"); 5895 } else { 5896 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5897 amdgpu_device_unset_mp1_state(adev); 5898 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5899 } 5900 5901 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5902 #endif 5903 } 5904 5905 /** 5906 * amdgpu_pci_resume() - resume normal ops after PCI reset 5907 * @pdev: pointer to PCI device 5908 * 5909 * Called when the error recovery driver tells us that its 5910 * OK to resume normal operation. 5911 */ 5912 void amdgpu_pci_resume(struct pci_dev *pdev) 5913 { 5914 STUB(); 5915 #ifdef notyet 5916 struct drm_device *dev = pci_get_drvdata(pdev); 5917 struct amdgpu_device *adev = drm_to_adev(dev); 5918 int i; 5919 5920 5921 DRM_INFO("PCI error: resume callback!!\n"); 5922 5923 /* Only continue execution for the case of pci_channel_io_frozen */ 5924 if (adev->pci_channel_state != pci_channel_io_frozen) 5925 return; 5926 5927 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5928 struct amdgpu_ring *ring = adev->rings[i]; 5929 5930 if (!ring || !ring->sched.thread) 5931 continue; 5932 5933 drm_sched_start(&ring->sched, true); 5934 } 5935 5936 amdgpu_device_unset_mp1_state(adev); 5937 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5938 #endif 5939 } 5940 5941 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5942 { 5943 return false; 5944 #ifdef notyet 5945 struct drm_device *dev = pci_get_drvdata(pdev); 5946 struct amdgpu_device *adev = drm_to_adev(dev); 5947 int r; 5948 5949 r = pci_save_state(pdev); 5950 if (!r) { 5951 kfree(adev->pci_state); 5952 5953 adev->pci_state = pci_store_saved_state(pdev); 5954 5955 if (!adev->pci_state) { 5956 DRM_ERROR("Failed to store PCI saved state"); 5957 return false; 5958 } 5959 } else { 5960 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5961 return false; 5962 } 5963 5964 return true; 5965 #endif 5966 } 5967 5968 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5969 { 5970 STUB(); 5971 return false; 5972 #ifdef notyet 5973 struct drm_device *dev = pci_get_drvdata(pdev); 5974 struct amdgpu_device *adev = drm_to_adev(dev); 5975 int r; 5976 5977 if (!adev->pci_state) 5978 return false; 5979 5980 r = pci_load_saved_state(pdev, adev->pci_state); 5981 5982 if (!r) { 5983 pci_restore_state(pdev); 5984 } else { 5985 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5986 return false; 5987 } 5988 5989 return true; 5990 #endif 5991 } 5992 5993 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5994 struct amdgpu_ring *ring) 5995 { 5996 #ifdef CONFIG_X86_64 5997 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5998 return; 5999 #endif 6000 if (adev->gmc.xgmi.connected_to_cpu) 6001 return; 6002 6003 if (ring && ring->funcs->emit_hdp_flush) 6004 amdgpu_ring_emit_hdp_flush(ring); 6005 else 6006 amdgpu_asic_flush_hdp(adev, ring); 6007 } 6008 6009 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6010 struct amdgpu_ring *ring) 6011 { 6012 #ifdef CONFIG_X86_64 6013 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6014 return; 6015 #endif 6016 if (adev->gmc.xgmi.connected_to_cpu) 6017 return; 6018 6019 amdgpu_asic_invalidate_hdp(adev, ring); 6020 } 6021 6022 int amdgpu_in_reset(struct amdgpu_device *adev) 6023 { 6024 return atomic_read(&adev->reset_domain->in_gpu_reset); 6025 } 6026 6027 /** 6028 * amdgpu_device_halt() - bring hardware to some kind of halt state 6029 * 6030 * @adev: amdgpu_device pointer 6031 * 6032 * Bring hardware to some kind of halt state so that no one can touch it 6033 * any more. It will help to maintain error context when error occurred. 6034 * Compare to a simple hang, the system will keep stable at least for SSH 6035 * access. Then it should be trivial to inspect the hardware state and 6036 * see what's going on. Implemented as following: 6037 * 6038 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6039 * clears all CPU mappings to device, disallows remappings through page faults 6040 * 2. amdgpu_irq_disable_all() disables all interrupts 6041 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6042 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6043 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6044 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6045 * flush any in flight DMA operations 6046 */ 6047 void amdgpu_device_halt(struct amdgpu_device *adev) 6048 { 6049 struct pci_dev *pdev = adev->pdev; 6050 struct drm_device *ddev = adev_to_drm(adev); 6051 6052 amdgpu_xcp_dev_unplug(adev); 6053 drm_dev_unplug(ddev); 6054 6055 amdgpu_irq_disable_all(adev); 6056 6057 amdgpu_fence_driver_hw_fini(adev); 6058 6059 adev->no_hw_access = true; 6060 6061 amdgpu_device_unmap_mmio(adev); 6062 6063 pci_disable_device(pdev); 6064 pci_wait_for_pending_transaction(pdev); 6065 } 6066 6067 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6068 u32 reg) 6069 { 6070 unsigned long flags, address, data; 6071 u32 r; 6072 6073 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6074 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6075 6076 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6077 WREG32(address, reg * 4); 6078 (void)RREG32(address); 6079 r = RREG32(data); 6080 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6081 return r; 6082 } 6083 6084 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6085 u32 reg, u32 v) 6086 { 6087 unsigned long flags, address, data; 6088 6089 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6090 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6091 6092 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6093 WREG32(address, reg * 4); 6094 (void)RREG32(address); 6095 WREG32(data, v); 6096 (void)RREG32(data); 6097 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6098 } 6099 6100 /** 6101 * amdgpu_device_switch_gang - switch to a new gang 6102 * @adev: amdgpu_device pointer 6103 * @gang: the gang to switch to 6104 * 6105 * Try to switch to a new gang. 6106 * Returns: NULL if we switched to the new gang or a reference to the current 6107 * gang leader. 6108 */ 6109 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6110 struct dma_fence *gang) 6111 { 6112 struct dma_fence *old = NULL; 6113 6114 do { 6115 dma_fence_put(old); 6116 rcu_read_lock(); 6117 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6118 rcu_read_unlock(); 6119 6120 if (old == gang) 6121 break; 6122 6123 if (!dma_fence_is_signaled(old)) 6124 return old; 6125 6126 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6127 old, gang) != old); 6128 6129 dma_fence_put(old); 6130 return NULL; 6131 } 6132 6133 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6134 { 6135 switch (adev->asic_type) { 6136 #ifdef CONFIG_DRM_AMDGPU_SI 6137 case CHIP_HAINAN: 6138 #endif 6139 case CHIP_TOPAZ: 6140 /* chips with no display hardware */ 6141 return false; 6142 #ifdef CONFIG_DRM_AMDGPU_SI 6143 case CHIP_TAHITI: 6144 case CHIP_PITCAIRN: 6145 case CHIP_VERDE: 6146 case CHIP_OLAND: 6147 #endif 6148 #ifdef CONFIG_DRM_AMDGPU_CIK 6149 case CHIP_BONAIRE: 6150 case CHIP_HAWAII: 6151 case CHIP_KAVERI: 6152 case CHIP_KABINI: 6153 case CHIP_MULLINS: 6154 #endif 6155 case CHIP_TONGA: 6156 case CHIP_FIJI: 6157 case CHIP_POLARIS10: 6158 case CHIP_POLARIS11: 6159 case CHIP_POLARIS12: 6160 case CHIP_VEGAM: 6161 case CHIP_CARRIZO: 6162 case CHIP_STONEY: 6163 /* chips with display hardware */ 6164 return true; 6165 default: 6166 /* IP discovery */ 6167 if (!adev->ip_versions[DCE_HWIP][0] || 6168 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6169 return false; 6170 return true; 6171 } 6172 } 6173 6174 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6175 uint32_t inst, uint32_t reg_addr, char reg_name[], 6176 uint32_t expected_value, uint32_t mask) 6177 { 6178 uint32_t ret = 0; 6179 uint32_t old_ = 0; 6180 uint32_t tmp_ = RREG32(reg_addr); 6181 uint32_t loop = adev->usec_timeout; 6182 6183 while ((tmp_ & (mask)) != (expected_value)) { 6184 if (old_ != tmp_) { 6185 loop = adev->usec_timeout; 6186 old_ = tmp_; 6187 } else 6188 udelay(1); 6189 tmp_ = RREG32(reg_addr); 6190 loop--; 6191 if (!loop) { 6192 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6193 inst, reg_name, (uint32_t)expected_value, 6194 (uint32_t)(tmp_ & (mask))); 6195 ret = -ETIMEDOUT; 6196 break; 6197 } 6198 } 6199 return ret; 6200 } 6201