1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 39 #include <drm/drm_aperture.h> 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_probe_helper.h> 42 #include <drm/amdgpu_drm.h> 43 #include <linux/vgaarb.h> 44 #include <linux/vga_switcheroo.h> 45 #include <linux/efi.h> 46 #include "amdgpu.h" 47 #include "amdgpu_trace.h" 48 #include "amdgpu_i2c.h" 49 #include "atom.h" 50 #include "amdgpu_atombios.h" 51 #include "amdgpu_atomfirmware.h" 52 #include "amd_pcie.h" 53 #ifdef CONFIG_DRM_AMDGPU_SI 54 #include "si.h" 55 #endif 56 #ifdef CONFIG_DRM_AMDGPU_CIK 57 #include "cik.h" 58 #endif 59 #include "vi.h" 60 #include "soc15.h" 61 #include "nv.h" 62 #include "bif/bif_4_1_d.h" 63 #include <linux/firmware.h> 64 #include "amdgpu_vf_error.h" 65 66 #include "amdgpu_amdkfd.h" 67 #include "amdgpu_pm.h" 68 69 #include "amdgpu_xgmi.h" 70 #include "amdgpu_ras.h" 71 #include "amdgpu_pmu.h" 72 #include "amdgpu_fru_eeprom.h" 73 #include "amdgpu_reset.h" 74 75 #include <linux/suspend.h> 76 #include <drm/task_barrier.h> 77 #include <linux/pm_runtime.h> 78 79 #include <drm/drm_drv.h> 80 81 #if IS_ENABLED(CONFIG_X86) && defined(__linux__) 82 #include <asm/intel-family.h> 83 #endif 84 85 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 92 93 #define AMDGPU_RESUME_MS 2000 94 #define AMDGPU_MAX_RETRY_LIMIT 2 95 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 96 97 static const struct drm_driver amdgpu_kms_driver; 98 99 const char *amdgpu_asic_name[] = { 100 "TAHITI", 101 "PITCAIRN", 102 "VERDE", 103 "OLAND", 104 "HAINAN", 105 "BONAIRE", 106 "KAVERI", 107 "KABINI", 108 "HAWAII", 109 "MULLINS", 110 "TOPAZ", 111 "TONGA", 112 "FIJI", 113 "CARRIZO", 114 "STONEY", 115 "POLARIS10", 116 "POLARIS11", 117 "POLARIS12", 118 "VEGAM", 119 "VEGA10", 120 "VEGA12", 121 "VEGA20", 122 "RAVEN", 123 "ARCTURUS", 124 "RENOIR", 125 "ALDEBARAN", 126 "NAVI10", 127 "CYAN_SKILLFISH", 128 "NAVI14", 129 "NAVI12", 130 "SIENNA_CICHLID", 131 "NAVY_FLOUNDER", 132 "VANGOGH", 133 "DIMGREY_CAVEFISH", 134 "BEIGE_GOBY", 135 "YELLOW_CARP", 136 "IP DISCOVERY", 137 "LAST", 138 }; 139 140 /** 141 * DOC: pcie_replay_count 142 * 143 * The amdgpu driver provides a sysfs API for reporting the total number 144 * of PCIe replays (NAKs) 145 * The file pcie_replay_count is used for this and returns the total 146 * number of replays as a sum of the NAKs generated and NAKs received 147 */ 148 149 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 150 struct device_attribute *attr, char *buf) 151 { 152 struct drm_device *ddev = dev_get_drvdata(dev); 153 struct amdgpu_device *adev = drm_to_adev(ddev); 154 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 155 156 return sysfs_emit(buf, "%llu\n", cnt); 157 } 158 159 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 160 amdgpu_device_get_pcie_replay_count, NULL); 161 162 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 163 164 /** 165 * DOC: product_name 166 * 167 * The amdgpu driver provides a sysfs API for reporting the product name 168 * for the device 169 * The file serial_number is used for this and returns the product name 170 * as returned from the FRU. 171 * NOTE: This is only available for certain server cards 172 */ 173 174 static ssize_t amdgpu_device_get_product_name(struct device *dev, 175 struct device_attribute *attr, char *buf) 176 { 177 struct drm_device *ddev = dev_get_drvdata(dev); 178 struct amdgpu_device *adev = drm_to_adev(ddev); 179 180 return sysfs_emit(buf, "%s\n", adev->product_name); 181 } 182 183 static DEVICE_ATTR(product_name, S_IRUGO, 184 amdgpu_device_get_product_name, NULL); 185 186 /** 187 * DOC: product_number 188 * 189 * The amdgpu driver provides a sysfs API for reporting the part number 190 * for the device 191 * The file serial_number is used for this and returns the part number 192 * as returned from the FRU. 193 * NOTE: This is only available for certain server cards 194 */ 195 196 static ssize_t amdgpu_device_get_product_number(struct device *dev, 197 struct device_attribute *attr, char *buf) 198 { 199 struct drm_device *ddev = dev_get_drvdata(dev); 200 struct amdgpu_device *adev = drm_to_adev(ddev); 201 202 return sysfs_emit(buf, "%s\n", adev->product_number); 203 } 204 205 static DEVICE_ATTR(product_number, S_IRUGO, 206 amdgpu_device_get_product_number, NULL); 207 208 /** 209 * DOC: serial_number 210 * 211 * The amdgpu driver provides a sysfs API for reporting the serial number 212 * for the device 213 * The file serial_number is used for this and returns the serial number 214 * as returned from the FRU. 215 * NOTE: This is only available for certain server cards 216 */ 217 218 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 219 struct device_attribute *attr, char *buf) 220 { 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 224 return sysfs_emit(buf, "%s\n", adev->serial); 225 } 226 227 static DEVICE_ATTR(serial_number, S_IRUGO, 228 amdgpu_device_get_serial_number, NULL); 229 230 /** 231 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 232 * 233 * @dev: drm_device pointer 234 * 235 * Returns true if the device is a dGPU with ATPX power control, 236 * otherwise return false. 237 */ 238 bool amdgpu_device_supports_px(struct drm_device *dev) 239 { 240 struct amdgpu_device *adev = drm_to_adev(dev); 241 242 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 243 return true; 244 return false; 245 } 246 247 /** 248 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 249 * 250 * @dev: drm_device pointer 251 * 252 * Returns true if the device is a dGPU with ACPI power control, 253 * otherwise return false. 254 */ 255 bool amdgpu_device_supports_boco(struct drm_device *dev) 256 { 257 struct amdgpu_device *adev = drm_to_adev(dev); 258 259 if (adev->has_pr3 || 260 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 261 return true; 262 return false; 263 } 264 265 /** 266 * amdgpu_device_supports_baco - Does the device support BACO 267 * 268 * @dev: drm_device pointer 269 * 270 * Returns true if the device supporte BACO, 271 * otherwise return false. 272 */ 273 bool amdgpu_device_supports_baco(struct drm_device *dev) 274 { 275 struct amdgpu_device *adev = drm_to_adev(dev); 276 277 return amdgpu_asic_supports_baco(adev); 278 } 279 280 /** 281 * amdgpu_device_supports_smart_shift - Is the device dGPU with 282 * smart shift support 283 * 284 * @dev: drm_device pointer 285 * 286 * Returns true if the device is a dGPU with Smart Shift support, 287 * otherwise returns false. 288 */ 289 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 290 { 291 return (amdgpu_device_supports_boco(dev) && 292 amdgpu_acpi_is_power_shift_control_supported()); 293 } 294 295 /* 296 * VRAM access helper functions 297 */ 298 299 /** 300 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 301 * 302 * @adev: amdgpu_device pointer 303 * @pos: offset of the buffer in vram 304 * @buf: virtual address of the buffer in system memory 305 * @size: read/write size, sizeof(@buf) must > @size 306 * @write: true - write to vram, otherwise - read from vram 307 */ 308 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 309 void *buf, size_t size, bool write) 310 { 311 unsigned long flags; 312 uint32_t hi = ~0, tmp = 0; 313 uint32_t *data = buf; 314 uint64_t last; 315 int idx; 316 317 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 318 return; 319 320 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 321 322 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 323 for (last = pos + size; pos < last; pos += 4) { 324 tmp = pos >> 31; 325 326 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 327 if (tmp != hi) { 328 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 329 hi = tmp; 330 } 331 if (write) 332 WREG32_NO_KIQ(mmMM_DATA, *data++); 333 else 334 *data++ = RREG32_NO_KIQ(mmMM_DATA); 335 } 336 337 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 338 drm_dev_exit(idx); 339 } 340 341 /** 342 * amdgpu_device_aper_access - access vram by vram aperature 343 * 344 * @adev: amdgpu_device pointer 345 * @pos: offset of the buffer in vram 346 * @buf: virtual address of the buffer in system memory 347 * @size: read/write size, sizeof(@buf) must > @size 348 * @write: true - write to vram, otherwise - read from vram 349 * 350 * The return value means how many bytes have been transferred. 351 */ 352 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 353 void *buf, size_t size, bool write) 354 { 355 #ifdef CONFIG_64BIT 356 void __iomem *addr; 357 size_t count = 0; 358 uint64_t last; 359 360 if (!adev->mman.aper_base_kaddr) 361 return 0; 362 363 last = min(pos + size, adev->gmc.visible_vram_size); 364 if (last > pos) { 365 addr = adev->mman.aper_base_kaddr + pos; 366 count = last - pos; 367 368 if (write) { 369 memcpy_toio(addr, buf, count); 370 mb(); 371 amdgpu_device_flush_hdp(adev, NULL); 372 } else { 373 amdgpu_device_invalidate_hdp(adev, NULL); 374 mb(); 375 memcpy_fromio(buf, addr, count); 376 } 377 378 } 379 380 return count; 381 #else 382 return 0; 383 #endif 384 } 385 386 /** 387 * amdgpu_device_vram_access - read/write a buffer in vram 388 * 389 * @adev: amdgpu_device pointer 390 * @pos: offset of the buffer in vram 391 * @buf: virtual address of the buffer in system memory 392 * @size: read/write size, sizeof(@buf) must > @size 393 * @write: true - write to vram, otherwise - read from vram 394 */ 395 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 396 void *buf, size_t size, bool write) 397 { 398 size_t count; 399 400 /* try to using vram apreature to access vram first */ 401 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 402 size -= count; 403 if (size) { 404 /* using MM to access rest vram */ 405 pos += count; 406 buf += count; 407 amdgpu_device_mm_access(adev, pos, buf, size, write); 408 } 409 } 410 411 /* 412 * register access helper functions. 413 */ 414 415 /* Check if hw access should be skipped because of hotplug or device error */ 416 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 417 { 418 if (adev->no_hw_access) 419 return true; 420 421 #ifdef CONFIG_LOCKDEP 422 /* 423 * This is a bit complicated to understand, so worth a comment. What we assert 424 * here is that the GPU reset is not running on another thread in parallel. 425 * 426 * For this we trylock the read side of the reset semaphore, if that succeeds 427 * we know that the reset is not running in paralell. 428 * 429 * If the trylock fails we assert that we are either already holding the read 430 * side of the lock or are the reset thread itself and hold the write side of 431 * the lock. 432 */ 433 if (in_task()) { 434 if (down_read_trylock(&adev->reset_domain->sem)) 435 up_read(&adev->reset_domain->sem); 436 else 437 lockdep_assert_held(&adev->reset_domain->sem); 438 } 439 #endif 440 return false; 441 } 442 443 /** 444 * amdgpu_device_rreg - read a memory mapped IO or indirect register 445 * 446 * @adev: amdgpu_device pointer 447 * @reg: dword aligned register offset 448 * @acc_flags: access flags which require special behavior 449 * 450 * Returns the 32 bit value from the offset specified. 451 */ 452 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 453 uint32_t reg, uint32_t acc_flags) 454 { 455 uint32_t ret; 456 457 if (amdgpu_device_skip_hw_access(adev)) 458 return 0; 459 460 if ((reg * 4) < adev->rmmio_size) { 461 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 462 amdgpu_sriov_runtime(adev) && 463 down_read_trylock(&adev->reset_domain->sem)) { 464 ret = amdgpu_kiq_rreg(adev, reg); 465 up_read(&adev->reset_domain->sem); 466 } else { 467 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 468 } 469 } else { 470 ret = adev->pcie_rreg(adev, reg * 4); 471 } 472 473 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 474 475 return ret; 476 } 477 478 /* 479 * MMIO register read with bytes helper functions 480 * @offset:bytes offset from MMIO start 481 * 482 */ 483 484 /** 485 * amdgpu_mm_rreg8 - read a memory mapped IO register 486 * 487 * @adev: amdgpu_device pointer 488 * @offset: byte aligned register offset 489 * 490 * Returns the 8 bit value from the offset specified. 491 */ 492 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 493 { 494 if (amdgpu_device_skip_hw_access(adev)) 495 return 0; 496 497 if (offset < adev->rmmio_size) 498 return (readb(adev->rmmio + offset)); 499 BUG(); 500 } 501 502 /* 503 * MMIO register write with bytes helper functions 504 * @offset:bytes offset from MMIO start 505 * @value: the value want to be written to the register 506 * 507 */ 508 /** 509 * amdgpu_mm_wreg8 - read a memory mapped IO register 510 * 511 * @adev: amdgpu_device pointer 512 * @offset: byte aligned register offset 513 * @value: 8 bit value to write 514 * 515 * Writes the value specified to the offset specified. 516 */ 517 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 518 { 519 if (amdgpu_device_skip_hw_access(adev)) 520 return; 521 522 if (offset < adev->rmmio_size) 523 writeb(value, adev->rmmio + offset); 524 else 525 BUG(); 526 } 527 528 /** 529 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 530 * 531 * @adev: amdgpu_device pointer 532 * @reg: dword aligned register offset 533 * @v: 32 bit value to write to the register 534 * @acc_flags: access flags which require special behavior 535 * 536 * Writes the value specified to the offset specified. 537 */ 538 void amdgpu_device_wreg(struct amdgpu_device *adev, 539 uint32_t reg, uint32_t v, 540 uint32_t acc_flags) 541 { 542 if (amdgpu_device_skip_hw_access(adev)) 543 return; 544 545 if ((reg * 4) < adev->rmmio_size) { 546 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 547 amdgpu_sriov_runtime(adev) && 548 down_read_trylock(&adev->reset_domain->sem)) { 549 amdgpu_kiq_wreg(adev, reg, v); 550 up_read(&adev->reset_domain->sem); 551 } else { 552 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 553 } 554 } else { 555 adev->pcie_wreg(adev, reg * 4, v); 556 } 557 558 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 559 } 560 561 /** 562 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 563 * 564 * @adev: amdgpu_device pointer 565 * @reg: mmio/rlc register 566 * @v: value to write 567 * 568 * this function is invoked only for the debugfs register access 569 */ 570 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 571 uint32_t reg, uint32_t v) 572 { 573 if (amdgpu_device_skip_hw_access(adev)) 574 return; 575 576 if (amdgpu_sriov_fullaccess(adev) && 577 adev->gfx.rlc.funcs && 578 adev->gfx.rlc.funcs->is_rlcg_access_range) { 579 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 580 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 581 } else if ((reg * 4) >= adev->rmmio_size) { 582 adev->pcie_wreg(adev, reg * 4, v); 583 } else { 584 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 585 } 586 } 587 588 /** 589 * amdgpu_mm_rdoorbell - read a doorbell dword 590 * 591 * @adev: amdgpu_device pointer 592 * @index: doorbell index 593 * 594 * Returns the value in the doorbell aperture at the 595 * requested doorbell index (CIK). 596 */ 597 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 598 { 599 if (amdgpu_device_skip_hw_access(adev)) 600 return 0; 601 602 if (index < adev->doorbell.num_doorbells) { 603 return readl(adev->doorbell.ptr + index); 604 } else { 605 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 606 return 0; 607 } 608 } 609 610 /** 611 * amdgpu_mm_wdoorbell - write a doorbell dword 612 * 613 * @adev: amdgpu_device pointer 614 * @index: doorbell index 615 * @v: value to write 616 * 617 * Writes @v to the doorbell aperture at the 618 * requested doorbell index (CIK). 619 */ 620 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 621 { 622 if (amdgpu_device_skip_hw_access(adev)) 623 return; 624 625 if (index < adev->doorbell.num_doorbells) { 626 writel(v, adev->doorbell.ptr + index); 627 } else { 628 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 629 } 630 } 631 632 /** 633 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 634 * 635 * @adev: amdgpu_device pointer 636 * @index: doorbell index 637 * 638 * Returns the value in the doorbell aperture at the 639 * requested doorbell index (VEGA10+). 640 */ 641 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 642 { 643 if (amdgpu_device_skip_hw_access(adev)) 644 return 0; 645 646 if (index < adev->doorbell.num_doorbells) { 647 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 648 } else { 649 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 650 return 0; 651 } 652 } 653 654 /** 655 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 656 * 657 * @adev: amdgpu_device pointer 658 * @index: doorbell index 659 * @v: value to write 660 * 661 * Writes @v to the doorbell aperture at the 662 * requested doorbell index (VEGA10+). 663 */ 664 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 665 { 666 if (amdgpu_device_skip_hw_access(adev)) 667 return; 668 669 if (index < adev->doorbell.num_doorbells) { 670 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 671 } else { 672 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 673 } 674 } 675 676 /** 677 * amdgpu_device_indirect_rreg - read an indirect register 678 * 679 * @adev: amdgpu_device pointer 680 * @pcie_index: mmio register offset 681 * @pcie_data: mmio register offset 682 * @reg_addr: indirect register address to read from 683 * 684 * Returns the value of indirect register @reg_addr 685 */ 686 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 687 u32 pcie_index, u32 pcie_data, 688 u32 reg_addr) 689 { 690 unsigned long flags; 691 u32 r; 692 void __iomem *pcie_index_offset; 693 void __iomem *pcie_data_offset; 694 695 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 696 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 697 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 698 699 writel(reg_addr, pcie_index_offset); 700 readl(pcie_index_offset); 701 r = readl(pcie_data_offset); 702 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 703 704 return r; 705 } 706 707 /** 708 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 709 * 710 * @adev: amdgpu_device pointer 711 * @pcie_index: mmio register offset 712 * @pcie_data: mmio register offset 713 * @reg_addr: indirect register address to read from 714 * 715 * Returns the value of indirect register @reg_addr 716 */ 717 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 718 u32 pcie_index, u32 pcie_data, 719 u32 reg_addr) 720 { 721 unsigned long flags; 722 u64 r; 723 void __iomem *pcie_index_offset; 724 void __iomem *pcie_data_offset; 725 726 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 727 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 728 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 729 730 /* read low 32 bits */ 731 writel(reg_addr, pcie_index_offset); 732 readl(pcie_index_offset); 733 r = readl(pcie_data_offset); 734 /* read high 32 bits */ 735 writel(reg_addr + 4, pcie_index_offset); 736 readl(pcie_index_offset); 737 r |= ((u64)readl(pcie_data_offset) << 32); 738 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 739 740 return r; 741 } 742 743 /** 744 * amdgpu_device_indirect_wreg - write an indirect register address 745 * 746 * @adev: amdgpu_device pointer 747 * @pcie_index: mmio register offset 748 * @pcie_data: mmio register offset 749 * @reg_addr: indirect register offset 750 * @reg_data: indirect register data 751 * 752 */ 753 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 754 u32 pcie_index, u32 pcie_data, 755 u32 reg_addr, u32 reg_data) 756 { 757 unsigned long flags; 758 void __iomem *pcie_index_offset; 759 void __iomem *pcie_data_offset; 760 761 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 762 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 763 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 764 765 writel(reg_addr, pcie_index_offset); 766 readl(pcie_index_offset); 767 writel(reg_data, pcie_data_offset); 768 readl(pcie_data_offset); 769 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 770 } 771 772 /** 773 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 774 * 775 * @adev: amdgpu_device pointer 776 * @pcie_index: mmio register offset 777 * @pcie_data: mmio register offset 778 * @reg_addr: indirect register offset 779 * @reg_data: indirect register data 780 * 781 */ 782 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 783 u32 pcie_index, u32 pcie_data, 784 u32 reg_addr, u64 reg_data) 785 { 786 unsigned long flags; 787 void __iomem *pcie_index_offset; 788 void __iomem *pcie_data_offset; 789 790 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 791 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 792 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 793 794 /* write low 32 bits */ 795 writel(reg_addr, pcie_index_offset); 796 readl(pcie_index_offset); 797 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 798 readl(pcie_data_offset); 799 /* write high 32 bits */ 800 writel(reg_addr + 4, pcie_index_offset); 801 readl(pcie_index_offset); 802 writel((u32)(reg_data >> 32), pcie_data_offset); 803 readl(pcie_data_offset); 804 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 805 } 806 807 /** 808 * amdgpu_invalid_rreg - dummy reg read function 809 * 810 * @adev: amdgpu_device pointer 811 * @reg: offset of register 812 * 813 * Dummy register read function. Used for register blocks 814 * that certain asics don't have (all asics). 815 * Returns the value in the register. 816 */ 817 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 818 { 819 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 820 BUG(); 821 return 0; 822 } 823 824 /** 825 * amdgpu_invalid_wreg - dummy reg write function 826 * 827 * @adev: amdgpu_device pointer 828 * @reg: offset of register 829 * @v: value to write to the register 830 * 831 * Dummy register read function. Used for register blocks 832 * that certain asics don't have (all asics). 833 */ 834 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 835 { 836 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 837 reg, v); 838 BUG(); 839 } 840 841 /** 842 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 843 * 844 * @adev: amdgpu_device pointer 845 * @reg: offset of register 846 * 847 * Dummy register read function. Used for register blocks 848 * that certain asics don't have (all asics). 849 * Returns the value in the register. 850 */ 851 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 852 { 853 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 854 BUG(); 855 return 0; 856 } 857 858 /** 859 * amdgpu_invalid_wreg64 - dummy reg write function 860 * 861 * @adev: amdgpu_device pointer 862 * @reg: offset of register 863 * @v: value to write to the register 864 * 865 * Dummy register read function. Used for register blocks 866 * that certain asics don't have (all asics). 867 */ 868 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 869 { 870 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 871 reg, v); 872 BUG(); 873 } 874 875 /** 876 * amdgpu_block_invalid_rreg - dummy reg read function 877 * 878 * @adev: amdgpu_device pointer 879 * @block: offset of instance 880 * @reg: offset of register 881 * 882 * Dummy register read function. Used for register blocks 883 * that certain asics don't have (all asics). 884 * Returns the value in the register. 885 */ 886 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 887 uint32_t block, uint32_t reg) 888 { 889 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 890 reg, block); 891 BUG(); 892 return 0; 893 } 894 895 /** 896 * amdgpu_block_invalid_wreg - dummy reg write function 897 * 898 * @adev: amdgpu_device pointer 899 * @block: offset of instance 900 * @reg: offset of register 901 * @v: value to write to the register 902 * 903 * Dummy register read function. Used for register blocks 904 * that certain asics don't have (all asics). 905 */ 906 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 907 uint32_t block, 908 uint32_t reg, uint32_t v) 909 { 910 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 911 reg, block, v); 912 BUG(); 913 } 914 915 /** 916 * amdgpu_device_asic_init - Wrapper for atom asic_init 917 * 918 * @adev: amdgpu_device pointer 919 * 920 * Does any asic specific work and then calls atom asic init. 921 */ 922 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 923 { 924 amdgpu_asic_pre_asic_init(adev); 925 926 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 927 return amdgpu_atomfirmware_asic_init(adev, true); 928 else 929 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 930 } 931 932 /** 933 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 934 * 935 * @adev: amdgpu_device pointer 936 * 937 * Allocates a scratch page of VRAM for use by various things in the 938 * driver. 939 */ 940 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 941 { 942 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 943 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 944 &adev->vram_scratch.robj, 945 &adev->vram_scratch.gpu_addr, 946 (void **)&adev->vram_scratch.ptr); 947 } 948 949 /** 950 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 951 * 952 * @adev: amdgpu_device pointer 953 * 954 * Frees the VRAM scratch page. 955 */ 956 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 957 { 958 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 959 } 960 961 /** 962 * amdgpu_device_program_register_sequence - program an array of registers. 963 * 964 * @adev: amdgpu_device pointer 965 * @registers: pointer to the register array 966 * @array_size: size of the register array 967 * 968 * Programs an array or registers with and and or masks. 969 * This is a helper for setting golden registers. 970 */ 971 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 972 const u32 *registers, 973 const u32 array_size) 974 { 975 u32 tmp, reg, and_mask, or_mask; 976 int i; 977 978 if (array_size % 3) 979 return; 980 981 for (i = 0; i < array_size; i +=3) { 982 reg = registers[i + 0]; 983 and_mask = registers[i + 1]; 984 or_mask = registers[i + 2]; 985 986 if (and_mask == 0xffffffff) { 987 tmp = or_mask; 988 } else { 989 tmp = RREG32(reg); 990 tmp &= ~and_mask; 991 if (adev->family >= AMDGPU_FAMILY_AI) 992 tmp |= (or_mask & and_mask); 993 else 994 tmp |= or_mask; 995 } 996 WREG32(reg, tmp); 997 } 998 } 999 1000 /** 1001 * amdgpu_device_pci_config_reset - reset the GPU 1002 * 1003 * @adev: amdgpu_device pointer 1004 * 1005 * Resets the GPU using the pci config reset sequence. 1006 * Only applicable to asics prior to vega10. 1007 */ 1008 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1009 { 1010 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1011 } 1012 1013 /** 1014 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1015 * 1016 * @adev: amdgpu_device pointer 1017 * 1018 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1019 */ 1020 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1021 { 1022 STUB(); 1023 return -ENOSYS; 1024 #ifdef notyet 1025 return pci_reset_function(adev->pdev); 1026 #endif 1027 } 1028 1029 /* 1030 * GPU doorbell aperture helpers function. 1031 */ 1032 /** 1033 * amdgpu_device_doorbell_init - Init doorbell driver information. 1034 * 1035 * @adev: amdgpu_device pointer 1036 * 1037 * Init doorbell driver information (CIK) 1038 * Returns 0 on success, error on failure. 1039 */ 1040 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1041 { 1042 1043 /* No doorbell on SI hardware generation */ 1044 if (adev->asic_type < CHIP_BONAIRE) { 1045 adev->doorbell.base = 0; 1046 adev->doorbell.size = 0; 1047 adev->doorbell.num_doorbells = 0; 1048 adev->doorbell.ptr = NULL; 1049 return 0; 1050 } 1051 1052 #ifdef __linux__ 1053 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1054 return -EINVAL; 1055 #endif 1056 1057 amdgpu_asic_init_doorbell_index(adev); 1058 1059 /* doorbell bar mapping */ 1060 #ifdef __linux__ 1061 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1062 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1063 #endif 1064 1065 if (adev->enable_mes) { 1066 adev->doorbell.num_doorbells = 1067 adev->doorbell.size / sizeof(u32); 1068 } else { 1069 adev->doorbell.num_doorbells = 1070 min_t(u32, adev->doorbell.size / sizeof(u32), 1071 adev->doorbell_index.max_assignment+1); 1072 if (adev->doorbell.num_doorbells == 0) 1073 return -EINVAL; 1074 1075 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1076 * paging queue doorbell use the second page. The 1077 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1078 * doorbells are in the first page. So with paging queue enabled, 1079 * the max num_doorbells should + 1 page (0x400 in dword) 1080 */ 1081 if (adev->asic_type >= CHIP_VEGA10) 1082 adev->doorbell.num_doorbells += 0x400; 1083 } 1084 1085 #ifdef __linux__ 1086 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1087 adev->doorbell.num_doorbells * 1088 sizeof(u32)); 1089 if (adev->doorbell.ptr == NULL) 1090 return -ENOMEM; 1091 #endif 1092 1093 return 0; 1094 } 1095 1096 /** 1097 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1098 * 1099 * @adev: amdgpu_device pointer 1100 * 1101 * Tear down doorbell driver information (CIK) 1102 */ 1103 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1104 { 1105 #ifdef __linux__ 1106 iounmap(adev->doorbell.ptr); 1107 #else 1108 if (adev->doorbell.size > 0) 1109 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 1110 adev->doorbell.size); 1111 #endif 1112 adev->doorbell.ptr = NULL; 1113 } 1114 1115 1116 1117 /* 1118 * amdgpu_device_wb_*() 1119 * Writeback is the method by which the GPU updates special pages in memory 1120 * with the status of certain GPU events (fences, ring pointers,etc.). 1121 */ 1122 1123 /** 1124 * amdgpu_device_wb_fini - Disable Writeback and free memory 1125 * 1126 * @adev: amdgpu_device pointer 1127 * 1128 * Disables Writeback and frees the Writeback memory (all asics). 1129 * Used at driver shutdown. 1130 */ 1131 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1132 { 1133 if (adev->wb.wb_obj) { 1134 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1135 &adev->wb.gpu_addr, 1136 (void **)&adev->wb.wb); 1137 adev->wb.wb_obj = NULL; 1138 } 1139 } 1140 1141 /** 1142 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1143 * 1144 * @adev: amdgpu_device pointer 1145 * 1146 * Initializes writeback and allocates writeback memory (all asics). 1147 * Used at driver startup. 1148 * Returns 0 on success or an -error on failure. 1149 */ 1150 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1151 { 1152 int r; 1153 1154 if (adev->wb.wb_obj == NULL) { 1155 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1156 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1157 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1158 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1159 (void **)&adev->wb.wb); 1160 if (r) { 1161 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1162 return r; 1163 } 1164 1165 adev->wb.num_wb = AMDGPU_MAX_WB; 1166 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1167 1168 /* clear wb memory */ 1169 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1170 } 1171 1172 return 0; 1173 } 1174 1175 /** 1176 * amdgpu_device_wb_get - Allocate a wb entry 1177 * 1178 * @adev: amdgpu_device pointer 1179 * @wb: wb index 1180 * 1181 * Allocate a wb slot for use by the driver (all asics). 1182 * Returns 0 on success or -EINVAL on failure. 1183 */ 1184 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1185 { 1186 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1187 1188 if (offset < adev->wb.num_wb) { 1189 __set_bit(offset, adev->wb.used); 1190 *wb = offset << 3; /* convert to dw offset */ 1191 return 0; 1192 } else { 1193 return -EINVAL; 1194 } 1195 } 1196 1197 /** 1198 * amdgpu_device_wb_free - Free a wb entry 1199 * 1200 * @adev: amdgpu_device pointer 1201 * @wb: wb index 1202 * 1203 * Free a wb slot allocated for use by the driver (all asics) 1204 */ 1205 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1206 { 1207 wb >>= 3; 1208 if (wb < adev->wb.num_wb) 1209 __clear_bit(wb, adev->wb.used); 1210 } 1211 1212 /** 1213 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1214 * 1215 * @adev: amdgpu_device pointer 1216 * 1217 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1218 * to fail, but if any of the BARs is not accessible after the size we abort 1219 * driver loading by returning -ENODEV. 1220 */ 1221 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1222 { 1223 #ifdef __linux__ 1224 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1225 struct pci_bus *root; 1226 struct resource *res; 1227 unsigned i; 1228 u16 cmd; 1229 int r; 1230 1231 /* Bypass for VF */ 1232 if (amdgpu_sriov_vf(adev)) 1233 return 0; 1234 1235 /* skip if the bios has already enabled large BAR */ 1236 if (adev->gmc.real_vram_size && 1237 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1238 return 0; 1239 1240 /* Check if the root BUS has 64bit memory resources */ 1241 root = adev->pdev->bus; 1242 while (root->parent) 1243 root = root->parent; 1244 1245 pci_bus_for_each_resource(root, res, i) { 1246 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1247 res->start > 0x100000000ull) 1248 break; 1249 } 1250 1251 /* Trying to resize is pointless without a root hub window above 4GB */ 1252 if (!res) 1253 return 0; 1254 1255 /* Limit the BAR size to what is available */ 1256 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1257 rbar_size); 1258 1259 /* Disable memory decoding while we change the BAR addresses and size */ 1260 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1261 pci_write_config_word(adev->pdev, PCI_COMMAND, 1262 cmd & ~PCI_COMMAND_MEMORY); 1263 1264 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1265 amdgpu_device_doorbell_fini(adev); 1266 if (adev->asic_type >= CHIP_BONAIRE) 1267 pci_release_resource(adev->pdev, 2); 1268 1269 pci_release_resource(adev->pdev, 0); 1270 1271 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1272 if (r == -ENOSPC) 1273 DRM_INFO("Not enough PCI address space for a large BAR."); 1274 else if (r && r != -ENOTSUPP) 1275 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1276 1277 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1278 1279 /* When the doorbell or fb BAR isn't available we have no chance of 1280 * using the device. 1281 */ 1282 r = amdgpu_device_doorbell_init(adev); 1283 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1284 return -ENODEV; 1285 1286 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1287 #endif /* __linux__ */ 1288 1289 return 0; 1290 } 1291 1292 /* 1293 * GPU helpers function. 1294 */ 1295 /** 1296 * amdgpu_device_need_post - check if the hw need post or not 1297 * 1298 * @adev: amdgpu_device pointer 1299 * 1300 * Check if the asic has been initialized (all asics) at driver startup 1301 * or post is needed if hw reset is performed. 1302 * Returns true if need or false if not. 1303 */ 1304 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1305 { 1306 uint32_t reg; 1307 1308 if (amdgpu_sriov_vf(adev)) 1309 return false; 1310 1311 if (amdgpu_passthrough(adev)) { 1312 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1313 * some old smc fw still need driver do vPost otherwise gpu hang, while 1314 * those smc fw version above 22.15 doesn't have this flaw, so we force 1315 * vpost executed for smc version below 22.15 1316 */ 1317 if (adev->asic_type == CHIP_FIJI) { 1318 int err; 1319 uint32_t fw_ver; 1320 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1321 /* force vPost if error occured */ 1322 if (err) 1323 return true; 1324 1325 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1326 if (fw_ver < 0x00160e00) 1327 return true; 1328 } 1329 } 1330 1331 /* Don't post if we need to reset whole hive on init */ 1332 if (adev->gmc.xgmi.pending_reset) 1333 return false; 1334 1335 if (adev->has_hw_reset) { 1336 adev->has_hw_reset = false; 1337 return true; 1338 } 1339 1340 /* bios scratch used on CIK+ */ 1341 if (adev->asic_type >= CHIP_BONAIRE) 1342 return amdgpu_atombios_scratch_need_asic_init(adev); 1343 1344 /* check MEM_SIZE for older asics */ 1345 reg = amdgpu_asic_get_config_memsize(adev); 1346 1347 if ((reg != 0) && (reg != 0xffffffff)) 1348 return false; 1349 1350 return true; 1351 } 1352 1353 /** 1354 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1355 * 1356 * @adev: amdgpu_device pointer 1357 * 1358 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1359 * be set for this device. 1360 * 1361 * Returns true if it should be used or false if not. 1362 */ 1363 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1364 { 1365 switch (amdgpu_aspm) { 1366 case -1: 1367 break; 1368 case 0: 1369 return false; 1370 case 1: 1371 return true; 1372 default: 1373 return false; 1374 } 1375 return pcie_aspm_enabled(adev->pdev); 1376 } 1377 1378 bool amdgpu_device_aspm_support_quirk(void) 1379 { 1380 #if IS_ENABLED(CONFIG_X86) 1381 struct cpu_info *ci = curcpu(); 1382 1383 return !(ci->ci_family == 6 && ci->ci_model == 0x97); 1384 #else 1385 return true; 1386 #endif 1387 } 1388 1389 /* if we get transitioned to only one device, take VGA back */ 1390 /** 1391 * amdgpu_device_vga_set_decode - enable/disable vga decode 1392 * 1393 * @pdev: PCI device pointer 1394 * @state: enable/disable vga decode 1395 * 1396 * Enable/disable vga decode (all asics). 1397 * Returns VGA resource flags. 1398 */ 1399 #ifdef notyet 1400 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1401 bool state) 1402 { 1403 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1404 amdgpu_asic_set_vga_state(adev, state); 1405 if (state) 1406 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1407 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1408 else 1409 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1410 } 1411 #endif 1412 1413 /** 1414 * amdgpu_device_check_block_size - validate the vm block size 1415 * 1416 * @adev: amdgpu_device pointer 1417 * 1418 * Validates the vm block size specified via module parameter. 1419 * The vm block size defines number of bits in page table versus page directory, 1420 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1421 * page table and the remaining bits are in the page directory. 1422 */ 1423 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1424 { 1425 /* defines number of bits in page table versus page directory, 1426 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1427 * page table and the remaining bits are in the page directory */ 1428 if (amdgpu_vm_block_size == -1) 1429 return; 1430 1431 if (amdgpu_vm_block_size < 9) { 1432 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1433 amdgpu_vm_block_size); 1434 amdgpu_vm_block_size = -1; 1435 } 1436 } 1437 1438 /** 1439 * amdgpu_device_check_vm_size - validate the vm size 1440 * 1441 * @adev: amdgpu_device pointer 1442 * 1443 * Validates the vm size in GB specified via module parameter. 1444 * The VM size is the size of the GPU virtual memory space in GB. 1445 */ 1446 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1447 { 1448 /* no need to check the default value */ 1449 if (amdgpu_vm_size == -1) 1450 return; 1451 1452 if (amdgpu_vm_size < 1) { 1453 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1454 amdgpu_vm_size); 1455 amdgpu_vm_size = -1; 1456 } 1457 } 1458 1459 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1460 { 1461 #ifdef __linux__ 1462 struct sysinfo si; 1463 #endif 1464 bool is_os_64 = (sizeof(void *) == 8); 1465 uint64_t total_memory; 1466 uint64_t dram_size_seven_GB = 0x1B8000000; 1467 uint64_t dram_size_three_GB = 0xB8000000; 1468 1469 if (amdgpu_smu_memory_pool_size == 0) 1470 return; 1471 1472 if (!is_os_64) { 1473 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1474 goto def_value; 1475 } 1476 #ifdef __linux__ 1477 si_meminfo(&si); 1478 total_memory = (uint64_t)si.totalram * si.mem_unit; 1479 #else 1480 total_memory = ptoa(physmem); 1481 #endif 1482 1483 if ((amdgpu_smu_memory_pool_size == 1) || 1484 (amdgpu_smu_memory_pool_size == 2)) { 1485 if (total_memory < dram_size_three_GB) 1486 goto def_value1; 1487 } else if ((amdgpu_smu_memory_pool_size == 4) || 1488 (amdgpu_smu_memory_pool_size == 8)) { 1489 if (total_memory < dram_size_seven_GB) 1490 goto def_value1; 1491 } else { 1492 DRM_WARN("Smu memory pool size not supported\n"); 1493 goto def_value; 1494 } 1495 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1496 1497 return; 1498 1499 def_value1: 1500 DRM_WARN("No enough system memory\n"); 1501 def_value: 1502 adev->pm.smu_prv_buffer_size = 0; 1503 } 1504 1505 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1506 { 1507 if (!(adev->flags & AMD_IS_APU) || 1508 adev->asic_type < CHIP_RAVEN) 1509 return 0; 1510 1511 switch (adev->asic_type) { 1512 case CHIP_RAVEN: 1513 if (adev->pdev->device == 0x15dd) 1514 adev->apu_flags |= AMD_APU_IS_RAVEN; 1515 if (adev->pdev->device == 0x15d8) 1516 adev->apu_flags |= AMD_APU_IS_PICASSO; 1517 break; 1518 case CHIP_RENOIR: 1519 if ((adev->pdev->device == 0x1636) || 1520 (adev->pdev->device == 0x164c)) 1521 adev->apu_flags |= AMD_APU_IS_RENOIR; 1522 else 1523 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1524 break; 1525 case CHIP_VANGOGH: 1526 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1527 break; 1528 case CHIP_YELLOW_CARP: 1529 break; 1530 case CHIP_CYAN_SKILLFISH: 1531 if ((adev->pdev->device == 0x13FE) || 1532 (adev->pdev->device == 0x143F)) 1533 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1534 break; 1535 default: 1536 break; 1537 } 1538 1539 return 0; 1540 } 1541 1542 /** 1543 * amdgpu_device_check_arguments - validate module params 1544 * 1545 * @adev: amdgpu_device pointer 1546 * 1547 * Validates certain module parameters and updates 1548 * the associated values used by the driver (all asics). 1549 */ 1550 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1551 { 1552 if (amdgpu_sched_jobs < 4) { 1553 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1554 amdgpu_sched_jobs); 1555 amdgpu_sched_jobs = 4; 1556 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1557 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1558 amdgpu_sched_jobs); 1559 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1560 } 1561 1562 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1563 /* gart size must be greater or equal to 32M */ 1564 dev_warn(adev->dev, "gart size (%d) too small\n", 1565 amdgpu_gart_size); 1566 amdgpu_gart_size = -1; 1567 } 1568 1569 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1570 /* gtt size must be greater or equal to 32M */ 1571 dev_warn(adev->dev, "gtt size (%d) too small\n", 1572 amdgpu_gtt_size); 1573 amdgpu_gtt_size = -1; 1574 } 1575 1576 /* valid range is between 4 and 9 inclusive */ 1577 if (amdgpu_vm_fragment_size != -1 && 1578 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1579 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1580 amdgpu_vm_fragment_size = -1; 1581 } 1582 1583 if (amdgpu_sched_hw_submission < 2) { 1584 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1585 amdgpu_sched_hw_submission); 1586 amdgpu_sched_hw_submission = 2; 1587 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1588 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1589 amdgpu_sched_hw_submission); 1590 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1591 } 1592 1593 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1594 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1595 amdgpu_reset_method = -1; 1596 } 1597 1598 amdgpu_device_check_smu_prv_buffer_size(adev); 1599 1600 amdgpu_device_check_vm_size(adev); 1601 1602 amdgpu_device_check_block_size(adev); 1603 1604 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1605 1606 return 0; 1607 } 1608 1609 #ifdef __linux__ 1610 /** 1611 * amdgpu_switcheroo_set_state - set switcheroo state 1612 * 1613 * @pdev: pci dev pointer 1614 * @state: vga_switcheroo state 1615 * 1616 * Callback for the switcheroo driver. Suspends or resumes the 1617 * the asics before or after it is powered up using ACPI methods. 1618 */ 1619 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1620 enum vga_switcheroo_state state) 1621 { 1622 struct drm_device *dev = pci_get_drvdata(pdev); 1623 int r; 1624 1625 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1626 return; 1627 1628 if (state == VGA_SWITCHEROO_ON) { 1629 pr_info("switched on\n"); 1630 /* don't suspend or resume card normally */ 1631 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1632 1633 pci_set_power_state(pdev, PCI_D0); 1634 amdgpu_device_load_pci_state(pdev); 1635 r = pci_enable_device(pdev); 1636 if (r) 1637 DRM_WARN("pci_enable_device failed (%d)\n", r); 1638 amdgpu_device_resume(dev, true); 1639 1640 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1641 } else { 1642 pr_info("switched off\n"); 1643 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1644 amdgpu_device_suspend(dev, true); 1645 amdgpu_device_cache_pci_state(pdev); 1646 /* Shut down the device */ 1647 pci_disable_device(pdev); 1648 pci_set_power_state(pdev, PCI_D3cold); 1649 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1650 } 1651 } 1652 1653 /** 1654 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1655 * 1656 * @pdev: pci dev pointer 1657 * 1658 * Callback for the switcheroo driver. Check of the switcheroo 1659 * state can be changed. 1660 * Returns true if the state can be changed, false if not. 1661 */ 1662 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1663 { 1664 struct drm_device *dev = pci_get_drvdata(pdev); 1665 1666 /* 1667 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1668 * locking inversion with the driver load path. And the access here is 1669 * completely racy anyway. So don't bother with locking for now. 1670 */ 1671 return atomic_read(&dev->open_count) == 0; 1672 } 1673 #endif /* __linux__ */ 1674 1675 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1676 #ifdef notyet 1677 .set_gpu_state = amdgpu_switcheroo_set_state, 1678 .reprobe = NULL, 1679 .can_switch = amdgpu_switcheroo_can_switch, 1680 #endif 1681 }; 1682 1683 /** 1684 * amdgpu_device_ip_set_clockgating_state - set the CG state 1685 * 1686 * @dev: amdgpu_device pointer 1687 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1688 * @state: clockgating state (gate or ungate) 1689 * 1690 * Sets the requested clockgating state for all instances of 1691 * the hardware IP specified. 1692 * Returns the error code from the last instance. 1693 */ 1694 int amdgpu_device_ip_set_clockgating_state(void *dev, 1695 enum amd_ip_block_type block_type, 1696 enum amd_clockgating_state state) 1697 { 1698 struct amdgpu_device *adev = dev; 1699 int i, r = 0; 1700 1701 for (i = 0; i < adev->num_ip_blocks; i++) { 1702 if (!adev->ip_blocks[i].status.valid) 1703 continue; 1704 if (adev->ip_blocks[i].version->type != block_type) 1705 continue; 1706 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1707 continue; 1708 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1709 (void *)adev, state); 1710 if (r) 1711 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1712 adev->ip_blocks[i].version->funcs->name, r); 1713 } 1714 return r; 1715 } 1716 1717 /** 1718 * amdgpu_device_ip_set_powergating_state - set the PG state 1719 * 1720 * @dev: amdgpu_device pointer 1721 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1722 * @state: powergating state (gate or ungate) 1723 * 1724 * Sets the requested powergating state for all instances of 1725 * the hardware IP specified. 1726 * Returns the error code from the last instance. 1727 */ 1728 int amdgpu_device_ip_set_powergating_state(void *dev, 1729 enum amd_ip_block_type block_type, 1730 enum amd_powergating_state state) 1731 { 1732 struct amdgpu_device *adev = dev; 1733 int i, r = 0; 1734 1735 for (i = 0; i < adev->num_ip_blocks; i++) { 1736 if (!adev->ip_blocks[i].status.valid) 1737 continue; 1738 if (adev->ip_blocks[i].version->type != block_type) 1739 continue; 1740 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1741 continue; 1742 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1743 (void *)adev, state); 1744 if (r) 1745 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1746 adev->ip_blocks[i].version->funcs->name, r); 1747 } 1748 return r; 1749 } 1750 1751 /** 1752 * amdgpu_device_ip_get_clockgating_state - get the CG state 1753 * 1754 * @adev: amdgpu_device pointer 1755 * @flags: clockgating feature flags 1756 * 1757 * Walks the list of IPs on the device and updates the clockgating 1758 * flags for each IP. 1759 * Updates @flags with the feature flags for each hardware IP where 1760 * clockgating is enabled. 1761 */ 1762 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1763 u64 *flags) 1764 { 1765 int i; 1766 1767 for (i = 0; i < adev->num_ip_blocks; i++) { 1768 if (!adev->ip_blocks[i].status.valid) 1769 continue; 1770 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1771 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1772 } 1773 } 1774 1775 /** 1776 * amdgpu_device_ip_wait_for_idle - wait for idle 1777 * 1778 * @adev: amdgpu_device pointer 1779 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1780 * 1781 * Waits for the request hardware IP to be idle. 1782 * Returns 0 for success or a negative error code on failure. 1783 */ 1784 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1785 enum amd_ip_block_type block_type) 1786 { 1787 int i, r; 1788 1789 for (i = 0; i < adev->num_ip_blocks; i++) { 1790 if (!adev->ip_blocks[i].status.valid) 1791 continue; 1792 if (adev->ip_blocks[i].version->type == block_type) { 1793 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1794 if (r) 1795 return r; 1796 break; 1797 } 1798 } 1799 return 0; 1800 1801 } 1802 1803 /** 1804 * amdgpu_device_ip_is_idle - is the hardware IP idle 1805 * 1806 * @adev: amdgpu_device pointer 1807 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1808 * 1809 * Check if the hardware IP is idle or not. 1810 * Returns true if it the IP is idle, false if not. 1811 */ 1812 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1813 enum amd_ip_block_type block_type) 1814 { 1815 int i; 1816 1817 for (i = 0; i < adev->num_ip_blocks; i++) { 1818 if (!adev->ip_blocks[i].status.valid) 1819 continue; 1820 if (adev->ip_blocks[i].version->type == block_type) 1821 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1822 } 1823 return true; 1824 1825 } 1826 1827 /** 1828 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1829 * 1830 * @adev: amdgpu_device pointer 1831 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1832 * 1833 * Returns a pointer to the hardware IP block structure 1834 * if it exists for the asic, otherwise NULL. 1835 */ 1836 struct amdgpu_ip_block * 1837 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1838 enum amd_ip_block_type type) 1839 { 1840 int i; 1841 1842 for (i = 0; i < adev->num_ip_blocks; i++) 1843 if (adev->ip_blocks[i].version->type == type) 1844 return &adev->ip_blocks[i]; 1845 1846 return NULL; 1847 } 1848 1849 /** 1850 * amdgpu_device_ip_block_version_cmp 1851 * 1852 * @adev: amdgpu_device pointer 1853 * @type: enum amd_ip_block_type 1854 * @major: major version 1855 * @minor: minor version 1856 * 1857 * return 0 if equal or greater 1858 * return 1 if smaller or the ip_block doesn't exist 1859 */ 1860 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1861 enum amd_ip_block_type type, 1862 u32 major, u32 minor) 1863 { 1864 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1865 1866 if (ip_block && ((ip_block->version->major > major) || 1867 ((ip_block->version->major == major) && 1868 (ip_block->version->minor >= minor)))) 1869 return 0; 1870 1871 return 1; 1872 } 1873 1874 /** 1875 * amdgpu_device_ip_block_add 1876 * 1877 * @adev: amdgpu_device pointer 1878 * @ip_block_version: pointer to the IP to add 1879 * 1880 * Adds the IP block driver information to the collection of IPs 1881 * on the asic. 1882 */ 1883 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1884 const struct amdgpu_ip_block_version *ip_block_version) 1885 { 1886 if (!ip_block_version) 1887 return -EINVAL; 1888 1889 switch (ip_block_version->type) { 1890 case AMD_IP_BLOCK_TYPE_VCN: 1891 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1892 return 0; 1893 break; 1894 case AMD_IP_BLOCK_TYPE_JPEG: 1895 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1896 return 0; 1897 break; 1898 default: 1899 break; 1900 } 1901 1902 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1903 ip_block_version->funcs->name); 1904 1905 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1906 1907 return 0; 1908 } 1909 1910 /** 1911 * amdgpu_device_enable_virtual_display - enable virtual display feature 1912 * 1913 * @adev: amdgpu_device pointer 1914 * 1915 * Enabled the virtual display feature if the user has enabled it via 1916 * the module parameter virtual_display. This feature provides a virtual 1917 * display hardware on headless boards or in virtualized environments. 1918 * This function parses and validates the configuration string specified by 1919 * the user and configues the virtual display configuration (number of 1920 * virtual connectors, crtcs, etc.) specified. 1921 */ 1922 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1923 { 1924 adev->enable_virtual_display = false; 1925 1926 #ifdef notyet 1927 if (amdgpu_virtual_display) { 1928 const char *pci_address_name = pci_name(adev->pdev); 1929 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1930 1931 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1932 pciaddstr_tmp = pciaddstr; 1933 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1934 pciaddname = strsep(&pciaddname_tmp, ","); 1935 if (!strcmp("all", pciaddname) 1936 || !strcmp(pci_address_name, pciaddname)) { 1937 long num_crtc; 1938 int res = -1; 1939 1940 adev->enable_virtual_display = true; 1941 1942 if (pciaddname_tmp) 1943 res = kstrtol(pciaddname_tmp, 10, 1944 &num_crtc); 1945 1946 if (!res) { 1947 if (num_crtc < 1) 1948 num_crtc = 1; 1949 if (num_crtc > 6) 1950 num_crtc = 6; 1951 adev->mode_info.num_crtc = num_crtc; 1952 } else { 1953 adev->mode_info.num_crtc = 1; 1954 } 1955 break; 1956 } 1957 } 1958 1959 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1960 amdgpu_virtual_display, pci_address_name, 1961 adev->enable_virtual_display, adev->mode_info.num_crtc); 1962 1963 kfree(pciaddstr); 1964 } 1965 #endif 1966 } 1967 1968 /** 1969 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1970 * 1971 * @adev: amdgpu_device pointer 1972 * 1973 * Parses the asic configuration parameters specified in the gpu info 1974 * firmware and makes them availale to the driver for use in configuring 1975 * the asic. 1976 * Returns 0 on success, -EINVAL on failure. 1977 */ 1978 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1979 { 1980 const char *chip_name; 1981 char fw_name[40]; 1982 int err; 1983 const struct gpu_info_firmware_header_v1_0 *hdr; 1984 1985 adev->firmware.gpu_info_fw = NULL; 1986 1987 if (adev->mman.discovery_bin) { 1988 /* 1989 * FIXME: The bounding box is still needed by Navi12, so 1990 * temporarily read it from gpu_info firmware. Should be dropped 1991 * when DAL no longer needs it. 1992 */ 1993 if (adev->asic_type != CHIP_NAVI12) 1994 return 0; 1995 } 1996 1997 switch (adev->asic_type) { 1998 default: 1999 return 0; 2000 case CHIP_VEGA10: 2001 chip_name = "vega10"; 2002 break; 2003 case CHIP_VEGA12: 2004 chip_name = "vega12"; 2005 break; 2006 case CHIP_RAVEN: 2007 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2008 chip_name = "raven2"; 2009 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2010 chip_name = "picasso"; 2011 else 2012 chip_name = "raven"; 2013 break; 2014 case CHIP_ARCTURUS: 2015 chip_name = "arcturus"; 2016 break; 2017 case CHIP_NAVI12: 2018 chip_name = "navi12"; 2019 break; 2020 } 2021 2022 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2023 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2024 if (err) { 2025 dev_err(adev->dev, 2026 "Failed to load gpu_info firmware \"%s\"\n", 2027 fw_name); 2028 goto out; 2029 } 2030 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2031 if (err) { 2032 dev_err(adev->dev, 2033 "Failed to validate gpu_info firmware \"%s\"\n", 2034 fw_name); 2035 goto out; 2036 } 2037 2038 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2039 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2040 2041 switch (hdr->version_major) { 2042 case 1: 2043 { 2044 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2045 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2046 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2047 2048 /* 2049 * Should be droped when DAL no longer needs it. 2050 */ 2051 if (adev->asic_type == CHIP_NAVI12) 2052 goto parse_soc_bounding_box; 2053 2054 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2055 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2056 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2057 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2058 adev->gfx.config.max_texture_channel_caches = 2059 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2060 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2061 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2062 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2063 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2064 adev->gfx.config.double_offchip_lds_buf = 2065 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2066 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2067 adev->gfx.cu_info.max_waves_per_simd = 2068 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2069 adev->gfx.cu_info.max_scratch_slots_per_cu = 2070 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2071 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2072 if (hdr->version_minor >= 1) { 2073 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2074 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2075 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2076 adev->gfx.config.num_sc_per_sh = 2077 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2078 adev->gfx.config.num_packer_per_sc = 2079 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2080 } 2081 2082 parse_soc_bounding_box: 2083 /* 2084 * soc bounding box info is not integrated in disocovery table, 2085 * we always need to parse it from gpu info firmware if needed. 2086 */ 2087 if (hdr->version_minor == 2) { 2088 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2089 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2090 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2091 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2092 } 2093 break; 2094 } 2095 default: 2096 dev_err(adev->dev, 2097 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2098 err = -EINVAL; 2099 goto out; 2100 } 2101 out: 2102 return err; 2103 } 2104 2105 /** 2106 * amdgpu_device_ip_early_init - run early init for hardware IPs 2107 * 2108 * @adev: amdgpu_device pointer 2109 * 2110 * Early initialization pass for hardware IPs. The hardware IPs that make 2111 * up each asic are discovered each IP's early_init callback is run. This 2112 * is the first stage in initializing the asic. 2113 * Returns 0 on success, negative error code on failure. 2114 */ 2115 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2116 { 2117 struct drm_device *dev = adev_to_drm(adev); 2118 struct pci_dev *parent; 2119 int i, r; 2120 2121 amdgpu_device_enable_virtual_display(adev); 2122 2123 if (amdgpu_sriov_vf(adev)) { 2124 r = amdgpu_virt_request_full_gpu(adev, true); 2125 if (r) 2126 return r; 2127 } 2128 2129 switch (adev->asic_type) { 2130 #ifdef CONFIG_DRM_AMDGPU_SI 2131 case CHIP_VERDE: 2132 case CHIP_TAHITI: 2133 case CHIP_PITCAIRN: 2134 case CHIP_OLAND: 2135 case CHIP_HAINAN: 2136 adev->family = AMDGPU_FAMILY_SI; 2137 r = si_set_ip_blocks(adev); 2138 if (r) 2139 return r; 2140 break; 2141 #endif 2142 #ifdef CONFIG_DRM_AMDGPU_CIK 2143 case CHIP_BONAIRE: 2144 case CHIP_HAWAII: 2145 case CHIP_KAVERI: 2146 case CHIP_KABINI: 2147 case CHIP_MULLINS: 2148 if (adev->flags & AMD_IS_APU) 2149 adev->family = AMDGPU_FAMILY_KV; 2150 else 2151 adev->family = AMDGPU_FAMILY_CI; 2152 2153 r = cik_set_ip_blocks(adev); 2154 if (r) 2155 return r; 2156 break; 2157 #endif 2158 case CHIP_TOPAZ: 2159 case CHIP_TONGA: 2160 case CHIP_FIJI: 2161 case CHIP_POLARIS10: 2162 case CHIP_POLARIS11: 2163 case CHIP_POLARIS12: 2164 case CHIP_VEGAM: 2165 case CHIP_CARRIZO: 2166 case CHIP_STONEY: 2167 if (adev->flags & AMD_IS_APU) 2168 adev->family = AMDGPU_FAMILY_CZ; 2169 else 2170 adev->family = AMDGPU_FAMILY_VI; 2171 2172 r = vi_set_ip_blocks(adev); 2173 if (r) 2174 return r; 2175 break; 2176 default: 2177 r = amdgpu_discovery_set_ip_blocks(adev); 2178 if (r) 2179 return r; 2180 break; 2181 } 2182 2183 if (amdgpu_has_atpx() && 2184 (amdgpu_is_atpx_hybrid() || 2185 amdgpu_has_atpx_dgpu_power_cntl()) && 2186 ((adev->flags & AMD_IS_APU) == 0) && 2187 !pci_is_thunderbolt_attached(dev->pdev)) 2188 adev->flags |= AMD_IS_PX; 2189 2190 if (!(adev->flags & AMD_IS_APU)) { 2191 parent = pci_upstream_bridge(adev->pdev); 2192 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2193 } 2194 2195 amdgpu_amdkfd_device_probe(adev); 2196 2197 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2198 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2199 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2200 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2201 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2202 2203 for (i = 0; i < adev->num_ip_blocks; i++) { 2204 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2205 DRM_ERROR("disabled ip block: %d <%s>\n", 2206 i, adev->ip_blocks[i].version->funcs->name); 2207 adev->ip_blocks[i].status.valid = false; 2208 } else { 2209 if (adev->ip_blocks[i].version->funcs->early_init) { 2210 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2211 if (r == -ENOENT) { 2212 adev->ip_blocks[i].status.valid = false; 2213 } else if (r) { 2214 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2215 adev->ip_blocks[i].version->funcs->name, r); 2216 return r; 2217 } else { 2218 adev->ip_blocks[i].status.valid = true; 2219 } 2220 } else { 2221 adev->ip_blocks[i].status.valid = true; 2222 } 2223 } 2224 /* get the vbios after the asic_funcs are set up */ 2225 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2226 r = amdgpu_device_parse_gpu_info_fw(adev); 2227 if (r) 2228 return r; 2229 2230 /* Read BIOS */ 2231 if (!amdgpu_get_bios(adev)) 2232 return -EINVAL; 2233 2234 r = amdgpu_atombios_init(adev); 2235 if (r) { 2236 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2237 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2238 return r; 2239 } 2240 2241 /*get pf2vf msg info at it's earliest time*/ 2242 if (amdgpu_sriov_vf(adev)) 2243 amdgpu_virt_init_data_exchange(adev); 2244 2245 } 2246 } 2247 2248 adev->cg_flags &= amdgpu_cg_mask; 2249 adev->pg_flags &= amdgpu_pg_mask; 2250 2251 return 0; 2252 } 2253 2254 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2255 { 2256 int i, r; 2257 2258 for (i = 0; i < adev->num_ip_blocks; i++) { 2259 if (!adev->ip_blocks[i].status.sw) 2260 continue; 2261 if (adev->ip_blocks[i].status.hw) 2262 continue; 2263 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2264 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2265 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2266 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2267 if (r) { 2268 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2269 adev->ip_blocks[i].version->funcs->name, r); 2270 return r; 2271 } 2272 adev->ip_blocks[i].status.hw = true; 2273 } 2274 } 2275 2276 return 0; 2277 } 2278 2279 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2280 { 2281 int i, r; 2282 2283 for (i = 0; i < adev->num_ip_blocks; i++) { 2284 if (!adev->ip_blocks[i].status.sw) 2285 continue; 2286 if (adev->ip_blocks[i].status.hw) 2287 continue; 2288 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2289 if (r) { 2290 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2291 adev->ip_blocks[i].version->funcs->name, r); 2292 return r; 2293 } 2294 adev->ip_blocks[i].status.hw = true; 2295 } 2296 2297 return 0; 2298 } 2299 2300 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2301 { 2302 int r = 0; 2303 int i; 2304 uint32_t smu_version; 2305 2306 if (adev->asic_type >= CHIP_VEGA10) { 2307 for (i = 0; i < adev->num_ip_blocks; i++) { 2308 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2309 continue; 2310 2311 if (!adev->ip_blocks[i].status.sw) 2312 continue; 2313 2314 /* no need to do the fw loading again if already done*/ 2315 if (adev->ip_blocks[i].status.hw == true) 2316 break; 2317 2318 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2319 r = adev->ip_blocks[i].version->funcs->resume(adev); 2320 if (r) { 2321 DRM_ERROR("resume of IP block <%s> failed %d\n", 2322 adev->ip_blocks[i].version->funcs->name, r); 2323 return r; 2324 } 2325 } else { 2326 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2327 if (r) { 2328 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2329 adev->ip_blocks[i].version->funcs->name, r); 2330 return r; 2331 } 2332 } 2333 2334 adev->ip_blocks[i].status.hw = true; 2335 break; 2336 } 2337 } 2338 2339 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2340 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2341 2342 return r; 2343 } 2344 2345 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2346 { 2347 long timeout; 2348 int r, i; 2349 2350 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2351 struct amdgpu_ring *ring = adev->rings[i]; 2352 2353 /* No need to setup the GPU scheduler for rings that don't need it */ 2354 if (!ring || ring->no_scheduler) 2355 continue; 2356 2357 switch (ring->funcs->type) { 2358 case AMDGPU_RING_TYPE_GFX: 2359 timeout = adev->gfx_timeout; 2360 break; 2361 case AMDGPU_RING_TYPE_COMPUTE: 2362 timeout = adev->compute_timeout; 2363 break; 2364 case AMDGPU_RING_TYPE_SDMA: 2365 timeout = adev->sdma_timeout; 2366 break; 2367 default: 2368 timeout = adev->video_timeout; 2369 break; 2370 } 2371 2372 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2373 ring->num_hw_submission, amdgpu_job_hang_limit, 2374 timeout, adev->reset_domain->wq, 2375 ring->sched_score, ring->name, 2376 adev->dev); 2377 if (r) { 2378 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2379 ring->name); 2380 return r; 2381 } 2382 } 2383 2384 return 0; 2385 } 2386 2387 2388 /** 2389 * amdgpu_device_ip_init - run init for hardware IPs 2390 * 2391 * @adev: amdgpu_device pointer 2392 * 2393 * Main initialization pass for hardware IPs. The list of all the hardware 2394 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2395 * are run. sw_init initializes the software state associated with each IP 2396 * and hw_init initializes the hardware associated with each IP. 2397 * Returns 0 on success, negative error code on failure. 2398 */ 2399 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2400 { 2401 int i, r; 2402 2403 r = amdgpu_ras_init(adev); 2404 if (r) 2405 return r; 2406 2407 for (i = 0; i < adev->num_ip_blocks; i++) { 2408 if (!adev->ip_blocks[i].status.valid) 2409 continue; 2410 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2411 if (r) { 2412 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2413 adev->ip_blocks[i].version->funcs->name, r); 2414 goto init_failed; 2415 } 2416 adev->ip_blocks[i].status.sw = true; 2417 2418 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2419 /* need to do common hw init early so everything is set up for gmc */ 2420 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2421 if (r) { 2422 DRM_ERROR("hw_init %d failed %d\n", i, r); 2423 goto init_failed; 2424 } 2425 adev->ip_blocks[i].status.hw = true; 2426 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2427 /* need to do gmc hw init early so we can allocate gpu mem */ 2428 /* Try to reserve bad pages early */ 2429 if (amdgpu_sriov_vf(adev)) 2430 amdgpu_virt_exchange_data(adev); 2431 2432 r = amdgpu_device_vram_scratch_init(adev); 2433 if (r) { 2434 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2435 goto init_failed; 2436 } 2437 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2438 if (r) { 2439 DRM_ERROR("hw_init %d failed %d\n", i, r); 2440 goto init_failed; 2441 } 2442 r = amdgpu_device_wb_init(adev); 2443 if (r) { 2444 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2445 goto init_failed; 2446 } 2447 adev->ip_blocks[i].status.hw = true; 2448 2449 /* right after GMC hw init, we create CSA */ 2450 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2451 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2452 AMDGPU_GEM_DOMAIN_VRAM, 2453 AMDGPU_CSA_SIZE); 2454 if (r) { 2455 DRM_ERROR("allocate CSA failed %d\n", r); 2456 goto init_failed; 2457 } 2458 } 2459 } 2460 } 2461 2462 if (amdgpu_sriov_vf(adev)) 2463 amdgpu_virt_init_data_exchange(adev); 2464 2465 r = amdgpu_ib_pool_init(adev); 2466 if (r) { 2467 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2468 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2469 goto init_failed; 2470 } 2471 2472 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2473 if (r) 2474 goto init_failed; 2475 2476 r = amdgpu_device_ip_hw_init_phase1(adev); 2477 if (r) 2478 goto init_failed; 2479 2480 r = amdgpu_device_fw_loading(adev); 2481 if (r) 2482 goto init_failed; 2483 2484 r = amdgpu_device_ip_hw_init_phase2(adev); 2485 if (r) 2486 goto init_failed; 2487 2488 /* 2489 * retired pages will be loaded from eeprom and reserved here, 2490 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2491 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2492 * for I2C communication which only true at this point. 2493 * 2494 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2495 * failure from bad gpu situation and stop amdgpu init process 2496 * accordingly. For other failed cases, it will still release all 2497 * the resource and print error message, rather than returning one 2498 * negative value to upper level. 2499 * 2500 * Note: theoretically, this should be called before all vram allocations 2501 * to protect retired page from abusing 2502 */ 2503 r = amdgpu_ras_recovery_init(adev); 2504 if (r) 2505 goto init_failed; 2506 2507 /** 2508 * In case of XGMI grab extra reference for reset domain for this device 2509 */ 2510 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2511 if (amdgpu_xgmi_add_device(adev) == 0) { 2512 if (!amdgpu_sriov_vf(adev)) { 2513 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2514 2515 if (WARN_ON(!hive)) { 2516 r = -ENOENT; 2517 goto init_failed; 2518 } 2519 2520 if (!hive->reset_domain || 2521 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2522 r = -ENOENT; 2523 amdgpu_put_xgmi_hive(hive); 2524 goto init_failed; 2525 } 2526 2527 /* Drop the early temporary reset domain we created for device */ 2528 amdgpu_reset_put_reset_domain(adev->reset_domain); 2529 adev->reset_domain = hive->reset_domain; 2530 amdgpu_put_xgmi_hive(hive); 2531 } 2532 } 2533 } 2534 2535 r = amdgpu_device_init_schedulers(adev); 2536 if (r) 2537 goto init_failed; 2538 2539 /* Don't init kfd if whole hive need to be reset during init */ 2540 if (!adev->gmc.xgmi.pending_reset) 2541 amdgpu_amdkfd_device_init(adev); 2542 2543 amdgpu_fru_get_product_info(adev); 2544 2545 init_failed: 2546 if (amdgpu_sriov_vf(adev)) 2547 amdgpu_virt_release_full_gpu(adev, true); 2548 2549 return r; 2550 } 2551 2552 /** 2553 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2554 * 2555 * @adev: amdgpu_device pointer 2556 * 2557 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2558 * this function before a GPU reset. If the value is retained after a 2559 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2560 */ 2561 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2562 { 2563 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2564 } 2565 2566 /** 2567 * amdgpu_device_check_vram_lost - check if vram is valid 2568 * 2569 * @adev: amdgpu_device pointer 2570 * 2571 * Checks the reset magic value written to the gart pointer in VRAM. 2572 * The driver calls this after a GPU reset to see if the contents of 2573 * VRAM is lost or now. 2574 * returns true if vram is lost, false if not. 2575 */ 2576 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2577 { 2578 if (memcmp(adev->gart.ptr, adev->reset_magic, 2579 AMDGPU_RESET_MAGIC_NUM)) 2580 return true; 2581 2582 if (!amdgpu_in_reset(adev)) 2583 return false; 2584 2585 /* 2586 * For all ASICs with baco/mode1 reset, the VRAM is 2587 * always assumed to be lost. 2588 */ 2589 switch (amdgpu_asic_reset_method(adev)) { 2590 case AMD_RESET_METHOD_BACO: 2591 case AMD_RESET_METHOD_MODE1: 2592 return true; 2593 default: 2594 return false; 2595 } 2596 } 2597 2598 /** 2599 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2600 * 2601 * @adev: amdgpu_device pointer 2602 * @state: clockgating state (gate or ungate) 2603 * 2604 * The list of all the hardware IPs that make up the asic is walked and the 2605 * set_clockgating_state callbacks are run. 2606 * Late initialization pass enabling clockgating for hardware IPs. 2607 * Fini or suspend, pass disabling clockgating for hardware IPs. 2608 * Returns 0 on success, negative error code on failure. 2609 */ 2610 2611 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2612 enum amd_clockgating_state state) 2613 { 2614 int i, j, r; 2615 2616 if (amdgpu_emu_mode == 1) 2617 return 0; 2618 2619 for (j = 0; j < adev->num_ip_blocks; j++) { 2620 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2621 if (!adev->ip_blocks[i].status.late_initialized) 2622 continue; 2623 /* skip CG for GFX on S0ix */ 2624 if (adev->in_s0ix && 2625 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2626 continue; 2627 /* skip CG for VCE/UVD, it's handled specially */ 2628 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2629 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2630 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2631 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2632 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2633 /* enable clockgating to save power */ 2634 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2635 state); 2636 if (r) { 2637 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2638 adev->ip_blocks[i].version->funcs->name, r); 2639 return r; 2640 } 2641 } 2642 } 2643 2644 return 0; 2645 } 2646 2647 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2648 enum amd_powergating_state state) 2649 { 2650 int i, j, r; 2651 2652 if (amdgpu_emu_mode == 1) 2653 return 0; 2654 2655 for (j = 0; j < adev->num_ip_blocks; j++) { 2656 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2657 if (!adev->ip_blocks[i].status.late_initialized) 2658 continue; 2659 /* skip PG for GFX on S0ix */ 2660 if (adev->in_s0ix && 2661 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2662 continue; 2663 /* skip CG for VCE/UVD, it's handled specially */ 2664 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2665 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2666 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2667 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2668 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2669 /* enable powergating to save power */ 2670 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2671 state); 2672 if (r) { 2673 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2674 adev->ip_blocks[i].version->funcs->name, r); 2675 return r; 2676 } 2677 } 2678 } 2679 return 0; 2680 } 2681 2682 static int amdgpu_device_enable_mgpu_fan_boost(void) 2683 { 2684 struct amdgpu_gpu_instance *gpu_ins; 2685 struct amdgpu_device *adev; 2686 int i, ret = 0; 2687 2688 mutex_lock(&mgpu_info.mutex); 2689 2690 /* 2691 * MGPU fan boost feature should be enabled 2692 * only when there are two or more dGPUs in 2693 * the system 2694 */ 2695 if (mgpu_info.num_dgpu < 2) 2696 goto out; 2697 2698 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2699 gpu_ins = &(mgpu_info.gpu_ins[i]); 2700 adev = gpu_ins->adev; 2701 if (!(adev->flags & AMD_IS_APU) && 2702 !gpu_ins->mgpu_fan_enabled) { 2703 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2704 if (ret) 2705 break; 2706 2707 gpu_ins->mgpu_fan_enabled = 1; 2708 } 2709 } 2710 2711 out: 2712 mutex_unlock(&mgpu_info.mutex); 2713 2714 return ret; 2715 } 2716 2717 /** 2718 * amdgpu_device_ip_late_init - run late init for hardware IPs 2719 * 2720 * @adev: amdgpu_device pointer 2721 * 2722 * Late initialization pass for hardware IPs. The list of all the hardware 2723 * IPs that make up the asic is walked and the late_init callbacks are run. 2724 * late_init covers any special initialization that an IP requires 2725 * after all of the have been initialized or something that needs to happen 2726 * late in the init process. 2727 * Returns 0 on success, negative error code on failure. 2728 */ 2729 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2730 { 2731 struct amdgpu_gpu_instance *gpu_instance; 2732 int i = 0, r; 2733 2734 for (i = 0; i < adev->num_ip_blocks; i++) { 2735 if (!adev->ip_blocks[i].status.hw) 2736 continue; 2737 if (adev->ip_blocks[i].version->funcs->late_init) { 2738 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2739 if (r) { 2740 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2741 adev->ip_blocks[i].version->funcs->name, r); 2742 return r; 2743 } 2744 } 2745 adev->ip_blocks[i].status.late_initialized = true; 2746 } 2747 2748 r = amdgpu_ras_late_init(adev); 2749 if (r) { 2750 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2751 return r; 2752 } 2753 2754 amdgpu_ras_set_error_query_ready(adev, true); 2755 2756 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2757 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2758 2759 amdgpu_device_fill_reset_magic(adev); 2760 2761 r = amdgpu_device_enable_mgpu_fan_boost(); 2762 if (r) 2763 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2764 2765 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2766 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2767 adev->asic_type == CHIP_ALDEBARAN )) 2768 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2769 2770 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2771 mutex_lock(&mgpu_info.mutex); 2772 2773 /* 2774 * Reset device p-state to low as this was booted with high. 2775 * 2776 * This should be performed only after all devices from the same 2777 * hive get initialized. 2778 * 2779 * However, it's unknown how many device in the hive in advance. 2780 * As this is counted one by one during devices initializations. 2781 * 2782 * So, we wait for all XGMI interlinked devices initialized. 2783 * This may bring some delays as those devices may come from 2784 * different hives. But that should be OK. 2785 */ 2786 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2787 for (i = 0; i < mgpu_info.num_gpu; i++) { 2788 gpu_instance = &(mgpu_info.gpu_ins[i]); 2789 if (gpu_instance->adev->flags & AMD_IS_APU) 2790 continue; 2791 2792 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2793 AMDGPU_XGMI_PSTATE_MIN); 2794 if (r) { 2795 DRM_ERROR("pstate setting failed (%d).\n", r); 2796 break; 2797 } 2798 } 2799 } 2800 2801 mutex_unlock(&mgpu_info.mutex); 2802 } 2803 2804 return 0; 2805 } 2806 2807 /** 2808 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2809 * 2810 * @adev: amdgpu_device pointer 2811 * 2812 * For ASICs need to disable SMC first 2813 */ 2814 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2815 { 2816 int i, r; 2817 2818 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2819 return; 2820 2821 for (i = 0; i < adev->num_ip_blocks; i++) { 2822 if (!adev->ip_blocks[i].status.hw) 2823 continue; 2824 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2825 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2826 /* XXX handle errors */ 2827 if (r) { 2828 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2829 adev->ip_blocks[i].version->funcs->name, r); 2830 } 2831 adev->ip_blocks[i].status.hw = false; 2832 break; 2833 } 2834 } 2835 } 2836 2837 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2838 { 2839 int i, r; 2840 2841 for (i = 0; i < adev->num_ip_blocks; i++) { 2842 if (!adev->ip_blocks[i].version->funcs->early_fini) 2843 continue; 2844 2845 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2846 if (r) { 2847 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2848 adev->ip_blocks[i].version->funcs->name, r); 2849 } 2850 } 2851 2852 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2853 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2854 2855 amdgpu_amdkfd_suspend(adev, false); 2856 2857 /* Workaroud for ASICs need to disable SMC first */ 2858 amdgpu_device_smu_fini_early(adev); 2859 2860 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2861 if (!adev->ip_blocks[i].status.hw) 2862 continue; 2863 2864 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2865 /* XXX handle errors */ 2866 if (r) { 2867 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2868 adev->ip_blocks[i].version->funcs->name, r); 2869 } 2870 2871 adev->ip_blocks[i].status.hw = false; 2872 } 2873 2874 if (amdgpu_sriov_vf(adev)) { 2875 if (amdgpu_virt_release_full_gpu(adev, false)) 2876 DRM_ERROR("failed to release exclusive mode on fini\n"); 2877 } 2878 2879 return 0; 2880 } 2881 2882 /** 2883 * amdgpu_device_ip_fini - run fini for hardware IPs 2884 * 2885 * @adev: amdgpu_device pointer 2886 * 2887 * Main teardown pass for hardware IPs. The list of all the hardware 2888 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2889 * are run. hw_fini tears down the hardware associated with each IP 2890 * and sw_fini tears down any software state associated with each IP. 2891 * Returns 0 on success, negative error code on failure. 2892 */ 2893 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2894 { 2895 int i, r; 2896 2897 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2898 amdgpu_virt_release_ras_err_handler_data(adev); 2899 2900 if (adev->gmc.xgmi.num_physical_nodes > 1) 2901 amdgpu_xgmi_remove_device(adev); 2902 2903 amdgpu_amdkfd_device_fini_sw(adev); 2904 2905 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2906 if (!adev->ip_blocks[i].status.sw) 2907 continue; 2908 2909 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2910 amdgpu_ucode_free_bo(adev); 2911 amdgpu_free_static_csa(&adev->virt.csa_obj); 2912 amdgpu_device_wb_fini(adev); 2913 amdgpu_device_vram_scratch_fini(adev); 2914 amdgpu_ib_pool_fini(adev); 2915 } 2916 2917 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2918 /* XXX handle errors */ 2919 if (r) { 2920 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2921 adev->ip_blocks[i].version->funcs->name, r); 2922 } 2923 adev->ip_blocks[i].status.sw = false; 2924 adev->ip_blocks[i].status.valid = false; 2925 } 2926 2927 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2928 if (!adev->ip_blocks[i].status.late_initialized) 2929 continue; 2930 if (adev->ip_blocks[i].version->funcs->late_fini) 2931 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2932 adev->ip_blocks[i].status.late_initialized = false; 2933 } 2934 2935 amdgpu_ras_fini(adev); 2936 2937 return 0; 2938 } 2939 2940 /** 2941 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2942 * 2943 * @work: work_struct. 2944 */ 2945 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2946 { 2947 struct amdgpu_device *adev = 2948 container_of(work, struct amdgpu_device, delayed_init_work.work); 2949 int r; 2950 2951 r = amdgpu_ib_ring_tests(adev); 2952 if (r) 2953 DRM_ERROR("ib ring test failed (%d).\n", r); 2954 } 2955 2956 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2957 { 2958 struct amdgpu_device *adev = 2959 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2960 2961 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2962 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2963 2964 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2965 adev->gfx.gfx_off_state = true; 2966 } 2967 2968 /** 2969 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2970 * 2971 * @adev: amdgpu_device pointer 2972 * 2973 * Main suspend function for hardware IPs. The list of all the hardware 2974 * IPs that make up the asic is walked, clockgating is disabled and the 2975 * suspend callbacks are run. suspend puts the hardware and software state 2976 * in each IP into a state suitable for suspend. 2977 * Returns 0 on success, negative error code on failure. 2978 */ 2979 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2980 { 2981 int i, r; 2982 2983 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2984 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2985 2986 /* 2987 * Per PMFW team's suggestion, driver needs to handle gfxoff 2988 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2989 * scenario. Add the missing df cstate disablement here. 2990 */ 2991 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2992 dev_warn(adev->dev, "Failed to disallow df cstate"); 2993 2994 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2995 if (!adev->ip_blocks[i].status.valid) 2996 continue; 2997 2998 /* displays are handled separately */ 2999 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3000 continue; 3001 3002 /* XXX handle errors */ 3003 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3004 /* XXX handle errors */ 3005 if (r) { 3006 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3007 adev->ip_blocks[i].version->funcs->name, r); 3008 return r; 3009 } 3010 3011 adev->ip_blocks[i].status.hw = false; 3012 } 3013 3014 return 0; 3015 } 3016 3017 /** 3018 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3019 * 3020 * @adev: amdgpu_device pointer 3021 * 3022 * Main suspend function for hardware IPs. The list of all the hardware 3023 * IPs that make up the asic is walked, clockgating is disabled and the 3024 * suspend callbacks are run. suspend puts the hardware and software state 3025 * in each IP into a state suitable for suspend. 3026 * Returns 0 on success, negative error code on failure. 3027 */ 3028 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3029 { 3030 int i, r; 3031 3032 if (adev->in_s0ix) 3033 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3034 3035 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3036 if (!adev->ip_blocks[i].status.valid) 3037 continue; 3038 /* displays are handled in phase1 */ 3039 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3040 continue; 3041 /* PSP lost connection when err_event_athub occurs */ 3042 if (amdgpu_ras_intr_triggered() && 3043 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3044 adev->ip_blocks[i].status.hw = false; 3045 continue; 3046 } 3047 3048 /* skip unnecessary suspend if we do not initialize them yet */ 3049 if (adev->gmc.xgmi.pending_reset && 3050 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3051 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3052 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3053 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3054 adev->ip_blocks[i].status.hw = false; 3055 continue; 3056 } 3057 3058 /* skip suspend of gfx/mes and psp for S0ix 3059 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3060 * like at runtime. PSP is also part of the always on hardware 3061 * so no need to suspend it. 3062 */ 3063 if (adev->in_s0ix && 3064 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3065 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3066 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3067 continue; 3068 3069 /* XXX handle errors */ 3070 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3071 /* XXX handle errors */ 3072 if (r) { 3073 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3074 adev->ip_blocks[i].version->funcs->name, r); 3075 } 3076 adev->ip_blocks[i].status.hw = false; 3077 /* handle putting the SMC in the appropriate state */ 3078 if(!amdgpu_sriov_vf(adev)){ 3079 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3080 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3081 if (r) { 3082 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3083 adev->mp1_state, r); 3084 return r; 3085 } 3086 } 3087 } 3088 } 3089 3090 return 0; 3091 } 3092 3093 /** 3094 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3095 * 3096 * @adev: amdgpu_device pointer 3097 * 3098 * Main suspend function for hardware IPs. The list of all the hardware 3099 * IPs that make up the asic is walked, clockgating is disabled and the 3100 * suspend callbacks are run. suspend puts the hardware and software state 3101 * in each IP into a state suitable for suspend. 3102 * Returns 0 on success, negative error code on failure. 3103 */ 3104 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3105 { 3106 int r; 3107 3108 if (amdgpu_sriov_vf(adev)) { 3109 amdgpu_virt_fini_data_exchange(adev); 3110 amdgpu_virt_request_full_gpu(adev, false); 3111 } 3112 3113 r = amdgpu_device_ip_suspend_phase1(adev); 3114 if (r) 3115 return r; 3116 r = amdgpu_device_ip_suspend_phase2(adev); 3117 3118 if (amdgpu_sriov_vf(adev)) 3119 amdgpu_virt_release_full_gpu(adev, false); 3120 3121 return r; 3122 } 3123 3124 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3125 { 3126 int i, r; 3127 3128 static enum amd_ip_block_type ip_order[] = { 3129 AMD_IP_BLOCK_TYPE_COMMON, 3130 AMD_IP_BLOCK_TYPE_GMC, 3131 AMD_IP_BLOCK_TYPE_PSP, 3132 AMD_IP_BLOCK_TYPE_IH, 3133 }; 3134 3135 for (i = 0; i < adev->num_ip_blocks; i++) { 3136 int j; 3137 struct amdgpu_ip_block *block; 3138 3139 block = &adev->ip_blocks[i]; 3140 block->status.hw = false; 3141 3142 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3143 3144 if (block->version->type != ip_order[j] || 3145 !block->status.valid) 3146 continue; 3147 3148 r = block->version->funcs->hw_init(adev); 3149 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3150 if (r) 3151 return r; 3152 block->status.hw = true; 3153 } 3154 } 3155 3156 return 0; 3157 } 3158 3159 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3160 { 3161 int i, r; 3162 3163 static enum amd_ip_block_type ip_order[] = { 3164 AMD_IP_BLOCK_TYPE_SMC, 3165 AMD_IP_BLOCK_TYPE_DCE, 3166 AMD_IP_BLOCK_TYPE_GFX, 3167 AMD_IP_BLOCK_TYPE_SDMA, 3168 AMD_IP_BLOCK_TYPE_UVD, 3169 AMD_IP_BLOCK_TYPE_VCE, 3170 AMD_IP_BLOCK_TYPE_VCN 3171 }; 3172 3173 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3174 int j; 3175 struct amdgpu_ip_block *block; 3176 3177 for (j = 0; j < adev->num_ip_blocks; j++) { 3178 block = &adev->ip_blocks[j]; 3179 3180 if (block->version->type != ip_order[i] || 3181 !block->status.valid || 3182 block->status.hw) 3183 continue; 3184 3185 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3186 r = block->version->funcs->resume(adev); 3187 else 3188 r = block->version->funcs->hw_init(adev); 3189 3190 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3191 if (r) 3192 return r; 3193 block->status.hw = true; 3194 } 3195 } 3196 3197 return 0; 3198 } 3199 3200 /** 3201 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3202 * 3203 * @adev: amdgpu_device pointer 3204 * 3205 * First resume function for hardware IPs. The list of all the hardware 3206 * IPs that make up the asic is walked and the resume callbacks are run for 3207 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3208 * after a suspend and updates the software state as necessary. This 3209 * function is also used for restoring the GPU after a GPU reset. 3210 * Returns 0 on success, negative error code on failure. 3211 */ 3212 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3213 { 3214 int i, r; 3215 3216 for (i = 0; i < adev->num_ip_blocks; i++) { 3217 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3218 continue; 3219 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3220 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3221 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3222 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3223 3224 r = adev->ip_blocks[i].version->funcs->resume(adev); 3225 if (r) { 3226 DRM_ERROR("resume of IP block <%s> failed %d\n", 3227 adev->ip_blocks[i].version->funcs->name, r); 3228 return r; 3229 } 3230 adev->ip_blocks[i].status.hw = true; 3231 } 3232 } 3233 3234 return 0; 3235 } 3236 3237 /** 3238 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3239 * 3240 * @adev: amdgpu_device pointer 3241 * 3242 * First resume function for hardware IPs. The list of all the hardware 3243 * IPs that make up the asic is walked and the resume callbacks are run for 3244 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3245 * functional state after a suspend and updates the software state as 3246 * necessary. This function is also used for restoring the GPU after a GPU 3247 * reset. 3248 * Returns 0 on success, negative error code on failure. 3249 */ 3250 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3251 { 3252 int i, r; 3253 3254 for (i = 0; i < adev->num_ip_blocks; i++) { 3255 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3256 continue; 3257 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3258 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3259 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3260 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3261 continue; 3262 r = adev->ip_blocks[i].version->funcs->resume(adev); 3263 if (r) { 3264 DRM_ERROR("resume of IP block <%s> failed %d\n", 3265 adev->ip_blocks[i].version->funcs->name, r); 3266 return r; 3267 } 3268 adev->ip_blocks[i].status.hw = true; 3269 3270 if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3271 /* disable gfxoff for IP resume. The gfxoff will be re-enabled in 3272 * amdgpu_device_resume() after IP resume. 3273 */ 3274 amdgpu_gfx_off_ctrl(adev, false); 3275 DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n"); 3276 } 3277 3278 } 3279 3280 return 0; 3281 } 3282 3283 /** 3284 * amdgpu_device_ip_resume - run resume for hardware IPs 3285 * 3286 * @adev: amdgpu_device pointer 3287 * 3288 * Main resume function for hardware IPs. The hardware IPs 3289 * are split into two resume functions because they are 3290 * are also used in in recovering from a GPU reset and some additional 3291 * steps need to be take between them. In this case (S3/S4) they are 3292 * run sequentially. 3293 * Returns 0 on success, negative error code on failure. 3294 */ 3295 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3296 { 3297 int r; 3298 3299 r = amdgpu_amdkfd_resume_iommu(adev); 3300 if (r) 3301 return r; 3302 3303 r = amdgpu_device_ip_resume_phase1(adev); 3304 if (r) 3305 return r; 3306 3307 r = amdgpu_device_fw_loading(adev); 3308 if (r) 3309 return r; 3310 3311 r = amdgpu_device_ip_resume_phase2(adev); 3312 3313 return r; 3314 } 3315 3316 /** 3317 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3318 * 3319 * @adev: amdgpu_device pointer 3320 * 3321 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3322 */ 3323 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3324 { 3325 if (amdgpu_sriov_vf(adev)) { 3326 if (adev->is_atom_fw) { 3327 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3328 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3329 } else { 3330 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3331 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3332 } 3333 3334 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3335 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3336 } 3337 } 3338 3339 /** 3340 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3341 * 3342 * @asic_type: AMD asic type 3343 * 3344 * Check if there is DC (new modesetting infrastructre) support for an asic. 3345 * returns true if DC has support, false if not. 3346 */ 3347 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3348 { 3349 switch (asic_type) { 3350 #ifdef CONFIG_DRM_AMDGPU_SI 3351 case CHIP_HAINAN: 3352 #endif 3353 case CHIP_TOPAZ: 3354 /* chips with no display hardware */ 3355 return false; 3356 #if defined(CONFIG_DRM_AMD_DC) 3357 case CHIP_TAHITI: 3358 case CHIP_PITCAIRN: 3359 case CHIP_VERDE: 3360 case CHIP_OLAND: 3361 /* 3362 * We have systems in the wild with these ASICs that require 3363 * LVDS and VGA support which is not supported with DC. 3364 * 3365 * Fallback to the non-DC driver here by default so as not to 3366 * cause regressions. 3367 */ 3368 #if defined(CONFIG_DRM_AMD_DC_SI) 3369 return amdgpu_dc > 0; 3370 #else 3371 return false; 3372 #endif 3373 case CHIP_BONAIRE: 3374 case CHIP_KAVERI: 3375 case CHIP_KABINI: 3376 case CHIP_MULLINS: 3377 /* 3378 * We have systems in the wild with these ASICs that require 3379 * VGA support which is not supported with DC. 3380 * 3381 * Fallback to the non-DC driver here by default so as not to 3382 * cause regressions. 3383 */ 3384 return amdgpu_dc > 0; 3385 default: 3386 return amdgpu_dc != 0; 3387 #else 3388 default: 3389 if (amdgpu_dc > 0) 3390 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3391 "but isn't supported by ASIC, ignoring\n"); 3392 return false; 3393 #endif 3394 } 3395 } 3396 3397 /** 3398 * amdgpu_device_has_dc_support - check if dc is supported 3399 * 3400 * @adev: amdgpu_device pointer 3401 * 3402 * Returns true for supported, false for not supported 3403 */ 3404 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3405 { 3406 if (amdgpu_sriov_vf(adev) || 3407 adev->enable_virtual_display || 3408 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3409 return false; 3410 3411 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3412 } 3413 3414 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3415 { 3416 struct amdgpu_device *adev = 3417 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3418 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3419 3420 /* It's a bug to not have a hive within this function */ 3421 if (WARN_ON(!hive)) 3422 return; 3423 3424 /* 3425 * Use task barrier to synchronize all xgmi reset works across the 3426 * hive. task_barrier_enter and task_barrier_exit will block 3427 * until all the threads running the xgmi reset works reach 3428 * those points. task_barrier_full will do both blocks. 3429 */ 3430 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3431 3432 task_barrier_enter(&hive->tb); 3433 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3434 3435 if (adev->asic_reset_res) 3436 goto fail; 3437 3438 task_barrier_exit(&hive->tb); 3439 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3440 3441 if (adev->asic_reset_res) 3442 goto fail; 3443 3444 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3445 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3446 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3447 } else { 3448 3449 task_barrier_full(&hive->tb); 3450 adev->asic_reset_res = amdgpu_asic_reset(adev); 3451 } 3452 3453 fail: 3454 if (adev->asic_reset_res) 3455 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3456 adev->asic_reset_res, adev_to_drm(adev)->unique); 3457 amdgpu_put_xgmi_hive(hive); 3458 } 3459 3460 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3461 { 3462 char *input = amdgpu_lockup_timeout; 3463 char *timeout_setting = NULL; 3464 int index = 0; 3465 long timeout; 3466 int ret = 0; 3467 3468 /* 3469 * By default timeout for non compute jobs is 10000 3470 * and 60000 for compute jobs. 3471 * In SR-IOV or passthrough mode, timeout for compute 3472 * jobs are 60000 by default. 3473 */ 3474 adev->gfx_timeout = msecs_to_jiffies(10000); 3475 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3476 if (amdgpu_sriov_vf(adev)) 3477 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3478 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3479 else 3480 adev->compute_timeout = msecs_to_jiffies(60000); 3481 3482 #ifdef notyet 3483 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3484 while ((timeout_setting = strsep(&input, ",")) && 3485 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3486 ret = kstrtol(timeout_setting, 0, &timeout); 3487 if (ret) 3488 return ret; 3489 3490 if (timeout == 0) { 3491 index++; 3492 continue; 3493 } else if (timeout < 0) { 3494 timeout = MAX_SCHEDULE_TIMEOUT; 3495 dev_warn(adev->dev, "lockup timeout disabled"); 3496 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3497 } else { 3498 timeout = msecs_to_jiffies(timeout); 3499 } 3500 3501 switch (index++) { 3502 case 0: 3503 adev->gfx_timeout = timeout; 3504 break; 3505 case 1: 3506 adev->compute_timeout = timeout; 3507 break; 3508 case 2: 3509 adev->sdma_timeout = timeout; 3510 break; 3511 case 3: 3512 adev->video_timeout = timeout; 3513 break; 3514 default: 3515 break; 3516 } 3517 } 3518 /* 3519 * There is only one value specified and 3520 * it should apply to all non-compute jobs. 3521 */ 3522 if (index == 1) { 3523 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3524 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3525 adev->compute_timeout = adev->gfx_timeout; 3526 } 3527 } 3528 #endif 3529 3530 return ret; 3531 } 3532 3533 /** 3534 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3535 * 3536 * @adev: amdgpu_device pointer 3537 * 3538 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3539 */ 3540 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3541 { 3542 #ifdef notyet 3543 struct iommu_domain *domain; 3544 3545 domain = iommu_get_domain_for_dev(adev->dev); 3546 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3547 #endif 3548 adev->ram_is_direct_mapped = true; 3549 } 3550 3551 static const struct attribute *amdgpu_dev_attributes[] = { 3552 &dev_attr_product_name.attr, 3553 &dev_attr_product_number.attr, 3554 &dev_attr_serial_number.attr, 3555 &dev_attr_pcie_replay_count.attr, 3556 NULL 3557 }; 3558 3559 /** 3560 * amdgpu_device_init - initialize the driver 3561 * 3562 * @adev: amdgpu_device pointer 3563 * @flags: driver flags 3564 * 3565 * Initializes the driver info and hw (all asics). 3566 * Returns 0 for success or an error on failure. 3567 * Called at driver startup. 3568 */ 3569 int amdgpu_device_init(struct amdgpu_device *adev, 3570 uint32_t flags) 3571 { 3572 struct drm_device *ddev = adev_to_drm(adev); 3573 struct pci_dev *pdev = adev->pdev; 3574 int r, i; 3575 bool px = false; 3576 u32 max_MBps; 3577 3578 adev->shutdown = false; 3579 adev->flags = flags; 3580 3581 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3582 adev->asic_type = amdgpu_force_asic_type; 3583 else 3584 adev->asic_type = flags & AMD_ASIC_MASK; 3585 3586 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3587 if (amdgpu_emu_mode == 1) 3588 adev->usec_timeout *= 10; 3589 adev->gmc.gart_size = 512 * 1024 * 1024; 3590 adev->accel_working = false; 3591 adev->num_rings = 0; 3592 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3593 adev->mman.buffer_funcs = NULL; 3594 adev->mman.buffer_funcs_ring = NULL; 3595 adev->vm_manager.vm_pte_funcs = NULL; 3596 adev->vm_manager.vm_pte_num_scheds = 0; 3597 adev->gmc.gmc_funcs = NULL; 3598 adev->harvest_ip_mask = 0x0; 3599 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3600 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3601 3602 adev->smc_rreg = &amdgpu_invalid_rreg; 3603 adev->smc_wreg = &amdgpu_invalid_wreg; 3604 adev->pcie_rreg = &amdgpu_invalid_rreg; 3605 adev->pcie_wreg = &amdgpu_invalid_wreg; 3606 adev->pciep_rreg = &amdgpu_invalid_rreg; 3607 adev->pciep_wreg = &amdgpu_invalid_wreg; 3608 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3609 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3610 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3611 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3612 adev->didt_rreg = &amdgpu_invalid_rreg; 3613 adev->didt_wreg = &amdgpu_invalid_wreg; 3614 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3615 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3616 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3617 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3618 3619 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3620 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3621 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3622 3623 /* mutex initialization are all done here so we 3624 * can recall function without having locking issues */ 3625 rw_init(&adev->firmware.mutex, "agfw"); 3626 rw_init(&adev->pm.mutex, "agpm"); 3627 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3628 rw_init(&adev->srbm_mutex, "srbm"); 3629 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3630 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3631 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3632 rw_init(&adev->mn_lock, "agpumn"); 3633 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3634 hash_init(adev->mn_hash); 3635 rw_init(&adev->psp.mutex, "agpsp"); 3636 rw_init(&adev->notifier_lock, "agnf"); 3637 rw_init(&adev->pm.stable_pstate_ctx_lock, "agps"); 3638 rw_init(&adev->benchmark_mutex, "agbm"); 3639 3640 amdgpu_device_init_apu_flags(adev); 3641 3642 r = amdgpu_device_check_arguments(adev); 3643 if (r) 3644 return r; 3645 3646 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3647 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3648 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3649 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3650 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3651 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3652 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3653 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3654 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3655 3656 INIT_LIST_HEAD(&adev->shadow_list); 3657 rw_init(&adev->shadow_list_lock, "sdwlst"); 3658 3659 INIT_LIST_HEAD(&adev->reset_list); 3660 3661 INIT_LIST_HEAD(&adev->ras_list); 3662 3663 INIT_DELAYED_WORK(&adev->delayed_init_work, 3664 amdgpu_device_delayed_init_work_handler); 3665 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3666 amdgpu_device_delay_enable_gfx_off); 3667 3668 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3669 3670 adev->gfx.gfx_off_req_count = 1; 3671 adev->gfx.gfx_off_residency = 0; 3672 adev->gfx.gfx_off_entrycount = 0; 3673 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3674 3675 atomic_set(&adev->throttling_logging_enabled, 1); 3676 /* 3677 * If throttling continues, logging will be performed every minute 3678 * to avoid log flooding. "-1" is subtracted since the thermal 3679 * throttling interrupt comes every second. Thus, the total logging 3680 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3681 * for throttling interrupt) = 60 seconds. 3682 */ 3683 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3684 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3685 3686 #ifdef __linux__ 3687 /* Registers mapping */ 3688 /* TODO: block userspace mapping of io register */ 3689 if (adev->asic_type >= CHIP_BONAIRE) { 3690 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3691 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3692 } else { 3693 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3694 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3695 } 3696 3697 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3698 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3699 3700 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3701 if (adev->rmmio == NULL) { 3702 return -ENOMEM; 3703 } 3704 #endif 3705 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3706 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3707 3708 amdgpu_device_get_pcie_info(adev); 3709 3710 if (amdgpu_mcbp) 3711 DRM_INFO("MCBP is enabled\n"); 3712 3713 /* 3714 * Reset domain needs to be present early, before XGMI hive discovered 3715 * (if any) and intitialized to use reset sem and in_gpu reset flag 3716 * early on during init and before calling to RREG32. 3717 */ 3718 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3719 if (!adev->reset_domain) 3720 return -ENOMEM; 3721 3722 /* detect hw virtualization here */ 3723 amdgpu_detect_virtualization(adev); 3724 3725 r = amdgpu_device_get_job_timeout_settings(adev); 3726 if (r) { 3727 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3728 return r; 3729 } 3730 3731 /* early init functions */ 3732 r = amdgpu_device_ip_early_init(adev); 3733 if (r) 3734 return r; 3735 3736 /* Get rid of things like offb */ 3737 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3738 if (r) 3739 return r; 3740 3741 /* Enable TMZ based on IP_VERSION */ 3742 amdgpu_gmc_tmz_set(adev); 3743 3744 amdgpu_gmc_noretry_set(adev); 3745 /* Need to get xgmi info early to decide the reset behavior*/ 3746 if (adev->gmc.xgmi.supported) { 3747 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3748 if (r) 3749 return r; 3750 } 3751 3752 /* enable PCIE atomic ops */ 3753 #ifdef notyet 3754 if (amdgpu_sriov_vf(adev)) 3755 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3756 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3757 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3758 else 3759 adev->have_atomics_support = 3760 !pci_enable_atomic_ops_to_root(adev->pdev, 3761 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3762 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3763 if (!adev->have_atomics_support) 3764 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3765 #else 3766 adev->have_atomics_support = false; 3767 #endif 3768 3769 /* doorbell bar mapping and doorbell index init*/ 3770 amdgpu_device_doorbell_init(adev); 3771 3772 if (amdgpu_emu_mode == 1) { 3773 /* post the asic on emulation mode */ 3774 emu_soc_asic_init(adev); 3775 goto fence_driver_init; 3776 } 3777 3778 amdgpu_reset_init(adev); 3779 3780 /* detect if we are with an SRIOV vbios */ 3781 amdgpu_device_detect_sriov_bios(adev); 3782 3783 /* check if we need to reset the asic 3784 * E.g., driver was not cleanly unloaded previously, etc. 3785 */ 3786 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3787 if (adev->gmc.xgmi.num_physical_nodes) { 3788 dev_info(adev->dev, "Pending hive reset.\n"); 3789 adev->gmc.xgmi.pending_reset = true; 3790 /* Only need to init necessary block for SMU to handle the reset */ 3791 for (i = 0; i < adev->num_ip_blocks; i++) { 3792 if (!adev->ip_blocks[i].status.valid) 3793 continue; 3794 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3795 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3796 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3797 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3798 DRM_DEBUG("IP %s disabled for hw_init.\n", 3799 adev->ip_blocks[i].version->funcs->name); 3800 adev->ip_blocks[i].status.hw = true; 3801 } 3802 } 3803 } else { 3804 r = amdgpu_asic_reset(adev); 3805 if (r) { 3806 dev_err(adev->dev, "asic reset on init failed\n"); 3807 goto failed; 3808 } 3809 } 3810 } 3811 3812 pci_enable_pcie_error_reporting(adev->pdev); 3813 3814 /* Post card if necessary */ 3815 if (amdgpu_device_need_post(adev)) { 3816 if (!adev->bios) { 3817 dev_err(adev->dev, "no vBIOS found\n"); 3818 r = -EINVAL; 3819 goto failed; 3820 } 3821 DRM_INFO("GPU posting now...\n"); 3822 r = amdgpu_device_asic_init(adev); 3823 if (r) { 3824 dev_err(adev->dev, "gpu post error!\n"); 3825 goto failed; 3826 } 3827 } 3828 3829 if (adev->is_atom_fw) { 3830 /* Initialize clocks */ 3831 r = amdgpu_atomfirmware_get_clock_info(adev); 3832 if (r) { 3833 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3834 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3835 goto failed; 3836 } 3837 } else { 3838 /* Initialize clocks */ 3839 r = amdgpu_atombios_get_clock_info(adev); 3840 if (r) { 3841 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3842 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3843 goto failed; 3844 } 3845 /* init i2c buses */ 3846 if (!amdgpu_device_has_dc_support(adev)) 3847 amdgpu_atombios_i2c_init(adev); 3848 } 3849 3850 fence_driver_init: 3851 /* Fence driver */ 3852 r = amdgpu_fence_driver_sw_init(adev); 3853 if (r) { 3854 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3855 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3856 goto failed; 3857 } 3858 3859 /* init the mode config */ 3860 drm_mode_config_init(adev_to_drm(adev)); 3861 3862 r = amdgpu_device_ip_init(adev); 3863 if (r) { 3864 /* failed in exclusive mode due to timeout */ 3865 if (amdgpu_sriov_vf(adev) && 3866 !amdgpu_sriov_runtime(adev) && 3867 amdgpu_virt_mmio_blocked(adev) && 3868 !amdgpu_virt_wait_reset(adev)) { 3869 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3870 /* Don't send request since VF is inactive. */ 3871 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3872 adev->virt.ops = NULL; 3873 r = -EAGAIN; 3874 goto release_ras_con; 3875 } 3876 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3877 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3878 goto release_ras_con; 3879 } 3880 3881 amdgpu_fence_driver_hw_init(adev); 3882 3883 dev_info(adev->dev, 3884 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3885 adev->gfx.config.max_shader_engines, 3886 adev->gfx.config.max_sh_per_se, 3887 adev->gfx.config.max_cu_per_sh, 3888 adev->gfx.cu_info.number); 3889 3890 #ifdef __OpenBSD__ 3891 { 3892 const char *chip_name; 3893 uint32_t version = adev->ip_versions[GC_HWIP][0]; 3894 int maj, min, rev; 3895 3896 switch (adev->asic_type) { 3897 case CHIP_RAVEN: 3898 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3899 chip_name = "RAVEN2"; 3900 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3901 chip_name = "PICASSO"; 3902 else 3903 chip_name = "RAVEN"; 3904 break; 3905 case CHIP_RENOIR: 3906 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3907 chip_name = "RENOIR"; 3908 else 3909 chip_name = "GREEN_SARDINE"; 3910 break; 3911 default: 3912 chip_name = amdgpu_asic_name[adev->asic_type]; 3913 } 3914 3915 printf("%s: %s", adev->self.dv_xname, chip_name); 3916 /* show graphics/compute ip block version, not set on < GFX9 */ 3917 if (version) { 3918 maj = IP_VERSION_MAJ(version); 3919 min = IP_VERSION_MIN(version); 3920 rev = IP_VERSION_REV(version); 3921 printf(" GC %d.%d.%d", maj, min, rev); 3922 } 3923 printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id); 3924 } 3925 #endif 3926 3927 adev->accel_working = true; 3928 3929 amdgpu_vm_check_compute_bug(adev); 3930 3931 /* Initialize the buffer migration limit. */ 3932 if (amdgpu_moverate >= 0) 3933 max_MBps = amdgpu_moverate; 3934 else 3935 max_MBps = 8; /* Allow 8 MB/s. */ 3936 /* Get a log2 for easy divisions. */ 3937 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3938 3939 r = amdgpu_pm_sysfs_init(adev); 3940 if (r) { 3941 adev->pm_sysfs_en = false; 3942 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3943 } else 3944 adev->pm_sysfs_en = true; 3945 3946 r = amdgpu_ucode_sysfs_init(adev); 3947 if (r) { 3948 adev->ucode_sysfs_en = false; 3949 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3950 } else 3951 adev->ucode_sysfs_en = true; 3952 3953 r = amdgpu_psp_sysfs_init(adev); 3954 if (r) { 3955 adev->psp_sysfs_en = false; 3956 if (!amdgpu_sriov_vf(adev)) 3957 DRM_ERROR("Creating psp sysfs failed\n"); 3958 } else 3959 adev->psp_sysfs_en = true; 3960 3961 /* 3962 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3963 * Otherwise the mgpu fan boost feature will be skipped due to the 3964 * gpu instance is counted less. 3965 */ 3966 amdgpu_register_gpu_instance(adev); 3967 3968 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3969 * explicit gating rather than handling it automatically. 3970 */ 3971 if (!adev->gmc.xgmi.pending_reset) { 3972 r = amdgpu_device_ip_late_init(adev); 3973 if (r) { 3974 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3975 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3976 goto release_ras_con; 3977 } 3978 /* must succeed. */ 3979 amdgpu_ras_resume(adev); 3980 queue_delayed_work(system_wq, &adev->delayed_init_work, 3981 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3982 } 3983 3984 if (amdgpu_sriov_vf(adev)) 3985 flush_delayed_work(&adev->delayed_init_work); 3986 3987 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3988 if (r) 3989 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3990 3991 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3992 r = amdgpu_pmu_init(adev); 3993 if (r) 3994 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3995 3996 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3997 if (amdgpu_device_cache_pci_state(adev->pdev)) 3998 pci_restore_state(pdev); 3999 4000 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4001 /* this will fail for cards that aren't VGA class devices, just 4002 * ignore it */ 4003 #ifdef notyet 4004 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4005 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4006 #endif 4007 4008 if (amdgpu_device_supports_px(ddev)) { 4009 px = true; 4010 vga_switcheroo_register_client(adev->pdev, 4011 &amdgpu_switcheroo_ops, px); 4012 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4013 } 4014 4015 if (adev->gmc.xgmi.pending_reset) 4016 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4017 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4018 4019 amdgpu_device_check_iommu_direct_map(adev); 4020 4021 return 0; 4022 4023 release_ras_con: 4024 amdgpu_release_ras_context(adev); 4025 4026 failed: 4027 amdgpu_vf_error_trans_all(adev); 4028 4029 return r; 4030 } 4031 4032 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4033 { 4034 STUB(); 4035 #ifdef notyet 4036 /* Clear all CPU mappings pointing to this device */ 4037 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4038 #endif 4039 4040 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4041 amdgpu_device_doorbell_fini(adev); 4042 4043 #ifdef __linux__ 4044 iounmap(adev->rmmio); 4045 adev->rmmio = NULL; 4046 if (adev->mman.aper_base_kaddr) 4047 iounmap(adev->mman.aper_base_kaddr); 4048 adev->mman.aper_base_kaddr = NULL; 4049 #else 4050 if (adev->rmmio_size > 0) 4051 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4052 adev->rmmio_size); 4053 adev->rmmio_size = 0; 4054 adev->rmmio = NULL; 4055 if (adev->mman.aper_base_kaddr) 4056 bus_space_unmap(adev->memt, adev->mman.aper_bsh, 4057 adev->gmc.visible_vram_size); 4058 adev->mman.aper_base_kaddr = NULL; 4059 #endif 4060 4061 /* Memory manager related */ 4062 if (!adev->gmc.xgmi.connected_to_cpu) { 4063 #ifdef __linux__ 4064 arch_phys_wc_del(adev->gmc.vram_mtrr); 4065 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4066 #else 4067 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC); 4068 #endif 4069 } 4070 } 4071 4072 /** 4073 * amdgpu_device_fini_hw - tear down the driver 4074 * 4075 * @adev: amdgpu_device pointer 4076 * 4077 * Tear down the driver info (all asics). 4078 * Called at driver shutdown. 4079 */ 4080 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4081 { 4082 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4083 flush_delayed_work(&adev->delayed_init_work); 4084 adev->shutdown = true; 4085 4086 /* make sure IB test finished before entering exclusive mode 4087 * to avoid preemption on IB test 4088 * */ 4089 if (amdgpu_sriov_vf(adev)) { 4090 amdgpu_virt_request_full_gpu(adev, false); 4091 amdgpu_virt_fini_data_exchange(adev); 4092 } 4093 4094 /* disable all interrupts */ 4095 amdgpu_irq_disable_all(adev); 4096 if (adev->mode_info.mode_config_initialized){ 4097 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4098 drm_helper_force_disable_all(adev_to_drm(adev)); 4099 else 4100 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4101 } 4102 amdgpu_fence_driver_hw_fini(adev); 4103 4104 if (adev->mman.initialized) { 4105 flush_delayed_work(&adev->mman.bdev.wq); 4106 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 4107 } 4108 4109 if (adev->pm_sysfs_en) 4110 amdgpu_pm_sysfs_fini(adev); 4111 if (adev->ucode_sysfs_en) 4112 amdgpu_ucode_sysfs_fini(adev); 4113 if (adev->psp_sysfs_en) 4114 amdgpu_psp_sysfs_fini(adev); 4115 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4116 4117 /* disable ras feature must before hw fini */ 4118 amdgpu_ras_pre_fini(adev); 4119 4120 amdgpu_device_ip_fini_early(adev); 4121 4122 amdgpu_irq_fini_hw(adev); 4123 4124 if (adev->mman.initialized) 4125 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4126 4127 amdgpu_gart_dummy_page_fini(adev); 4128 4129 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4130 amdgpu_device_unmap_mmio(adev); 4131 4132 } 4133 4134 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4135 { 4136 int idx; 4137 4138 amdgpu_fence_driver_sw_fini(adev); 4139 amdgpu_device_ip_fini(adev); 4140 release_firmware(adev->firmware.gpu_info_fw); 4141 adev->firmware.gpu_info_fw = NULL; 4142 adev->accel_working = false; 4143 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4144 4145 amdgpu_reset_fini(adev); 4146 4147 /* free i2c buses */ 4148 if (!amdgpu_device_has_dc_support(adev)) 4149 amdgpu_i2c_fini(adev); 4150 4151 if (amdgpu_emu_mode != 1) 4152 amdgpu_atombios_fini(adev); 4153 4154 kfree(adev->bios); 4155 adev->bios = NULL; 4156 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4157 vga_switcheroo_unregister_client(adev->pdev); 4158 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4159 } 4160 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4161 vga_client_unregister(adev->pdev); 4162 4163 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4164 #ifdef __linux__ 4165 iounmap(adev->rmmio); 4166 adev->rmmio = NULL; 4167 #else 4168 if (adev->rmmio_size > 0) 4169 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4170 adev->rmmio_size); 4171 adev->rmmio_size = 0; 4172 adev->rmmio = NULL; 4173 #endif 4174 amdgpu_device_doorbell_fini(adev); 4175 drm_dev_exit(idx); 4176 } 4177 4178 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4179 amdgpu_pmu_fini(adev); 4180 if (adev->mman.discovery_bin) 4181 amdgpu_discovery_fini(adev); 4182 4183 amdgpu_reset_put_reset_domain(adev->reset_domain); 4184 adev->reset_domain = NULL; 4185 4186 kfree(adev->pci_state); 4187 4188 } 4189 4190 /** 4191 * amdgpu_device_evict_resources - evict device resources 4192 * @adev: amdgpu device object 4193 * 4194 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4195 * of the vram memory type. Mainly used for evicting device resources 4196 * at suspend time. 4197 * 4198 */ 4199 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4200 { 4201 int ret; 4202 4203 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4204 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4205 return 0; 4206 4207 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4208 if (ret) 4209 DRM_WARN("evicting device resources failed\n"); 4210 return ret; 4211 } 4212 4213 /* 4214 * Suspend & resume. 4215 */ 4216 /** 4217 * amdgpu_device_suspend - initiate device suspend 4218 * 4219 * @dev: drm dev pointer 4220 * @fbcon : notify the fbdev of suspend 4221 * 4222 * Puts the hw in the suspend state (all asics). 4223 * Returns 0 for success or an error on failure. 4224 * Called at driver suspend. 4225 */ 4226 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4227 { 4228 struct amdgpu_device *adev = drm_to_adev(dev); 4229 int r = 0; 4230 4231 if (adev->shutdown) 4232 return 0; 4233 4234 #ifdef notyet 4235 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4236 return 0; 4237 #endif 4238 4239 adev->in_suspend = true; 4240 4241 if (amdgpu_sriov_vf(adev)) { 4242 amdgpu_virt_fini_data_exchange(adev); 4243 r = amdgpu_virt_request_full_gpu(adev, false); 4244 if (r) 4245 return r; 4246 } 4247 4248 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4249 DRM_WARN("smart shift update failed\n"); 4250 4251 drm_kms_helper_poll_disable(dev); 4252 4253 if (fbcon) 4254 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4255 4256 cancel_delayed_work_sync(&adev->delayed_init_work); 4257 4258 amdgpu_ras_suspend(adev); 4259 4260 amdgpu_device_ip_suspend_phase1(adev); 4261 4262 if (!adev->in_s0ix) 4263 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4264 4265 r = amdgpu_device_evict_resources(adev); 4266 if (r) 4267 return r; 4268 4269 amdgpu_fence_driver_hw_fini(adev); 4270 4271 amdgpu_device_ip_suspend_phase2(adev); 4272 4273 if (amdgpu_sriov_vf(adev)) 4274 amdgpu_virt_release_full_gpu(adev, false); 4275 4276 return 0; 4277 } 4278 4279 /** 4280 * amdgpu_device_resume - initiate device resume 4281 * 4282 * @dev: drm dev pointer 4283 * @fbcon : notify the fbdev of resume 4284 * 4285 * Bring the hw back to operating state (all asics). 4286 * Returns 0 for success or an error on failure. 4287 * Called at driver resume. 4288 */ 4289 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4290 { 4291 struct amdgpu_device *adev = drm_to_adev(dev); 4292 int r = 0; 4293 4294 if (amdgpu_sriov_vf(adev)) { 4295 r = amdgpu_virt_request_full_gpu(adev, true); 4296 if (r) 4297 return r; 4298 } 4299 4300 #ifdef notyet 4301 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4302 return 0; 4303 #endif 4304 4305 if (adev->in_s0ix) 4306 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4307 4308 /* post card */ 4309 if (amdgpu_device_need_post(adev)) { 4310 r = amdgpu_device_asic_init(adev); 4311 if (r) 4312 dev_err(adev->dev, "amdgpu asic init failed\n"); 4313 } 4314 4315 r = amdgpu_device_ip_resume(adev); 4316 4317 /* no matter what r is, always need to properly release full GPU */ 4318 if (amdgpu_sriov_vf(adev)) { 4319 amdgpu_virt_init_data_exchange(adev); 4320 amdgpu_virt_release_full_gpu(adev, true); 4321 } 4322 4323 if (r) { 4324 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4325 return r; 4326 } 4327 amdgpu_fence_driver_hw_init(adev); 4328 4329 r = amdgpu_device_ip_late_init(adev); 4330 if (r) 4331 return r; 4332 4333 queue_delayed_work(system_wq, &adev->delayed_init_work, 4334 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4335 4336 if (!adev->in_s0ix) { 4337 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4338 if (r) 4339 return r; 4340 } 4341 4342 /* Make sure IB tests flushed */ 4343 flush_delayed_work(&adev->delayed_init_work); 4344 4345 if (adev->in_s0ix) { 4346 /* re-enable gfxoff after IP resume. This re-enables gfxoff after 4347 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2(). 4348 */ 4349 amdgpu_gfx_off_ctrl(adev, true); 4350 DRM_DEBUG("will enable gfxoff for the mission mode\n"); 4351 } 4352 if (fbcon) 4353 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4354 4355 drm_kms_helper_poll_enable(dev); 4356 4357 amdgpu_ras_resume(adev); 4358 4359 /* 4360 * Most of the connector probing functions try to acquire runtime pm 4361 * refs to ensure that the GPU is powered on when connector polling is 4362 * performed. Since we're calling this from a runtime PM callback, 4363 * trying to acquire rpm refs will cause us to deadlock. 4364 * 4365 * Since we're guaranteed to be holding the rpm lock, it's safe to 4366 * temporarily disable the rpm helpers so this doesn't deadlock us. 4367 */ 4368 #if defined(CONFIG_PM) && defined(__linux__) 4369 dev->dev->power.disable_depth++; 4370 #endif 4371 if (!amdgpu_device_has_dc_support(adev)) 4372 drm_helper_hpd_irq_event(dev); 4373 else 4374 drm_kms_helper_hotplug_event(dev); 4375 #if defined(CONFIG_PM) && defined(__linux__) 4376 dev->dev->power.disable_depth--; 4377 #endif 4378 adev->in_suspend = false; 4379 4380 if (adev->enable_mes) 4381 amdgpu_mes_self_test(adev); 4382 4383 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4384 DRM_WARN("smart shift update failed\n"); 4385 4386 return 0; 4387 } 4388 4389 /** 4390 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4391 * 4392 * @adev: amdgpu_device pointer 4393 * 4394 * The list of all the hardware IPs that make up the asic is walked and 4395 * the check_soft_reset callbacks are run. check_soft_reset determines 4396 * if the asic is still hung or not. 4397 * Returns true if any of the IPs are still in a hung state, false if not. 4398 */ 4399 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4400 { 4401 int i; 4402 bool asic_hang = false; 4403 4404 if (amdgpu_sriov_vf(adev)) 4405 return true; 4406 4407 if (amdgpu_asic_need_full_reset(adev)) 4408 return true; 4409 4410 for (i = 0; i < adev->num_ip_blocks; i++) { 4411 if (!adev->ip_blocks[i].status.valid) 4412 continue; 4413 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4414 adev->ip_blocks[i].status.hang = 4415 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4416 if (adev->ip_blocks[i].status.hang) { 4417 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4418 asic_hang = true; 4419 } 4420 } 4421 return asic_hang; 4422 } 4423 4424 /** 4425 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4426 * 4427 * @adev: amdgpu_device pointer 4428 * 4429 * The list of all the hardware IPs that make up the asic is walked and the 4430 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4431 * handles any IP specific hardware or software state changes that are 4432 * necessary for a soft reset to succeed. 4433 * Returns 0 on success, negative error code on failure. 4434 */ 4435 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4436 { 4437 int i, r = 0; 4438 4439 for (i = 0; i < adev->num_ip_blocks; i++) { 4440 if (!adev->ip_blocks[i].status.valid) 4441 continue; 4442 if (adev->ip_blocks[i].status.hang && 4443 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4444 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4445 if (r) 4446 return r; 4447 } 4448 } 4449 4450 return 0; 4451 } 4452 4453 /** 4454 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4455 * 4456 * @adev: amdgpu_device pointer 4457 * 4458 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4459 * reset is necessary to recover. 4460 * Returns true if a full asic reset is required, false if not. 4461 */ 4462 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4463 { 4464 int i; 4465 4466 if (amdgpu_asic_need_full_reset(adev)) 4467 return true; 4468 4469 for (i = 0; i < adev->num_ip_blocks; i++) { 4470 if (!adev->ip_blocks[i].status.valid) 4471 continue; 4472 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4473 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4474 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4475 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4476 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4477 if (adev->ip_blocks[i].status.hang) { 4478 dev_info(adev->dev, "Some block need full reset!\n"); 4479 return true; 4480 } 4481 } 4482 } 4483 return false; 4484 } 4485 4486 /** 4487 * amdgpu_device_ip_soft_reset - do a soft reset 4488 * 4489 * @adev: amdgpu_device pointer 4490 * 4491 * The list of all the hardware IPs that make up the asic is walked and the 4492 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4493 * IP specific hardware or software state changes that are necessary to soft 4494 * reset the IP. 4495 * Returns 0 on success, negative error code on failure. 4496 */ 4497 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4498 { 4499 int i, r = 0; 4500 4501 for (i = 0; i < adev->num_ip_blocks; i++) { 4502 if (!adev->ip_blocks[i].status.valid) 4503 continue; 4504 if (adev->ip_blocks[i].status.hang && 4505 adev->ip_blocks[i].version->funcs->soft_reset) { 4506 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4507 if (r) 4508 return r; 4509 } 4510 } 4511 4512 return 0; 4513 } 4514 4515 /** 4516 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4517 * 4518 * @adev: amdgpu_device pointer 4519 * 4520 * The list of all the hardware IPs that make up the asic is walked and the 4521 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4522 * handles any IP specific hardware or software state changes that are 4523 * necessary after the IP has been soft reset. 4524 * Returns 0 on success, negative error code on failure. 4525 */ 4526 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4527 { 4528 int i, r = 0; 4529 4530 for (i = 0; i < adev->num_ip_blocks; i++) { 4531 if (!adev->ip_blocks[i].status.valid) 4532 continue; 4533 if (adev->ip_blocks[i].status.hang && 4534 adev->ip_blocks[i].version->funcs->post_soft_reset) 4535 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4536 if (r) 4537 return r; 4538 } 4539 4540 return 0; 4541 } 4542 4543 /** 4544 * amdgpu_device_recover_vram - Recover some VRAM contents 4545 * 4546 * @adev: amdgpu_device pointer 4547 * 4548 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4549 * restore things like GPUVM page tables after a GPU reset where 4550 * the contents of VRAM might be lost. 4551 * 4552 * Returns: 4553 * 0 on success, negative error code on failure. 4554 */ 4555 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4556 { 4557 struct dma_fence *fence = NULL, *next = NULL; 4558 struct amdgpu_bo *shadow; 4559 struct amdgpu_bo_vm *vmbo; 4560 long r = 1, tmo; 4561 4562 if (amdgpu_sriov_runtime(adev)) 4563 tmo = msecs_to_jiffies(8000); 4564 else 4565 tmo = msecs_to_jiffies(100); 4566 4567 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4568 mutex_lock(&adev->shadow_list_lock); 4569 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4570 shadow = &vmbo->bo; 4571 /* No need to recover an evicted BO */ 4572 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4573 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4574 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4575 continue; 4576 4577 r = amdgpu_bo_restore_shadow(shadow, &next); 4578 if (r) 4579 break; 4580 4581 if (fence) { 4582 tmo = dma_fence_wait_timeout(fence, false, tmo); 4583 dma_fence_put(fence); 4584 fence = next; 4585 if (tmo == 0) { 4586 r = -ETIMEDOUT; 4587 break; 4588 } else if (tmo < 0) { 4589 r = tmo; 4590 break; 4591 } 4592 } else { 4593 fence = next; 4594 } 4595 } 4596 mutex_unlock(&adev->shadow_list_lock); 4597 4598 if (fence) 4599 tmo = dma_fence_wait_timeout(fence, false, tmo); 4600 dma_fence_put(fence); 4601 4602 if (r < 0 || tmo <= 0) { 4603 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4604 return -EIO; 4605 } 4606 4607 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4608 return 0; 4609 } 4610 4611 4612 /** 4613 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4614 * 4615 * @adev: amdgpu_device pointer 4616 * @from_hypervisor: request from hypervisor 4617 * 4618 * do VF FLR and reinitialize Asic 4619 * return 0 means succeeded otherwise failed 4620 */ 4621 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4622 bool from_hypervisor) 4623 { 4624 int r; 4625 struct amdgpu_hive_info *hive = NULL; 4626 int retry_limit = 0; 4627 4628 retry: 4629 amdgpu_amdkfd_pre_reset(adev); 4630 4631 if (from_hypervisor) 4632 r = amdgpu_virt_request_full_gpu(adev, true); 4633 else 4634 r = amdgpu_virt_reset_gpu(adev); 4635 if (r) 4636 return r; 4637 4638 /* Resume IP prior to SMC */ 4639 r = amdgpu_device_ip_reinit_early_sriov(adev); 4640 if (r) 4641 goto error; 4642 4643 amdgpu_virt_init_data_exchange(adev); 4644 4645 r = amdgpu_device_fw_loading(adev); 4646 if (r) 4647 return r; 4648 4649 /* now we are okay to resume SMC/CP/SDMA */ 4650 r = amdgpu_device_ip_reinit_late_sriov(adev); 4651 if (r) 4652 goto error; 4653 4654 hive = amdgpu_get_xgmi_hive(adev); 4655 /* Update PSP FW topology after reset */ 4656 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4657 r = amdgpu_xgmi_update_topology(hive, adev); 4658 4659 if (hive) 4660 amdgpu_put_xgmi_hive(hive); 4661 4662 if (!r) { 4663 amdgpu_irq_gpu_reset_resume_helper(adev); 4664 r = amdgpu_ib_ring_tests(adev); 4665 4666 amdgpu_amdkfd_post_reset(adev); 4667 } 4668 4669 error: 4670 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4671 amdgpu_inc_vram_lost(adev); 4672 r = amdgpu_device_recover_vram(adev); 4673 } 4674 amdgpu_virt_release_full_gpu(adev, true); 4675 4676 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4677 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4678 retry_limit++; 4679 goto retry; 4680 } else 4681 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4682 } 4683 4684 return r; 4685 } 4686 4687 /** 4688 * amdgpu_device_has_job_running - check if there is any job in mirror list 4689 * 4690 * @adev: amdgpu_device pointer 4691 * 4692 * check if there is any job in mirror list 4693 */ 4694 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4695 { 4696 int i; 4697 struct drm_sched_job *job; 4698 4699 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4700 struct amdgpu_ring *ring = adev->rings[i]; 4701 4702 if (!ring || !ring->sched.thread) 4703 continue; 4704 4705 spin_lock(&ring->sched.job_list_lock); 4706 job = list_first_entry_or_null(&ring->sched.pending_list, 4707 struct drm_sched_job, list); 4708 spin_unlock(&ring->sched.job_list_lock); 4709 if (job) 4710 return true; 4711 } 4712 return false; 4713 } 4714 4715 /** 4716 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4717 * 4718 * @adev: amdgpu_device pointer 4719 * 4720 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4721 * a hung GPU. 4722 */ 4723 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4724 { 4725 4726 if (amdgpu_gpu_recovery == 0) 4727 goto disabled; 4728 4729 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4730 dev_info(adev->dev,"Timeout, but no hardware hang detected.\n"); 4731 return false; 4732 } 4733 4734 if (amdgpu_sriov_vf(adev)) 4735 return true; 4736 4737 if (amdgpu_gpu_recovery == -1) { 4738 switch (adev->asic_type) { 4739 #ifdef CONFIG_DRM_AMDGPU_SI 4740 case CHIP_VERDE: 4741 case CHIP_TAHITI: 4742 case CHIP_PITCAIRN: 4743 case CHIP_OLAND: 4744 case CHIP_HAINAN: 4745 #endif 4746 #ifdef CONFIG_DRM_AMDGPU_CIK 4747 case CHIP_KAVERI: 4748 case CHIP_KABINI: 4749 case CHIP_MULLINS: 4750 #endif 4751 case CHIP_CARRIZO: 4752 case CHIP_STONEY: 4753 case CHIP_CYAN_SKILLFISH: 4754 goto disabled; 4755 default: 4756 break; 4757 } 4758 } 4759 4760 return true; 4761 4762 disabled: 4763 dev_info(adev->dev, "GPU recovery disabled.\n"); 4764 return false; 4765 } 4766 4767 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4768 { 4769 u32 i; 4770 int ret = 0; 4771 4772 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4773 4774 dev_info(adev->dev, "GPU mode1 reset\n"); 4775 4776 /* disable BM */ 4777 pci_clear_master(adev->pdev); 4778 4779 amdgpu_device_cache_pci_state(adev->pdev); 4780 4781 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4782 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4783 ret = amdgpu_dpm_mode1_reset(adev); 4784 } else { 4785 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4786 ret = psp_gpu_reset(adev); 4787 } 4788 4789 if (ret) 4790 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4791 4792 amdgpu_device_load_pci_state(adev->pdev); 4793 4794 /* wait for asic to come out of reset */ 4795 for (i = 0; i < adev->usec_timeout; i++) { 4796 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4797 4798 if (memsize != 0xffffffff) 4799 break; 4800 udelay(1); 4801 } 4802 4803 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4804 return ret; 4805 } 4806 4807 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4808 struct amdgpu_reset_context *reset_context) 4809 { 4810 int i, r = 0; 4811 struct amdgpu_job *job = NULL; 4812 bool need_full_reset = 4813 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4814 4815 if (reset_context->reset_req_dev == adev) 4816 job = reset_context->job; 4817 4818 if (amdgpu_sriov_vf(adev)) { 4819 /* stop the data exchange thread */ 4820 amdgpu_virt_fini_data_exchange(adev); 4821 } 4822 4823 amdgpu_fence_driver_isr_toggle(adev, true); 4824 4825 /* block all schedulers and reset given job's ring */ 4826 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4827 struct amdgpu_ring *ring = adev->rings[i]; 4828 4829 if (!ring || !ring->sched.thread) 4830 continue; 4831 4832 /*clear job fence from fence drv to avoid force_completion 4833 *leave NULL and vm flush fence in fence drv */ 4834 amdgpu_fence_driver_clear_job_fences(ring); 4835 4836 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4837 amdgpu_fence_driver_force_completion(ring); 4838 } 4839 4840 amdgpu_fence_driver_isr_toggle(adev, false); 4841 4842 if (job && job->vm) 4843 drm_sched_increase_karma(&job->base); 4844 4845 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4846 /* If reset handler not implemented, continue; otherwise return */ 4847 if (r == -ENOSYS) 4848 r = 0; 4849 else 4850 return r; 4851 4852 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4853 if (!amdgpu_sriov_vf(adev)) { 4854 4855 if (!need_full_reset) 4856 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4857 4858 if (!need_full_reset && amdgpu_gpu_recovery) { 4859 amdgpu_device_ip_pre_soft_reset(adev); 4860 r = amdgpu_device_ip_soft_reset(adev); 4861 amdgpu_device_ip_post_soft_reset(adev); 4862 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4863 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4864 need_full_reset = true; 4865 } 4866 } 4867 4868 if (need_full_reset) 4869 r = amdgpu_device_ip_suspend(adev); 4870 if (need_full_reset) 4871 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4872 else 4873 clear_bit(AMDGPU_NEED_FULL_RESET, 4874 &reset_context->flags); 4875 } 4876 4877 return r; 4878 } 4879 4880 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4881 { 4882 int i; 4883 4884 lockdep_assert_held(&adev->reset_domain->sem); 4885 4886 for (i = 0; i < adev->num_regs; i++) { 4887 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4888 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4889 adev->reset_dump_reg_value[i]); 4890 } 4891 4892 return 0; 4893 } 4894 4895 #ifdef CONFIG_DEV_COREDUMP 4896 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4897 size_t count, void *data, size_t datalen) 4898 { 4899 struct drm_printer p; 4900 struct amdgpu_device *adev = data; 4901 struct drm_print_iterator iter; 4902 int i; 4903 4904 iter.data = buffer; 4905 iter.offset = 0; 4906 iter.start = offset; 4907 iter.remain = count; 4908 4909 p = drm_coredump_printer(&iter); 4910 4911 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4912 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4913 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4914 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4915 if (adev->reset_task_info.pid) 4916 drm_printf(&p, "process_name: %s PID: %d\n", 4917 adev->reset_task_info.process_name, 4918 adev->reset_task_info.pid); 4919 4920 if (adev->reset_vram_lost) 4921 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4922 if (adev->num_regs) { 4923 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4924 4925 for (i = 0; i < adev->num_regs; i++) 4926 drm_printf(&p, "0x%08x: 0x%08x\n", 4927 adev->reset_dump_reg_list[i], 4928 adev->reset_dump_reg_value[i]); 4929 } 4930 4931 return count - iter.remain; 4932 } 4933 4934 static void amdgpu_devcoredump_free(void *data) 4935 { 4936 } 4937 4938 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4939 { 4940 struct drm_device *dev = adev_to_drm(adev); 4941 4942 ktime_get_ts64(&adev->reset_time); 4943 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4944 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4945 } 4946 #endif 4947 4948 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4949 struct amdgpu_reset_context *reset_context) 4950 { 4951 struct amdgpu_device *tmp_adev = NULL; 4952 bool need_full_reset, skip_hw_reset, vram_lost = false; 4953 int r = 0; 4954 bool gpu_reset_for_dev_remove = 0; 4955 4956 /* Try reset handler method first */ 4957 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4958 reset_list); 4959 amdgpu_reset_reg_dumps(tmp_adev); 4960 4961 reset_context->reset_device_list = device_list_handle; 4962 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4963 /* If reset handler not implemented, continue; otherwise return */ 4964 if (r == -ENOSYS) 4965 r = 0; 4966 else 4967 return r; 4968 4969 /* Reset handler not implemented, use the default method */ 4970 need_full_reset = 4971 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4972 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4973 4974 gpu_reset_for_dev_remove = 4975 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4976 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4977 4978 /* 4979 * ASIC reset has to be done on all XGMI hive nodes ASAP 4980 * to allow proper links negotiation in FW (within 1 sec) 4981 */ 4982 if (!skip_hw_reset && need_full_reset) { 4983 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4984 /* For XGMI run all resets in parallel to speed up the process */ 4985 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4986 tmp_adev->gmc.xgmi.pending_reset = false; 4987 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4988 r = -EALREADY; 4989 } else 4990 r = amdgpu_asic_reset(tmp_adev); 4991 4992 if (r) { 4993 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4994 r, adev_to_drm(tmp_adev)->unique); 4995 break; 4996 } 4997 } 4998 4999 /* For XGMI wait for all resets to complete before proceed */ 5000 if (!r) { 5001 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5002 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5003 flush_work(&tmp_adev->xgmi_reset_work); 5004 r = tmp_adev->asic_reset_res; 5005 if (r) 5006 break; 5007 } 5008 } 5009 } 5010 } 5011 5012 if (!r && amdgpu_ras_intr_triggered()) { 5013 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5014 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 5015 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 5016 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 5017 } 5018 5019 amdgpu_ras_intr_cleared(); 5020 } 5021 5022 /* Since the mode1 reset affects base ip blocks, the 5023 * phase1 ip blocks need to be resumed. Otherwise there 5024 * will be a BIOS signature error and the psp bootloader 5025 * can't load kdb on the next amdgpu install. 5026 */ 5027 if (gpu_reset_for_dev_remove) { 5028 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5029 amdgpu_device_ip_resume_phase1(tmp_adev); 5030 5031 goto end; 5032 } 5033 5034 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5035 if (need_full_reset) { 5036 /* post card */ 5037 r = amdgpu_device_asic_init(tmp_adev); 5038 if (r) { 5039 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5040 } else { 5041 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5042 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 5043 if (r) 5044 goto out; 5045 5046 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5047 if (r) 5048 goto out; 5049 5050 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5051 #ifdef CONFIG_DEV_COREDUMP 5052 tmp_adev->reset_vram_lost = vram_lost; 5053 memset(&tmp_adev->reset_task_info, 0, 5054 sizeof(tmp_adev->reset_task_info)); 5055 if (reset_context->job && reset_context->job->vm) 5056 tmp_adev->reset_task_info = 5057 reset_context->job->vm->task_info; 5058 amdgpu_reset_capture_coredumpm(tmp_adev); 5059 #endif 5060 if (vram_lost) { 5061 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5062 amdgpu_inc_vram_lost(tmp_adev); 5063 } 5064 5065 r = amdgpu_device_fw_loading(tmp_adev); 5066 if (r) 5067 return r; 5068 5069 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5070 if (r) 5071 goto out; 5072 5073 if (vram_lost) 5074 amdgpu_device_fill_reset_magic(tmp_adev); 5075 5076 /* 5077 * Add this ASIC as tracked as reset was already 5078 * complete successfully. 5079 */ 5080 amdgpu_register_gpu_instance(tmp_adev); 5081 5082 if (!reset_context->hive && 5083 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5084 amdgpu_xgmi_add_device(tmp_adev); 5085 5086 r = amdgpu_device_ip_late_init(tmp_adev); 5087 if (r) 5088 goto out; 5089 5090 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5091 5092 /* 5093 * The GPU enters bad state once faulty pages 5094 * by ECC has reached the threshold, and ras 5095 * recovery is scheduled next. So add one check 5096 * here to break recovery if it indeed exceeds 5097 * bad page threshold, and remind user to 5098 * retire this GPU or setting one bigger 5099 * bad_page_threshold value to fix this once 5100 * probing driver again. 5101 */ 5102 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5103 /* must succeed. */ 5104 amdgpu_ras_resume(tmp_adev); 5105 } else { 5106 r = -EINVAL; 5107 goto out; 5108 } 5109 5110 /* Update PSP FW topology after reset */ 5111 if (reset_context->hive && 5112 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5113 r = amdgpu_xgmi_update_topology( 5114 reset_context->hive, tmp_adev); 5115 } 5116 } 5117 5118 out: 5119 if (!r) { 5120 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5121 r = amdgpu_ib_ring_tests(tmp_adev); 5122 if (r) { 5123 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5124 need_full_reset = true; 5125 r = -EAGAIN; 5126 goto end; 5127 } 5128 } 5129 5130 if (!r) 5131 r = amdgpu_device_recover_vram(tmp_adev); 5132 else 5133 tmp_adev->asic_reset_res = r; 5134 } 5135 5136 end: 5137 if (need_full_reset) 5138 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5139 else 5140 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5141 return r; 5142 } 5143 5144 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5145 { 5146 5147 switch (amdgpu_asic_reset_method(adev)) { 5148 case AMD_RESET_METHOD_MODE1: 5149 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5150 break; 5151 case AMD_RESET_METHOD_MODE2: 5152 adev->mp1_state = PP_MP1_STATE_RESET; 5153 break; 5154 default: 5155 adev->mp1_state = PP_MP1_STATE_NONE; 5156 break; 5157 } 5158 5159 pci_dev_put(p); 5160 } 5161 5162 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5163 { 5164 amdgpu_vf_error_trans_all(adev); 5165 adev->mp1_state = PP_MP1_STATE_NONE; 5166 } 5167 5168 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5169 { 5170 STUB(); 5171 #ifdef notyet 5172 struct pci_dev *p = NULL; 5173 5174 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5175 adev->pdev->bus->number, 1); 5176 if (p) { 5177 pm_runtime_enable(&(p->dev)); 5178 pm_runtime_resume(&(p->dev)); 5179 } 5180 #endif 5181 } 5182 5183 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5184 { 5185 enum amd_reset_method reset_method; 5186 struct pci_dev *p = NULL; 5187 u64 expires; 5188 5189 /* 5190 * For now, only BACO and mode1 reset are confirmed 5191 * to suffer the audio issue without proper suspended. 5192 */ 5193 reset_method = amdgpu_asic_reset_method(adev); 5194 if ((reset_method != AMD_RESET_METHOD_BACO) && 5195 (reset_method != AMD_RESET_METHOD_MODE1)) 5196 return -EINVAL; 5197 5198 STUB(); 5199 return -ENOSYS; 5200 #ifdef notyet 5201 5202 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5203 adev->pdev->bus->number, 1); 5204 if (!p) 5205 return -ENODEV; 5206 5207 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5208 if (!expires) 5209 /* 5210 * If we cannot get the audio device autosuspend delay, 5211 * a fixed 4S interval will be used. Considering 3S is 5212 * the audio controller default autosuspend delay setting. 5213 * 4S used here is guaranteed to cover that. 5214 */ 5215 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5216 5217 while (!pm_runtime_status_suspended(&(p->dev))) { 5218 if (!pm_runtime_suspend(&(p->dev))) 5219 break; 5220 5221 if (expires < ktime_get_mono_fast_ns()) { 5222 dev_warn(adev->dev, "failed to suspend display audio\n"); 5223 pci_dev_put(p); 5224 /* TODO: abort the succeeding gpu reset? */ 5225 return -ETIMEDOUT; 5226 } 5227 } 5228 5229 pm_runtime_disable(&(p->dev)); 5230 5231 pci_dev_put(p); 5232 return 0; 5233 #endif 5234 } 5235 5236 static void amdgpu_device_recheck_guilty_jobs( 5237 struct amdgpu_device *adev, struct list_head *device_list_handle, 5238 struct amdgpu_reset_context *reset_context) 5239 { 5240 int i, r = 0; 5241 5242 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5243 struct amdgpu_ring *ring = adev->rings[i]; 5244 int ret = 0; 5245 struct drm_sched_job *s_job; 5246 5247 if (!ring || !ring->sched.thread) 5248 continue; 5249 5250 s_job = list_first_entry_or_null(&ring->sched.pending_list, 5251 struct drm_sched_job, list); 5252 if (s_job == NULL) 5253 continue; 5254 5255 /* clear job's guilty and depend the folowing step to decide the real one */ 5256 drm_sched_reset_karma(s_job); 5257 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 5258 5259 if (!s_job->s_fence->parent) { 5260 DRM_WARN("Failed to get a HW fence for job!"); 5261 continue; 5262 } 5263 5264 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 5265 if (ret == 0) { /* timeout */ 5266 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 5267 ring->sched.name, s_job->id); 5268 5269 5270 amdgpu_fence_driver_isr_toggle(adev, true); 5271 5272 /* Clear this failed job from fence array */ 5273 amdgpu_fence_driver_clear_job_fences(ring); 5274 5275 amdgpu_fence_driver_isr_toggle(adev, false); 5276 5277 /* Since the job won't signal and we go for 5278 * another resubmit drop this parent pointer 5279 */ 5280 dma_fence_put(s_job->s_fence->parent); 5281 s_job->s_fence->parent = NULL; 5282 5283 /* set guilty */ 5284 drm_sched_increase_karma(s_job); 5285 amdgpu_reset_prepare_hwcontext(adev, reset_context); 5286 retry: 5287 /* do hw reset */ 5288 if (amdgpu_sriov_vf(adev)) { 5289 amdgpu_virt_fini_data_exchange(adev); 5290 r = amdgpu_device_reset_sriov(adev, false); 5291 if (r) 5292 adev->asic_reset_res = r; 5293 } else { 5294 clear_bit(AMDGPU_SKIP_HW_RESET, 5295 &reset_context->flags); 5296 r = amdgpu_do_asic_reset(device_list_handle, 5297 reset_context); 5298 if (r && r == -EAGAIN) 5299 goto retry; 5300 } 5301 5302 /* 5303 * add reset counter so that the following 5304 * resubmitted job could flush vmid 5305 */ 5306 atomic_inc(&adev->gpu_reset_counter); 5307 continue; 5308 } 5309 5310 /* got the hw fence, signal finished fence */ 5311 atomic_dec(ring->sched.score); 5312 dma_fence_get(&s_job->s_fence->finished); 5313 dma_fence_signal(&s_job->s_fence->finished); 5314 dma_fence_put(&s_job->s_fence->finished); 5315 5316 /* remove node from list and free the job */ 5317 spin_lock(&ring->sched.job_list_lock); 5318 list_del_init(&s_job->list); 5319 spin_unlock(&ring->sched.job_list_lock); 5320 ring->sched.ops->free_job(s_job); 5321 } 5322 } 5323 5324 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5325 { 5326 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5327 5328 #if defined(CONFIG_DEBUG_FS) 5329 if (!amdgpu_sriov_vf(adev)) 5330 cancel_work(&adev->reset_work); 5331 #endif 5332 5333 if (adev->kfd.dev) 5334 cancel_work(&adev->kfd.reset_work); 5335 5336 if (amdgpu_sriov_vf(adev)) 5337 cancel_work(&adev->virt.flr_work); 5338 5339 if (con && adev->ras_enabled) 5340 cancel_work(&con->recovery_work); 5341 5342 } 5343 5344 5345 /** 5346 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5347 * 5348 * @adev: amdgpu_device pointer 5349 * @job: which job trigger hang 5350 * 5351 * Attempt to reset the GPU if it has hung (all asics). 5352 * Attempt to do soft-reset or full-reset and reinitialize Asic 5353 * Returns 0 for success or an error on failure. 5354 */ 5355 5356 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5357 struct amdgpu_job *job, 5358 struct amdgpu_reset_context *reset_context) 5359 { 5360 struct list_head device_list, *device_list_handle = NULL; 5361 bool job_signaled = false; 5362 struct amdgpu_hive_info *hive = NULL; 5363 struct amdgpu_device *tmp_adev = NULL; 5364 int i, r = 0; 5365 bool need_emergency_restart = false; 5366 bool audio_suspended = false; 5367 int tmp_vram_lost_counter; 5368 bool gpu_reset_for_dev_remove = false; 5369 5370 gpu_reset_for_dev_remove = 5371 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5372 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5373 5374 /* 5375 * Special case: RAS triggered and full reset isn't supported 5376 */ 5377 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5378 5379 /* 5380 * Flush RAM to disk so that after reboot 5381 * the user can read log and see why the system rebooted. 5382 */ 5383 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5384 DRM_WARN("Emergency reboot."); 5385 5386 #ifdef notyet 5387 ksys_sync_helper(); 5388 emergency_restart(); 5389 #else 5390 panic("emergency_restart"); 5391 #endif 5392 } 5393 5394 dev_info(adev->dev, "GPU %s begin!\n", 5395 need_emergency_restart ? "jobs stop":"reset"); 5396 5397 if (!amdgpu_sriov_vf(adev)) 5398 hive = amdgpu_get_xgmi_hive(adev); 5399 if (hive) 5400 mutex_lock(&hive->hive_lock); 5401 5402 reset_context->job = job; 5403 reset_context->hive = hive; 5404 /* 5405 * Build list of devices to reset. 5406 * In case we are in XGMI hive mode, resort the device list 5407 * to put adev in the 1st position. 5408 */ 5409 INIT_LIST_HEAD(&device_list); 5410 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5411 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5412 list_add_tail(&tmp_adev->reset_list, &device_list); 5413 if (gpu_reset_for_dev_remove && adev->shutdown) 5414 tmp_adev->shutdown = true; 5415 } 5416 if (!list_is_first(&adev->reset_list, &device_list)) 5417 list_rotate_to_front(&adev->reset_list, &device_list); 5418 device_list_handle = &device_list; 5419 } else { 5420 list_add_tail(&adev->reset_list, &device_list); 5421 device_list_handle = &device_list; 5422 } 5423 5424 /* We need to lock reset domain only once both for XGMI and single device */ 5425 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5426 reset_list); 5427 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5428 5429 /* block all schedulers and reset given job's ring */ 5430 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5431 5432 amdgpu_device_set_mp1_state(tmp_adev); 5433 5434 /* 5435 * Try to put the audio codec into suspend state 5436 * before gpu reset started. 5437 * 5438 * Due to the power domain of the graphics device 5439 * is shared with AZ power domain. Without this, 5440 * we may change the audio hardware from behind 5441 * the audio driver's back. That will trigger 5442 * some audio codec errors. 5443 */ 5444 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5445 audio_suspended = true; 5446 5447 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5448 5449 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5450 5451 if (!amdgpu_sriov_vf(tmp_adev)) 5452 amdgpu_amdkfd_pre_reset(tmp_adev); 5453 5454 /* 5455 * Mark these ASICs to be reseted as untracked first 5456 * And add them back after reset completed 5457 */ 5458 amdgpu_unregister_gpu_instance(tmp_adev); 5459 5460 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5461 5462 /* disable ras on ALL IPs */ 5463 if (!need_emergency_restart && 5464 amdgpu_device_ip_need_full_reset(tmp_adev)) 5465 amdgpu_ras_suspend(tmp_adev); 5466 5467 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5468 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5469 5470 if (!ring || !ring->sched.thread) 5471 continue; 5472 5473 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5474 5475 if (need_emergency_restart) 5476 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5477 } 5478 atomic_inc(&tmp_adev->gpu_reset_counter); 5479 } 5480 5481 if (need_emergency_restart) 5482 goto skip_sched_resume; 5483 5484 /* 5485 * Must check guilty signal here since after this point all old 5486 * HW fences are force signaled. 5487 * 5488 * job->base holds a reference to parent fence 5489 */ 5490 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5491 job_signaled = true; 5492 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5493 goto skip_hw_reset; 5494 } 5495 5496 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5497 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5498 if (gpu_reset_for_dev_remove) { 5499 /* Workaroud for ASICs need to disable SMC first */ 5500 amdgpu_device_smu_fini_early(tmp_adev); 5501 } 5502 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5503 /*TODO Should we stop ?*/ 5504 if (r) { 5505 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5506 r, adev_to_drm(tmp_adev)->unique); 5507 tmp_adev->asic_reset_res = r; 5508 } 5509 5510 /* 5511 * Drop all pending non scheduler resets. Scheduler resets 5512 * were already dropped during drm_sched_stop 5513 */ 5514 amdgpu_device_stop_pending_resets(tmp_adev); 5515 } 5516 5517 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5518 /* Actual ASIC resets if needed.*/ 5519 /* Host driver will handle XGMI hive reset for SRIOV */ 5520 if (amdgpu_sriov_vf(adev)) { 5521 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5522 if (r) 5523 adev->asic_reset_res = r; 5524 5525 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5526 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5527 amdgpu_ras_resume(adev); 5528 } else { 5529 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5530 if (r && r == -EAGAIN) 5531 goto retry; 5532 5533 if (!r && gpu_reset_for_dev_remove) 5534 goto recover_end; 5535 } 5536 5537 skip_hw_reset: 5538 5539 /* Post ASIC reset for all devs .*/ 5540 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5541 5542 /* 5543 * Sometimes a later bad compute job can block a good gfx job as gfx 5544 * and compute ring share internal GC HW mutually. We add an additional 5545 * guilty jobs recheck step to find the real guilty job, it synchronously 5546 * submits and pends for the first job being signaled. If it gets timeout, 5547 * we identify it as a real guilty job. 5548 */ 5549 if (amdgpu_gpu_recovery == 2 && 5550 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5551 amdgpu_device_recheck_guilty_jobs( 5552 tmp_adev, device_list_handle, reset_context); 5553 5554 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5555 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5556 5557 if (!ring || !ring->sched.thread) 5558 continue; 5559 5560 /* No point to resubmit jobs if we didn't HW reset*/ 5561 if (!tmp_adev->asic_reset_res && !job_signaled) 5562 drm_sched_resubmit_jobs(&ring->sched); 5563 5564 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5565 } 5566 5567 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5568 amdgpu_mes_self_test(tmp_adev); 5569 5570 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5571 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5572 } 5573 5574 if (tmp_adev->asic_reset_res) 5575 r = tmp_adev->asic_reset_res; 5576 5577 tmp_adev->asic_reset_res = 0; 5578 5579 if (r) { 5580 /* bad news, how to tell it to userspace ? */ 5581 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5582 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5583 } else { 5584 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5585 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5586 DRM_WARN("smart shift update failed\n"); 5587 } 5588 } 5589 5590 skip_sched_resume: 5591 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5592 /* unlock kfd: SRIOV would do it separately */ 5593 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5594 amdgpu_amdkfd_post_reset(tmp_adev); 5595 5596 /* kfd_post_reset will do nothing if kfd device is not initialized, 5597 * need to bring up kfd here if it's not be initialized before 5598 */ 5599 if (!adev->kfd.init_complete) 5600 amdgpu_amdkfd_device_init(adev); 5601 5602 if (audio_suspended) 5603 amdgpu_device_resume_display_audio(tmp_adev); 5604 5605 amdgpu_device_unset_mp1_state(tmp_adev); 5606 } 5607 5608 recover_end: 5609 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5610 reset_list); 5611 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5612 5613 if (hive) { 5614 mutex_unlock(&hive->hive_lock); 5615 amdgpu_put_xgmi_hive(hive); 5616 } 5617 5618 if (r) 5619 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5620 5621 atomic_set(&adev->reset_domain->reset_res, r); 5622 return r; 5623 } 5624 5625 /** 5626 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5627 * 5628 * @adev: amdgpu_device pointer 5629 * 5630 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5631 * and lanes) of the slot the device is in. Handles APUs and 5632 * virtualized environments where PCIE config space may not be available. 5633 */ 5634 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5635 { 5636 struct pci_dev *pdev; 5637 enum pci_bus_speed speed_cap, platform_speed_cap; 5638 enum pcie_link_width platform_link_width; 5639 5640 if (amdgpu_pcie_gen_cap) 5641 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5642 5643 if (amdgpu_pcie_lane_cap) 5644 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5645 5646 /* covers APUs as well */ 5647 if (pci_is_root_bus(adev->pdev->bus)) { 5648 if (adev->pm.pcie_gen_mask == 0) 5649 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5650 if (adev->pm.pcie_mlw_mask == 0) 5651 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5652 return; 5653 } 5654 5655 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5656 return; 5657 5658 pcie_bandwidth_available(adev->pdev, NULL, 5659 &platform_speed_cap, &platform_link_width); 5660 5661 if (adev->pm.pcie_gen_mask == 0) { 5662 /* asic caps */ 5663 pdev = adev->pdev; 5664 speed_cap = pcie_get_speed_cap(pdev); 5665 if (speed_cap == PCI_SPEED_UNKNOWN) { 5666 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5667 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5668 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5669 } else { 5670 if (speed_cap == PCIE_SPEED_32_0GT) 5671 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5672 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5673 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5674 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5675 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5676 else if (speed_cap == PCIE_SPEED_16_0GT) 5677 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5678 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5679 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5680 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5681 else if (speed_cap == PCIE_SPEED_8_0GT) 5682 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5683 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5684 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5685 else if (speed_cap == PCIE_SPEED_5_0GT) 5686 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5687 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5688 else 5689 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5690 } 5691 /* platform caps */ 5692 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5693 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5694 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5695 } else { 5696 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5697 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5698 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5699 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5700 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5701 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5702 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5703 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5704 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5705 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5706 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5707 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5708 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5709 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5710 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5711 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5712 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5713 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5714 else 5715 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5716 5717 } 5718 } 5719 if (adev->pm.pcie_mlw_mask == 0) { 5720 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5721 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5722 } else { 5723 switch (platform_link_width) { 5724 case PCIE_LNK_X32: 5725 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5726 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5727 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5728 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5729 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5730 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5731 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5732 break; 5733 case PCIE_LNK_X16: 5734 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5735 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5736 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5737 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5738 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5739 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5740 break; 5741 case PCIE_LNK_X12: 5742 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5743 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5744 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5745 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5746 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5747 break; 5748 case PCIE_LNK_X8: 5749 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5750 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5751 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5752 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5753 break; 5754 case PCIE_LNK_X4: 5755 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5756 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5757 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5758 break; 5759 case PCIE_LNK_X2: 5760 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5761 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5762 break; 5763 case PCIE_LNK_X1: 5764 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5765 break; 5766 default: 5767 break; 5768 } 5769 } 5770 } 5771 } 5772 5773 /** 5774 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5775 * 5776 * @adev: amdgpu_device pointer 5777 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5778 * 5779 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5780 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5781 * @peer_adev. 5782 */ 5783 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5784 struct amdgpu_device *peer_adev) 5785 { 5786 #ifdef CONFIG_HSA_AMD_P2P 5787 uint64_t address_mask = peer_adev->dev->dma_mask ? 5788 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5789 resource_size_t aper_limit = 5790 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5791 bool p2p_access = 5792 !adev->gmc.xgmi.connected_to_cpu && 5793 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5794 5795 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5796 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5797 !(adev->gmc.aper_base & address_mask || 5798 aper_limit & address_mask)); 5799 #else 5800 return false; 5801 #endif 5802 } 5803 5804 int amdgpu_device_baco_enter(struct drm_device *dev) 5805 { 5806 struct amdgpu_device *adev = drm_to_adev(dev); 5807 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5808 5809 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5810 return -ENOTSUPP; 5811 5812 if (ras && adev->ras_enabled && 5813 adev->nbio.funcs->enable_doorbell_interrupt) 5814 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5815 5816 return amdgpu_dpm_baco_enter(adev); 5817 } 5818 5819 int amdgpu_device_baco_exit(struct drm_device *dev) 5820 { 5821 struct amdgpu_device *adev = drm_to_adev(dev); 5822 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5823 int ret = 0; 5824 5825 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5826 return -ENOTSUPP; 5827 5828 ret = amdgpu_dpm_baco_exit(adev); 5829 if (ret) 5830 return ret; 5831 5832 if (ras && adev->ras_enabled && 5833 adev->nbio.funcs->enable_doorbell_interrupt) 5834 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5835 5836 if (amdgpu_passthrough(adev) && 5837 adev->nbio.funcs->clear_doorbell_interrupt) 5838 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5839 5840 return 0; 5841 } 5842 5843 /** 5844 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5845 * @pdev: PCI device struct 5846 * @state: PCI channel state 5847 * 5848 * Description: Called when a PCI error is detected. 5849 * 5850 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5851 */ 5852 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5853 { 5854 STUB(); 5855 return 0; 5856 #ifdef notyet 5857 struct drm_device *dev = pci_get_drvdata(pdev); 5858 struct amdgpu_device *adev = drm_to_adev(dev); 5859 int i; 5860 5861 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5862 5863 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5864 DRM_WARN("No support for XGMI hive yet..."); 5865 return PCI_ERS_RESULT_DISCONNECT; 5866 } 5867 5868 adev->pci_channel_state = state; 5869 5870 switch (state) { 5871 case pci_channel_io_normal: 5872 return PCI_ERS_RESULT_CAN_RECOVER; 5873 /* Fatal error, prepare for slot reset */ 5874 case pci_channel_io_frozen: 5875 /* 5876 * Locking adev->reset_domain->sem will prevent any external access 5877 * to GPU during PCI error recovery 5878 */ 5879 amdgpu_device_lock_reset_domain(adev->reset_domain); 5880 amdgpu_device_set_mp1_state(adev); 5881 5882 /* 5883 * Block any work scheduling as we do for regular GPU reset 5884 * for the duration of the recovery 5885 */ 5886 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5887 struct amdgpu_ring *ring = adev->rings[i]; 5888 5889 if (!ring || !ring->sched.thread) 5890 continue; 5891 5892 drm_sched_stop(&ring->sched, NULL); 5893 } 5894 atomic_inc(&adev->gpu_reset_counter); 5895 return PCI_ERS_RESULT_NEED_RESET; 5896 case pci_channel_io_perm_failure: 5897 /* Permanent error, prepare for device removal */ 5898 return PCI_ERS_RESULT_DISCONNECT; 5899 } 5900 5901 return PCI_ERS_RESULT_NEED_RESET; 5902 #endif 5903 } 5904 5905 /** 5906 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5907 * @pdev: pointer to PCI device 5908 */ 5909 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5910 { 5911 5912 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5913 5914 /* TODO - dump whatever for debugging purposes */ 5915 5916 /* This called only if amdgpu_pci_error_detected returns 5917 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5918 * works, no need to reset slot. 5919 */ 5920 5921 return PCI_ERS_RESULT_RECOVERED; 5922 } 5923 5924 /** 5925 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5926 * @pdev: PCI device struct 5927 * 5928 * Description: This routine is called by the pci error recovery 5929 * code after the PCI slot has been reset, just before we 5930 * should resume normal operations. 5931 */ 5932 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5933 { 5934 STUB(); 5935 return PCI_ERS_RESULT_RECOVERED; 5936 #ifdef notyet 5937 struct drm_device *dev = pci_get_drvdata(pdev); 5938 struct amdgpu_device *adev = drm_to_adev(dev); 5939 int r, i; 5940 struct amdgpu_reset_context reset_context; 5941 u32 memsize; 5942 struct list_head device_list; 5943 5944 DRM_INFO("PCI error: slot reset callback!!\n"); 5945 5946 memset(&reset_context, 0, sizeof(reset_context)); 5947 5948 INIT_LIST_HEAD(&device_list); 5949 list_add_tail(&adev->reset_list, &device_list); 5950 5951 /* wait for asic to come out of reset */ 5952 drm_msleep(500); 5953 5954 /* Restore PCI confspace */ 5955 amdgpu_device_load_pci_state(pdev); 5956 5957 /* confirm ASIC came out of reset */ 5958 for (i = 0; i < adev->usec_timeout; i++) { 5959 memsize = amdgpu_asic_get_config_memsize(adev); 5960 5961 if (memsize != 0xffffffff) 5962 break; 5963 udelay(1); 5964 } 5965 if (memsize == 0xffffffff) { 5966 r = -ETIME; 5967 goto out; 5968 } 5969 5970 reset_context.method = AMD_RESET_METHOD_NONE; 5971 reset_context.reset_req_dev = adev; 5972 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5973 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5974 5975 adev->no_hw_access = true; 5976 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5977 adev->no_hw_access = false; 5978 if (r) 5979 goto out; 5980 5981 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5982 5983 out: 5984 if (!r) { 5985 if (amdgpu_device_cache_pci_state(adev->pdev)) 5986 pci_restore_state(adev->pdev); 5987 5988 DRM_INFO("PCIe error recovery succeeded\n"); 5989 } else { 5990 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5991 amdgpu_device_unset_mp1_state(adev); 5992 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5993 } 5994 5995 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5996 #endif 5997 } 5998 5999 /** 6000 * amdgpu_pci_resume() - resume normal ops after PCI reset 6001 * @pdev: pointer to PCI device 6002 * 6003 * Called when the error recovery driver tells us that its 6004 * OK to resume normal operation. 6005 */ 6006 void amdgpu_pci_resume(struct pci_dev *pdev) 6007 { 6008 STUB(); 6009 #ifdef notyet 6010 struct drm_device *dev = pci_get_drvdata(pdev); 6011 struct amdgpu_device *adev = drm_to_adev(dev); 6012 int i; 6013 6014 6015 DRM_INFO("PCI error: resume callback!!\n"); 6016 6017 /* Only continue execution for the case of pci_channel_io_frozen */ 6018 if (adev->pci_channel_state != pci_channel_io_frozen) 6019 return; 6020 6021 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6022 struct amdgpu_ring *ring = adev->rings[i]; 6023 6024 if (!ring || !ring->sched.thread) 6025 continue; 6026 6027 6028 drm_sched_resubmit_jobs(&ring->sched); 6029 drm_sched_start(&ring->sched, true); 6030 } 6031 6032 amdgpu_device_unset_mp1_state(adev); 6033 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6034 #endif 6035 } 6036 6037 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6038 { 6039 return false; 6040 #ifdef notyet 6041 struct drm_device *dev = pci_get_drvdata(pdev); 6042 struct amdgpu_device *adev = drm_to_adev(dev); 6043 int r; 6044 6045 r = pci_save_state(pdev); 6046 if (!r) { 6047 kfree(adev->pci_state); 6048 6049 adev->pci_state = pci_store_saved_state(pdev); 6050 6051 if (!adev->pci_state) { 6052 DRM_ERROR("Failed to store PCI saved state"); 6053 return false; 6054 } 6055 } else { 6056 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6057 return false; 6058 } 6059 6060 return true; 6061 #endif 6062 } 6063 6064 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6065 { 6066 STUB(); 6067 return false; 6068 #ifdef notyet 6069 struct drm_device *dev = pci_get_drvdata(pdev); 6070 struct amdgpu_device *adev = drm_to_adev(dev); 6071 int r; 6072 6073 if (!adev->pci_state) 6074 return false; 6075 6076 r = pci_load_saved_state(pdev, adev->pci_state); 6077 6078 if (!r) { 6079 pci_restore_state(pdev); 6080 } else { 6081 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6082 return false; 6083 } 6084 6085 return true; 6086 #endif 6087 } 6088 6089 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6090 struct amdgpu_ring *ring) 6091 { 6092 #ifdef CONFIG_X86_64 6093 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6094 return; 6095 #endif 6096 if (adev->gmc.xgmi.connected_to_cpu) 6097 return; 6098 6099 if (ring && ring->funcs->emit_hdp_flush) 6100 amdgpu_ring_emit_hdp_flush(ring); 6101 else 6102 amdgpu_asic_flush_hdp(adev, ring); 6103 } 6104 6105 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6106 struct amdgpu_ring *ring) 6107 { 6108 #ifdef CONFIG_X86_64 6109 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6110 return; 6111 #endif 6112 if (adev->gmc.xgmi.connected_to_cpu) 6113 return; 6114 6115 amdgpu_asic_invalidate_hdp(adev, ring); 6116 } 6117 6118 int amdgpu_in_reset(struct amdgpu_device *adev) 6119 { 6120 return atomic_read(&adev->reset_domain->in_gpu_reset); 6121 } 6122 6123 /** 6124 * amdgpu_device_halt() - bring hardware to some kind of halt state 6125 * 6126 * @adev: amdgpu_device pointer 6127 * 6128 * Bring hardware to some kind of halt state so that no one can touch it 6129 * any more. It will help to maintain error context when error occurred. 6130 * Compare to a simple hang, the system will keep stable at least for SSH 6131 * access. Then it should be trivial to inspect the hardware state and 6132 * see what's going on. Implemented as following: 6133 * 6134 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6135 * clears all CPU mappings to device, disallows remappings through page faults 6136 * 2. amdgpu_irq_disable_all() disables all interrupts 6137 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6138 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6139 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6140 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6141 * flush any in flight DMA operations 6142 */ 6143 void amdgpu_device_halt(struct amdgpu_device *adev) 6144 { 6145 struct pci_dev *pdev = adev->pdev; 6146 struct drm_device *ddev = adev_to_drm(adev); 6147 6148 drm_dev_unplug(ddev); 6149 6150 amdgpu_irq_disable_all(adev); 6151 6152 amdgpu_fence_driver_hw_fini(adev); 6153 6154 adev->no_hw_access = true; 6155 6156 amdgpu_device_unmap_mmio(adev); 6157 6158 pci_disable_device(pdev); 6159 pci_wait_for_pending_transaction(pdev); 6160 } 6161 6162 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6163 u32 reg) 6164 { 6165 unsigned long flags, address, data; 6166 u32 r; 6167 6168 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6169 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6170 6171 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6172 WREG32(address, reg * 4); 6173 (void)RREG32(address); 6174 r = RREG32(data); 6175 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6176 return r; 6177 } 6178 6179 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6180 u32 reg, u32 v) 6181 { 6182 unsigned long flags, address, data; 6183 6184 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6185 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6186 6187 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6188 WREG32(address, reg * 4); 6189 (void)RREG32(address); 6190 WREG32(data, v); 6191 (void)RREG32(data); 6192 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6193 } 6194 6195 /** 6196 * amdgpu_device_switch_gang - switch to a new gang 6197 * @adev: amdgpu_device pointer 6198 * @gang: the gang to switch to 6199 * 6200 * Try to switch to a new gang. 6201 * Returns: NULL if we switched to the new gang or a reference to the current 6202 * gang leader. 6203 */ 6204 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6205 struct dma_fence *gang) 6206 { 6207 struct dma_fence *old = NULL; 6208 6209 do { 6210 dma_fence_put(old); 6211 rcu_read_lock(); 6212 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6213 rcu_read_unlock(); 6214 6215 if (old == gang) 6216 break; 6217 6218 if (!dma_fence_is_signaled(old)) 6219 return old; 6220 6221 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6222 old, gang) != old); 6223 6224 dma_fence_put(old); 6225 return NULL; 6226 } 6227 6228 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6229 { 6230 switch (adev->asic_type) { 6231 #ifdef CONFIG_DRM_AMDGPU_SI 6232 case CHIP_HAINAN: 6233 #endif 6234 case CHIP_TOPAZ: 6235 /* chips with no display hardware */ 6236 return false; 6237 #ifdef CONFIG_DRM_AMDGPU_SI 6238 case CHIP_TAHITI: 6239 case CHIP_PITCAIRN: 6240 case CHIP_VERDE: 6241 case CHIP_OLAND: 6242 #endif 6243 #ifdef CONFIG_DRM_AMDGPU_CIK 6244 case CHIP_BONAIRE: 6245 case CHIP_HAWAII: 6246 case CHIP_KAVERI: 6247 case CHIP_KABINI: 6248 case CHIP_MULLINS: 6249 #endif 6250 case CHIP_TONGA: 6251 case CHIP_FIJI: 6252 case CHIP_POLARIS10: 6253 case CHIP_POLARIS11: 6254 case CHIP_POLARIS12: 6255 case CHIP_VEGAM: 6256 case CHIP_CARRIZO: 6257 case CHIP_STONEY: 6258 /* chips with display hardware */ 6259 return true; 6260 default: 6261 /* IP discovery */ 6262 if (!adev->ip_versions[DCE_HWIP][0] || 6263 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6264 return false; 6265 return true; 6266 } 6267 } 6268