1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/vgaarb.h> 45 #include <linux/vga_switcheroo.h> 46 #include <linux/efi.h> 47 #include "amdgpu.h" 48 #include "amdgpu_trace.h" 49 #include "amdgpu_i2c.h" 50 #include "atom.h" 51 #include "amdgpu_atombios.h" 52 #include "amdgpu_atomfirmware.h" 53 #include "amd_pcie.h" 54 #ifdef CONFIG_DRM_AMDGPU_SI 55 #include "si.h" 56 #endif 57 #ifdef CONFIG_DRM_AMDGPU_CIK 58 #include "cik.h" 59 #endif 60 #include "vi.h" 61 #include "soc15.h" 62 #include "nv.h" 63 #include "bif/bif_4_1_d.h" 64 #include <linux/firmware.h> 65 #include "amdgpu_vf_error.h" 66 67 #include "amdgpu_amdkfd.h" 68 #include "amdgpu_pm.h" 69 70 #include "amdgpu_xgmi.h" 71 #include "amdgpu_ras.h" 72 #include "amdgpu_pmu.h" 73 #include "amdgpu_fru_eeprom.h" 74 #include "amdgpu_reset.h" 75 76 #include <linux/suspend.h> 77 #include <drm/task_barrier.h> 78 #include <linux/pm_runtime.h> 79 80 #include <drm/drm_drv.h> 81 82 #if IS_ENABLED(CONFIG_X86) && defined(__linux__) 83 #include <asm/intel-family.h> 84 #endif 85 86 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 93 94 #define AMDGPU_RESUME_MS 2000 95 #define AMDGPU_MAX_RETRY_LIMIT 2 96 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 97 98 static const struct drm_driver amdgpu_kms_driver; 99 100 const char *amdgpu_asic_name[] = { 101 "TAHITI", 102 "PITCAIRN", 103 "VERDE", 104 "OLAND", 105 "HAINAN", 106 "BONAIRE", 107 "KAVERI", 108 "KABINI", 109 "HAWAII", 110 "MULLINS", 111 "TOPAZ", 112 "TONGA", 113 "FIJI", 114 "CARRIZO", 115 "STONEY", 116 "POLARIS10", 117 "POLARIS11", 118 "POLARIS12", 119 "VEGAM", 120 "VEGA10", 121 "VEGA12", 122 "VEGA20", 123 "RAVEN", 124 "ARCTURUS", 125 "RENOIR", 126 "ALDEBARAN", 127 "NAVI10", 128 "CYAN_SKILLFISH", 129 "NAVI14", 130 "NAVI12", 131 "SIENNA_CICHLID", 132 "NAVY_FLOUNDER", 133 "VANGOGH", 134 "DIMGREY_CAVEFISH", 135 "BEIGE_GOBY", 136 "YELLOW_CARP", 137 "IP DISCOVERY", 138 "LAST", 139 }; 140 141 /** 142 * DOC: pcie_replay_count 143 * 144 * The amdgpu driver provides a sysfs API for reporting the total number 145 * of PCIe replays (NAKs) 146 * The file pcie_replay_count is used for this and returns the total 147 * number of replays as a sum of the NAKs generated and NAKs received 148 */ 149 150 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 151 struct device_attribute *attr, char *buf) 152 { 153 struct drm_device *ddev = dev_get_drvdata(dev); 154 struct amdgpu_device *adev = drm_to_adev(ddev); 155 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 156 157 return sysfs_emit(buf, "%llu\n", cnt); 158 } 159 160 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 161 amdgpu_device_get_pcie_replay_count, NULL); 162 163 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 164 165 /** 166 * DOC: product_name 167 * 168 * The amdgpu driver provides a sysfs API for reporting the product name 169 * for the device 170 * The file serial_number is used for this and returns the product name 171 * as returned from the FRU. 172 * NOTE: This is only available for certain server cards 173 */ 174 175 static ssize_t amdgpu_device_get_product_name(struct device *dev, 176 struct device_attribute *attr, char *buf) 177 { 178 struct drm_device *ddev = dev_get_drvdata(dev); 179 struct amdgpu_device *adev = drm_to_adev(ddev); 180 181 return sysfs_emit(buf, "%s\n", adev->product_name); 182 } 183 184 static DEVICE_ATTR(product_name, S_IRUGO, 185 amdgpu_device_get_product_name, NULL); 186 187 /** 188 * DOC: product_number 189 * 190 * The amdgpu driver provides a sysfs API for reporting the part number 191 * for the device 192 * The file serial_number is used for this and returns the part number 193 * as returned from the FRU. 194 * NOTE: This is only available for certain server cards 195 */ 196 197 static ssize_t amdgpu_device_get_product_number(struct device *dev, 198 struct device_attribute *attr, char *buf) 199 { 200 struct drm_device *ddev = dev_get_drvdata(dev); 201 struct amdgpu_device *adev = drm_to_adev(ddev); 202 203 return sysfs_emit(buf, "%s\n", adev->product_number); 204 } 205 206 static DEVICE_ATTR(product_number, S_IRUGO, 207 amdgpu_device_get_product_number, NULL); 208 209 /** 210 * DOC: serial_number 211 * 212 * The amdgpu driver provides a sysfs API for reporting the serial number 213 * for the device 214 * The file serial_number is used for this and returns the serial number 215 * as returned from the FRU. 216 * NOTE: This is only available for certain server cards 217 */ 218 219 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 220 struct device_attribute *attr, char *buf) 221 { 222 struct drm_device *ddev = dev_get_drvdata(dev); 223 struct amdgpu_device *adev = drm_to_adev(ddev); 224 225 return sysfs_emit(buf, "%s\n", adev->serial); 226 } 227 228 static DEVICE_ATTR(serial_number, S_IRUGO, 229 amdgpu_device_get_serial_number, NULL); 230 231 /** 232 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 233 * 234 * @dev: drm_device pointer 235 * 236 * Returns true if the device is a dGPU with ATPX power control, 237 * otherwise return false. 238 */ 239 bool amdgpu_device_supports_px(struct drm_device *dev) 240 { 241 struct amdgpu_device *adev = drm_to_adev(dev); 242 243 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 244 return true; 245 return false; 246 } 247 248 /** 249 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 250 * 251 * @dev: drm_device pointer 252 * 253 * Returns true if the device is a dGPU with ACPI power control, 254 * otherwise return false. 255 */ 256 bool amdgpu_device_supports_boco(struct drm_device *dev) 257 { 258 struct amdgpu_device *adev = drm_to_adev(dev); 259 260 if (adev->has_pr3 || 261 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 262 return true; 263 return false; 264 } 265 266 /** 267 * amdgpu_device_supports_baco - Does the device support BACO 268 * 269 * @dev: drm_device pointer 270 * 271 * Returns true if the device supporte BACO, 272 * otherwise return false. 273 */ 274 bool amdgpu_device_supports_baco(struct drm_device *dev) 275 { 276 struct amdgpu_device *adev = drm_to_adev(dev); 277 278 return amdgpu_asic_supports_baco(adev); 279 } 280 281 /** 282 * amdgpu_device_supports_smart_shift - Is the device dGPU with 283 * smart shift support 284 * 285 * @dev: drm_device pointer 286 * 287 * Returns true if the device is a dGPU with Smart Shift support, 288 * otherwise returns false. 289 */ 290 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 291 { 292 return (amdgpu_device_supports_boco(dev) && 293 amdgpu_acpi_is_power_shift_control_supported()); 294 } 295 296 /* 297 * VRAM access helper functions 298 */ 299 300 /** 301 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 302 * 303 * @adev: amdgpu_device pointer 304 * @pos: offset of the buffer in vram 305 * @buf: virtual address of the buffer in system memory 306 * @size: read/write size, sizeof(@buf) must > @size 307 * @write: true - write to vram, otherwise - read from vram 308 */ 309 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 310 void *buf, size_t size, bool write) 311 { 312 unsigned long flags; 313 uint32_t hi = ~0, tmp = 0; 314 uint32_t *data = buf; 315 uint64_t last; 316 int idx; 317 318 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 319 return; 320 321 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 322 323 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 324 for (last = pos + size; pos < last; pos += 4) { 325 tmp = pos >> 31; 326 327 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 328 if (tmp != hi) { 329 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 330 hi = tmp; 331 } 332 if (write) 333 WREG32_NO_KIQ(mmMM_DATA, *data++); 334 else 335 *data++ = RREG32_NO_KIQ(mmMM_DATA); 336 } 337 338 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 339 drm_dev_exit(idx); 340 } 341 342 /** 343 * amdgpu_device_aper_access - access vram by vram aperature 344 * 345 * @adev: amdgpu_device pointer 346 * @pos: offset of the buffer in vram 347 * @buf: virtual address of the buffer in system memory 348 * @size: read/write size, sizeof(@buf) must > @size 349 * @write: true - write to vram, otherwise - read from vram 350 * 351 * The return value means how many bytes have been transferred. 352 */ 353 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 354 void *buf, size_t size, bool write) 355 { 356 #ifdef CONFIG_64BIT 357 void __iomem *addr; 358 size_t count = 0; 359 uint64_t last; 360 361 if (!adev->mman.aper_base_kaddr) 362 return 0; 363 364 last = min(pos + size, adev->gmc.visible_vram_size); 365 if (last > pos) { 366 addr = adev->mman.aper_base_kaddr + pos; 367 count = last - pos; 368 369 if (write) { 370 memcpy_toio(addr, buf, count); 371 mb(); 372 amdgpu_device_flush_hdp(adev, NULL); 373 } else { 374 amdgpu_device_invalidate_hdp(adev, NULL); 375 mb(); 376 memcpy_fromio(buf, addr, count); 377 } 378 379 } 380 381 return count; 382 #else 383 return 0; 384 #endif 385 } 386 387 /** 388 * amdgpu_device_vram_access - read/write a buffer in vram 389 * 390 * @adev: amdgpu_device pointer 391 * @pos: offset of the buffer in vram 392 * @buf: virtual address of the buffer in system memory 393 * @size: read/write size, sizeof(@buf) must > @size 394 * @write: true - write to vram, otherwise - read from vram 395 */ 396 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 397 void *buf, size_t size, bool write) 398 { 399 size_t count; 400 401 /* try to using vram apreature to access vram first */ 402 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 403 size -= count; 404 if (size) { 405 /* using MM to access rest vram */ 406 pos += count; 407 buf += count; 408 amdgpu_device_mm_access(adev, pos, buf, size, write); 409 } 410 } 411 412 /* 413 * register access helper functions. 414 */ 415 416 /* Check if hw access should be skipped because of hotplug or device error */ 417 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 418 { 419 if (adev->no_hw_access) 420 return true; 421 422 #ifdef CONFIG_LOCKDEP 423 /* 424 * This is a bit complicated to understand, so worth a comment. What we assert 425 * here is that the GPU reset is not running on another thread in parallel. 426 * 427 * For this we trylock the read side of the reset semaphore, if that succeeds 428 * we know that the reset is not running in paralell. 429 * 430 * If the trylock fails we assert that we are either already holding the read 431 * side of the lock or are the reset thread itself and hold the write side of 432 * the lock. 433 */ 434 if (in_task()) { 435 if (down_read_trylock(&adev->reset_domain->sem)) 436 up_read(&adev->reset_domain->sem); 437 else 438 lockdep_assert_held(&adev->reset_domain->sem); 439 } 440 #endif 441 return false; 442 } 443 444 /** 445 * amdgpu_device_rreg - read a memory mapped IO or indirect register 446 * 447 * @adev: amdgpu_device pointer 448 * @reg: dword aligned register offset 449 * @acc_flags: access flags which require special behavior 450 * 451 * Returns the 32 bit value from the offset specified. 452 */ 453 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 454 uint32_t reg, uint32_t acc_flags) 455 { 456 uint32_t ret; 457 458 if (amdgpu_device_skip_hw_access(adev)) 459 return 0; 460 461 if ((reg * 4) < adev->rmmio_size) { 462 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 463 amdgpu_sriov_runtime(adev) && 464 down_read_trylock(&adev->reset_domain->sem)) { 465 ret = amdgpu_kiq_rreg(adev, reg); 466 up_read(&adev->reset_domain->sem); 467 } else { 468 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 469 } 470 } else { 471 ret = adev->pcie_rreg(adev, reg * 4); 472 } 473 474 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 475 476 return ret; 477 } 478 479 /* 480 * MMIO register read with bytes helper functions 481 * @offset:bytes offset from MMIO start 482 * 483 */ 484 485 /** 486 * amdgpu_mm_rreg8 - read a memory mapped IO register 487 * 488 * @adev: amdgpu_device pointer 489 * @offset: byte aligned register offset 490 * 491 * Returns the 8 bit value from the offset specified. 492 */ 493 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 494 { 495 if (amdgpu_device_skip_hw_access(adev)) 496 return 0; 497 498 if (offset < adev->rmmio_size) 499 return (readb(adev->rmmio + offset)); 500 BUG(); 501 } 502 503 /* 504 * MMIO register write with bytes helper functions 505 * @offset:bytes offset from MMIO start 506 * @value: the value want to be written to the register 507 * 508 */ 509 /** 510 * amdgpu_mm_wreg8 - read a memory mapped IO register 511 * 512 * @adev: amdgpu_device pointer 513 * @offset: byte aligned register offset 514 * @value: 8 bit value to write 515 * 516 * Writes the value specified to the offset specified. 517 */ 518 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 519 { 520 if (amdgpu_device_skip_hw_access(adev)) 521 return; 522 523 if (offset < adev->rmmio_size) 524 writeb(value, adev->rmmio + offset); 525 else 526 BUG(); 527 } 528 529 /** 530 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 531 * 532 * @adev: amdgpu_device pointer 533 * @reg: dword aligned register offset 534 * @v: 32 bit value to write to the register 535 * @acc_flags: access flags which require special behavior 536 * 537 * Writes the value specified to the offset specified. 538 */ 539 void amdgpu_device_wreg(struct amdgpu_device *adev, 540 uint32_t reg, uint32_t v, 541 uint32_t acc_flags) 542 { 543 if (amdgpu_device_skip_hw_access(adev)) 544 return; 545 546 if ((reg * 4) < adev->rmmio_size) { 547 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 548 amdgpu_sriov_runtime(adev) && 549 down_read_trylock(&adev->reset_domain->sem)) { 550 amdgpu_kiq_wreg(adev, reg, v); 551 up_read(&adev->reset_domain->sem); 552 } else { 553 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 554 } 555 } else { 556 adev->pcie_wreg(adev, reg * 4, v); 557 } 558 559 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 560 } 561 562 /** 563 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 564 * 565 * @adev: amdgpu_device pointer 566 * @reg: mmio/rlc register 567 * @v: value to write 568 * 569 * this function is invoked only for the debugfs register access 570 */ 571 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 572 uint32_t reg, uint32_t v) 573 { 574 if (amdgpu_device_skip_hw_access(adev)) 575 return; 576 577 if (amdgpu_sriov_fullaccess(adev) && 578 adev->gfx.rlc.funcs && 579 adev->gfx.rlc.funcs->is_rlcg_access_range) { 580 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 581 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 582 } else if ((reg * 4) >= adev->rmmio_size) { 583 adev->pcie_wreg(adev, reg * 4, v); 584 } else { 585 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 586 } 587 } 588 589 /** 590 * amdgpu_mm_rdoorbell - read a doorbell dword 591 * 592 * @adev: amdgpu_device pointer 593 * @index: doorbell index 594 * 595 * Returns the value in the doorbell aperture at the 596 * requested doorbell index (CIK). 597 */ 598 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 599 { 600 if (amdgpu_device_skip_hw_access(adev)) 601 return 0; 602 603 if (index < adev->doorbell.num_doorbells) { 604 return readl(adev->doorbell.ptr + index); 605 } else { 606 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 607 return 0; 608 } 609 } 610 611 /** 612 * amdgpu_mm_wdoorbell - write a doorbell dword 613 * 614 * @adev: amdgpu_device pointer 615 * @index: doorbell index 616 * @v: value to write 617 * 618 * Writes @v to the doorbell aperture at the 619 * requested doorbell index (CIK). 620 */ 621 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 622 { 623 if (amdgpu_device_skip_hw_access(adev)) 624 return; 625 626 if (index < adev->doorbell.num_doorbells) { 627 writel(v, adev->doorbell.ptr + index); 628 } else { 629 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 630 } 631 } 632 633 /** 634 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 635 * 636 * @adev: amdgpu_device pointer 637 * @index: doorbell index 638 * 639 * Returns the value in the doorbell aperture at the 640 * requested doorbell index (VEGA10+). 641 */ 642 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 643 { 644 if (amdgpu_device_skip_hw_access(adev)) 645 return 0; 646 647 if (index < adev->doorbell.num_doorbells) { 648 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 649 } else { 650 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 651 return 0; 652 } 653 } 654 655 /** 656 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 657 * 658 * @adev: amdgpu_device pointer 659 * @index: doorbell index 660 * @v: value to write 661 * 662 * Writes @v to the doorbell aperture at the 663 * requested doorbell index (VEGA10+). 664 */ 665 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 666 { 667 if (amdgpu_device_skip_hw_access(adev)) 668 return; 669 670 if (index < adev->doorbell.num_doorbells) { 671 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 672 } else { 673 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 674 } 675 } 676 677 /** 678 * amdgpu_device_indirect_rreg - read an indirect register 679 * 680 * @adev: amdgpu_device pointer 681 * @pcie_index: mmio register offset 682 * @pcie_data: mmio register offset 683 * @reg_addr: indirect register address to read from 684 * 685 * Returns the value of indirect register @reg_addr 686 */ 687 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 688 u32 pcie_index, u32 pcie_data, 689 u32 reg_addr) 690 { 691 unsigned long flags; 692 u32 r; 693 void __iomem *pcie_index_offset; 694 void __iomem *pcie_data_offset; 695 696 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 697 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 698 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 699 700 writel(reg_addr, pcie_index_offset); 701 readl(pcie_index_offset); 702 r = readl(pcie_data_offset); 703 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 704 705 return r; 706 } 707 708 /** 709 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 710 * 711 * @adev: amdgpu_device pointer 712 * @pcie_index: mmio register offset 713 * @pcie_data: mmio register offset 714 * @reg_addr: indirect register address to read from 715 * 716 * Returns the value of indirect register @reg_addr 717 */ 718 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 719 u32 pcie_index, u32 pcie_data, 720 u32 reg_addr) 721 { 722 unsigned long flags; 723 u64 r; 724 void __iomem *pcie_index_offset; 725 void __iomem *pcie_data_offset; 726 727 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 728 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 729 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 730 731 /* read low 32 bits */ 732 writel(reg_addr, pcie_index_offset); 733 readl(pcie_index_offset); 734 r = readl(pcie_data_offset); 735 /* read high 32 bits */ 736 writel(reg_addr + 4, pcie_index_offset); 737 readl(pcie_index_offset); 738 r |= ((u64)readl(pcie_data_offset) << 32); 739 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 740 741 return r; 742 } 743 744 /** 745 * amdgpu_device_indirect_wreg - write an indirect register address 746 * 747 * @adev: amdgpu_device pointer 748 * @pcie_index: mmio register offset 749 * @pcie_data: mmio register offset 750 * @reg_addr: indirect register offset 751 * @reg_data: indirect register data 752 * 753 */ 754 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 755 u32 pcie_index, u32 pcie_data, 756 u32 reg_addr, u32 reg_data) 757 { 758 unsigned long flags; 759 void __iomem *pcie_index_offset; 760 void __iomem *pcie_data_offset; 761 762 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 763 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 764 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 765 766 writel(reg_addr, pcie_index_offset); 767 readl(pcie_index_offset); 768 writel(reg_data, pcie_data_offset); 769 readl(pcie_data_offset); 770 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 771 } 772 773 /** 774 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 775 * 776 * @adev: amdgpu_device pointer 777 * @pcie_index: mmio register offset 778 * @pcie_data: mmio register offset 779 * @reg_addr: indirect register offset 780 * @reg_data: indirect register data 781 * 782 */ 783 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 784 u32 pcie_index, u32 pcie_data, 785 u32 reg_addr, u64 reg_data) 786 { 787 unsigned long flags; 788 void __iomem *pcie_index_offset; 789 void __iomem *pcie_data_offset; 790 791 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 792 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 793 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 794 795 /* write low 32 bits */ 796 writel(reg_addr, pcie_index_offset); 797 readl(pcie_index_offset); 798 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 799 readl(pcie_data_offset); 800 /* write high 32 bits */ 801 writel(reg_addr + 4, pcie_index_offset); 802 readl(pcie_index_offset); 803 writel((u32)(reg_data >> 32), pcie_data_offset); 804 readl(pcie_data_offset); 805 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 806 } 807 808 /** 809 * amdgpu_invalid_rreg - dummy reg read function 810 * 811 * @adev: amdgpu_device pointer 812 * @reg: offset of register 813 * 814 * Dummy register read function. Used for register blocks 815 * that certain asics don't have (all asics). 816 * Returns the value in the register. 817 */ 818 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 819 { 820 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 821 BUG(); 822 return 0; 823 } 824 825 /** 826 * amdgpu_invalid_wreg - dummy reg write function 827 * 828 * @adev: amdgpu_device pointer 829 * @reg: offset of register 830 * @v: value to write to the register 831 * 832 * Dummy register read function. Used for register blocks 833 * that certain asics don't have (all asics). 834 */ 835 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 836 { 837 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 838 reg, v); 839 BUG(); 840 } 841 842 /** 843 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: offset of register 847 * 848 * Dummy register read function. Used for register blocks 849 * that certain asics don't have (all asics). 850 * Returns the value in the register. 851 */ 852 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 853 { 854 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 855 BUG(); 856 return 0; 857 } 858 859 /** 860 * amdgpu_invalid_wreg64 - dummy reg write function 861 * 862 * @adev: amdgpu_device pointer 863 * @reg: offset of register 864 * @v: value to write to the register 865 * 866 * Dummy register read function. Used for register blocks 867 * that certain asics don't have (all asics). 868 */ 869 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 870 { 871 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 872 reg, v); 873 BUG(); 874 } 875 876 /** 877 * amdgpu_block_invalid_rreg - dummy reg read function 878 * 879 * @adev: amdgpu_device pointer 880 * @block: offset of instance 881 * @reg: offset of register 882 * 883 * Dummy register read function. Used for register blocks 884 * that certain asics don't have (all asics). 885 * Returns the value in the register. 886 */ 887 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 888 uint32_t block, uint32_t reg) 889 { 890 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 891 reg, block); 892 BUG(); 893 return 0; 894 } 895 896 /** 897 * amdgpu_block_invalid_wreg - dummy reg write function 898 * 899 * @adev: amdgpu_device pointer 900 * @block: offset of instance 901 * @reg: offset of register 902 * @v: value to write to the register 903 * 904 * Dummy register read function. Used for register blocks 905 * that certain asics don't have (all asics). 906 */ 907 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 908 uint32_t block, 909 uint32_t reg, uint32_t v) 910 { 911 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 912 reg, block, v); 913 BUG(); 914 } 915 916 /** 917 * amdgpu_device_asic_init - Wrapper for atom asic_init 918 * 919 * @adev: amdgpu_device pointer 920 * 921 * Does any asic specific work and then calls atom asic init. 922 */ 923 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 924 { 925 amdgpu_asic_pre_asic_init(adev); 926 927 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 928 return amdgpu_atomfirmware_asic_init(adev, true); 929 else 930 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 931 } 932 933 /** 934 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 935 * 936 * @adev: amdgpu_device pointer 937 * 938 * Allocates a scratch page of VRAM for use by various things in the 939 * driver. 940 */ 941 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 942 { 943 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 944 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 945 &adev->vram_scratch.robj, 946 &adev->vram_scratch.gpu_addr, 947 (void **)&adev->vram_scratch.ptr); 948 } 949 950 /** 951 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 952 * 953 * @adev: amdgpu_device pointer 954 * 955 * Frees the VRAM scratch page. 956 */ 957 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 958 { 959 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 960 } 961 962 /** 963 * amdgpu_device_program_register_sequence - program an array of registers. 964 * 965 * @adev: amdgpu_device pointer 966 * @registers: pointer to the register array 967 * @array_size: size of the register array 968 * 969 * Programs an array or registers with and and or masks. 970 * This is a helper for setting golden registers. 971 */ 972 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 973 const u32 *registers, 974 const u32 array_size) 975 { 976 u32 tmp, reg, and_mask, or_mask; 977 int i; 978 979 if (array_size % 3) 980 return; 981 982 for (i = 0; i < array_size; i +=3) { 983 reg = registers[i + 0]; 984 and_mask = registers[i + 1]; 985 or_mask = registers[i + 2]; 986 987 if (and_mask == 0xffffffff) { 988 tmp = or_mask; 989 } else { 990 tmp = RREG32(reg); 991 tmp &= ~and_mask; 992 if (adev->family >= AMDGPU_FAMILY_AI) 993 tmp |= (or_mask & and_mask); 994 else 995 tmp |= or_mask; 996 } 997 WREG32(reg, tmp); 998 } 999 } 1000 1001 /** 1002 * amdgpu_device_pci_config_reset - reset the GPU 1003 * 1004 * @adev: amdgpu_device pointer 1005 * 1006 * Resets the GPU using the pci config reset sequence. 1007 * Only applicable to asics prior to vega10. 1008 */ 1009 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1010 { 1011 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1012 } 1013 1014 /** 1015 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1016 * 1017 * @adev: amdgpu_device pointer 1018 * 1019 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1020 */ 1021 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1022 { 1023 STUB(); 1024 return -ENOSYS; 1025 #ifdef notyet 1026 return pci_reset_function(adev->pdev); 1027 #endif 1028 } 1029 1030 /* 1031 * GPU doorbell aperture helpers function. 1032 */ 1033 /** 1034 * amdgpu_device_doorbell_init - Init doorbell driver information. 1035 * 1036 * @adev: amdgpu_device pointer 1037 * 1038 * Init doorbell driver information (CIK) 1039 * Returns 0 on success, error on failure. 1040 */ 1041 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1042 { 1043 1044 /* No doorbell on SI hardware generation */ 1045 if (adev->asic_type < CHIP_BONAIRE) { 1046 adev->doorbell.base = 0; 1047 adev->doorbell.size = 0; 1048 adev->doorbell.num_doorbells = 0; 1049 adev->doorbell.ptr = NULL; 1050 return 0; 1051 } 1052 1053 #ifdef __linux__ 1054 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1055 return -EINVAL; 1056 #endif 1057 1058 amdgpu_asic_init_doorbell_index(adev); 1059 1060 /* doorbell bar mapping */ 1061 #ifdef __linux__ 1062 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1063 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1064 #endif 1065 1066 if (adev->enable_mes) { 1067 adev->doorbell.num_doorbells = 1068 adev->doorbell.size / sizeof(u32); 1069 } else { 1070 adev->doorbell.num_doorbells = 1071 min_t(u32, adev->doorbell.size / sizeof(u32), 1072 adev->doorbell_index.max_assignment+1); 1073 if (adev->doorbell.num_doorbells == 0) 1074 return -EINVAL; 1075 1076 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1077 * paging queue doorbell use the second page. The 1078 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1079 * doorbells are in the first page. So with paging queue enabled, 1080 * the max num_doorbells should + 1 page (0x400 in dword) 1081 */ 1082 if (adev->asic_type >= CHIP_VEGA10) 1083 adev->doorbell.num_doorbells += 0x400; 1084 } 1085 1086 #ifdef __linux__ 1087 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1088 adev->doorbell.num_doorbells * 1089 sizeof(u32)); 1090 if (adev->doorbell.ptr == NULL) 1091 return -ENOMEM; 1092 #endif 1093 1094 return 0; 1095 } 1096 1097 /** 1098 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1099 * 1100 * @adev: amdgpu_device pointer 1101 * 1102 * Tear down doorbell driver information (CIK) 1103 */ 1104 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1105 { 1106 #ifdef __linux__ 1107 iounmap(adev->doorbell.ptr); 1108 #else 1109 if (adev->doorbell.size > 0) 1110 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 1111 adev->doorbell.size); 1112 #endif 1113 adev->doorbell.ptr = NULL; 1114 } 1115 1116 1117 1118 /* 1119 * amdgpu_device_wb_*() 1120 * Writeback is the method by which the GPU updates special pages in memory 1121 * with the status of certain GPU events (fences, ring pointers,etc.). 1122 */ 1123 1124 /** 1125 * amdgpu_device_wb_fini - Disable Writeback and free memory 1126 * 1127 * @adev: amdgpu_device pointer 1128 * 1129 * Disables Writeback and frees the Writeback memory (all asics). 1130 * Used at driver shutdown. 1131 */ 1132 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1133 { 1134 if (adev->wb.wb_obj) { 1135 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1136 &adev->wb.gpu_addr, 1137 (void **)&adev->wb.wb); 1138 adev->wb.wb_obj = NULL; 1139 } 1140 } 1141 1142 /** 1143 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1144 * 1145 * @adev: amdgpu_device pointer 1146 * 1147 * Initializes writeback and allocates writeback memory (all asics). 1148 * Used at driver startup. 1149 * Returns 0 on success or an -error on failure. 1150 */ 1151 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1152 { 1153 int r; 1154 1155 if (adev->wb.wb_obj == NULL) { 1156 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1157 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1158 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1159 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1160 (void **)&adev->wb.wb); 1161 if (r) { 1162 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1163 return r; 1164 } 1165 1166 adev->wb.num_wb = AMDGPU_MAX_WB; 1167 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1168 1169 /* clear wb memory */ 1170 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1171 } 1172 1173 return 0; 1174 } 1175 1176 /** 1177 * amdgpu_device_wb_get - Allocate a wb entry 1178 * 1179 * @adev: amdgpu_device pointer 1180 * @wb: wb index 1181 * 1182 * Allocate a wb slot for use by the driver (all asics). 1183 * Returns 0 on success or -EINVAL on failure. 1184 */ 1185 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1186 { 1187 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1188 1189 if (offset < adev->wb.num_wb) { 1190 __set_bit(offset, adev->wb.used); 1191 *wb = offset << 3; /* convert to dw offset */ 1192 return 0; 1193 } else { 1194 return -EINVAL; 1195 } 1196 } 1197 1198 /** 1199 * amdgpu_device_wb_free - Free a wb entry 1200 * 1201 * @adev: amdgpu_device pointer 1202 * @wb: wb index 1203 * 1204 * Free a wb slot allocated for use by the driver (all asics) 1205 */ 1206 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1207 { 1208 wb >>= 3; 1209 if (wb < adev->wb.num_wb) 1210 __clear_bit(wb, adev->wb.used); 1211 } 1212 1213 /** 1214 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1215 * 1216 * @adev: amdgpu_device pointer 1217 * 1218 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1219 * to fail, but if any of the BARs is not accessible after the size we abort 1220 * driver loading by returning -ENODEV. 1221 */ 1222 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1223 { 1224 #ifdef __linux__ 1225 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1226 struct pci_bus *root; 1227 struct resource *res; 1228 unsigned i; 1229 u16 cmd; 1230 int r; 1231 1232 /* Bypass for VF */ 1233 if (amdgpu_sriov_vf(adev)) 1234 return 0; 1235 1236 /* skip if the bios has already enabled large BAR */ 1237 if (adev->gmc.real_vram_size && 1238 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1239 return 0; 1240 1241 /* Check if the root BUS has 64bit memory resources */ 1242 root = adev->pdev->bus; 1243 while (root->parent) 1244 root = root->parent; 1245 1246 pci_bus_for_each_resource(root, res, i) { 1247 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1248 res->start > 0x100000000ull) 1249 break; 1250 } 1251 1252 /* Trying to resize is pointless without a root hub window above 4GB */ 1253 if (!res) 1254 return 0; 1255 1256 /* Limit the BAR size to what is available */ 1257 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1258 rbar_size); 1259 1260 /* Disable memory decoding while we change the BAR addresses and size */ 1261 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1262 pci_write_config_word(adev->pdev, PCI_COMMAND, 1263 cmd & ~PCI_COMMAND_MEMORY); 1264 1265 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1266 amdgpu_device_doorbell_fini(adev); 1267 if (adev->asic_type >= CHIP_BONAIRE) 1268 pci_release_resource(adev->pdev, 2); 1269 1270 pci_release_resource(adev->pdev, 0); 1271 1272 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1273 if (r == -ENOSPC) 1274 DRM_INFO("Not enough PCI address space for a large BAR."); 1275 else if (r && r != -ENOTSUPP) 1276 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1277 1278 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1279 1280 /* When the doorbell or fb BAR isn't available we have no chance of 1281 * using the device. 1282 */ 1283 r = amdgpu_device_doorbell_init(adev); 1284 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1285 return -ENODEV; 1286 1287 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1288 #endif /* __linux__ */ 1289 1290 return 0; 1291 } 1292 1293 /* 1294 * GPU helpers function. 1295 */ 1296 /** 1297 * amdgpu_device_need_post - check if the hw need post or not 1298 * 1299 * @adev: amdgpu_device pointer 1300 * 1301 * Check if the asic has been initialized (all asics) at driver startup 1302 * or post is needed if hw reset is performed. 1303 * Returns true if need or false if not. 1304 */ 1305 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1306 { 1307 uint32_t reg; 1308 1309 if (amdgpu_sriov_vf(adev)) 1310 return false; 1311 1312 if (amdgpu_passthrough(adev)) { 1313 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1314 * some old smc fw still need driver do vPost otherwise gpu hang, while 1315 * those smc fw version above 22.15 doesn't have this flaw, so we force 1316 * vpost executed for smc version below 22.15 1317 */ 1318 if (adev->asic_type == CHIP_FIJI) { 1319 int err; 1320 uint32_t fw_ver; 1321 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1322 /* force vPost if error occured */ 1323 if (err) 1324 return true; 1325 1326 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1327 if (fw_ver < 0x00160e00) 1328 return true; 1329 } 1330 } 1331 1332 /* Don't post if we need to reset whole hive on init */ 1333 if (adev->gmc.xgmi.pending_reset) 1334 return false; 1335 1336 if (adev->has_hw_reset) { 1337 adev->has_hw_reset = false; 1338 return true; 1339 } 1340 1341 /* bios scratch used on CIK+ */ 1342 if (adev->asic_type >= CHIP_BONAIRE) 1343 return amdgpu_atombios_scratch_need_asic_init(adev); 1344 1345 /* check MEM_SIZE for older asics */ 1346 reg = amdgpu_asic_get_config_memsize(adev); 1347 1348 if ((reg != 0) && (reg != 0xffffffff)) 1349 return false; 1350 1351 return true; 1352 } 1353 1354 /** 1355 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1356 * 1357 * @adev: amdgpu_device pointer 1358 * 1359 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1360 * be set for this device. 1361 * 1362 * Returns true if it should be used or false if not. 1363 */ 1364 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1365 { 1366 switch (amdgpu_aspm) { 1367 case -1: 1368 break; 1369 case 0: 1370 return false; 1371 case 1: 1372 return true; 1373 default: 1374 return false; 1375 } 1376 return pcie_aspm_enabled(adev->pdev); 1377 } 1378 1379 bool amdgpu_device_aspm_support_quirk(void) 1380 { 1381 #if IS_ENABLED(CONFIG_X86) 1382 struct cpu_info *ci = curcpu(); 1383 1384 return !(ci->ci_family == 6 && ci->ci_model == 0x97); 1385 #else 1386 return true; 1387 #endif 1388 } 1389 1390 /* if we get transitioned to only one device, take VGA back */ 1391 /** 1392 * amdgpu_device_vga_set_decode - enable/disable vga decode 1393 * 1394 * @pdev: PCI device pointer 1395 * @state: enable/disable vga decode 1396 * 1397 * Enable/disable vga decode (all asics). 1398 * Returns VGA resource flags. 1399 */ 1400 #ifdef notyet 1401 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1402 bool state) 1403 { 1404 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1405 amdgpu_asic_set_vga_state(adev, state); 1406 if (state) 1407 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1408 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1409 else 1410 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1411 } 1412 #endif 1413 1414 /** 1415 * amdgpu_device_check_block_size - validate the vm block size 1416 * 1417 * @adev: amdgpu_device pointer 1418 * 1419 * Validates the vm block size specified via module parameter. 1420 * The vm block size defines number of bits in page table versus page directory, 1421 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1422 * page table and the remaining bits are in the page directory. 1423 */ 1424 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1425 { 1426 /* defines number of bits in page table versus page directory, 1427 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1428 * page table and the remaining bits are in the page directory */ 1429 if (amdgpu_vm_block_size == -1) 1430 return; 1431 1432 if (amdgpu_vm_block_size < 9) { 1433 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1434 amdgpu_vm_block_size); 1435 amdgpu_vm_block_size = -1; 1436 } 1437 } 1438 1439 /** 1440 * amdgpu_device_check_vm_size - validate the vm size 1441 * 1442 * @adev: amdgpu_device pointer 1443 * 1444 * Validates the vm size in GB specified via module parameter. 1445 * The VM size is the size of the GPU virtual memory space in GB. 1446 */ 1447 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1448 { 1449 /* no need to check the default value */ 1450 if (amdgpu_vm_size == -1) 1451 return; 1452 1453 if (amdgpu_vm_size < 1) { 1454 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1455 amdgpu_vm_size); 1456 amdgpu_vm_size = -1; 1457 } 1458 } 1459 1460 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1461 { 1462 #ifdef __linux__ 1463 struct sysinfo si; 1464 #endif 1465 bool is_os_64 = (sizeof(void *) == 8); 1466 uint64_t total_memory; 1467 uint64_t dram_size_seven_GB = 0x1B8000000; 1468 uint64_t dram_size_three_GB = 0xB8000000; 1469 1470 if (amdgpu_smu_memory_pool_size == 0) 1471 return; 1472 1473 if (!is_os_64) { 1474 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1475 goto def_value; 1476 } 1477 #ifdef __linux__ 1478 si_meminfo(&si); 1479 total_memory = (uint64_t)si.totalram * si.mem_unit; 1480 #else 1481 total_memory = ptoa(physmem); 1482 #endif 1483 1484 if ((amdgpu_smu_memory_pool_size == 1) || 1485 (amdgpu_smu_memory_pool_size == 2)) { 1486 if (total_memory < dram_size_three_GB) 1487 goto def_value1; 1488 } else if ((amdgpu_smu_memory_pool_size == 4) || 1489 (amdgpu_smu_memory_pool_size == 8)) { 1490 if (total_memory < dram_size_seven_GB) 1491 goto def_value1; 1492 } else { 1493 DRM_WARN("Smu memory pool size not supported\n"); 1494 goto def_value; 1495 } 1496 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1497 1498 return; 1499 1500 def_value1: 1501 DRM_WARN("No enough system memory\n"); 1502 def_value: 1503 adev->pm.smu_prv_buffer_size = 0; 1504 } 1505 1506 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1507 { 1508 if (!(adev->flags & AMD_IS_APU) || 1509 adev->asic_type < CHIP_RAVEN) 1510 return 0; 1511 1512 switch (adev->asic_type) { 1513 case CHIP_RAVEN: 1514 if (adev->pdev->device == 0x15dd) 1515 adev->apu_flags |= AMD_APU_IS_RAVEN; 1516 if (adev->pdev->device == 0x15d8) 1517 adev->apu_flags |= AMD_APU_IS_PICASSO; 1518 break; 1519 case CHIP_RENOIR: 1520 if ((adev->pdev->device == 0x1636) || 1521 (adev->pdev->device == 0x164c)) 1522 adev->apu_flags |= AMD_APU_IS_RENOIR; 1523 else 1524 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1525 break; 1526 case CHIP_VANGOGH: 1527 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1528 break; 1529 case CHIP_YELLOW_CARP: 1530 break; 1531 case CHIP_CYAN_SKILLFISH: 1532 if ((adev->pdev->device == 0x13FE) || 1533 (adev->pdev->device == 0x143F)) 1534 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1535 break; 1536 default: 1537 break; 1538 } 1539 1540 return 0; 1541 } 1542 1543 /** 1544 * amdgpu_device_check_arguments - validate module params 1545 * 1546 * @adev: amdgpu_device pointer 1547 * 1548 * Validates certain module parameters and updates 1549 * the associated values used by the driver (all asics). 1550 */ 1551 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1552 { 1553 if (amdgpu_sched_jobs < 4) { 1554 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1555 amdgpu_sched_jobs); 1556 amdgpu_sched_jobs = 4; 1557 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1558 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1559 amdgpu_sched_jobs); 1560 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1561 } 1562 1563 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1564 /* gart size must be greater or equal to 32M */ 1565 dev_warn(adev->dev, "gart size (%d) too small\n", 1566 amdgpu_gart_size); 1567 amdgpu_gart_size = -1; 1568 } 1569 1570 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1571 /* gtt size must be greater or equal to 32M */ 1572 dev_warn(adev->dev, "gtt size (%d) too small\n", 1573 amdgpu_gtt_size); 1574 amdgpu_gtt_size = -1; 1575 } 1576 1577 /* valid range is between 4 and 9 inclusive */ 1578 if (amdgpu_vm_fragment_size != -1 && 1579 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1580 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1581 amdgpu_vm_fragment_size = -1; 1582 } 1583 1584 if (amdgpu_sched_hw_submission < 2) { 1585 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1586 amdgpu_sched_hw_submission); 1587 amdgpu_sched_hw_submission = 2; 1588 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1589 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1590 amdgpu_sched_hw_submission); 1591 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1592 } 1593 1594 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1595 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1596 amdgpu_reset_method = -1; 1597 } 1598 1599 amdgpu_device_check_smu_prv_buffer_size(adev); 1600 1601 amdgpu_device_check_vm_size(adev); 1602 1603 amdgpu_device_check_block_size(adev); 1604 1605 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1606 1607 return 0; 1608 } 1609 1610 #ifdef __linux__ 1611 /** 1612 * amdgpu_switcheroo_set_state - set switcheroo state 1613 * 1614 * @pdev: pci dev pointer 1615 * @state: vga_switcheroo state 1616 * 1617 * Callback for the switcheroo driver. Suspends or resumes the 1618 * the asics before or after it is powered up using ACPI methods. 1619 */ 1620 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1621 enum vga_switcheroo_state state) 1622 { 1623 struct drm_device *dev = pci_get_drvdata(pdev); 1624 int r; 1625 1626 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1627 return; 1628 1629 if (state == VGA_SWITCHEROO_ON) { 1630 pr_info("switched on\n"); 1631 /* don't suspend or resume card normally */ 1632 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1633 1634 pci_set_power_state(pdev, PCI_D0); 1635 amdgpu_device_load_pci_state(pdev); 1636 r = pci_enable_device(pdev); 1637 if (r) 1638 DRM_WARN("pci_enable_device failed (%d)\n", r); 1639 amdgpu_device_resume(dev, true); 1640 1641 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1642 } else { 1643 pr_info("switched off\n"); 1644 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1645 amdgpu_device_suspend(dev, true); 1646 amdgpu_device_cache_pci_state(pdev); 1647 /* Shut down the device */ 1648 pci_disable_device(pdev); 1649 pci_set_power_state(pdev, PCI_D3cold); 1650 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1651 } 1652 } 1653 1654 /** 1655 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1656 * 1657 * @pdev: pci dev pointer 1658 * 1659 * Callback for the switcheroo driver. Check of the switcheroo 1660 * state can be changed. 1661 * Returns true if the state can be changed, false if not. 1662 */ 1663 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1664 { 1665 struct drm_device *dev = pci_get_drvdata(pdev); 1666 1667 /* 1668 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1669 * locking inversion with the driver load path. And the access here is 1670 * completely racy anyway. So don't bother with locking for now. 1671 */ 1672 return atomic_read(&dev->open_count) == 0; 1673 } 1674 #endif /* __linux__ */ 1675 1676 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1677 #ifdef notyet 1678 .set_gpu_state = amdgpu_switcheroo_set_state, 1679 .reprobe = NULL, 1680 .can_switch = amdgpu_switcheroo_can_switch, 1681 #endif 1682 }; 1683 1684 /** 1685 * amdgpu_device_ip_set_clockgating_state - set the CG state 1686 * 1687 * @dev: amdgpu_device pointer 1688 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1689 * @state: clockgating state (gate or ungate) 1690 * 1691 * Sets the requested clockgating state for all instances of 1692 * the hardware IP specified. 1693 * Returns the error code from the last instance. 1694 */ 1695 int amdgpu_device_ip_set_clockgating_state(void *dev, 1696 enum amd_ip_block_type block_type, 1697 enum amd_clockgating_state state) 1698 { 1699 struct amdgpu_device *adev = dev; 1700 int i, r = 0; 1701 1702 for (i = 0; i < adev->num_ip_blocks; i++) { 1703 if (!adev->ip_blocks[i].status.valid) 1704 continue; 1705 if (adev->ip_blocks[i].version->type != block_type) 1706 continue; 1707 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1708 continue; 1709 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1710 (void *)adev, state); 1711 if (r) 1712 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1713 adev->ip_blocks[i].version->funcs->name, r); 1714 } 1715 return r; 1716 } 1717 1718 /** 1719 * amdgpu_device_ip_set_powergating_state - set the PG state 1720 * 1721 * @dev: amdgpu_device pointer 1722 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1723 * @state: powergating state (gate or ungate) 1724 * 1725 * Sets the requested powergating state for all instances of 1726 * the hardware IP specified. 1727 * Returns the error code from the last instance. 1728 */ 1729 int amdgpu_device_ip_set_powergating_state(void *dev, 1730 enum amd_ip_block_type block_type, 1731 enum amd_powergating_state state) 1732 { 1733 struct amdgpu_device *adev = dev; 1734 int i, r = 0; 1735 1736 for (i = 0; i < adev->num_ip_blocks; i++) { 1737 if (!adev->ip_blocks[i].status.valid) 1738 continue; 1739 if (adev->ip_blocks[i].version->type != block_type) 1740 continue; 1741 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1742 continue; 1743 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1744 (void *)adev, state); 1745 if (r) 1746 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1747 adev->ip_blocks[i].version->funcs->name, r); 1748 } 1749 return r; 1750 } 1751 1752 /** 1753 * amdgpu_device_ip_get_clockgating_state - get the CG state 1754 * 1755 * @adev: amdgpu_device pointer 1756 * @flags: clockgating feature flags 1757 * 1758 * Walks the list of IPs on the device and updates the clockgating 1759 * flags for each IP. 1760 * Updates @flags with the feature flags for each hardware IP where 1761 * clockgating is enabled. 1762 */ 1763 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1764 u64 *flags) 1765 { 1766 int i; 1767 1768 for (i = 0; i < adev->num_ip_blocks; i++) { 1769 if (!adev->ip_blocks[i].status.valid) 1770 continue; 1771 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1772 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1773 } 1774 } 1775 1776 /** 1777 * amdgpu_device_ip_wait_for_idle - wait for idle 1778 * 1779 * @adev: amdgpu_device pointer 1780 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1781 * 1782 * Waits for the request hardware IP to be idle. 1783 * Returns 0 for success or a negative error code on failure. 1784 */ 1785 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1786 enum amd_ip_block_type block_type) 1787 { 1788 int i, r; 1789 1790 for (i = 0; i < adev->num_ip_blocks; i++) { 1791 if (!adev->ip_blocks[i].status.valid) 1792 continue; 1793 if (adev->ip_blocks[i].version->type == block_type) { 1794 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1795 if (r) 1796 return r; 1797 break; 1798 } 1799 } 1800 return 0; 1801 1802 } 1803 1804 /** 1805 * amdgpu_device_ip_is_idle - is the hardware IP idle 1806 * 1807 * @adev: amdgpu_device pointer 1808 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1809 * 1810 * Check if the hardware IP is idle or not. 1811 * Returns true if it the IP is idle, false if not. 1812 */ 1813 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1814 enum amd_ip_block_type block_type) 1815 { 1816 int i; 1817 1818 for (i = 0; i < adev->num_ip_blocks; i++) { 1819 if (!adev->ip_blocks[i].status.valid) 1820 continue; 1821 if (adev->ip_blocks[i].version->type == block_type) 1822 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1823 } 1824 return true; 1825 1826 } 1827 1828 /** 1829 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1830 * 1831 * @adev: amdgpu_device pointer 1832 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1833 * 1834 * Returns a pointer to the hardware IP block structure 1835 * if it exists for the asic, otherwise NULL. 1836 */ 1837 struct amdgpu_ip_block * 1838 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1839 enum amd_ip_block_type type) 1840 { 1841 int i; 1842 1843 for (i = 0; i < adev->num_ip_blocks; i++) 1844 if (adev->ip_blocks[i].version->type == type) 1845 return &adev->ip_blocks[i]; 1846 1847 return NULL; 1848 } 1849 1850 /** 1851 * amdgpu_device_ip_block_version_cmp 1852 * 1853 * @adev: amdgpu_device pointer 1854 * @type: enum amd_ip_block_type 1855 * @major: major version 1856 * @minor: minor version 1857 * 1858 * return 0 if equal or greater 1859 * return 1 if smaller or the ip_block doesn't exist 1860 */ 1861 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1862 enum amd_ip_block_type type, 1863 u32 major, u32 minor) 1864 { 1865 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1866 1867 if (ip_block && ((ip_block->version->major > major) || 1868 ((ip_block->version->major == major) && 1869 (ip_block->version->minor >= minor)))) 1870 return 0; 1871 1872 return 1; 1873 } 1874 1875 /** 1876 * amdgpu_device_ip_block_add 1877 * 1878 * @adev: amdgpu_device pointer 1879 * @ip_block_version: pointer to the IP to add 1880 * 1881 * Adds the IP block driver information to the collection of IPs 1882 * on the asic. 1883 */ 1884 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1885 const struct amdgpu_ip_block_version *ip_block_version) 1886 { 1887 if (!ip_block_version) 1888 return -EINVAL; 1889 1890 switch (ip_block_version->type) { 1891 case AMD_IP_BLOCK_TYPE_VCN: 1892 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1893 return 0; 1894 break; 1895 case AMD_IP_BLOCK_TYPE_JPEG: 1896 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1897 return 0; 1898 break; 1899 default: 1900 break; 1901 } 1902 1903 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1904 ip_block_version->funcs->name); 1905 1906 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1907 1908 return 0; 1909 } 1910 1911 /** 1912 * amdgpu_device_enable_virtual_display - enable virtual display feature 1913 * 1914 * @adev: amdgpu_device pointer 1915 * 1916 * Enabled the virtual display feature if the user has enabled it via 1917 * the module parameter virtual_display. This feature provides a virtual 1918 * display hardware on headless boards or in virtualized environments. 1919 * This function parses and validates the configuration string specified by 1920 * the user and configues the virtual display configuration (number of 1921 * virtual connectors, crtcs, etc.) specified. 1922 */ 1923 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1924 { 1925 adev->enable_virtual_display = false; 1926 1927 #ifdef notyet 1928 if (amdgpu_virtual_display) { 1929 const char *pci_address_name = pci_name(adev->pdev); 1930 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1931 1932 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1933 pciaddstr_tmp = pciaddstr; 1934 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1935 pciaddname = strsep(&pciaddname_tmp, ","); 1936 if (!strcmp("all", pciaddname) 1937 || !strcmp(pci_address_name, pciaddname)) { 1938 long num_crtc; 1939 int res = -1; 1940 1941 adev->enable_virtual_display = true; 1942 1943 if (pciaddname_tmp) 1944 res = kstrtol(pciaddname_tmp, 10, 1945 &num_crtc); 1946 1947 if (!res) { 1948 if (num_crtc < 1) 1949 num_crtc = 1; 1950 if (num_crtc > 6) 1951 num_crtc = 6; 1952 adev->mode_info.num_crtc = num_crtc; 1953 } else { 1954 adev->mode_info.num_crtc = 1; 1955 } 1956 break; 1957 } 1958 } 1959 1960 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1961 amdgpu_virtual_display, pci_address_name, 1962 adev->enable_virtual_display, adev->mode_info.num_crtc); 1963 1964 kfree(pciaddstr); 1965 } 1966 #endif 1967 } 1968 1969 /** 1970 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1971 * 1972 * @adev: amdgpu_device pointer 1973 * 1974 * Parses the asic configuration parameters specified in the gpu info 1975 * firmware and makes them availale to the driver for use in configuring 1976 * the asic. 1977 * Returns 0 on success, -EINVAL on failure. 1978 */ 1979 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1980 { 1981 const char *chip_name; 1982 char fw_name[40]; 1983 int err; 1984 const struct gpu_info_firmware_header_v1_0 *hdr; 1985 1986 adev->firmware.gpu_info_fw = NULL; 1987 1988 if (adev->mman.discovery_bin) { 1989 /* 1990 * FIXME: The bounding box is still needed by Navi12, so 1991 * temporarily read it from gpu_info firmware. Should be dropped 1992 * when DAL no longer needs it. 1993 */ 1994 if (adev->asic_type != CHIP_NAVI12) 1995 return 0; 1996 } 1997 1998 switch (adev->asic_type) { 1999 default: 2000 return 0; 2001 case CHIP_VEGA10: 2002 chip_name = "vega10"; 2003 break; 2004 case CHIP_VEGA12: 2005 chip_name = "vega12"; 2006 break; 2007 case CHIP_RAVEN: 2008 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2009 chip_name = "raven2"; 2010 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2011 chip_name = "picasso"; 2012 else 2013 chip_name = "raven"; 2014 break; 2015 case CHIP_ARCTURUS: 2016 chip_name = "arcturus"; 2017 break; 2018 case CHIP_NAVI12: 2019 chip_name = "navi12"; 2020 break; 2021 } 2022 2023 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2024 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2025 if (err) { 2026 dev_err(adev->dev, 2027 "Failed to load gpu_info firmware \"%s\"\n", 2028 fw_name); 2029 goto out; 2030 } 2031 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2032 if (err) { 2033 dev_err(adev->dev, 2034 "Failed to validate gpu_info firmware \"%s\"\n", 2035 fw_name); 2036 goto out; 2037 } 2038 2039 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2040 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2041 2042 switch (hdr->version_major) { 2043 case 1: 2044 { 2045 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2046 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2047 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2048 2049 /* 2050 * Should be droped when DAL no longer needs it. 2051 */ 2052 if (adev->asic_type == CHIP_NAVI12) 2053 goto parse_soc_bounding_box; 2054 2055 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2056 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2057 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2058 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2059 adev->gfx.config.max_texture_channel_caches = 2060 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2061 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2062 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2063 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2064 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2065 adev->gfx.config.double_offchip_lds_buf = 2066 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2067 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2068 adev->gfx.cu_info.max_waves_per_simd = 2069 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2070 adev->gfx.cu_info.max_scratch_slots_per_cu = 2071 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2072 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2073 if (hdr->version_minor >= 1) { 2074 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2075 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2076 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2077 adev->gfx.config.num_sc_per_sh = 2078 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2079 adev->gfx.config.num_packer_per_sc = 2080 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2081 } 2082 2083 parse_soc_bounding_box: 2084 /* 2085 * soc bounding box info is not integrated in disocovery table, 2086 * we always need to parse it from gpu info firmware if needed. 2087 */ 2088 if (hdr->version_minor == 2) { 2089 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2090 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2091 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2092 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2093 } 2094 break; 2095 } 2096 default: 2097 dev_err(adev->dev, 2098 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2099 err = -EINVAL; 2100 goto out; 2101 } 2102 out: 2103 return err; 2104 } 2105 2106 /** 2107 * amdgpu_device_ip_early_init - run early init for hardware IPs 2108 * 2109 * @adev: amdgpu_device pointer 2110 * 2111 * Early initialization pass for hardware IPs. The hardware IPs that make 2112 * up each asic are discovered each IP's early_init callback is run. This 2113 * is the first stage in initializing the asic. 2114 * Returns 0 on success, negative error code on failure. 2115 */ 2116 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2117 { 2118 struct drm_device *dev = adev_to_drm(adev); 2119 struct pci_dev *parent; 2120 int i, r; 2121 2122 amdgpu_device_enable_virtual_display(adev); 2123 2124 if (amdgpu_sriov_vf(adev)) { 2125 r = amdgpu_virt_request_full_gpu(adev, true); 2126 if (r) 2127 return r; 2128 } 2129 2130 switch (adev->asic_type) { 2131 #ifdef CONFIG_DRM_AMDGPU_SI 2132 case CHIP_VERDE: 2133 case CHIP_TAHITI: 2134 case CHIP_PITCAIRN: 2135 case CHIP_OLAND: 2136 case CHIP_HAINAN: 2137 adev->family = AMDGPU_FAMILY_SI; 2138 r = si_set_ip_blocks(adev); 2139 if (r) 2140 return r; 2141 break; 2142 #endif 2143 #ifdef CONFIG_DRM_AMDGPU_CIK 2144 case CHIP_BONAIRE: 2145 case CHIP_HAWAII: 2146 case CHIP_KAVERI: 2147 case CHIP_KABINI: 2148 case CHIP_MULLINS: 2149 if (adev->flags & AMD_IS_APU) 2150 adev->family = AMDGPU_FAMILY_KV; 2151 else 2152 adev->family = AMDGPU_FAMILY_CI; 2153 2154 r = cik_set_ip_blocks(adev); 2155 if (r) 2156 return r; 2157 break; 2158 #endif 2159 case CHIP_TOPAZ: 2160 case CHIP_TONGA: 2161 case CHIP_FIJI: 2162 case CHIP_POLARIS10: 2163 case CHIP_POLARIS11: 2164 case CHIP_POLARIS12: 2165 case CHIP_VEGAM: 2166 case CHIP_CARRIZO: 2167 case CHIP_STONEY: 2168 if (adev->flags & AMD_IS_APU) 2169 adev->family = AMDGPU_FAMILY_CZ; 2170 else 2171 adev->family = AMDGPU_FAMILY_VI; 2172 2173 r = vi_set_ip_blocks(adev); 2174 if (r) 2175 return r; 2176 break; 2177 default: 2178 r = amdgpu_discovery_set_ip_blocks(adev); 2179 if (r) 2180 return r; 2181 break; 2182 } 2183 2184 if (amdgpu_has_atpx() && 2185 (amdgpu_is_atpx_hybrid() || 2186 amdgpu_has_atpx_dgpu_power_cntl()) && 2187 ((adev->flags & AMD_IS_APU) == 0) && 2188 !pci_is_thunderbolt_attached(dev->pdev)) 2189 adev->flags |= AMD_IS_PX; 2190 2191 if (!(adev->flags & AMD_IS_APU)) { 2192 parent = pci_upstream_bridge(adev->pdev); 2193 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2194 } 2195 2196 amdgpu_amdkfd_device_probe(adev); 2197 2198 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2199 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2200 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2201 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2202 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2203 2204 for (i = 0; i < adev->num_ip_blocks; i++) { 2205 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2206 DRM_ERROR("disabled ip block: %d <%s>\n", 2207 i, adev->ip_blocks[i].version->funcs->name); 2208 adev->ip_blocks[i].status.valid = false; 2209 } else { 2210 if (adev->ip_blocks[i].version->funcs->early_init) { 2211 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2212 if (r == -ENOENT) { 2213 adev->ip_blocks[i].status.valid = false; 2214 } else if (r) { 2215 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2216 adev->ip_blocks[i].version->funcs->name, r); 2217 return r; 2218 } else { 2219 adev->ip_blocks[i].status.valid = true; 2220 } 2221 } else { 2222 adev->ip_blocks[i].status.valid = true; 2223 } 2224 } 2225 /* get the vbios after the asic_funcs are set up */ 2226 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2227 r = amdgpu_device_parse_gpu_info_fw(adev); 2228 if (r) 2229 return r; 2230 2231 /* Read BIOS */ 2232 if (!amdgpu_get_bios(adev)) 2233 return -EINVAL; 2234 2235 r = amdgpu_atombios_init(adev); 2236 if (r) { 2237 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2238 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2239 return r; 2240 } 2241 2242 /*get pf2vf msg info at it's earliest time*/ 2243 if (amdgpu_sriov_vf(adev)) 2244 amdgpu_virt_init_data_exchange(adev); 2245 2246 } 2247 } 2248 2249 adev->cg_flags &= amdgpu_cg_mask; 2250 adev->pg_flags &= amdgpu_pg_mask; 2251 2252 return 0; 2253 } 2254 2255 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2256 { 2257 int i, r; 2258 2259 for (i = 0; i < adev->num_ip_blocks; i++) { 2260 if (!adev->ip_blocks[i].status.sw) 2261 continue; 2262 if (adev->ip_blocks[i].status.hw) 2263 continue; 2264 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2265 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2266 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2267 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2268 if (r) { 2269 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2270 adev->ip_blocks[i].version->funcs->name, r); 2271 return r; 2272 } 2273 adev->ip_blocks[i].status.hw = true; 2274 } 2275 } 2276 2277 return 0; 2278 } 2279 2280 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2281 { 2282 int i, r; 2283 2284 for (i = 0; i < adev->num_ip_blocks; i++) { 2285 if (!adev->ip_blocks[i].status.sw) 2286 continue; 2287 if (adev->ip_blocks[i].status.hw) 2288 continue; 2289 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2290 if (r) { 2291 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2292 adev->ip_blocks[i].version->funcs->name, r); 2293 return r; 2294 } 2295 adev->ip_blocks[i].status.hw = true; 2296 } 2297 2298 return 0; 2299 } 2300 2301 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2302 { 2303 int r = 0; 2304 int i; 2305 uint32_t smu_version; 2306 2307 if (adev->asic_type >= CHIP_VEGA10) { 2308 for (i = 0; i < adev->num_ip_blocks; i++) { 2309 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2310 continue; 2311 2312 if (!adev->ip_blocks[i].status.sw) 2313 continue; 2314 2315 /* no need to do the fw loading again if already done*/ 2316 if (adev->ip_blocks[i].status.hw == true) 2317 break; 2318 2319 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2320 r = adev->ip_blocks[i].version->funcs->resume(adev); 2321 if (r) { 2322 DRM_ERROR("resume of IP block <%s> failed %d\n", 2323 adev->ip_blocks[i].version->funcs->name, r); 2324 return r; 2325 } 2326 } else { 2327 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2328 if (r) { 2329 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2330 adev->ip_blocks[i].version->funcs->name, r); 2331 return r; 2332 } 2333 } 2334 2335 adev->ip_blocks[i].status.hw = true; 2336 break; 2337 } 2338 } 2339 2340 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2341 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2342 2343 return r; 2344 } 2345 2346 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2347 { 2348 long timeout; 2349 int r, i; 2350 2351 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2352 struct amdgpu_ring *ring = adev->rings[i]; 2353 2354 /* No need to setup the GPU scheduler for rings that don't need it */ 2355 if (!ring || ring->no_scheduler) 2356 continue; 2357 2358 switch (ring->funcs->type) { 2359 case AMDGPU_RING_TYPE_GFX: 2360 timeout = adev->gfx_timeout; 2361 break; 2362 case AMDGPU_RING_TYPE_COMPUTE: 2363 timeout = adev->compute_timeout; 2364 break; 2365 case AMDGPU_RING_TYPE_SDMA: 2366 timeout = adev->sdma_timeout; 2367 break; 2368 default: 2369 timeout = adev->video_timeout; 2370 break; 2371 } 2372 2373 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2374 ring->num_hw_submission, amdgpu_job_hang_limit, 2375 timeout, adev->reset_domain->wq, 2376 ring->sched_score, ring->name, 2377 adev->dev); 2378 if (r) { 2379 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2380 ring->name); 2381 return r; 2382 } 2383 } 2384 2385 return 0; 2386 } 2387 2388 2389 /** 2390 * amdgpu_device_ip_init - run init for hardware IPs 2391 * 2392 * @adev: amdgpu_device pointer 2393 * 2394 * Main initialization pass for hardware IPs. The list of all the hardware 2395 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2396 * are run. sw_init initializes the software state associated with each IP 2397 * and hw_init initializes the hardware associated with each IP. 2398 * Returns 0 on success, negative error code on failure. 2399 */ 2400 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2401 { 2402 int i, r; 2403 2404 r = amdgpu_ras_init(adev); 2405 if (r) 2406 return r; 2407 2408 for (i = 0; i < adev->num_ip_blocks; i++) { 2409 if (!adev->ip_blocks[i].status.valid) 2410 continue; 2411 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2412 if (r) { 2413 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2414 adev->ip_blocks[i].version->funcs->name, r); 2415 goto init_failed; 2416 } 2417 adev->ip_blocks[i].status.sw = true; 2418 2419 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2420 /* need to do common hw init early so everything is set up for gmc */ 2421 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2422 if (r) { 2423 DRM_ERROR("hw_init %d failed %d\n", i, r); 2424 goto init_failed; 2425 } 2426 adev->ip_blocks[i].status.hw = true; 2427 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2428 /* need to do gmc hw init early so we can allocate gpu mem */ 2429 /* Try to reserve bad pages early */ 2430 if (amdgpu_sriov_vf(adev)) 2431 amdgpu_virt_exchange_data(adev); 2432 2433 r = amdgpu_device_vram_scratch_init(adev); 2434 if (r) { 2435 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2436 goto init_failed; 2437 } 2438 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2439 if (r) { 2440 DRM_ERROR("hw_init %d failed %d\n", i, r); 2441 goto init_failed; 2442 } 2443 r = amdgpu_device_wb_init(adev); 2444 if (r) { 2445 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2446 goto init_failed; 2447 } 2448 adev->ip_blocks[i].status.hw = true; 2449 2450 /* right after GMC hw init, we create CSA */ 2451 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2452 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2453 AMDGPU_GEM_DOMAIN_VRAM, 2454 AMDGPU_CSA_SIZE); 2455 if (r) { 2456 DRM_ERROR("allocate CSA failed %d\n", r); 2457 goto init_failed; 2458 } 2459 } 2460 } 2461 } 2462 2463 if (amdgpu_sriov_vf(adev)) 2464 amdgpu_virt_init_data_exchange(adev); 2465 2466 r = amdgpu_ib_pool_init(adev); 2467 if (r) { 2468 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2469 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2470 goto init_failed; 2471 } 2472 2473 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2474 if (r) 2475 goto init_failed; 2476 2477 r = amdgpu_device_ip_hw_init_phase1(adev); 2478 if (r) 2479 goto init_failed; 2480 2481 r = amdgpu_device_fw_loading(adev); 2482 if (r) 2483 goto init_failed; 2484 2485 r = amdgpu_device_ip_hw_init_phase2(adev); 2486 if (r) 2487 goto init_failed; 2488 2489 /* 2490 * retired pages will be loaded from eeprom and reserved here, 2491 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2492 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2493 * for I2C communication which only true at this point. 2494 * 2495 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2496 * failure from bad gpu situation and stop amdgpu init process 2497 * accordingly. For other failed cases, it will still release all 2498 * the resource and print error message, rather than returning one 2499 * negative value to upper level. 2500 * 2501 * Note: theoretically, this should be called before all vram allocations 2502 * to protect retired page from abusing 2503 */ 2504 r = amdgpu_ras_recovery_init(adev); 2505 if (r) 2506 goto init_failed; 2507 2508 /** 2509 * In case of XGMI grab extra reference for reset domain for this device 2510 */ 2511 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2512 if (amdgpu_xgmi_add_device(adev) == 0) { 2513 if (!amdgpu_sriov_vf(adev)) { 2514 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2515 2516 if (WARN_ON(!hive)) { 2517 r = -ENOENT; 2518 goto init_failed; 2519 } 2520 2521 if (!hive->reset_domain || 2522 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2523 r = -ENOENT; 2524 amdgpu_put_xgmi_hive(hive); 2525 goto init_failed; 2526 } 2527 2528 /* Drop the early temporary reset domain we created for device */ 2529 amdgpu_reset_put_reset_domain(adev->reset_domain); 2530 adev->reset_domain = hive->reset_domain; 2531 amdgpu_put_xgmi_hive(hive); 2532 } 2533 } 2534 } 2535 2536 r = amdgpu_device_init_schedulers(adev); 2537 if (r) 2538 goto init_failed; 2539 2540 /* Don't init kfd if whole hive need to be reset during init */ 2541 if (!adev->gmc.xgmi.pending_reset) 2542 amdgpu_amdkfd_device_init(adev); 2543 2544 amdgpu_fru_get_product_info(adev); 2545 2546 init_failed: 2547 2548 return r; 2549 } 2550 2551 /** 2552 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2553 * 2554 * @adev: amdgpu_device pointer 2555 * 2556 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2557 * this function before a GPU reset. If the value is retained after a 2558 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2559 */ 2560 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2561 { 2562 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2563 } 2564 2565 /** 2566 * amdgpu_device_check_vram_lost - check if vram is valid 2567 * 2568 * @adev: amdgpu_device pointer 2569 * 2570 * Checks the reset magic value written to the gart pointer in VRAM. 2571 * The driver calls this after a GPU reset to see if the contents of 2572 * VRAM is lost or now. 2573 * returns true if vram is lost, false if not. 2574 */ 2575 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2576 { 2577 if (memcmp(adev->gart.ptr, adev->reset_magic, 2578 AMDGPU_RESET_MAGIC_NUM)) 2579 return true; 2580 2581 if (!amdgpu_in_reset(adev)) 2582 return false; 2583 2584 /* 2585 * For all ASICs with baco/mode1 reset, the VRAM is 2586 * always assumed to be lost. 2587 */ 2588 switch (amdgpu_asic_reset_method(adev)) { 2589 case AMD_RESET_METHOD_BACO: 2590 case AMD_RESET_METHOD_MODE1: 2591 return true; 2592 default: 2593 return false; 2594 } 2595 } 2596 2597 /** 2598 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2599 * 2600 * @adev: amdgpu_device pointer 2601 * @state: clockgating state (gate or ungate) 2602 * 2603 * The list of all the hardware IPs that make up the asic is walked and the 2604 * set_clockgating_state callbacks are run. 2605 * Late initialization pass enabling clockgating for hardware IPs. 2606 * Fini or suspend, pass disabling clockgating for hardware IPs. 2607 * Returns 0 on success, negative error code on failure. 2608 */ 2609 2610 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2611 enum amd_clockgating_state state) 2612 { 2613 int i, j, r; 2614 2615 if (amdgpu_emu_mode == 1) 2616 return 0; 2617 2618 for (j = 0; j < adev->num_ip_blocks; j++) { 2619 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2620 if (!adev->ip_blocks[i].status.late_initialized) 2621 continue; 2622 /* skip CG for GFX on S0ix */ 2623 if (adev->in_s0ix && 2624 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2625 continue; 2626 /* skip CG for VCE/UVD, it's handled specially */ 2627 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2628 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2629 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2630 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2631 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2632 /* enable clockgating to save power */ 2633 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2634 state); 2635 if (r) { 2636 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2637 adev->ip_blocks[i].version->funcs->name, r); 2638 return r; 2639 } 2640 } 2641 } 2642 2643 return 0; 2644 } 2645 2646 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2647 enum amd_powergating_state state) 2648 { 2649 int i, j, r; 2650 2651 if (amdgpu_emu_mode == 1) 2652 return 0; 2653 2654 for (j = 0; j < adev->num_ip_blocks; j++) { 2655 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2656 if (!adev->ip_blocks[i].status.late_initialized) 2657 continue; 2658 /* skip PG for GFX on S0ix */ 2659 if (adev->in_s0ix && 2660 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2661 continue; 2662 /* skip CG for VCE/UVD, it's handled specially */ 2663 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2664 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2665 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2666 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2667 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2668 /* enable powergating to save power */ 2669 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2670 state); 2671 if (r) { 2672 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2673 adev->ip_blocks[i].version->funcs->name, r); 2674 return r; 2675 } 2676 } 2677 } 2678 return 0; 2679 } 2680 2681 static int amdgpu_device_enable_mgpu_fan_boost(void) 2682 { 2683 struct amdgpu_gpu_instance *gpu_ins; 2684 struct amdgpu_device *adev; 2685 int i, ret = 0; 2686 2687 mutex_lock(&mgpu_info.mutex); 2688 2689 /* 2690 * MGPU fan boost feature should be enabled 2691 * only when there are two or more dGPUs in 2692 * the system 2693 */ 2694 if (mgpu_info.num_dgpu < 2) 2695 goto out; 2696 2697 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2698 gpu_ins = &(mgpu_info.gpu_ins[i]); 2699 adev = gpu_ins->adev; 2700 if (!(adev->flags & AMD_IS_APU) && 2701 !gpu_ins->mgpu_fan_enabled) { 2702 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2703 if (ret) 2704 break; 2705 2706 gpu_ins->mgpu_fan_enabled = 1; 2707 } 2708 } 2709 2710 out: 2711 mutex_unlock(&mgpu_info.mutex); 2712 2713 return ret; 2714 } 2715 2716 /** 2717 * amdgpu_device_ip_late_init - run late init for hardware IPs 2718 * 2719 * @adev: amdgpu_device pointer 2720 * 2721 * Late initialization pass for hardware IPs. The list of all the hardware 2722 * IPs that make up the asic is walked and the late_init callbacks are run. 2723 * late_init covers any special initialization that an IP requires 2724 * after all of the have been initialized or something that needs to happen 2725 * late in the init process. 2726 * Returns 0 on success, negative error code on failure. 2727 */ 2728 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2729 { 2730 struct amdgpu_gpu_instance *gpu_instance; 2731 int i = 0, r; 2732 2733 for (i = 0; i < adev->num_ip_blocks; i++) { 2734 if (!adev->ip_blocks[i].status.hw) 2735 continue; 2736 if (adev->ip_blocks[i].version->funcs->late_init) { 2737 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2738 if (r) { 2739 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2740 adev->ip_blocks[i].version->funcs->name, r); 2741 return r; 2742 } 2743 } 2744 adev->ip_blocks[i].status.late_initialized = true; 2745 } 2746 2747 r = amdgpu_ras_late_init(adev); 2748 if (r) { 2749 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2750 return r; 2751 } 2752 2753 amdgpu_ras_set_error_query_ready(adev, true); 2754 2755 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2756 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2757 2758 amdgpu_device_fill_reset_magic(adev); 2759 2760 r = amdgpu_device_enable_mgpu_fan_boost(); 2761 if (r) 2762 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2763 2764 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2765 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2766 adev->asic_type == CHIP_ALDEBARAN )) 2767 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2768 2769 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2770 mutex_lock(&mgpu_info.mutex); 2771 2772 /* 2773 * Reset device p-state to low as this was booted with high. 2774 * 2775 * This should be performed only after all devices from the same 2776 * hive get initialized. 2777 * 2778 * However, it's unknown how many device in the hive in advance. 2779 * As this is counted one by one during devices initializations. 2780 * 2781 * So, we wait for all XGMI interlinked devices initialized. 2782 * This may bring some delays as those devices may come from 2783 * different hives. But that should be OK. 2784 */ 2785 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2786 for (i = 0; i < mgpu_info.num_gpu; i++) { 2787 gpu_instance = &(mgpu_info.gpu_ins[i]); 2788 if (gpu_instance->adev->flags & AMD_IS_APU) 2789 continue; 2790 2791 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2792 AMDGPU_XGMI_PSTATE_MIN); 2793 if (r) { 2794 DRM_ERROR("pstate setting failed (%d).\n", r); 2795 break; 2796 } 2797 } 2798 } 2799 2800 mutex_unlock(&mgpu_info.mutex); 2801 } 2802 2803 return 0; 2804 } 2805 2806 /** 2807 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2808 * 2809 * @adev: amdgpu_device pointer 2810 * 2811 * For ASICs need to disable SMC first 2812 */ 2813 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2814 { 2815 int i, r; 2816 2817 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2818 return; 2819 2820 for (i = 0; i < adev->num_ip_blocks; i++) { 2821 if (!adev->ip_blocks[i].status.hw) 2822 continue; 2823 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2824 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2825 /* XXX handle errors */ 2826 if (r) { 2827 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2828 adev->ip_blocks[i].version->funcs->name, r); 2829 } 2830 adev->ip_blocks[i].status.hw = false; 2831 break; 2832 } 2833 } 2834 } 2835 2836 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2837 { 2838 int i, r; 2839 2840 for (i = 0; i < adev->num_ip_blocks; i++) { 2841 if (!adev->ip_blocks[i].version->funcs->early_fini) 2842 continue; 2843 2844 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2845 if (r) { 2846 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2847 adev->ip_blocks[i].version->funcs->name, r); 2848 } 2849 } 2850 2851 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2852 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2853 2854 amdgpu_amdkfd_suspend(adev, false); 2855 2856 /* Workaroud for ASICs need to disable SMC first */ 2857 amdgpu_device_smu_fini_early(adev); 2858 2859 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2860 if (!adev->ip_blocks[i].status.hw) 2861 continue; 2862 2863 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2864 /* XXX handle errors */ 2865 if (r) { 2866 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2867 adev->ip_blocks[i].version->funcs->name, r); 2868 } 2869 2870 adev->ip_blocks[i].status.hw = false; 2871 } 2872 2873 if (amdgpu_sriov_vf(adev)) { 2874 if (amdgpu_virt_release_full_gpu(adev, false)) 2875 DRM_ERROR("failed to release exclusive mode on fini\n"); 2876 } 2877 2878 return 0; 2879 } 2880 2881 /** 2882 * amdgpu_device_ip_fini - run fini for hardware IPs 2883 * 2884 * @adev: amdgpu_device pointer 2885 * 2886 * Main teardown pass for hardware IPs. The list of all the hardware 2887 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2888 * are run. hw_fini tears down the hardware associated with each IP 2889 * and sw_fini tears down any software state associated with each IP. 2890 * Returns 0 on success, negative error code on failure. 2891 */ 2892 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2893 { 2894 int i, r; 2895 2896 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2897 amdgpu_virt_release_ras_err_handler_data(adev); 2898 2899 if (adev->gmc.xgmi.num_physical_nodes > 1) 2900 amdgpu_xgmi_remove_device(adev); 2901 2902 amdgpu_amdkfd_device_fini_sw(adev); 2903 2904 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2905 if (!adev->ip_blocks[i].status.sw) 2906 continue; 2907 2908 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2909 amdgpu_ucode_free_bo(adev); 2910 amdgpu_free_static_csa(&adev->virt.csa_obj); 2911 amdgpu_device_wb_fini(adev); 2912 amdgpu_device_vram_scratch_fini(adev); 2913 amdgpu_ib_pool_fini(adev); 2914 } 2915 2916 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2917 /* XXX handle errors */ 2918 if (r) { 2919 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2920 adev->ip_blocks[i].version->funcs->name, r); 2921 } 2922 adev->ip_blocks[i].status.sw = false; 2923 adev->ip_blocks[i].status.valid = false; 2924 } 2925 2926 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2927 if (!adev->ip_blocks[i].status.late_initialized) 2928 continue; 2929 if (adev->ip_blocks[i].version->funcs->late_fini) 2930 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2931 adev->ip_blocks[i].status.late_initialized = false; 2932 } 2933 2934 amdgpu_ras_fini(adev); 2935 2936 return 0; 2937 } 2938 2939 /** 2940 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2941 * 2942 * @work: work_struct. 2943 */ 2944 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2945 { 2946 struct amdgpu_device *adev = 2947 container_of(work, struct amdgpu_device, delayed_init_work.work); 2948 int r; 2949 2950 r = amdgpu_ib_ring_tests(adev); 2951 if (r) 2952 DRM_ERROR("ib ring test failed (%d).\n", r); 2953 } 2954 2955 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2956 { 2957 struct amdgpu_device *adev = 2958 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2959 2960 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2961 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2962 2963 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2964 adev->gfx.gfx_off_state = true; 2965 } 2966 2967 /** 2968 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2969 * 2970 * @adev: amdgpu_device pointer 2971 * 2972 * Main suspend function for hardware IPs. The list of all the hardware 2973 * IPs that make up the asic is walked, clockgating is disabled and the 2974 * suspend callbacks are run. suspend puts the hardware and software state 2975 * in each IP into a state suitable for suspend. 2976 * Returns 0 on success, negative error code on failure. 2977 */ 2978 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2979 { 2980 int i, r; 2981 2982 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2983 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2984 2985 /* 2986 * Per PMFW team's suggestion, driver needs to handle gfxoff 2987 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2988 * scenario. Add the missing df cstate disablement here. 2989 */ 2990 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2991 dev_warn(adev->dev, "Failed to disallow df cstate"); 2992 2993 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2994 if (!adev->ip_blocks[i].status.valid) 2995 continue; 2996 2997 /* displays are handled separately */ 2998 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2999 continue; 3000 3001 /* XXX handle errors */ 3002 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3003 /* XXX handle errors */ 3004 if (r) { 3005 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3006 adev->ip_blocks[i].version->funcs->name, r); 3007 return r; 3008 } 3009 3010 adev->ip_blocks[i].status.hw = false; 3011 } 3012 3013 return 0; 3014 } 3015 3016 /** 3017 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3018 * 3019 * @adev: amdgpu_device pointer 3020 * 3021 * Main suspend function for hardware IPs. The list of all the hardware 3022 * IPs that make up the asic is walked, clockgating is disabled and the 3023 * suspend callbacks are run. suspend puts the hardware and software state 3024 * in each IP into a state suitable for suspend. 3025 * Returns 0 on success, negative error code on failure. 3026 */ 3027 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3028 { 3029 int i, r; 3030 3031 if (adev->in_s0ix) 3032 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3033 3034 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3035 if (!adev->ip_blocks[i].status.valid) 3036 continue; 3037 /* displays are handled in phase1 */ 3038 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3039 continue; 3040 /* PSP lost connection when err_event_athub occurs */ 3041 if (amdgpu_ras_intr_triggered() && 3042 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3043 adev->ip_blocks[i].status.hw = false; 3044 continue; 3045 } 3046 3047 /* skip unnecessary suspend if we do not initialize them yet */ 3048 if (adev->gmc.xgmi.pending_reset && 3049 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3050 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3051 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3052 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3053 adev->ip_blocks[i].status.hw = false; 3054 continue; 3055 } 3056 3057 /* skip suspend of gfx/mes and psp for S0ix 3058 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3059 * like at runtime. PSP is also part of the always on hardware 3060 * so no need to suspend it. 3061 */ 3062 if (adev->in_s0ix && 3063 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3064 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3065 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3066 continue; 3067 3068 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3069 if (adev->in_s0ix && 3070 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3071 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3072 continue; 3073 3074 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3075 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3076 * from this location and RLC Autoload automatically also gets loaded 3077 * from here based on PMFW -> PSP message during re-init sequence. 3078 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3079 * the TMR and reload FWs again for IMU enabled APU ASICs. 3080 */ 3081 if (amdgpu_in_reset(adev) && 3082 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3083 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3084 continue; 3085 3086 /* XXX handle errors */ 3087 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3088 /* XXX handle errors */ 3089 if (r) { 3090 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3091 adev->ip_blocks[i].version->funcs->name, r); 3092 } 3093 adev->ip_blocks[i].status.hw = false; 3094 /* handle putting the SMC in the appropriate state */ 3095 if(!amdgpu_sriov_vf(adev)){ 3096 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3097 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3098 if (r) { 3099 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3100 adev->mp1_state, r); 3101 return r; 3102 } 3103 } 3104 } 3105 } 3106 3107 return 0; 3108 } 3109 3110 /** 3111 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3112 * 3113 * @adev: amdgpu_device pointer 3114 * 3115 * Main suspend function for hardware IPs. The list of all the hardware 3116 * IPs that make up the asic is walked, clockgating is disabled and the 3117 * suspend callbacks are run. suspend puts the hardware and software state 3118 * in each IP into a state suitable for suspend. 3119 * Returns 0 on success, negative error code on failure. 3120 */ 3121 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3122 { 3123 int r; 3124 3125 if (amdgpu_sriov_vf(adev)) { 3126 amdgpu_virt_fini_data_exchange(adev); 3127 amdgpu_virt_request_full_gpu(adev, false); 3128 } 3129 3130 r = amdgpu_device_ip_suspend_phase1(adev); 3131 if (r) 3132 return r; 3133 r = amdgpu_device_ip_suspend_phase2(adev); 3134 3135 if (amdgpu_sriov_vf(adev)) 3136 amdgpu_virt_release_full_gpu(adev, false); 3137 3138 return r; 3139 } 3140 3141 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3142 { 3143 int i, r; 3144 3145 static enum amd_ip_block_type ip_order[] = { 3146 AMD_IP_BLOCK_TYPE_COMMON, 3147 AMD_IP_BLOCK_TYPE_GMC, 3148 AMD_IP_BLOCK_TYPE_PSP, 3149 AMD_IP_BLOCK_TYPE_IH, 3150 }; 3151 3152 for (i = 0; i < adev->num_ip_blocks; i++) { 3153 int j; 3154 struct amdgpu_ip_block *block; 3155 3156 block = &adev->ip_blocks[i]; 3157 block->status.hw = false; 3158 3159 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3160 3161 if (block->version->type != ip_order[j] || 3162 !block->status.valid) 3163 continue; 3164 3165 r = block->version->funcs->hw_init(adev); 3166 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3167 if (r) 3168 return r; 3169 block->status.hw = true; 3170 } 3171 } 3172 3173 return 0; 3174 } 3175 3176 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3177 { 3178 int i, r; 3179 3180 static enum amd_ip_block_type ip_order[] = { 3181 AMD_IP_BLOCK_TYPE_SMC, 3182 AMD_IP_BLOCK_TYPE_DCE, 3183 AMD_IP_BLOCK_TYPE_GFX, 3184 AMD_IP_BLOCK_TYPE_SDMA, 3185 AMD_IP_BLOCK_TYPE_UVD, 3186 AMD_IP_BLOCK_TYPE_VCE, 3187 AMD_IP_BLOCK_TYPE_VCN 3188 }; 3189 3190 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3191 int j; 3192 struct amdgpu_ip_block *block; 3193 3194 for (j = 0; j < adev->num_ip_blocks; j++) { 3195 block = &adev->ip_blocks[j]; 3196 3197 if (block->version->type != ip_order[i] || 3198 !block->status.valid || 3199 block->status.hw) 3200 continue; 3201 3202 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3203 r = block->version->funcs->resume(adev); 3204 else 3205 r = block->version->funcs->hw_init(adev); 3206 3207 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3208 if (r) 3209 return r; 3210 block->status.hw = true; 3211 } 3212 } 3213 3214 return 0; 3215 } 3216 3217 /** 3218 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3219 * 3220 * @adev: amdgpu_device pointer 3221 * 3222 * First resume function for hardware IPs. The list of all the hardware 3223 * IPs that make up the asic is walked and the resume callbacks are run for 3224 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3225 * after a suspend and updates the software state as necessary. This 3226 * function is also used for restoring the GPU after a GPU reset. 3227 * Returns 0 on success, negative error code on failure. 3228 */ 3229 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3230 { 3231 int i, r; 3232 3233 for (i = 0; i < adev->num_ip_blocks; i++) { 3234 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3235 continue; 3236 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3237 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3238 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3239 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3240 3241 r = adev->ip_blocks[i].version->funcs->resume(adev); 3242 if (r) { 3243 DRM_ERROR("resume of IP block <%s> failed %d\n", 3244 adev->ip_blocks[i].version->funcs->name, r); 3245 return r; 3246 } 3247 adev->ip_blocks[i].status.hw = true; 3248 } 3249 } 3250 3251 return 0; 3252 } 3253 3254 /** 3255 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3256 * 3257 * @adev: amdgpu_device pointer 3258 * 3259 * First resume function for hardware IPs. The list of all the hardware 3260 * IPs that make up the asic is walked and the resume callbacks are run for 3261 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3262 * functional state after a suspend and updates the software state as 3263 * necessary. This function is also used for restoring the GPU after a GPU 3264 * reset. 3265 * Returns 0 on success, negative error code on failure. 3266 */ 3267 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3268 { 3269 int i, r; 3270 3271 for (i = 0; i < adev->num_ip_blocks; i++) { 3272 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3273 continue; 3274 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3275 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3276 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3277 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3278 continue; 3279 r = adev->ip_blocks[i].version->funcs->resume(adev); 3280 if (r) { 3281 DRM_ERROR("resume of IP block <%s> failed %d\n", 3282 adev->ip_blocks[i].version->funcs->name, r); 3283 return r; 3284 } 3285 adev->ip_blocks[i].status.hw = true; 3286 3287 if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3288 /* disable gfxoff for IP resume. The gfxoff will be re-enabled in 3289 * amdgpu_device_resume() after IP resume. 3290 */ 3291 amdgpu_gfx_off_ctrl(adev, false); 3292 DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n"); 3293 } 3294 3295 } 3296 3297 return 0; 3298 } 3299 3300 /** 3301 * amdgpu_device_ip_resume - run resume for hardware IPs 3302 * 3303 * @adev: amdgpu_device pointer 3304 * 3305 * Main resume function for hardware IPs. The hardware IPs 3306 * are split into two resume functions because they are 3307 * are also used in in recovering from a GPU reset and some additional 3308 * steps need to be take between them. In this case (S3/S4) they are 3309 * run sequentially. 3310 * Returns 0 on success, negative error code on failure. 3311 */ 3312 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3313 { 3314 int r; 3315 3316 r = amdgpu_amdkfd_resume_iommu(adev); 3317 if (r) 3318 return r; 3319 3320 r = amdgpu_device_ip_resume_phase1(adev); 3321 if (r) 3322 return r; 3323 3324 r = amdgpu_device_fw_loading(adev); 3325 if (r) 3326 return r; 3327 3328 r = amdgpu_device_ip_resume_phase2(adev); 3329 3330 return r; 3331 } 3332 3333 /** 3334 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3335 * 3336 * @adev: amdgpu_device pointer 3337 * 3338 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3339 */ 3340 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3341 { 3342 if (amdgpu_sriov_vf(adev)) { 3343 if (adev->is_atom_fw) { 3344 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3345 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3346 } else { 3347 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3348 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3349 } 3350 3351 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3352 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3353 } 3354 } 3355 3356 /** 3357 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3358 * 3359 * @asic_type: AMD asic type 3360 * 3361 * Check if there is DC (new modesetting infrastructre) support for an asic. 3362 * returns true if DC has support, false if not. 3363 */ 3364 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3365 { 3366 switch (asic_type) { 3367 #ifdef CONFIG_DRM_AMDGPU_SI 3368 case CHIP_HAINAN: 3369 #endif 3370 case CHIP_TOPAZ: 3371 /* chips with no display hardware */ 3372 return false; 3373 #if defined(CONFIG_DRM_AMD_DC) 3374 case CHIP_TAHITI: 3375 case CHIP_PITCAIRN: 3376 case CHIP_VERDE: 3377 case CHIP_OLAND: 3378 /* 3379 * We have systems in the wild with these ASICs that require 3380 * LVDS and VGA support which is not supported with DC. 3381 * 3382 * Fallback to the non-DC driver here by default so as not to 3383 * cause regressions. 3384 */ 3385 #if defined(CONFIG_DRM_AMD_DC_SI) 3386 return amdgpu_dc > 0; 3387 #else 3388 return false; 3389 #endif 3390 case CHIP_BONAIRE: 3391 case CHIP_KAVERI: 3392 case CHIP_KABINI: 3393 case CHIP_MULLINS: 3394 /* 3395 * We have systems in the wild with these ASICs that require 3396 * VGA support which is not supported with DC. 3397 * 3398 * Fallback to the non-DC driver here by default so as not to 3399 * cause regressions. 3400 */ 3401 return amdgpu_dc > 0; 3402 default: 3403 return amdgpu_dc != 0; 3404 #else 3405 default: 3406 if (amdgpu_dc > 0) 3407 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3408 "but isn't supported by ASIC, ignoring\n"); 3409 return false; 3410 #endif 3411 } 3412 } 3413 3414 /** 3415 * amdgpu_device_has_dc_support - check if dc is supported 3416 * 3417 * @adev: amdgpu_device pointer 3418 * 3419 * Returns true for supported, false for not supported 3420 */ 3421 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3422 { 3423 if (amdgpu_sriov_vf(adev) || 3424 adev->enable_virtual_display || 3425 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3426 return false; 3427 3428 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3429 } 3430 3431 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3432 { 3433 struct amdgpu_device *adev = 3434 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3435 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3436 3437 /* It's a bug to not have a hive within this function */ 3438 if (WARN_ON(!hive)) 3439 return; 3440 3441 /* 3442 * Use task barrier to synchronize all xgmi reset works across the 3443 * hive. task_barrier_enter and task_barrier_exit will block 3444 * until all the threads running the xgmi reset works reach 3445 * those points. task_barrier_full will do both blocks. 3446 */ 3447 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3448 3449 task_barrier_enter(&hive->tb); 3450 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3451 3452 if (adev->asic_reset_res) 3453 goto fail; 3454 3455 task_barrier_exit(&hive->tb); 3456 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3457 3458 if (adev->asic_reset_res) 3459 goto fail; 3460 3461 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3462 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3463 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3464 } else { 3465 3466 task_barrier_full(&hive->tb); 3467 adev->asic_reset_res = amdgpu_asic_reset(adev); 3468 } 3469 3470 fail: 3471 if (adev->asic_reset_res) 3472 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3473 adev->asic_reset_res, adev_to_drm(adev)->unique); 3474 amdgpu_put_xgmi_hive(hive); 3475 } 3476 3477 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3478 { 3479 char *input = amdgpu_lockup_timeout; 3480 char *timeout_setting = NULL; 3481 int index = 0; 3482 long timeout; 3483 int ret = 0; 3484 3485 /* 3486 * By default timeout for non compute jobs is 10000 3487 * and 60000 for compute jobs. 3488 * In SR-IOV or passthrough mode, timeout for compute 3489 * jobs are 60000 by default. 3490 */ 3491 adev->gfx_timeout = msecs_to_jiffies(10000); 3492 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3493 if (amdgpu_sriov_vf(adev)) 3494 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3495 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3496 else 3497 adev->compute_timeout = msecs_to_jiffies(60000); 3498 3499 #ifdef notyet 3500 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3501 while ((timeout_setting = strsep(&input, ",")) && 3502 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3503 ret = kstrtol(timeout_setting, 0, &timeout); 3504 if (ret) 3505 return ret; 3506 3507 if (timeout == 0) { 3508 index++; 3509 continue; 3510 } else if (timeout < 0) { 3511 timeout = MAX_SCHEDULE_TIMEOUT; 3512 dev_warn(adev->dev, "lockup timeout disabled"); 3513 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3514 } else { 3515 timeout = msecs_to_jiffies(timeout); 3516 } 3517 3518 switch (index++) { 3519 case 0: 3520 adev->gfx_timeout = timeout; 3521 break; 3522 case 1: 3523 adev->compute_timeout = timeout; 3524 break; 3525 case 2: 3526 adev->sdma_timeout = timeout; 3527 break; 3528 case 3: 3529 adev->video_timeout = timeout; 3530 break; 3531 default: 3532 break; 3533 } 3534 } 3535 /* 3536 * There is only one value specified and 3537 * it should apply to all non-compute jobs. 3538 */ 3539 if (index == 1) { 3540 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3541 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3542 adev->compute_timeout = adev->gfx_timeout; 3543 } 3544 } 3545 #endif 3546 3547 return ret; 3548 } 3549 3550 /** 3551 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3552 * 3553 * @adev: amdgpu_device pointer 3554 * 3555 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3556 */ 3557 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3558 { 3559 #ifdef notyet 3560 struct iommu_domain *domain; 3561 3562 domain = iommu_get_domain_for_dev(adev->dev); 3563 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3564 #endif 3565 adev->ram_is_direct_mapped = true; 3566 } 3567 3568 static const struct attribute *amdgpu_dev_attributes[] = { 3569 &dev_attr_product_name.attr, 3570 &dev_attr_product_number.attr, 3571 &dev_attr_serial_number.attr, 3572 &dev_attr_pcie_replay_count.attr, 3573 NULL 3574 }; 3575 3576 /** 3577 * amdgpu_device_init - initialize the driver 3578 * 3579 * @adev: amdgpu_device pointer 3580 * @flags: driver flags 3581 * 3582 * Initializes the driver info and hw (all asics). 3583 * Returns 0 for success or an error on failure. 3584 * Called at driver startup. 3585 */ 3586 int amdgpu_device_init(struct amdgpu_device *adev, 3587 uint32_t flags) 3588 { 3589 struct drm_device *ddev = adev_to_drm(adev); 3590 struct pci_dev *pdev = adev->pdev; 3591 int r, i; 3592 bool px = false; 3593 u32 max_MBps; 3594 int tmp; 3595 3596 adev->shutdown = false; 3597 adev->flags = flags; 3598 3599 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3600 adev->asic_type = amdgpu_force_asic_type; 3601 else 3602 adev->asic_type = flags & AMD_ASIC_MASK; 3603 3604 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3605 if (amdgpu_emu_mode == 1) 3606 adev->usec_timeout *= 10; 3607 adev->gmc.gart_size = 512 * 1024 * 1024; 3608 adev->accel_working = false; 3609 adev->num_rings = 0; 3610 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3611 adev->mman.buffer_funcs = NULL; 3612 adev->mman.buffer_funcs_ring = NULL; 3613 adev->vm_manager.vm_pte_funcs = NULL; 3614 adev->vm_manager.vm_pte_num_scheds = 0; 3615 adev->gmc.gmc_funcs = NULL; 3616 adev->harvest_ip_mask = 0x0; 3617 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3618 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3619 3620 adev->smc_rreg = &amdgpu_invalid_rreg; 3621 adev->smc_wreg = &amdgpu_invalid_wreg; 3622 adev->pcie_rreg = &amdgpu_invalid_rreg; 3623 adev->pcie_wreg = &amdgpu_invalid_wreg; 3624 adev->pciep_rreg = &amdgpu_invalid_rreg; 3625 adev->pciep_wreg = &amdgpu_invalid_wreg; 3626 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3627 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3628 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3629 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3630 adev->didt_rreg = &amdgpu_invalid_rreg; 3631 adev->didt_wreg = &amdgpu_invalid_wreg; 3632 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3633 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3634 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3635 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3636 3637 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3638 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3639 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3640 3641 /* mutex initialization are all done here so we 3642 * can recall function without having locking issues */ 3643 rw_init(&adev->firmware.mutex, "agfw"); 3644 rw_init(&adev->pm.mutex, "agpm"); 3645 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3646 rw_init(&adev->srbm_mutex, "srbm"); 3647 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3648 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3649 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3650 rw_init(&adev->mn_lock, "agpumn"); 3651 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3652 hash_init(adev->mn_hash); 3653 rw_init(&adev->psp.mutex, "agpsp"); 3654 rw_init(&adev->notifier_lock, "agnf"); 3655 rw_init(&adev->pm.stable_pstate_ctx_lock, "agps"); 3656 rw_init(&adev->benchmark_mutex, "agbm"); 3657 3658 amdgpu_device_init_apu_flags(adev); 3659 3660 r = amdgpu_device_check_arguments(adev); 3661 if (r) 3662 return r; 3663 3664 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3665 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3666 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3667 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3668 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3669 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3670 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3671 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3672 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3673 3674 INIT_LIST_HEAD(&adev->shadow_list); 3675 rw_init(&adev->shadow_list_lock, "sdwlst"); 3676 3677 INIT_LIST_HEAD(&adev->reset_list); 3678 3679 INIT_LIST_HEAD(&adev->ras_list); 3680 3681 INIT_DELAYED_WORK(&adev->delayed_init_work, 3682 amdgpu_device_delayed_init_work_handler); 3683 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3684 amdgpu_device_delay_enable_gfx_off); 3685 3686 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3687 3688 adev->gfx.gfx_off_req_count = 1; 3689 adev->gfx.gfx_off_residency = 0; 3690 adev->gfx.gfx_off_entrycount = 0; 3691 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3692 3693 atomic_set(&adev->throttling_logging_enabled, 1); 3694 /* 3695 * If throttling continues, logging will be performed every minute 3696 * to avoid log flooding. "-1" is subtracted since the thermal 3697 * throttling interrupt comes every second. Thus, the total logging 3698 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3699 * for throttling interrupt) = 60 seconds. 3700 */ 3701 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3702 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3703 3704 #ifdef __linux__ 3705 /* Registers mapping */ 3706 /* TODO: block userspace mapping of io register */ 3707 if (adev->asic_type >= CHIP_BONAIRE) { 3708 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3709 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3710 } else { 3711 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3712 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3713 } 3714 3715 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3716 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3717 3718 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3719 if (adev->rmmio == NULL) { 3720 return -ENOMEM; 3721 } 3722 #endif 3723 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3724 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3725 3726 amdgpu_device_get_pcie_info(adev); 3727 3728 if (amdgpu_mcbp) 3729 DRM_INFO("MCBP is enabled\n"); 3730 3731 /* 3732 * Reset domain needs to be present early, before XGMI hive discovered 3733 * (if any) and intitialized to use reset sem and in_gpu reset flag 3734 * early on during init and before calling to RREG32. 3735 */ 3736 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3737 if (!adev->reset_domain) 3738 return -ENOMEM; 3739 3740 /* detect hw virtualization here */ 3741 amdgpu_detect_virtualization(adev); 3742 3743 r = amdgpu_device_get_job_timeout_settings(adev); 3744 if (r) { 3745 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3746 return r; 3747 } 3748 3749 /* early init functions */ 3750 r = amdgpu_device_ip_early_init(adev); 3751 if (r) 3752 return r; 3753 3754 /* Get rid of things like offb */ 3755 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3756 if (r) 3757 return r; 3758 3759 /* Enable TMZ based on IP_VERSION */ 3760 amdgpu_gmc_tmz_set(adev); 3761 3762 amdgpu_gmc_noretry_set(adev); 3763 /* Need to get xgmi info early to decide the reset behavior*/ 3764 if (adev->gmc.xgmi.supported) { 3765 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3766 if (r) 3767 return r; 3768 } 3769 3770 /* enable PCIE atomic ops */ 3771 #ifdef notyet 3772 if (amdgpu_sriov_vf(adev)) 3773 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3774 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3775 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3776 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3777 * internal path natively support atomics, set have_atomics_support to true. 3778 */ 3779 else if ((adev->flags & AMD_IS_APU) && 3780 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) 3781 adev->have_atomics_support = true; 3782 else 3783 adev->have_atomics_support = 3784 !pci_enable_atomic_ops_to_root(adev->pdev, 3785 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3786 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3787 if (!adev->have_atomics_support) 3788 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3789 #else 3790 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3791 * internal path natively support atomics, set have_atomics_support to true. 3792 */ 3793 if ((adev->flags & AMD_IS_APU) && 3794 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) 3795 adev->have_atomics_support = true; 3796 else 3797 adev->have_atomics_support = false; 3798 #endif 3799 3800 /* doorbell bar mapping and doorbell index init*/ 3801 amdgpu_device_doorbell_init(adev); 3802 3803 if (amdgpu_emu_mode == 1) { 3804 /* post the asic on emulation mode */ 3805 emu_soc_asic_init(adev); 3806 goto fence_driver_init; 3807 } 3808 3809 amdgpu_reset_init(adev); 3810 3811 /* detect if we are with an SRIOV vbios */ 3812 amdgpu_device_detect_sriov_bios(adev); 3813 3814 /* check if we need to reset the asic 3815 * E.g., driver was not cleanly unloaded previously, etc. 3816 */ 3817 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3818 if (adev->gmc.xgmi.num_physical_nodes) { 3819 dev_info(adev->dev, "Pending hive reset.\n"); 3820 adev->gmc.xgmi.pending_reset = true; 3821 /* Only need to init necessary block for SMU to handle the reset */ 3822 for (i = 0; i < adev->num_ip_blocks; i++) { 3823 if (!adev->ip_blocks[i].status.valid) 3824 continue; 3825 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3826 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3827 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3828 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3829 DRM_DEBUG("IP %s disabled for hw_init.\n", 3830 adev->ip_blocks[i].version->funcs->name); 3831 adev->ip_blocks[i].status.hw = true; 3832 } 3833 } 3834 } else { 3835 tmp = amdgpu_reset_method; 3836 /* It should do a default reset when loading or reloading the driver, 3837 * regardless of the module parameter reset_method. 3838 */ 3839 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3840 r = amdgpu_asic_reset(adev); 3841 amdgpu_reset_method = tmp; 3842 if (r) { 3843 dev_err(adev->dev, "asic reset on init failed\n"); 3844 goto failed; 3845 } 3846 } 3847 } 3848 3849 pci_enable_pcie_error_reporting(adev->pdev); 3850 3851 /* Post card if necessary */ 3852 if (amdgpu_device_need_post(adev)) { 3853 if (!adev->bios) { 3854 dev_err(adev->dev, "no vBIOS found\n"); 3855 r = -EINVAL; 3856 goto failed; 3857 } 3858 DRM_INFO("GPU posting now...\n"); 3859 r = amdgpu_device_asic_init(adev); 3860 if (r) { 3861 dev_err(adev->dev, "gpu post error!\n"); 3862 goto failed; 3863 } 3864 } 3865 3866 if (adev->is_atom_fw) { 3867 /* Initialize clocks */ 3868 r = amdgpu_atomfirmware_get_clock_info(adev); 3869 if (r) { 3870 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3871 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3872 goto failed; 3873 } 3874 } else { 3875 /* Initialize clocks */ 3876 r = amdgpu_atombios_get_clock_info(adev); 3877 if (r) { 3878 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3879 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3880 goto failed; 3881 } 3882 /* init i2c buses */ 3883 if (!amdgpu_device_has_dc_support(adev)) 3884 amdgpu_atombios_i2c_init(adev); 3885 } 3886 3887 fence_driver_init: 3888 /* Fence driver */ 3889 r = amdgpu_fence_driver_sw_init(adev); 3890 if (r) { 3891 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3892 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3893 goto failed; 3894 } 3895 3896 /* init the mode config */ 3897 drm_mode_config_init(adev_to_drm(adev)); 3898 3899 r = amdgpu_device_ip_init(adev); 3900 if (r) { 3901 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3902 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3903 goto release_ras_con; 3904 } 3905 3906 amdgpu_fence_driver_hw_init(adev); 3907 3908 dev_info(adev->dev, 3909 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3910 adev->gfx.config.max_shader_engines, 3911 adev->gfx.config.max_sh_per_se, 3912 adev->gfx.config.max_cu_per_sh, 3913 adev->gfx.cu_info.number); 3914 3915 #ifdef __OpenBSD__ 3916 { 3917 const char *chip_name; 3918 uint32_t version = adev->ip_versions[GC_HWIP][0]; 3919 int maj, min, rev; 3920 3921 switch (adev->asic_type) { 3922 case CHIP_RAVEN: 3923 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3924 chip_name = "RAVEN2"; 3925 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3926 chip_name = "PICASSO"; 3927 else 3928 chip_name = "RAVEN"; 3929 break; 3930 case CHIP_RENOIR: 3931 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3932 chip_name = "RENOIR"; 3933 else 3934 chip_name = "GREEN_SARDINE"; 3935 break; 3936 default: 3937 chip_name = amdgpu_asic_name[adev->asic_type]; 3938 } 3939 3940 printf("%s: %s", adev->self.dv_xname, chip_name); 3941 /* show graphics/compute ip block version, not set on < GFX9 */ 3942 if (version) { 3943 maj = IP_VERSION_MAJ(version); 3944 min = IP_VERSION_MIN(version); 3945 rev = IP_VERSION_REV(version); 3946 printf(" GC %d.%d.%d", maj, min, rev); 3947 } 3948 printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id); 3949 } 3950 #endif 3951 3952 adev->accel_working = true; 3953 3954 amdgpu_vm_check_compute_bug(adev); 3955 3956 /* Initialize the buffer migration limit. */ 3957 if (amdgpu_moverate >= 0) 3958 max_MBps = amdgpu_moverate; 3959 else 3960 max_MBps = 8; /* Allow 8 MB/s. */ 3961 /* Get a log2 for easy divisions. */ 3962 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3963 3964 r = amdgpu_pm_sysfs_init(adev); 3965 if (r) { 3966 adev->pm_sysfs_en = false; 3967 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3968 } else 3969 adev->pm_sysfs_en = true; 3970 3971 r = amdgpu_ucode_sysfs_init(adev); 3972 if (r) { 3973 adev->ucode_sysfs_en = false; 3974 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3975 } else 3976 adev->ucode_sysfs_en = true; 3977 3978 r = amdgpu_psp_sysfs_init(adev); 3979 if (r) { 3980 adev->psp_sysfs_en = false; 3981 if (!amdgpu_sriov_vf(adev)) 3982 DRM_ERROR("Creating psp sysfs failed\n"); 3983 } else 3984 adev->psp_sysfs_en = true; 3985 3986 /* 3987 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3988 * Otherwise the mgpu fan boost feature will be skipped due to the 3989 * gpu instance is counted less. 3990 */ 3991 amdgpu_register_gpu_instance(adev); 3992 3993 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3994 * explicit gating rather than handling it automatically. 3995 */ 3996 if (!adev->gmc.xgmi.pending_reset) { 3997 r = amdgpu_device_ip_late_init(adev); 3998 if (r) { 3999 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4000 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4001 goto release_ras_con; 4002 } 4003 /* must succeed. */ 4004 amdgpu_ras_resume(adev); 4005 queue_delayed_work(system_wq, &adev->delayed_init_work, 4006 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4007 } 4008 4009 if (amdgpu_sriov_vf(adev)) { 4010 amdgpu_virt_release_full_gpu(adev, true); 4011 flush_delayed_work(&adev->delayed_init_work); 4012 } 4013 4014 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4015 if (r) 4016 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4017 4018 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4019 r = amdgpu_pmu_init(adev); 4020 if (r) 4021 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4022 4023 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4024 if (amdgpu_device_cache_pci_state(adev->pdev)) 4025 pci_restore_state(pdev); 4026 4027 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4028 /* this will fail for cards that aren't VGA class devices, just 4029 * ignore it */ 4030 #ifdef notyet 4031 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4032 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4033 #endif 4034 4035 px = amdgpu_device_supports_px(ddev); 4036 4037 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4038 apple_gmux_detect(NULL, NULL))) 4039 vga_switcheroo_register_client(adev->pdev, 4040 &amdgpu_switcheroo_ops, px); 4041 4042 if (px) 4043 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4044 4045 if (adev->gmc.xgmi.pending_reset) 4046 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4047 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4048 4049 amdgpu_device_check_iommu_direct_map(adev); 4050 4051 return 0; 4052 4053 release_ras_con: 4054 if (amdgpu_sriov_vf(adev)) 4055 amdgpu_virt_release_full_gpu(adev, true); 4056 4057 /* failed in exclusive mode due to timeout */ 4058 if (amdgpu_sriov_vf(adev) && 4059 !amdgpu_sriov_runtime(adev) && 4060 amdgpu_virt_mmio_blocked(adev) && 4061 !amdgpu_virt_wait_reset(adev)) { 4062 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4063 /* Don't send request since VF is inactive. */ 4064 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4065 adev->virt.ops = NULL; 4066 r = -EAGAIN; 4067 } 4068 amdgpu_release_ras_context(adev); 4069 4070 failed: 4071 amdgpu_vf_error_trans_all(adev); 4072 4073 return r; 4074 } 4075 4076 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4077 { 4078 STUB(); 4079 #ifdef notyet 4080 /* Clear all CPU mappings pointing to this device */ 4081 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4082 #endif 4083 4084 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4085 amdgpu_device_doorbell_fini(adev); 4086 4087 #ifdef __linux__ 4088 iounmap(adev->rmmio); 4089 adev->rmmio = NULL; 4090 if (adev->mman.aper_base_kaddr) 4091 iounmap(adev->mman.aper_base_kaddr); 4092 adev->mman.aper_base_kaddr = NULL; 4093 #else 4094 if (adev->rmmio_size > 0) 4095 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4096 adev->rmmio_size); 4097 adev->rmmio_size = 0; 4098 adev->rmmio = NULL; 4099 if (adev->mman.aper_base_kaddr) 4100 bus_space_unmap(adev->memt, adev->mman.aper_bsh, 4101 adev->gmc.visible_vram_size); 4102 adev->mman.aper_base_kaddr = NULL; 4103 #endif 4104 4105 /* Memory manager related */ 4106 if (!adev->gmc.xgmi.connected_to_cpu) { 4107 #ifdef __linux__ 4108 arch_phys_wc_del(adev->gmc.vram_mtrr); 4109 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4110 #else 4111 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC); 4112 #endif 4113 } 4114 } 4115 4116 /** 4117 * amdgpu_device_fini_hw - tear down the driver 4118 * 4119 * @adev: amdgpu_device pointer 4120 * 4121 * Tear down the driver info (all asics). 4122 * Called at driver shutdown. 4123 */ 4124 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4125 { 4126 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4127 flush_delayed_work(&adev->delayed_init_work); 4128 adev->shutdown = true; 4129 4130 /* make sure IB test finished before entering exclusive mode 4131 * to avoid preemption on IB test 4132 * */ 4133 if (amdgpu_sriov_vf(adev)) { 4134 amdgpu_virt_request_full_gpu(adev, false); 4135 amdgpu_virt_fini_data_exchange(adev); 4136 } 4137 4138 /* disable all interrupts */ 4139 amdgpu_irq_disable_all(adev); 4140 if (adev->mode_info.mode_config_initialized){ 4141 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4142 drm_helper_force_disable_all(adev_to_drm(adev)); 4143 else 4144 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4145 } 4146 amdgpu_fence_driver_hw_fini(adev); 4147 4148 if (adev->mman.initialized) { 4149 flush_delayed_work(&adev->mman.bdev.wq); 4150 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 4151 } 4152 4153 if (adev->pm_sysfs_en) 4154 amdgpu_pm_sysfs_fini(adev); 4155 if (adev->ucode_sysfs_en) 4156 amdgpu_ucode_sysfs_fini(adev); 4157 if (adev->psp_sysfs_en) 4158 amdgpu_psp_sysfs_fini(adev); 4159 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4160 4161 /* disable ras feature must before hw fini */ 4162 amdgpu_ras_pre_fini(adev); 4163 4164 amdgpu_device_ip_fini_early(adev); 4165 4166 amdgpu_irq_fini_hw(adev); 4167 4168 if (adev->mman.initialized) 4169 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4170 4171 amdgpu_gart_dummy_page_fini(adev); 4172 4173 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4174 amdgpu_device_unmap_mmio(adev); 4175 4176 } 4177 4178 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4179 { 4180 int idx; 4181 bool px; 4182 4183 amdgpu_fence_driver_sw_fini(adev); 4184 amdgpu_device_ip_fini(adev); 4185 release_firmware(adev->firmware.gpu_info_fw); 4186 adev->firmware.gpu_info_fw = NULL; 4187 adev->accel_working = false; 4188 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4189 4190 amdgpu_reset_fini(adev); 4191 4192 /* free i2c buses */ 4193 if (!amdgpu_device_has_dc_support(adev)) 4194 amdgpu_i2c_fini(adev); 4195 4196 if (amdgpu_emu_mode != 1) 4197 amdgpu_atombios_fini(adev); 4198 4199 kfree(adev->bios); 4200 adev->bios = NULL; 4201 4202 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4203 4204 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4205 apple_gmux_detect(NULL, NULL))) 4206 vga_switcheroo_unregister_client(adev->pdev); 4207 4208 if (px) 4209 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4210 4211 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4212 vga_client_unregister(adev->pdev); 4213 4214 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4215 #ifdef __linux__ 4216 iounmap(adev->rmmio); 4217 adev->rmmio = NULL; 4218 #else 4219 if (adev->rmmio_size > 0) 4220 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4221 adev->rmmio_size); 4222 adev->rmmio_size = 0; 4223 adev->rmmio = NULL; 4224 #endif 4225 amdgpu_device_doorbell_fini(adev); 4226 drm_dev_exit(idx); 4227 } 4228 4229 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4230 amdgpu_pmu_fini(adev); 4231 if (adev->mman.discovery_bin) 4232 amdgpu_discovery_fini(adev); 4233 4234 amdgpu_reset_put_reset_domain(adev->reset_domain); 4235 adev->reset_domain = NULL; 4236 4237 kfree(adev->pci_state); 4238 4239 } 4240 4241 /** 4242 * amdgpu_device_evict_resources - evict device resources 4243 * @adev: amdgpu device object 4244 * 4245 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4246 * of the vram memory type. Mainly used for evicting device resources 4247 * at suspend time. 4248 * 4249 */ 4250 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4251 { 4252 int ret; 4253 4254 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4255 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4256 return 0; 4257 4258 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4259 if (ret) 4260 DRM_WARN("evicting device resources failed\n"); 4261 return ret; 4262 } 4263 4264 /* 4265 * Suspend & resume. 4266 */ 4267 /** 4268 * amdgpu_device_suspend - initiate device suspend 4269 * 4270 * @dev: drm dev pointer 4271 * @fbcon : notify the fbdev of suspend 4272 * 4273 * Puts the hw in the suspend state (all asics). 4274 * Returns 0 for success or an error on failure. 4275 * Called at driver suspend. 4276 */ 4277 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4278 { 4279 struct amdgpu_device *adev = drm_to_adev(dev); 4280 int r = 0; 4281 4282 if (adev->shutdown) 4283 return 0; 4284 4285 #ifdef notyet 4286 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4287 return 0; 4288 #endif 4289 4290 adev->in_suspend = true; 4291 4292 if (amdgpu_sriov_vf(adev)) { 4293 amdgpu_virt_fini_data_exchange(adev); 4294 r = amdgpu_virt_request_full_gpu(adev, false); 4295 if (r) 4296 return r; 4297 } 4298 4299 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4300 DRM_WARN("smart shift update failed\n"); 4301 4302 drm_kms_helper_poll_disable(dev); 4303 4304 if (fbcon) 4305 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4306 4307 cancel_delayed_work_sync(&adev->delayed_init_work); 4308 4309 amdgpu_ras_suspend(adev); 4310 4311 amdgpu_device_ip_suspend_phase1(adev); 4312 4313 if (!adev->in_s0ix) 4314 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4315 4316 r = amdgpu_device_evict_resources(adev); 4317 if (r) 4318 return r; 4319 4320 amdgpu_fence_driver_hw_fini(adev); 4321 4322 amdgpu_device_ip_suspend_phase2(adev); 4323 4324 if (amdgpu_sriov_vf(adev)) 4325 amdgpu_virt_release_full_gpu(adev, false); 4326 4327 return 0; 4328 } 4329 4330 /** 4331 * amdgpu_device_resume - initiate device resume 4332 * 4333 * @dev: drm dev pointer 4334 * @fbcon : notify the fbdev of resume 4335 * 4336 * Bring the hw back to operating state (all asics). 4337 * Returns 0 for success or an error on failure. 4338 * Called at driver resume. 4339 */ 4340 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4341 { 4342 struct amdgpu_device *adev = drm_to_adev(dev); 4343 int r = 0; 4344 4345 if (amdgpu_sriov_vf(adev)) { 4346 r = amdgpu_virt_request_full_gpu(adev, true); 4347 if (r) 4348 return r; 4349 } 4350 4351 #ifdef notyet 4352 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4353 return 0; 4354 #endif 4355 4356 if (adev->in_s0ix) 4357 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4358 4359 /* post card */ 4360 if (amdgpu_device_need_post(adev)) { 4361 r = amdgpu_device_asic_init(adev); 4362 if (r) 4363 dev_err(adev->dev, "amdgpu asic init failed\n"); 4364 } 4365 4366 r = amdgpu_device_ip_resume(adev); 4367 4368 /* no matter what r is, always need to properly release full GPU */ 4369 if (amdgpu_sriov_vf(adev)) { 4370 amdgpu_virt_init_data_exchange(adev); 4371 amdgpu_virt_release_full_gpu(adev, true); 4372 } 4373 4374 if (r) { 4375 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4376 return r; 4377 } 4378 amdgpu_fence_driver_hw_init(adev); 4379 4380 r = amdgpu_device_ip_late_init(adev); 4381 if (r) 4382 return r; 4383 4384 queue_delayed_work(system_wq, &adev->delayed_init_work, 4385 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4386 4387 if (!adev->in_s0ix) { 4388 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4389 if (r) 4390 return r; 4391 } 4392 4393 /* Make sure IB tests flushed */ 4394 flush_delayed_work(&adev->delayed_init_work); 4395 4396 if (adev->in_s0ix) { 4397 /* re-enable gfxoff after IP resume. This re-enables gfxoff after 4398 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2(). 4399 */ 4400 amdgpu_gfx_off_ctrl(adev, true); 4401 DRM_DEBUG("will enable gfxoff for the mission mode\n"); 4402 } 4403 if (fbcon) 4404 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4405 4406 drm_kms_helper_poll_enable(dev); 4407 4408 amdgpu_ras_resume(adev); 4409 4410 /* 4411 * Most of the connector probing functions try to acquire runtime pm 4412 * refs to ensure that the GPU is powered on when connector polling is 4413 * performed. Since we're calling this from a runtime PM callback, 4414 * trying to acquire rpm refs will cause us to deadlock. 4415 * 4416 * Since we're guaranteed to be holding the rpm lock, it's safe to 4417 * temporarily disable the rpm helpers so this doesn't deadlock us. 4418 */ 4419 #if defined(CONFIG_PM) && defined(__linux__) 4420 dev->dev->power.disable_depth++; 4421 #endif 4422 if (!amdgpu_device_has_dc_support(adev)) 4423 drm_helper_hpd_irq_event(dev); 4424 else 4425 drm_kms_helper_hotplug_event(dev); 4426 #if defined(CONFIG_PM) && defined(__linux__) 4427 dev->dev->power.disable_depth--; 4428 #endif 4429 adev->in_suspend = false; 4430 4431 if (adev->enable_mes) 4432 amdgpu_mes_self_test(adev); 4433 4434 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4435 DRM_WARN("smart shift update failed\n"); 4436 4437 return 0; 4438 } 4439 4440 /** 4441 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4442 * 4443 * @adev: amdgpu_device pointer 4444 * 4445 * The list of all the hardware IPs that make up the asic is walked and 4446 * the check_soft_reset callbacks are run. check_soft_reset determines 4447 * if the asic is still hung or not. 4448 * Returns true if any of the IPs are still in a hung state, false if not. 4449 */ 4450 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4451 { 4452 int i; 4453 bool asic_hang = false; 4454 4455 if (amdgpu_sriov_vf(adev)) 4456 return true; 4457 4458 if (amdgpu_asic_need_full_reset(adev)) 4459 return true; 4460 4461 for (i = 0; i < adev->num_ip_blocks; i++) { 4462 if (!adev->ip_blocks[i].status.valid) 4463 continue; 4464 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4465 adev->ip_blocks[i].status.hang = 4466 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4467 if (adev->ip_blocks[i].status.hang) { 4468 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4469 asic_hang = true; 4470 } 4471 } 4472 return asic_hang; 4473 } 4474 4475 /** 4476 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4477 * 4478 * @adev: amdgpu_device pointer 4479 * 4480 * The list of all the hardware IPs that make up the asic is walked and the 4481 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4482 * handles any IP specific hardware or software state changes that are 4483 * necessary for a soft reset to succeed. 4484 * Returns 0 on success, negative error code on failure. 4485 */ 4486 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4487 { 4488 int i, r = 0; 4489 4490 for (i = 0; i < adev->num_ip_blocks; i++) { 4491 if (!adev->ip_blocks[i].status.valid) 4492 continue; 4493 if (adev->ip_blocks[i].status.hang && 4494 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4495 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4496 if (r) 4497 return r; 4498 } 4499 } 4500 4501 return 0; 4502 } 4503 4504 /** 4505 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4506 * 4507 * @adev: amdgpu_device pointer 4508 * 4509 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4510 * reset is necessary to recover. 4511 * Returns true if a full asic reset is required, false if not. 4512 */ 4513 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4514 { 4515 int i; 4516 4517 if (amdgpu_asic_need_full_reset(adev)) 4518 return true; 4519 4520 for (i = 0; i < adev->num_ip_blocks; i++) { 4521 if (!adev->ip_blocks[i].status.valid) 4522 continue; 4523 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4524 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4525 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4526 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4527 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4528 if (adev->ip_blocks[i].status.hang) { 4529 dev_info(adev->dev, "Some block need full reset!\n"); 4530 return true; 4531 } 4532 } 4533 } 4534 return false; 4535 } 4536 4537 /** 4538 * amdgpu_device_ip_soft_reset - do a soft reset 4539 * 4540 * @adev: amdgpu_device pointer 4541 * 4542 * The list of all the hardware IPs that make up the asic is walked and the 4543 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4544 * IP specific hardware or software state changes that are necessary to soft 4545 * reset the IP. 4546 * Returns 0 on success, negative error code on failure. 4547 */ 4548 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4549 { 4550 int i, r = 0; 4551 4552 for (i = 0; i < adev->num_ip_blocks; i++) { 4553 if (!adev->ip_blocks[i].status.valid) 4554 continue; 4555 if (adev->ip_blocks[i].status.hang && 4556 adev->ip_blocks[i].version->funcs->soft_reset) { 4557 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4558 if (r) 4559 return r; 4560 } 4561 } 4562 4563 return 0; 4564 } 4565 4566 /** 4567 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4568 * 4569 * @adev: amdgpu_device pointer 4570 * 4571 * The list of all the hardware IPs that make up the asic is walked and the 4572 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4573 * handles any IP specific hardware or software state changes that are 4574 * necessary after the IP has been soft reset. 4575 * Returns 0 on success, negative error code on failure. 4576 */ 4577 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4578 { 4579 int i, r = 0; 4580 4581 for (i = 0; i < adev->num_ip_blocks; i++) { 4582 if (!adev->ip_blocks[i].status.valid) 4583 continue; 4584 if (adev->ip_blocks[i].status.hang && 4585 adev->ip_blocks[i].version->funcs->post_soft_reset) 4586 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4587 if (r) 4588 return r; 4589 } 4590 4591 return 0; 4592 } 4593 4594 /** 4595 * amdgpu_device_recover_vram - Recover some VRAM contents 4596 * 4597 * @adev: amdgpu_device pointer 4598 * 4599 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4600 * restore things like GPUVM page tables after a GPU reset where 4601 * the contents of VRAM might be lost. 4602 * 4603 * Returns: 4604 * 0 on success, negative error code on failure. 4605 */ 4606 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4607 { 4608 struct dma_fence *fence = NULL, *next = NULL; 4609 struct amdgpu_bo *shadow; 4610 struct amdgpu_bo_vm *vmbo; 4611 long r = 1, tmo; 4612 4613 if (amdgpu_sriov_runtime(adev)) 4614 tmo = msecs_to_jiffies(8000); 4615 else 4616 tmo = msecs_to_jiffies(100); 4617 4618 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4619 mutex_lock(&adev->shadow_list_lock); 4620 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4621 /* If vm is compute context or adev is APU, shadow will be NULL */ 4622 if (!vmbo->shadow) 4623 continue; 4624 shadow = vmbo->shadow; 4625 4626 /* No need to recover an evicted BO */ 4627 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4628 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4629 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4630 continue; 4631 4632 r = amdgpu_bo_restore_shadow(shadow, &next); 4633 if (r) 4634 break; 4635 4636 if (fence) { 4637 tmo = dma_fence_wait_timeout(fence, false, tmo); 4638 dma_fence_put(fence); 4639 fence = next; 4640 if (tmo == 0) { 4641 r = -ETIMEDOUT; 4642 break; 4643 } else if (tmo < 0) { 4644 r = tmo; 4645 break; 4646 } 4647 } else { 4648 fence = next; 4649 } 4650 } 4651 mutex_unlock(&adev->shadow_list_lock); 4652 4653 if (fence) 4654 tmo = dma_fence_wait_timeout(fence, false, tmo); 4655 dma_fence_put(fence); 4656 4657 if (r < 0 || tmo <= 0) { 4658 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4659 return -EIO; 4660 } 4661 4662 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4663 return 0; 4664 } 4665 4666 4667 /** 4668 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4669 * 4670 * @adev: amdgpu_device pointer 4671 * @from_hypervisor: request from hypervisor 4672 * 4673 * do VF FLR and reinitialize Asic 4674 * return 0 means succeeded otherwise failed 4675 */ 4676 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4677 bool from_hypervisor) 4678 { 4679 int r; 4680 struct amdgpu_hive_info *hive = NULL; 4681 int retry_limit = 0; 4682 4683 retry: 4684 amdgpu_amdkfd_pre_reset(adev); 4685 4686 if (from_hypervisor) 4687 r = amdgpu_virt_request_full_gpu(adev, true); 4688 else 4689 r = amdgpu_virt_reset_gpu(adev); 4690 if (r) 4691 return r; 4692 4693 /* Resume IP prior to SMC */ 4694 r = amdgpu_device_ip_reinit_early_sriov(adev); 4695 if (r) 4696 goto error; 4697 4698 amdgpu_virt_init_data_exchange(adev); 4699 4700 r = amdgpu_device_fw_loading(adev); 4701 if (r) 4702 return r; 4703 4704 /* now we are okay to resume SMC/CP/SDMA */ 4705 r = amdgpu_device_ip_reinit_late_sriov(adev); 4706 if (r) 4707 goto error; 4708 4709 hive = amdgpu_get_xgmi_hive(adev); 4710 /* Update PSP FW topology after reset */ 4711 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4712 r = amdgpu_xgmi_update_topology(hive, adev); 4713 4714 if (hive) 4715 amdgpu_put_xgmi_hive(hive); 4716 4717 if (!r) { 4718 amdgpu_irq_gpu_reset_resume_helper(adev); 4719 r = amdgpu_ib_ring_tests(adev); 4720 4721 amdgpu_amdkfd_post_reset(adev); 4722 } 4723 4724 error: 4725 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4726 amdgpu_inc_vram_lost(adev); 4727 r = amdgpu_device_recover_vram(adev); 4728 } 4729 amdgpu_virt_release_full_gpu(adev, true); 4730 4731 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4732 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4733 retry_limit++; 4734 goto retry; 4735 } else 4736 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4737 } 4738 4739 return r; 4740 } 4741 4742 /** 4743 * amdgpu_device_has_job_running - check if there is any job in mirror list 4744 * 4745 * @adev: amdgpu_device pointer 4746 * 4747 * check if there is any job in mirror list 4748 */ 4749 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4750 { 4751 int i; 4752 struct drm_sched_job *job; 4753 4754 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4755 struct amdgpu_ring *ring = adev->rings[i]; 4756 4757 if (!ring || !ring->sched.thread) 4758 continue; 4759 4760 spin_lock(&ring->sched.job_list_lock); 4761 job = list_first_entry_or_null(&ring->sched.pending_list, 4762 struct drm_sched_job, list); 4763 spin_unlock(&ring->sched.job_list_lock); 4764 if (job) 4765 return true; 4766 } 4767 return false; 4768 } 4769 4770 /** 4771 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4772 * 4773 * @adev: amdgpu_device pointer 4774 * 4775 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4776 * a hung GPU. 4777 */ 4778 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4779 { 4780 4781 if (amdgpu_gpu_recovery == 0) 4782 goto disabled; 4783 4784 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4785 dev_info(adev->dev,"Timeout, but no hardware hang detected.\n"); 4786 return false; 4787 } 4788 4789 if (amdgpu_sriov_vf(adev)) 4790 return true; 4791 4792 if (amdgpu_gpu_recovery == -1) { 4793 switch (adev->asic_type) { 4794 #ifdef CONFIG_DRM_AMDGPU_SI 4795 case CHIP_VERDE: 4796 case CHIP_TAHITI: 4797 case CHIP_PITCAIRN: 4798 case CHIP_OLAND: 4799 case CHIP_HAINAN: 4800 #endif 4801 #ifdef CONFIG_DRM_AMDGPU_CIK 4802 case CHIP_KAVERI: 4803 case CHIP_KABINI: 4804 case CHIP_MULLINS: 4805 #endif 4806 case CHIP_CARRIZO: 4807 case CHIP_STONEY: 4808 case CHIP_CYAN_SKILLFISH: 4809 goto disabled; 4810 default: 4811 break; 4812 } 4813 } 4814 4815 return true; 4816 4817 disabled: 4818 dev_info(adev->dev, "GPU recovery disabled.\n"); 4819 return false; 4820 } 4821 4822 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4823 { 4824 u32 i; 4825 int ret = 0; 4826 4827 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4828 4829 dev_info(adev->dev, "GPU mode1 reset\n"); 4830 4831 /* disable BM */ 4832 pci_clear_master(adev->pdev); 4833 4834 amdgpu_device_cache_pci_state(adev->pdev); 4835 4836 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4837 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4838 ret = amdgpu_dpm_mode1_reset(adev); 4839 } else { 4840 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4841 ret = psp_gpu_reset(adev); 4842 } 4843 4844 if (ret) 4845 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4846 4847 amdgpu_device_load_pci_state(adev->pdev); 4848 4849 /* wait for asic to come out of reset */ 4850 for (i = 0; i < adev->usec_timeout; i++) { 4851 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4852 4853 if (memsize != 0xffffffff) 4854 break; 4855 udelay(1); 4856 } 4857 4858 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4859 return ret; 4860 } 4861 4862 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4863 struct amdgpu_reset_context *reset_context) 4864 { 4865 int i, r = 0; 4866 struct amdgpu_job *job = NULL; 4867 bool need_full_reset = 4868 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4869 4870 if (reset_context->reset_req_dev == adev) 4871 job = reset_context->job; 4872 4873 if (amdgpu_sriov_vf(adev)) { 4874 /* stop the data exchange thread */ 4875 amdgpu_virt_fini_data_exchange(adev); 4876 } 4877 4878 amdgpu_fence_driver_isr_toggle(adev, true); 4879 4880 /* block all schedulers and reset given job's ring */ 4881 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4882 struct amdgpu_ring *ring = adev->rings[i]; 4883 4884 if (!ring || !ring->sched.thread) 4885 continue; 4886 4887 /*clear job fence from fence drv to avoid force_completion 4888 *leave NULL and vm flush fence in fence drv */ 4889 amdgpu_fence_driver_clear_job_fences(ring); 4890 4891 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4892 amdgpu_fence_driver_force_completion(ring); 4893 } 4894 4895 amdgpu_fence_driver_isr_toggle(adev, false); 4896 4897 if (job && job->vm) 4898 drm_sched_increase_karma(&job->base); 4899 4900 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4901 /* If reset handler not implemented, continue; otherwise return */ 4902 if (r == -ENOSYS) 4903 r = 0; 4904 else 4905 return r; 4906 4907 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4908 if (!amdgpu_sriov_vf(adev)) { 4909 4910 if (!need_full_reset) 4911 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4912 4913 if (!need_full_reset && amdgpu_gpu_recovery) { 4914 amdgpu_device_ip_pre_soft_reset(adev); 4915 r = amdgpu_device_ip_soft_reset(adev); 4916 amdgpu_device_ip_post_soft_reset(adev); 4917 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4918 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4919 need_full_reset = true; 4920 } 4921 } 4922 4923 if (need_full_reset) 4924 r = amdgpu_device_ip_suspend(adev); 4925 if (need_full_reset) 4926 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4927 else 4928 clear_bit(AMDGPU_NEED_FULL_RESET, 4929 &reset_context->flags); 4930 } 4931 4932 return r; 4933 } 4934 4935 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4936 { 4937 int i; 4938 4939 lockdep_assert_held(&adev->reset_domain->sem); 4940 4941 for (i = 0; i < adev->num_regs; i++) { 4942 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4943 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4944 adev->reset_dump_reg_value[i]); 4945 } 4946 4947 return 0; 4948 } 4949 4950 #ifdef CONFIG_DEV_COREDUMP 4951 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4952 size_t count, void *data, size_t datalen) 4953 { 4954 struct drm_printer p; 4955 struct amdgpu_device *adev = data; 4956 struct drm_print_iterator iter; 4957 int i; 4958 4959 iter.data = buffer; 4960 iter.offset = 0; 4961 iter.start = offset; 4962 iter.remain = count; 4963 4964 p = drm_coredump_printer(&iter); 4965 4966 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4967 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4968 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4969 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4970 if (adev->reset_task_info.pid) 4971 drm_printf(&p, "process_name: %s PID: %d\n", 4972 adev->reset_task_info.process_name, 4973 adev->reset_task_info.pid); 4974 4975 if (adev->reset_vram_lost) 4976 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4977 if (adev->num_regs) { 4978 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4979 4980 for (i = 0; i < adev->num_regs; i++) 4981 drm_printf(&p, "0x%08x: 0x%08x\n", 4982 adev->reset_dump_reg_list[i], 4983 adev->reset_dump_reg_value[i]); 4984 } 4985 4986 return count - iter.remain; 4987 } 4988 4989 static void amdgpu_devcoredump_free(void *data) 4990 { 4991 } 4992 4993 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4994 { 4995 struct drm_device *dev = adev_to_drm(adev); 4996 4997 ktime_get_ts64(&adev->reset_time); 4998 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4999 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 5000 } 5001 #endif 5002 5003 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5004 struct amdgpu_reset_context *reset_context) 5005 { 5006 struct amdgpu_device *tmp_adev = NULL; 5007 bool need_full_reset, skip_hw_reset, vram_lost = false; 5008 int r = 0; 5009 bool gpu_reset_for_dev_remove = 0; 5010 5011 /* Try reset handler method first */ 5012 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5013 reset_list); 5014 amdgpu_reset_reg_dumps(tmp_adev); 5015 5016 reset_context->reset_device_list = device_list_handle; 5017 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5018 /* If reset handler not implemented, continue; otherwise return */ 5019 if (r == -ENOSYS) 5020 r = 0; 5021 else 5022 return r; 5023 5024 /* Reset handler not implemented, use the default method */ 5025 need_full_reset = 5026 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5027 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5028 5029 gpu_reset_for_dev_remove = 5030 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5031 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5032 5033 /* 5034 * ASIC reset has to be done on all XGMI hive nodes ASAP 5035 * to allow proper links negotiation in FW (within 1 sec) 5036 */ 5037 if (!skip_hw_reset && need_full_reset) { 5038 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5039 /* For XGMI run all resets in parallel to speed up the process */ 5040 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5041 tmp_adev->gmc.xgmi.pending_reset = false; 5042 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5043 r = -EALREADY; 5044 } else 5045 r = amdgpu_asic_reset(tmp_adev); 5046 5047 if (r) { 5048 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5049 r, adev_to_drm(tmp_adev)->unique); 5050 break; 5051 } 5052 } 5053 5054 /* For XGMI wait for all resets to complete before proceed */ 5055 if (!r) { 5056 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5057 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5058 flush_work(&tmp_adev->xgmi_reset_work); 5059 r = tmp_adev->asic_reset_res; 5060 if (r) 5061 break; 5062 } 5063 } 5064 } 5065 } 5066 5067 if (!r && amdgpu_ras_intr_triggered()) { 5068 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5069 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 5070 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 5071 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 5072 } 5073 5074 amdgpu_ras_intr_cleared(); 5075 } 5076 5077 /* Since the mode1 reset affects base ip blocks, the 5078 * phase1 ip blocks need to be resumed. Otherwise there 5079 * will be a BIOS signature error and the psp bootloader 5080 * can't load kdb on the next amdgpu install. 5081 */ 5082 if (gpu_reset_for_dev_remove) { 5083 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5084 amdgpu_device_ip_resume_phase1(tmp_adev); 5085 5086 goto end; 5087 } 5088 5089 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5090 if (need_full_reset) { 5091 /* post card */ 5092 r = amdgpu_device_asic_init(tmp_adev); 5093 if (r) { 5094 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5095 } else { 5096 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5097 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 5098 if (r) 5099 goto out; 5100 5101 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5102 if (r) 5103 goto out; 5104 5105 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5106 #ifdef CONFIG_DEV_COREDUMP 5107 tmp_adev->reset_vram_lost = vram_lost; 5108 memset(&tmp_adev->reset_task_info, 0, 5109 sizeof(tmp_adev->reset_task_info)); 5110 if (reset_context->job && reset_context->job->vm) 5111 tmp_adev->reset_task_info = 5112 reset_context->job->vm->task_info; 5113 amdgpu_reset_capture_coredumpm(tmp_adev); 5114 #endif 5115 if (vram_lost) { 5116 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5117 amdgpu_inc_vram_lost(tmp_adev); 5118 } 5119 5120 r = amdgpu_device_fw_loading(tmp_adev); 5121 if (r) 5122 return r; 5123 5124 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5125 if (r) 5126 goto out; 5127 5128 if (vram_lost) 5129 amdgpu_device_fill_reset_magic(tmp_adev); 5130 5131 /* 5132 * Add this ASIC as tracked as reset was already 5133 * complete successfully. 5134 */ 5135 amdgpu_register_gpu_instance(tmp_adev); 5136 5137 if (!reset_context->hive && 5138 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5139 amdgpu_xgmi_add_device(tmp_adev); 5140 5141 r = amdgpu_device_ip_late_init(tmp_adev); 5142 if (r) 5143 goto out; 5144 5145 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5146 5147 /* 5148 * The GPU enters bad state once faulty pages 5149 * by ECC has reached the threshold, and ras 5150 * recovery is scheduled next. So add one check 5151 * here to break recovery if it indeed exceeds 5152 * bad page threshold, and remind user to 5153 * retire this GPU or setting one bigger 5154 * bad_page_threshold value to fix this once 5155 * probing driver again. 5156 */ 5157 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5158 /* must succeed. */ 5159 amdgpu_ras_resume(tmp_adev); 5160 } else { 5161 r = -EINVAL; 5162 goto out; 5163 } 5164 5165 /* Update PSP FW topology after reset */ 5166 if (reset_context->hive && 5167 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5168 r = amdgpu_xgmi_update_topology( 5169 reset_context->hive, tmp_adev); 5170 } 5171 } 5172 5173 out: 5174 if (!r) { 5175 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5176 r = amdgpu_ib_ring_tests(tmp_adev); 5177 if (r) { 5178 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5179 need_full_reset = true; 5180 r = -EAGAIN; 5181 goto end; 5182 } 5183 } 5184 5185 if (!r) 5186 r = amdgpu_device_recover_vram(tmp_adev); 5187 else 5188 tmp_adev->asic_reset_res = r; 5189 } 5190 5191 end: 5192 if (need_full_reset) 5193 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5194 else 5195 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5196 return r; 5197 } 5198 5199 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5200 { 5201 5202 switch (amdgpu_asic_reset_method(adev)) { 5203 case AMD_RESET_METHOD_MODE1: 5204 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5205 break; 5206 case AMD_RESET_METHOD_MODE2: 5207 adev->mp1_state = PP_MP1_STATE_RESET; 5208 break; 5209 default: 5210 adev->mp1_state = PP_MP1_STATE_NONE; 5211 break; 5212 } 5213 5214 pci_dev_put(p); 5215 } 5216 5217 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5218 { 5219 amdgpu_vf_error_trans_all(adev); 5220 adev->mp1_state = PP_MP1_STATE_NONE; 5221 } 5222 5223 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5224 { 5225 STUB(); 5226 #ifdef notyet 5227 struct pci_dev *p = NULL; 5228 5229 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5230 adev->pdev->bus->number, 1); 5231 if (p) { 5232 pm_runtime_enable(&(p->dev)); 5233 pm_runtime_resume(&(p->dev)); 5234 } 5235 #endif 5236 } 5237 5238 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5239 { 5240 enum amd_reset_method reset_method; 5241 struct pci_dev *p = NULL; 5242 u64 expires; 5243 5244 /* 5245 * For now, only BACO and mode1 reset are confirmed 5246 * to suffer the audio issue without proper suspended. 5247 */ 5248 reset_method = amdgpu_asic_reset_method(adev); 5249 if ((reset_method != AMD_RESET_METHOD_BACO) && 5250 (reset_method != AMD_RESET_METHOD_MODE1)) 5251 return -EINVAL; 5252 5253 STUB(); 5254 return -ENOSYS; 5255 #ifdef notyet 5256 5257 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5258 adev->pdev->bus->number, 1); 5259 if (!p) 5260 return -ENODEV; 5261 5262 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5263 if (!expires) 5264 /* 5265 * If we cannot get the audio device autosuspend delay, 5266 * a fixed 4S interval will be used. Considering 3S is 5267 * the audio controller default autosuspend delay setting. 5268 * 4S used here is guaranteed to cover that. 5269 */ 5270 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5271 5272 while (!pm_runtime_status_suspended(&(p->dev))) { 5273 if (!pm_runtime_suspend(&(p->dev))) 5274 break; 5275 5276 if (expires < ktime_get_mono_fast_ns()) { 5277 dev_warn(adev->dev, "failed to suspend display audio\n"); 5278 pci_dev_put(p); 5279 /* TODO: abort the succeeding gpu reset? */ 5280 return -ETIMEDOUT; 5281 } 5282 } 5283 5284 pm_runtime_disable(&(p->dev)); 5285 5286 pci_dev_put(p); 5287 return 0; 5288 #endif 5289 } 5290 5291 static void amdgpu_device_recheck_guilty_jobs( 5292 struct amdgpu_device *adev, struct list_head *device_list_handle, 5293 struct amdgpu_reset_context *reset_context) 5294 { 5295 int i, r = 0; 5296 5297 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5298 struct amdgpu_ring *ring = adev->rings[i]; 5299 int ret = 0; 5300 struct drm_sched_job *s_job; 5301 5302 if (!ring || !ring->sched.thread) 5303 continue; 5304 5305 s_job = list_first_entry_or_null(&ring->sched.pending_list, 5306 struct drm_sched_job, list); 5307 if (s_job == NULL) 5308 continue; 5309 5310 /* clear job's guilty and depend the folowing step to decide the real one */ 5311 drm_sched_reset_karma(s_job); 5312 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 5313 5314 if (!s_job->s_fence->parent) { 5315 DRM_WARN("Failed to get a HW fence for job!"); 5316 continue; 5317 } 5318 5319 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 5320 if (ret == 0) { /* timeout */ 5321 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 5322 ring->sched.name, s_job->id); 5323 5324 5325 amdgpu_fence_driver_isr_toggle(adev, true); 5326 5327 /* Clear this failed job from fence array */ 5328 amdgpu_fence_driver_clear_job_fences(ring); 5329 5330 amdgpu_fence_driver_isr_toggle(adev, false); 5331 5332 /* Since the job won't signal and we go for 5333 * another resubmit drop this parent pointer 5334 */ 5335 dma_fence_put(s_job->s_fence->parent); 5336 s_job->s_fence->parent = NULL; 5337 5338 /* set guilty */ 5339 drm_sched_increase_karma(s_job); 5340 amdgpu_reset_prepare_hwcontext(adev, reset_context); 5341 retry: 5342 /* do hw reset */ 5343 if (amdgpu_sriov_vf(adev)) { 5344 amdgpu_virt_fini_data_exchange(adev); 5345 r = amdgpu_device_reset_sriov(adev, false); 5346 if (r) 5347 adev->asic_reset_res = r; 5348 } else { 5349 clear_bit(AMDGPU_SKIP_HW_RESET, 5350 &reset_context->flags); 5351 r = amdgpu_do_asic_reset(device_list_handle, 5352 reset_context); 5353 if (r && r == -EAGAIN) 5354 goto retry; 5355 } 5356 5357 /* 5358 * add reset counter so that the following 5359 * resubmitted job could flush vmid 5360 */ 5361 atomic_inc(&adev->gpu_reset_counter); 5362 continue; 5363 } 5364 5365 /* got the hw fence, signal finished fence */ 5366 atomic_dec(ring->sched.score); 5367 dma_fence_get(&s_job->s_fence->finished); 5368 dma_fence_signal(&s_job->s_fence->finished); 5369 dma_fence_put(&s_job->s_fence->finished); 5370 5371 /* remove node from list and free the job */ 5372 spin_lock(&ring->sched.job_list_lock); 5373 list_del_init(&s_job->list); 5374 spin_unlock(&ring->sched.job_list_lock); 5375 ring->sched.ops->free_job(s_job); 5376 } 5377 } 5378 5379 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5380 { 5381 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5382 5383 #if defined(CONFIG_DEBUG_FS) 5384 if (!amdgpu_sriov_vf(adev)) 5385 cancel_work(&adev->reset_work); 5386 #endif 5387 5388 if (adev->kfd.dev) 5389 cancel_work(&adev->kfd.reset_work); 5390 5391 if (amdgpu_sriov_vf(adev)) 5392 cancel_work(&adev->virt.flr_work); 5393 5394 if (con && adev->ras_enabled) 5395 cancel_work(&con->recovery_work); 5396 5397 } 5398 5399 5400 /** 5401 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5402 * 5403 * @adev: amdgpu_device pointer 5404 * @job: which job trigger hang 5405 * 5406 * Attempt to reset the GPU if it has hung (all asics). 5407 * Attempt to do soft-reset or full-reset and reinitialize Asic 5408 * Returns 0 for success or an error on failure. 5409 */ 5410 5411 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5412 struct amdgpu_job *job, 5413 struct amdgpu_reset_context *reset_context) 5414 { 5415 struct list_head device_list, *device_list_handle = NULL; 5416 bool job_signaled = false; 5417 struct amdgpu_hive_info *hive = NULL; 5418 struct amdgpu_device *tmp_adev = NULL; 5419 int i, r = 0; 5420 bool need_emergency_restart = false; 5421 bool audio_suspended = false; 5422 int tmp_vram_lost_counter; 5423 bool gpu_reset_for_dev_remove = false; 5424 5425 gpu_reset_for_dev_remove = 5426 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5427 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5428 5429 /* 5430 * Special case: RAS triggered and full reset isn't supported 5431 */ 5432 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5433 5434 /* 5435 * Flush RAM to disk so that after reboot 5436 * the user can read log and see why the system rebooted. 5437 */ 5438 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5439 DRM_WARN("Emergency reboot."); 5440 5441 #ifdef notyet 5442 ksys_sync_helper(); 5443 emergency_restart(); 5444 #else 5445 panic("emergency_restart"); 5446 #endif 5447 } 5448 5449 dev_info(adev->dev, "GPU %s begin!\n", 5450 need_emergency_restart ? "jobs stop":"reset"); 5451 5452 if (!amdgpu_sriov_vf(adev)) 5453 hive = amdgpu_get_xgmi_hive(adev); 5454 if (hive) 5455 mutex_lock(&hive->hive_lock); 5456 5457 reset_context->job = job; 5458 reset_context->hive = hive; 5459 /* 5460 * Build list of devices to reset. 5461 * In case we are in XGMI hive mode, resort the device list 5462 * to put adev in the 1st position. 5463 */ 5464 INIT_LIST_HEAD(&device_list); 5465 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5466 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5467 list_add_tail(&tmp_adev->reset_list, &device_list); 5468 if (gpu_reset_for_dev_remove && adev->shutdown) 5469 tmp_adev->shutdown = true; 5470 } 5471 if (!list_is_first(&adev->reset_list, &device_list)) 5472 list_rotate_to_front(&adev->reset_list, &device_list); 5473 device_list_handle = &device_list; 5474 } else { 5475 list_add_tail(&adev->reset_list, &device_list); 5476 device_list_handle = &device_list; 5477 } 5478 5479 /* We need to lock reset domain only once both for XGMI and single device */ 5480 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5481 reset_list); 5482 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5483 5484 /* block all schedulers and reset given job's ring */ 5485 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5486 5487 amdgpu_device_set_mp1_state(tmp_adev); 5488 5489 /* 5490 * Try to put the audio codec into suspend state 5491 * before gpu reset started. 5492 * 5493 * Due to the power domain of the graphics device 5494 * is shared with AZ power domain. Without this, 5495 * we may change the audio hardware from behind 5496 * the audio driver's back. That will trigger 5497 * some audio codec errors. 5498 */ 5499 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5500 audio_suspended = true; 5501 5502 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5503 5504 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5505 5506 if (!amdgpu_sriov_vf(tmp_adev)) 5507 amdgpu_amdkfd_pre_reset(tmp_adev); 5508 5509 /* 5510 * Mark these ASICs to be reseted as untracked first 5511 * And add them back after reset completed 5512 */ 5513 amdgpu_unregister_gpu_instance(tmp_adev); 5514 5515 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5516 5517 /* disable ras on ALL IPs */ 5518 if (!need_emergency_restart && 5519 amdgpu_device_ip_need_full_reset(tmp_adev)) 5520 amdgpu_ras_suspend(tmp_adev); 5521 5522 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5523 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5524 5525 if (!ring || !ring->sched.thread) 5526 continue; 5527 5528 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5529 5530 if (need_emergency_restart) 5531 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5532 } 5533 atomic_inc(&tmp_adev->gpu_reset_counter); 5534 } 5535 5536 if (need_emergency_restart) 5537 goto skip_sched_resume; 5538 5539 /* 5540 * Must check guilty signal here since after this point all old 5541 * HW fences are force signaled. 5542 * 5543 * job->base holds a reference to parent fence 5544 */ 5545 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5546 job_signaled = true; 5547 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5548 goto skip_hw_reset; 5549 } 5550 5551 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5552 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5553 if (gpu_reset_for_dev_remove) { 5554 /* Workaroud for ASICs need to disable SMC first */ 5555 amdgpu_device_smu_fini_early(tmp_adev); 5556 } 5557 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5558 /*TODO Should we stop ?*/ 5559 if (r) { 5560 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5561 r, adev_to_drm(tmp_adev)->unique); 5562 tmp_adev->asic_reset_res = r; 5563 } 5564 5565 /* 5566 * Drop all pending non scheduler resets. Scheduler resets 5567 * were already dropped during drm_sched_stop 5568 */ 5569 amdgpu_device_stop_pending_resets(tmp_adev); 5570 } 5571 5572 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5573 /* Actual ASIC resets if needed.*/ 5574 /* Host driver will handle XGMI hive reset for SRIOV */ 5575 if (amdgpu_sriov_vf(adev)) { 5576 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5577 if (r) 5578 adev->asic_reset_res = r; 5579 5580 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5581 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5582 amdgpu_ras_resume(adev); 5583 } else { 5584 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5585 if (r && r == -EAGAIN) 5586 goto retry; 5587 5588 if (!r && gpu_reset_for_dev_remove) 5589 goto recover_end; 5590 } 5591 5592 skip_hw_reset: 5593 5594 /* Post ASIC reset for all devs .*/ 5595 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5596 5597 /* 5598 * Sometimes a later bad compute job can block a good gfx job as gfx 5599 * and compute ring share internal GC HW mutually. We add an additional 5600 * guilty jobs recheck step to find the real guilty job, it synchronously 5601 * submits and pends for the first job being signaled. If it gets timeout, 5602 * we identify it as a real guilty job. 5603 */ 5604 if (amdgpu_gpu_recovery == 2 && 5605 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5606 amdgpu_device_recheck_guilty_jobs( 5607 tmp_adev, device_list_handle, reset_context); 5608 5609 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5610 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5611 5612 if (!ring || !ring->sched.thread) 5613 continue; 5614 5615 /* No point to resubmit jobs if we didn't HW reset*/ 5616 if (!tmp_adev->asic_reset_res && !job_signaled) 5617 drm_sched_resubmit_jobs(&ring->sched); 5618 5619 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5620 } 5621 5622 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5623 amdgpu_mes_self_test(tmp_adev); 5624 5625 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5626 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5627 } 5628 5629 if (tmp_adev->asic_reset_res) 5630 r = tmp_adev->asic_reset_res; 5631 5632 tmp_adev->asic_reset_res = 0; 5633 5634 if (r) { 5635 /* bad news, how to tell it to userspace ? */ 5636 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5637 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5638 } else { 5639 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5640 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5641 DRM_WARN("smart shift update failed\n"); 5642 } 5643 } 5644 5645 skip_sched_resume: 5646 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5647 /* unlock kfd: SRIOV would do it separately */ 5648 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5649 amdgpu_amdkfd_post_reset(tmp_adev); 5650 5651 /* kfd_post_reset will do nothing if kfd device is not initialized, 5652 * need to bring up kfd here if it's not be initialized before 5653 */ 5654 if (!adev->kfd.init_complete) 5655 amdgpu_amdkfd_device_init(adev); 5656 5657 if (audio_suspended) 5658 amdgpu_device_resume_display_audio(tmp_adev); 5659 5660 amdgpu_device_unset_mp1_state(tmp_adev); 5661 } 5662 5663 recover_end: 5664 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5665 reset_list); 5666 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5667 5668 if (hive) { 5669 mutex_unlock(&hive->hive_lock); 5670 amdgpu_put_xgmi_hive(hive); 5671 } 5672 5673 if (r) 5674 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5675 5676 atomic_set(&adev->reset_domain->reset_res, r); 5677 return r; 5678 } 5679 5680 /** 5681 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5682 * 5683 * @adev: amdgpu_device pointer 5684 * 5685 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5686 * and lanes) of the slot the device is in. Handles APUs and 5687 * virtualized environments where PCIE config space may not be available. 5688 */ 5689 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5690 { 5691 struct pci_dev *pdev; 5692 enum pci_bus_speed speed_cap, platform_speed_cap; 5693 enum pcie_link_width platform_link_width; 5694 5695 if (amdgpu_pcie_gen_cap) 5696 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5697 5698 if (amdgpu_pcie_lane_cap) 5699 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5700 5701 /* covers APUs as well */ 5702 if (pci_is_root_bus(adev->pdev->bus)) { 5703 if (adev->pm.pcie_gen_mask == 0) 5704 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5705 if (adev->pm.pcie_mlw_mask == 0) 5706 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5707 return; 5708 } 5709 5710 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5711 return; 5712 5713 pcie_bandwidth_available(adev->pdev, NULL, 5714 &platform_speed_cap, &platform_link_width); 5715 5716 if (adev->pm.pcie_gen_mask == 0) { 5717 /* asic caps */ 5718 pdev = adev->pdev; 5719 speed_cap = pcie_get_speed_cap(pdev); 5720 if (speed_cap == PCI_SPEED_UNKNOWN) { 5721 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5722 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5723 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5724 } else { 5725 if (speed_cap == PCIE_SPEED_32_0GT) 5726 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5727 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5728 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5729 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5730 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5731 else if (speed_cap == PCIE_SPEED_16_0GT) 5732 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5733 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5734 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5735 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5736 else if (speed_cap == PCIE_SPEED_8_0GT) 5737 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5738 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5739 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5740 else if (speed_cap == PCIE_SPEED_5_0GT) 5741 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5742 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5743 else 5744 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5745 } 5746 /* platform caps */ 5747 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5748 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5749 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5750 } else { 5751 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5752 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5753 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5754 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5755 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5756 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5757 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5758 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5759 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5760 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5761 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5762 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5763 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5764 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5765 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5766 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5767 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5768 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5769 else 5770 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5771 5772 } 5773 } 5774 if (adev->pm.pcie_mlw_mask == 0) { 5775 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5776 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5777 } else { 5778 switch (platform_link_width) { 5779 case PCIE_LNK_X32: 5780 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5781 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5782 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5783 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5784 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5785 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5786 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5787 break; 5788 case PCIE_LNK_X16: 5789 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5790 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5791 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5792 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5793 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5794 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5795 break; 5796 case PCIE_LNK_X12: 5797 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5798 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5799 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5800 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5801 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5802 break; 5803 case PCIE_LNK_X8: 5804 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5805 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5806 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5807 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5808 break; 5809 case PCIE_LNK_X4: 5810 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5811 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5812 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5813 break; 5814 case PCIE_LNK_X2: 5815 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5816 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5817 break; 5818 case PCIE_LNK_X1: 5819 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5820 break; 5821 default: 5822 break; 5823 } 5824 } 5825 } 5826 } 5827 5828 /** 5829 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5830 * 5831 * @adev: amdgpu_device pointer 5832 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5833 * 5834 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5835 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5836 * @peer_adev. 5837 */ 5838 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5839 struct amdgpu_device *peer_adev) 5840 { 5841 #ifdef CONFIG_HSA_AMD_P2P 5842 uint64_t address_mask = peer_adev->dev->dma_mask ? 5843 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5844 resource_size_t aper_limit = 5845 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5846 bool p2p_access = 5847 !adev->gmc.xgmi.connected_to_cpu && 5848 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5849 5850 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5851 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5852 !(adev->gmc.aper_base & address_mask || 5853 aper_limit & address_mask)); 5854 #else 5855 return false; 5856 #endif 5857 } 5858 5859 int amdgpu_device_baco_enter(struct drm_device *dev) 5860 { 5861 struct amdgpu_device *adev = drm_to_adev(dev); 5862 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5863 5864 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5865 return -ENOTSUPP; 5866 5867 if (ras && adev->ras_enabled && 5868 adev->nbio.funcs->enable_doorbell_interrupt) 5869 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5870 5871 return amdgpu_dpm_baco_enter(adev); 5872 } 5873 5874 int amdgpu_device_baco_exit(struct drm_device *dev) 5875 { 5876 struct amdgpu_device *adev = drm_to_adev(dev); 5877 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5878 int ret = 0; 5879 5880 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5881 return -ENOTSUPP; 5882 5883 ret = amdgpu_dpm_baco_exit(adev); 5884 if (ret) 5885 return ret; 5886 5887 if (ras && adev->ras_enabled && 5888 adev->nbio.funcs->enable_doorbell_interrupt) 5889 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5890 5891 if (amdgpu_passthrough(adev) && 5892 adev->nbio.funcs->clear_doorbell_interrupt) 5893 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5894 5895 return 0; 5896 } 5897 5898 /** 5899 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5900 * @pdev: PCI device struct 5901 * @state: PCI channel state 5902 * 5903 * Description: Called when a PCI error is detected. 5904 * 5905 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5906 */ 5907 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5908 { 5909 STUB(); 5910 return 0; 5911 #ifdef notyet 5912 struct drm_device *dev = pci_get_drvdata(pdev); 5913 struct amdgpu_device *adev = drm_to_adev(dev); 5914 int i; 5915 5916 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5917 5918 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5919 DRM_WARN("No support for XGMI hive yet..."); 5920 return PCI_ERS_RESULT_DISCONNECT; 5921 } 5922 5923 adev->pci_channel_state = state; 5924 5925 switch (state) { 5926 case pci_channel_io_normal: 5927 return PCI_ERS_RESULT_CAN_RECOVER; 5928 /* Fatal error, prepare for slot reset */ 5929 case pci_channel_io_frozen: 5930 /* 5931 * Locking adev->reset_domain->sem will prevent any external access 5932 * to GPU during PCI error recovery 5933 */ 5934 amdgpu_device_lock_reset_domain(adev->reset_domain); 5935 amdgpu_device_set_mp1_state(adev); 5936 5937 /* 5938 * Block any work scheduling as we do for regular GPU reset 5939 * for the duration of the recovery 5940 */ 5941 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5942 struct amdgpu_ring *ring = adev->rings[i]; 5943 5944 if (!ring || !ring->sched.thread) 5945 continue; 5946 5947 drm_sched_stop(&ring->sched, NULL); 5948 } 5949 atomic_inc(&adev->gpu_reset_counter); 5950 return PCI_ERS_RESULT_NEED_RESET; 5951 case pci_channel_io_perm_failure: 5952 /* Permanent error, prepare for device removal */ 5953 return PCI_ERS_RESULT_DISCONNECT; 5954 } 5955 5956 return PCI_ERS_RESULT_NEED_RESET; 5957 #endif 5958 } 5959 5960 /** 5961 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5962 * @pdev: pointer to PCI device 5963 */ 5964 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5965 { 5966 5967 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5968 5969 /* TODO - dump whatever for debugging purposes */ 5970 5971 /* This called only if amdgpu_pci_error_detected returns 5972 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5973 * works, no need to reset slot. 5974 */ 5975 5976 return PCI_ERS_RESULT_RECOVERED; 5977 } 5978 5979 /** 5980 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5981 * @pdev: PCI device struct 5982 * 5983 * Description: This routine is called by the pci error recovery 5984 * code after the PCI slot has been reset, just before we 5985 * should resume normal operations. 5986 */ 5987 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5988 { 5989 STUB(); 5990 return PCI_ERS_RESULT_RECOVERED; 5991 #ifdef notyet 5992 struct drm_device *dev = pci_get_drvdata(pdev); 5993 struct amdgpu_device *adev = drm_to_adev(dev); 5994 int r, i; 5995 struct amdgpu_reset_context reset_context; 5996 u32 memsize; 5997 struct list_head device_list; 5998 5999 DRM_INFO("PCI error: slot reset callback!!\n"); 6000 6001 memset(&reset_context, 0, sizeof(reset_context)); 6002 6003 INIT_LIST_HEAD(&device_list); 6004 list_add_tail(&adev->reset_list, &device_list); 6005 6006 /* wait for asic to come out of reset */ 6007 drm_msleep(500); 6008 6009 /* Restore PCI confspace */ 6010 amdgpu_device_load_pci_state(pdev); 6011 6012 /* confirm ASIC came out of reset */ 6013 for (i = 0; i < adev->usec_timeout; i++) { 6014 memsize = amdgpu_asic_get_config_memsize(adev); 6015 6016 if (memsize != 0xffffffff) 6017 break; 6018 udelay(1); 6019 } 6020 if (memsize == 0xffffffff) { 6021 r = -ETIME; 6022 goto out; 6023 } 6024 6025 reset_context.method = AMD_RESET_METHOD_NONE; 6026 reset_context.reset_req_dev = adev; 6027 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6028 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6029 6030 adev->no_hw_access = true; 6031 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6032 adev->no_hw_access = false; 6033 if (r) 6034 goto out; 6035 6036 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6037 6038 out: 6039 if (!r) { 6040 if (amdgpu_device_cache_pci_state(adev->pdev)) 6041 pci_restore_state(adev->pdev); 6042 6043 DRM_INFO("PCIe error recovery succeeded\n"); 6044 } else { 6045 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6046 amdgpu_device_unset_mp1_state(adev); 6047 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6048 } 6049 6050 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6051 #endif 6052 } 6053 6054 /** 6055 * amdgpu_pci_resume() - resume normal ops after PCI reset 6056 * @pdev: pointer to PCI device 6057 * 6058 * Called when the error recovery driver tells us that its 6059 * OK to resume normal operation. 6060 */ 6061 void amdgpu_pci_resume(struct pci_dev *pdev) 6062 { 6063 STUB(); 6064 #ifdef notyet 6065 struct drm_device *dev = pci_get_drvdata(pdev); 6066 struct amdgpu_device *adev = drm_to_adev(dev); 6067 int i; 6068 6069 6070 DRM_INFO("PCI error: resume callback!!\n"); 6071 6072 /* Only continue execution for the case of pci_channel_io_frozen */ 6073 if (adev->pci_channel_state != pci_channel_io_frozen) 6074 return; 6075 6076 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6077 struct amdgpu_ring *ring = adev->rings[i]; 6078 6079 if (!ring || !ring->sched.thread) 6080 continue; 6081 6082 6083 drm_sched_resubmit_jobs(&ring->sched); 6084 drm_sched_start(&ring->sched, true); 6085 } 6086 6087 amdgpu_device_unset_mp1_state(adev); 6088 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6089 #endif 6090 } 6091 6092 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6093 { 6094 return false; 6095 #ifdef notyet 6096 struct drm_device *dev = pci_get_drvdata(pdev); 6097 struct amdgpu_device *adev = drm_to_adev(dev); 6098 int r; 6099 6100 r = pci_save_state(pdev); 6101 if (!r) { 6102 kfree(adev->pci_state); 6103 6104 adev->pci_state = pci_store_saved_state(pdev); 6105 6106 if (!adev->pci_state) { 6107 DRM_ERROR("Failed to store PCI saved state"); 6108 return false; 6109 } 6110 } else { 6111 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6112 return false; 6113 } 6114 6115 return true; 6116 #endif 6117 } 6118 6119 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6120 { 6121 STUB(); 6122 return false; 6123 #ifdef notyet 6124 struct drm_device *dev = pci_get_drvdata(pdev); 6125 struct amdgpu_device *adev = drm_to_adev(dev); 6126 int r; 6127 6128 if (!adev->pci_state) 6129 return false; 6130 6131 r = pci_load_saved_state(pdev, adev->pci_state); 6132 6133 if (!r) { 6134 pci_restore_state(pdev); 6135 } else { 6136 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6137 return false; 6138 } 6139 6140 return true; 6141 #endif 6142 } 6143 6144 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6145 struct amdgpu_ring *ring) 6146 { 6147 #ifdef CONFIG_X86_64 6148 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6149 return; 6150 #endif 6151 if (adev->gmc.xgmi.connected_to_cpu) 6152 return; 6153 6154 if (ring && ring->funcs->emit_hdp_flush) 6155 amdgpu_ring_emit_hdp_flush(ring); 6156 else 6157 amdgpu_asic_flush_hdp(adev, ring); 6158 } 6159 6160 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6161 struct amdgpu_ring *ring) 6162 { 6163 #ifdef CONFIG_X86_64 6164 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6165 return; 6166 #endif 6167 if (adev->gmc.xgmi.connected_to_cpu) 6168 return; 6169 6170 amdgpu_asic_invalidate_hdp(adev, ring); 6171 } 6172 6173 int amdgpu_in_reset(struct amdgpu_device *adev) 6174 { 6175 return atomic_read(&adev->reset_domain->in_gpu_reset); 6176 } 6177 6178 /** 6179 * amdgpu_device_halt() - bring hardware to some kind of halt state 6180 * 6181 * @adev: amdgpu_device pointer 6182 * 6183 * Bring hardware to some kind of halt state so that no one can touch it 6184 * any more. It will help to maintain error context when error occurred. 6185 * Compare to a simple hang, the system will keep stable at least for SSH 6186 * access. Then it should be trivial to inspect the hardware state and 6187 * see what's going on. Implemented as following: 6188 * 6189 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6190 * clears all CPU mappings to device, disallows remappings through page faults 6191 * 2. amdgpu_irq_disable_all() disables all interrupts 6192 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6193 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6194 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6195 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6196 * flush any in flight DMA operations 6197 */ 6198 void amdgpu_device_halt(struct amdgpu_device *adev) 6199 { 6200 struct pci_dev *pdev = adev->pdev; 6201 struct drm_device *ddev = adev_to_drm(adev); 6202 6203 drm_dev_unplug(ddev); 6204 6205 amdgpu_irq_disable_all(adev); 6206 6207 amdgpu_fence_driver_hw_fini(adev); 6208 6209 adev->no_hw_access = true; 6210 6211 amdgpu_device_unmap_mmio(adev); 6212 6213 pci_disable_device(pdev); 6214 pci_wait_for_pending_transaction(pdev); 6215 } 6216 6217 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6218 u32 reg) 6219 { 6220 unsigned long flags, address, data; 6221 u32 r; 6222 6223 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6224 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6225 6226 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6227 WREG32(address, reg * 4); 6228 (void)RREG32(address); 6229 r = RREG32(data); 6230 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6231 return r; 6232 } 6233 6234 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6235 u32 reg, u32 v) 6236 { 6237 unsigned long flags, address, data; 6238 6239 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6240 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6241 6242 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6243 WREG32(address, reg * 4); 6244 (void)RREG32(address); 6245 WREG32(data, v); 6246 (void)RREG32(data); 6247 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6248 } 6249 6250 /** 6251 * amdgpu_device_switch_gang - switch to a new gang 6252 * @adev: amdgpu_device pointer 6253 * @gang: the gang to switch to 6254 * 6255 * Try to switch to a new gang. 6256 * Returns: NULL if we switched to the new gang or a reference to the current 6257 * gang leader. 6258 */ 6259 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6260 struct dma_fence *gang) 6261 { 6262 struct dma_fence *old = NULL; 6263 6264 do { 6265 dma_fence_put(old); 6266 rcu_read_lock(); 6267 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6268 rcu_read_unlock(); 6269 6270 if (old == gang) 6271 break; 6272 6273 if (!dma_fence_is_signaled(old)) 6274 return old; 6275 6276 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6277 old, gang) != old); 6278 6279 dma_fence_put(old); 6280 return NULL; 6281 } 6282 6283 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6284 { 6285 switch (adev->asic_type) { 6286 #ifdef CONFIG_DRM_AMDGPU_SI 6287 case CHIP_HAINAN: 6288 #endif 6289 case CHIP_TOPAZ: 6290 /* chips with no display hardware */ 6291 return false; 6292 #ifdef CONFIG_DRM_AMDGPU_SI 6293 case CHIP_TAHITI: 6294 case CHIP_PITCAIRN: 6295 case CHIP_VERDE: 6296 case CHIP_OLAND: 6297 #endif 6298 #ifdef CONFIG_DRM_AMDGPU_CIK 6299 case CHIP_BONAIRE: 6300 case CHIP_HAWAII: 6301 case CHIP_KAVERI: 6302 case CHIP_KABINI: 6303 case CHIP_MULLINS: 6304 #endif 6305 case CHIP_TONGA: 6306 case CHIP_FIJI: 6307 case CHIP_POLARIS10: 6308 case CHIP_POLARIS11: 6309 case CHIP_POLARIS12: 6310 case CHIP_VEGAM: 6311 case CHIP_CARRIZO: 6312 case CHIP_STONEY: 6313 /* chips with display hardware */ 6314 return true; 6315 default: 6316 /* IP discovery */ 6317 if (!adev->ip_versions[DCE_HWIP][0] || 6318 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6319 return false; 6320 return true; 6321 } 6322 } 6323